Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiPage.java |
— | — | @@ -74,6 +74,8 @@ |
75 | 75 | |
76 | 76 | public WikiLink getRedirect(); |
77 | 77 | |
| 78 | + public WikiLink getAliasFor(); |
| 79 | + |
78 | 80 | public CharSequence getTitleSuffix(); |
79 | 81 | |
80 | 82 | public CharSequence getTitlePrefix(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java |
— | — | @@ -103,6 +103,11 @@ |
104 | 104 | /** |
105 | 105 | * List of extractors for determining redirect targets. |
106 | 106 | */ |
| 107 | + public List<ValueExtractor> aliasExtractors = new ArrayList<ValueExtractor>(); |
| 108 | + |
| 109 | + /** |
| 110 | + * List of extractors for determining redirect targets. |
| 111 | + */ |
107 | 112 | public List<ValueExtractor> redirectExtractors = new ArrayList<ValueExtractor>(); |
108 | 113 | |
109 | 114 | /** |
— | — | @@ -286,7 +291,7 @@ |
287 | 292 | protected String wikiName; |
288 | 293 | |
289 | 294 | public Set<Integer> conceptNamespacecs = new HashSet<Integer>(); |
290 | | - |
| 295 | + |
291 | 296 | protected WikiConfiguration() { |
292 | 297 | this(null); |
293 | 298 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateParameterValueExtractor.java |
— | — | @@ -7,6 +7,7 @@ |
8 | 8 | import de.brightbyte.data.MultiMap; |
9 | 9 | import de.brightbyte.wikiword.analyzer.AnalyzerUtils; |
10 | 10 | import de.brightbyte.wikiword.analyzer.WikiPage; |
| 11 | +import de.brightbyte.wikiword.analyzer.mangler.Mangler; |
11 | 12 | import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher; |
12 | 13 | import de.brightbyte.wikiword.analyzer.matcher.NameMatcher; |
13 | 14 | import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher; |
— | — | @@ -18,6 +19,7 @@ |
19 | 20 | protected NameMatcher template; |
20 | 21 | protected String parameter; |
21 | 22 | protected String prefix = null; |
| 23 | + protected Mangler mangler = null; |
22 | 24 | |
23 | 25 | public TemplateParameterValueExtractor(String template, int flags, String parameter) { |
24 | 26 | this(new ExactNameMatcher(template), parameter); |
— | — | @@ -40,6 +42,11 @@ |
41 | 43 | this.parameter = parameter; |
42 | 44 | } |
43 | 45 | |
| 46 | + public TemplateParameterValueExtractor setManger(Mangler m) { |
| 47 | + mangler = m; |
| 48 | + return this; |
| 49 | + } |
| 50 | + |
44 | 51 | public Set<CharSequence> extract(WikiPage page, Set<CharSequence> into) { |
45 | 52 | MultiMap<String, TemplateData, List<TemplateData>> tpl = page.getTemplates(); |
46 | 53 | |
— | — | @@ -47,7 +54,10 @@ |
48 | 55 | for (TemplateData m: list) { |
49 | 56 | CharSequence v = m.getParameter(parameter); |
50 | 57 | if (prefix!=null) v = prefix+v; |
51 | | - if (v!=null) AnalyzerUtils.addToSet(into, v); |
| 58 | + if (v!=null) { |
| 59 | + if (mangler!=null) v = mangler.mangle(v); |
| 60 | + AnalyzerUtils.addToSet(into, v); |
| 61 | + } |
52 | 62 | } |
53 | 63 | } |
54 | 64 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java |
— | — | @@ -446,8 +446,11 @@ |
447 | 447 | |
448 | 448 | protected Set<CharSequence> titleTerms = null; |
449 | 449 | protected Set<CharSequence> pageTerms = null; |
| 450 | + |
450 | 451 | protected WikiLink redirect = null; |
451 | 452 | protected boolean redirectKnown = false; |
| 453 | + protected WikiLink aliasFor = null; |
| 454 | + protected boolean aliasForKnown = false; |
452 | 455 | |
453 | 456 | protected CharSequence cleaned = null; |
454 | 457 | protected CharSequence flat = null; |
— | — | @@ -461,7 +464,7 @@ |
462 | 465 | protected MultiMap<String, CharSequence, Set<CharSequence>> properties = null; |
463 | 466 | protected Set<CharSequence> supplementLinks = null; |
464 | 467 | protected Holder<CharSequence> supplementedConcept = null; |
465 | | - protected List<WikiLink> links = null; |
| 468 | + protected List<WikiLink> links = null; |
466 | 469 | protected List<WikiLink> disambig = null; |
467 | 470 | protected Set<String> categories = null; |
468 | 471 | protected Set<String> sections = null; |
— | — | @@ -832,6 +835,18 @@ |
833 | 836 | } |
834 | 837 | |
835 | 838 | /* (non-Javadoc) |
| 839 | + * @see de.brightbyte.wikiword.analyzer.WikiPage#getAliasFor() |
| 840 | + */ |
| 841 | + public WikiLink getAliasFor() { |
| 842 | + if (!aliasForKnown) { |
| 843 | + aliasFor = extractRedirectLink( this ); |
| 844 | + aliasForKnown = true; |
| 845 | + } |
| 846 | + |
| 847 | + return aliasFor; |
| 848 | + } |
| 849 | + |
| 850 | + /* (non-Javadoc) |
836 | 851 | * @see de.brightbyte.wikiword.analyzer.WikiPage#getTitleSuffix() |
837 | 852 | */ |
838 | 853 | public CharSequence getTitleSuffix() { |
— | — | @@ -1070,6 +1085,7 @@ |
1071 | 1086 | config.propertyExtractors, |
1072 | 1087 | config.pageTermExtractors, |
1073 | 1088 | config.redirectExtractors, |
| 1089 | + config.aliasExtractors, |
1074 | 1090 | extraTemplateUsers |
1075 | 1091 | ); |
1076 | 1092 | |
— | — | @@ -1293,6 +1309,15 @@ |
1294 | 1310 | |
1295 | 1311 | return makeLink(page.getName(), target, null, null); |
1296 | 1312 | } |
| 1313 | + |
| 1314 | + protected WikiLink extractAliasLink(WikiPage page) { |
| 1315 | + Set<CharSequence> t = evalExtractors(config.aliasExtractors, page); |
| 1316 | + if (t==null || t.isEmpty()) return null; |
| 1317 | + |
| 1318 | + CharSequence target = t.iterator().next(); //first item |
| 1319 | + |
| 1320 | + return makeLink(page.getName(), target, null, null); |
| 1321 | + } |
1297 | 1322 | |
1298 | 1323 | /** Link targets in MediaWiki may be given in url-encoded form, that is, |
1299 | 1324 | * using codes like %3A for : (Colon). |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java |
— | — | @@ -276,6 +276,10 @@ |
277 | 277 | } else { |
278 | 278 | int conceptId = store.storeAbout(rcId, rcName, name); |
279 | 279 | |
| 280 | + //if the cat page contains a reference to the main topic page, store it. |
| 281 | + WikiTextAnalyzer.WikiLink aliasFor = analyzerPage.getAliasFor(); |
| 282 | + storeConceptAlias(rcId, conceptId, name, -1, aliasFor.getTargetConcept().toString(), AliasScope.CATEGORY); |
| 283 | + |
280 | 284 | List<WikiTextAnalyzer.WikiLink> links = analyzerPage.getLinks(); |
281 | 285 | linkTracker.step(links.size()); |
282 | 286 | |
— | — | @@ -293,7 +297,6 @@ |
294 | 298 | } |
295 | 299 | } |
296 | 300 | |
297 | | - |
298 | 301 | //TODO: langlinks from category! |
299 | 302 | // need resolve-ids on langling, then! |
300 | 303 | // beware aliased categories! |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -5,6 +5,7 @@ |
6 | 6 | import de.brightbyte.wikiword.ConceptType; |
7 | 7 | import de.brightbyte.wikiword.ResourceType; |
8 | 8 | import de.brightbyte.wikiword.analyzer.WikiConfiguration; |
| 9 | +import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterValueExtractor; |
9 | 10 | import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler; |
10 | 11 | import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor; |
11 | 12 | import de.brightbyte.wikiword.analyzer.sensor.HasCategorySensor; |
— | — | @@ -178,6 +179,10 @@ |
179 | 180 | disambigStripSectionPattern = sectionPattern("See also", 0); |
180 | 181 | //FIXME: disambig pages marked with {{shipindex}} are tabular! |
181 | 182 | |
| 183 | + aliasExtractors.add( new TemplateParameterValueExtractor("Catmore?", 0, "1") ); //FIXME: testme |
| 184 | + aliasExtractors.add( new TemplateParameterValueExtractor("Catmore1", 0, "1").setManger( new RegularExpressionMangler("^.*\\[\\[ *(.+?) *(\\||\\]\\])", "$1", 0) ) ); |
| 185 | + //TODO: Catmoresub |
| 186 | + |
182 | 187 | useCategoryAliases = true; //enwiki uses plural category names. resolve them. |
183 | 188 | } |
184 | 189 | |