r53363 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53362‎ | r53363 | r53364 >
Date:16:11, 16 July 2009
Author:daniel
Status:deferred
Tags:
Comment:
support {{catmore}}
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiPage.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateParameterValueExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiPage.java
@@ -74,6 +74,8 @@
7575
7676 public WikiLink getRedirect();
7777
 78+ public WikiLink getAliasFor();
 79+
7880 public CharSequence getTitleSuffix();
7981
8082 public CharSequence getTitlePrefix();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java
@@ -103,6 +103,11 @@
104104 /**
105105 * List of extractors for determining redirect targets.
106106 */
 107+ public List<ValueExtractor> aliasExtractors = new ArrayList<ValueExtractor>();
 108+
 109+ /**
 110+ * List of extractors for determining redirect targets.
 111+ */
107112 public List<ValueExtractor> redirectExtractors = new ArrayList<ValueExtractor>();
108113
109114 /**
@@ -286,7 +291,7 @@
287292 protected String wikiName;
288293
289294 public Set<Integer> conceptNamespacecs = new HashSet<Integer>();
290 -
 295+
291296 protected WikiConfiguration() {
292297 this(null);
293298 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateParameterValueExtractor.java
@@ -7,6 +7,7 @@
88 import de.brightbyte.data.MultiMap;
99 import de.brightbyte.wikiword.analyzer.AnalyzerUtils;
1010 import de.brightbyte.wikiword.analyzer.WikiPage;
 11+import de.brightbyte.wikiword.analyzer.mangler.Mangler;
1112 import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
1213 import de.brightbyte.wikiword.analyzer.matcher.NameMatcher;
1314 import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher;
@@ -18,6 +19,7 @@
1920 protected NameMatcher template;
2021 protected String parameter;
2122 protected String prefix = null;
 23+ protected Mangler mangler = null;
2224
2325 public TemplateParameterValueExtractor(String template, int flags, String parameter) {
2426 this(new ExactNameMatcher(template), parameter);
@@ -40,6 +42,11 @@
4143 this.parameter = parameter;
4244 }
4345
 46+ public TemplateParameterValueExtractor setManger(Mangler m) {
 47+ mangler = m;
 48+ return this;
 49+ }
 50+
4451 public Set<CharSequence> extract(WikiPage page, Set<CharSequence> into) {
4552 MultiMap<String, TemplateData, List<TemplateData>> tpl = page.getTemplates();
4653
@@ -47,7 +54,10 @@
4855 for (TemplateData m: list) {
4956 CharSequence v = m.getParameter(parameter);
5057 if (prefix!=null) v = prefix+v;
51 - if (v!=null) AnalyzerUtils.addToSet(into, v);
 58+ if (v!=null) {
 59+ if (mangler!=null) v = mangler.mangle(v);
 60+ AnalyzerUtils.addToSet(into, v);
 61+ }
5262 }
5363 }
5464
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java
@@ -446,8 +446,11 @@
447447
448448 protected Set<CharSequence> titleTerms = null;
449449 protected Set<CharSequence> pageTerms = null;
 450+
450451 protected WikiLink redirect = null;
451452 protected boolean redirectKnown = false;
 453+ protected WikiLink aliasFor = null;
 454+ protected boolean aliasForKnown = false;
452455
453456 protected CharSequence cleaned = null;
454457 protected CharSequence flat = null;
@@ -461,7 +464,7 @@
462465 protected MultiMap<String, CharSequence, Set<CharSequence>> properties = null;
463466 protected Set<CharSequence> supplementLinks = null;
464467 protected Holder<CharSequence> supplementedConcept = null;
465 - protected List<WikiLink> links = null;
 468+ protected List<WikiLink> links = null;
466469 protected List<WikiLink> disambig = null;
467470 protected Set<String> categories = null;
468471 protected Set<String> sections = null;
@@ -832,6 +835,18 @@
833836 }
834837
835838 /* (non-Javadoc)
 839+ * @see de.brightbyte.wikiword.analyzer.WikiPage#getAliasFor()
 840+ */
 841+ public WikiLink getAliasFor() {
 842+ if (!aliasForKnown) {
 843+ aliasFor = extractRedirectLink( this );
 844+ aliasForKnown = true;
 845+ }
 846+
 847+ return aliasFor;
 848+ }
 849+
 850+ /* (non-Javadoc)
836851 * @see de.brightbyte.wikiword.analyzer.WikiPage#getTitleSuffix()
837852 */
838853 public CharSequence getTitleSuffix() {
@@ -1070,6 +1085,7 @@
10711086 config.propertyExtractors,
10721087 config.pageTermExtractors,
10731088 config.redirectExtractors,
 1089+ config.aliasExtractors,
10741090 extraTemplateUsers
10751091 );
10761092
@@ -1293,6 +1309,15 @@
12941310
12951311 return makeLink(page.getName(), target, null, null);
12961312 }
 1313+
 1314+ protected WikiLink extractAliasLink(WikiPage page) {
 1315+ Set<CharSequence> t = evalExtractors(config.aliasExtractors, page);
 1316+ if (t==null || t.isEmpty()) return null;
 1317+
 1318+ CharSequence target = t.iterator().next(); //first item
 1319+
 1320+ return makeLink(page.getName(), target, null, null);
 1321+ }
12971322
12981323 /** Link targets in MediaWiki may be given in url-encoded form, that is,
12991324 * using codes like %3A for : (Colon).
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
@@ -276,6 +276,10 @@
277277 } else {
278278 int conceptId = store.storeAbout(rcId, rcName, name);
279279
 280+ //if the cat page contains a reference to the main topic page, store it.
 281+ WikiTextAnalyzer.WikiLink aliasFor = analyzerPage.getAliasFor();
 282+ storeConceptAlias(rcId, conceptId, name, -1, aliasFor.getTargetConcept().toString(), AliasScope.CATEGORY);
 283+
280284 List<WikiTextAnalyzer.WikiLink> links = analyzerPage.getLinks();
281285 linkTracker.step(links.size());
282286
@@ -293,7 +297,6 @@
294298 }
295299 }
296300
297 -
298301 //TODO: langlinks from category!
299302 // need resolve-ids on langling, then!
300303 // beware aliased categories!
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java
@@ -5,6 +5,7 @@
66 import de.brightbyte.wikiword.ConceptType;
77 import de.brightbyte.wikiword.ResourceType;
88 import de.brightbyte.wikiword.analyzer.WikiConfiguration;
 9+import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterValueExtractor;
910 import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
1011 import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor;
1112 import de.brightbyte.wikiword.analyzer.sensor.HasCategorySensor;
@@ -178,6 +179,10 @@
179180 disambigStripSectionPattern = sectionPattern("See also", 0);
180181 //FIXME: disambig pages marked with {{shipindex}} are tabular!
181182
 183+ aliasExtractors.add( new TemplateParameterValueExtractor("Catmore?", 0, "1") ); //FIXME: testme
 184+ aliasExtractors.add( new TemplateParameterValueExtractor("Catmore1", 0, "1").setManger( new RegularExpressionMangler("^.*\\[\\[ *(.+?) *(\\||\\]\\])", "$1", 0) ) );
 185+ //TODO: Catmoresub
 186+
182187 useCategoryAliases = true; //enwiki uses plural category names. resolve them.
183188 }
184189

Status & tagging log