r53362 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53361‎ | r53362 | r53363 >
Date:15:29, 16 July 2009
Author:daniel
Status:deferred
Tags:
Comment:
sorting out target resource vs. target concept. ugh, what a mess.
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java
@@ -3,9 +3,11 @@
44 import java.io.IOException;
55 import java.util.ArrayList;
66 import java.util.HashMap;
 7+import java.util.HashSet;
78 import java.util.List;
89 import java.util.Map;
910 import java.util.Properties;
 11+import java.util.Set;
1012 import java.util.regex.Pattern;
1113
1214 import de.brightbyte.data.measure.Measure;
@@ -282,6 +284,8 @@
283285 protected WikiTextAnalyzer analyzer;
284286
285287 protected String wikiName;
 288+
 289+ public Set<Integer> conceptNamespacecs = new HashSet<Integer>();
286290
287291 protected WikiConfiguration() {
288292 this(null);
@@ -456,6 +460,9 @@
457461
458462 String category_redirect_templates = pyBotFamily.getProperty("category_redirect_templates");
459463 if (category_redirect_templates!=null) this.redirectExtractors.add( new TemplateParameterValueExtractor(category_redirect_templates.replace(' ', '_'), Pattern.CASE_INSENSITIVE, "1").setPrefix("Category:") );
 464+
 465+ conceptNamespacecs.add(Namespace.MAIN);
 466+ conceptNamespacecs.add(Namespace.CATEGORY);
460467 }
461468
462469 public void attach(WikiTextAnalyzer analyzer) {
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java
@@ -254,6 +254,8 @@
255255 private CharSequence interwiki;
256256 private int namespace;
257257 private CharSequence target;
 258+ private CharSequence targetConcept;
 259+ private CharSequence targetConceptPage;
258260 private CharSequence title;
259261 private CharSequence section;
260262 private CharSequence text;
@@ -399,7 +401,29 @@
400402 return false;
401403 return true;
402404 }
403 -
 405+
 406+ public CharSequence getTargetConcept() {
 407+ if (targetConcept==null) {
 408+ targetConcept = getTargetConceptPage();
 409+ if (section!=null) targetConcept = targetConcept + "#" + section;
 410+ }
 411+
 412+ return targetConcept;
 413+ }
 414+
 415+ public CharSequence getTargetConceptPage() {
 416+ if (targetConceptPage==null) {
 417+ if (namespace!=Namespace.MAIN && !isConceptNamespace(namespace)) {
 418+ targetConceptPage = target;
 419+ int idx = StringUtils.indexOf('#', targetConceptPage);
 420+ if (idx>=0) targetConceptPage= targetConceptPage.subSequence(0, idx);
 421+ } else {
 422+ targetConceptPage = title;
 423+ }
 424+ }
 425+
 426+ return targetConceptPage;
 427+ }
404428 }
405429
406430 protected class Page implements WikiPage {
@@ -1891,6 +1915,15 @@
18921916 return mainArtikeMarkerMatcher.matches();
18931917 }
18941918
 1919+ /**
 1920+ * returns true of the given namespace ID identifies a namespace
 1921+ * that contains pages about concepts. This is usually the main namespace
 1922+ * and the category namespace.
 1923+ */
 1924+ public boolean isConceptNamespace(int ns) {
 1925+ return config.conceptNamespacecs.contains(ns);
 1926+ }
 1927+
18951928 public boolean mayBeFormOf(CharSequence form, CharSequence base) {
18961929 form = AnalyzerUtils.toLowerCase(form);
18971930 base = AnalyzerUtils.toLowerCase(base);
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
@@ -210,9 +210,11 @@
211211 if (m==WikiTextAnalyzer.LinkMagic.NONE) {
212212 if (link.getNamespace()!=Namespace.MAIN) continue;
213213 if (link.getInterwiki()!=null) continue;
 214+
 215+ String tgt = link.getTargetConcept().toString();
214216
215 - storeReference(rcId, link.getText().toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK);
216 - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString());
 217+ storeReference(rcId, link.getText().toString(), -1, tgt, ExtractionRule.TERM_FROM_LINK);
 218+ if (link.getSection()!=null) storeSection(rcId, tgt, link.getTargetPage().toString());
217219 }
218220 }
219221 }
@@ -224,9 +226,11 @@
225227 if (m==WikiTextAnalyzer.LinkMagic.NONE) {
226228 if (link.getNamespace()!=Namespace.MAIN && link.getNamespace()!=Namespace.CATEGORY) continue;
227229 if (link.getInterwiki()!=null) continue;
 230+
 231+ String tgt = link.getTargetConcept().toString();
228232
229 - storeLink(rcId, conceptId, conceptName, link.getText().toString(), link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK);
230 - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString());
 233+ storeLink(rcId, conceptId, conceptName, link.getText().toString(), tgt, ExtractionRule.TERM_FROM_LINK);
 234+ if (link.getSection()!=null) storeSection(rcId, tgt, link.getTargetConceptPage().toString());
231235 }
232236 }
233237 }
@@ -280,7 +284,7 @@
281285
282286 if (m==WikiTextAnalyzer.LinkMagic.CATEGORY) {
283287 //FIXME: store this also as a reference to the categorie's concept under it's original title!
284 - storeConceptBroader(rcId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT);
 288+ storeConceptBroader(rcId, name, link.getTitle().toString(), ExtractionRule.BROADER_FROM_CAT);
285289 }
286290
287291 if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) {
@@ -375,7 +379,7 @@
376380 //NOTE: the alias is preliminary: if a article with the name of the category
377381 // exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks
378382
379 - storeConceptAlias(rcId, -1, link.getTarget().toString(), conceptId, name, AliasScope.CATEGORY);
 383+ storeConceptAlias(rcId, -1, link.getTargetConcept().toString(), conceptId, name, AliasScope.CATEGORY);
380384 categorize = false;
381385 }
382386 }
@@ -388,8 +392,10 @@
389393 //XXX: if {{DEFAULTSORT}} is handled for PageTerms, apply for each category again?
390394 storeReference(rcId, sortKey, conceptId, name, ExtractionRule.TERM_FROM_SORTKEY); //sort key is a name for this page
391395 }
392 -
393 - storeConceptBroader(rcId, conceptId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT);
 396+
 397+ if (!StringUtils.equals(link.getTitle(),name) ) {
 398+ storeConceptBroader(rcId, conceptId, name, link.getTitle().toString(), ExtractionRule.BROADER_FROM_CAT);
 399+ }
394400 }
395401 }
396402 else if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) {
@@ -414,11 +420,11 @@
415421 WikiTextAnalyzer.LinkMagic m = link.getMagic();
416422
417423 if (m==WikiTextAnalyzer.LinkMagic.NONE) {
418 - if (link.getNamespace()!=Namespace.MAIN) continue;
 424+ if (!analyzer.isConceptNamespace(link.getNamespace())) continue;
419425 if (link.getInterwiki()!=null) continue;
420426
421427 for (CharSequence term : terms) {
422 - storeReference(rcId, term.toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_DISAMBIG);
 428+ storeReference(rcId, term.toString(), -1, link.getTitle().toString(), ExtractionRule.TERM_FROM_DISAMBIG);
423429 }
424430 }
425431 }
@@ -452,8 +458,9 @@
453459 String name = analyzerPage.getConceptName();
454460 String rcName = analyzerPage.getResourceName();
455461 String text = analyzerPage.getText().toString();
456 -
 462+
457463 WikiTextAnalyzer.WikiLink link = analyzerPage.getRedirect();
 464+ String tgtConcept = link.getTargetConcept().toString();
458465
459466 int conceptId = 0;
460467
@@ -465,34 +472,39 @@
466473 out.info("skipped interwiki redirect "+rcName+" -> "+link);
467474 }
468475 else if (link.getNamespace()!=analyzerPage.getNamespace()) {
469 - if (link.getNamespace()==Namespace.CATEGORY && analyzerPage.getNamespace()==Namespace.MAIN) {
470 - if ( StringUtils.equals(link.getTarget(), rcName) ) {
471 - out.debug("ignored redundant category redirect "+rcName+" -> "+link);
 476+ if ( analyzer.isConceptNamespace(link.getNamespace()) ) {
 477+ if ( StringUtils.equals(tgtConcept, name) ) {
 478+ out.debug("ignored redundant inter-namespace redirect "+rcName+" -> "+link);
472479 } else {
473480 out.debug("processing redirect to category "+rcName+" -> "+link);
474 - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT );
475 - String tgtConcept = link.getTarget().toString();
476481
 482+ storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, tgtConcept, ExtractionRule.TERM_FROM_REDIRECT );
 483+
477484 if (!name.equals(tgtConcept)) {
478485 conceptId = store.storeAbout(rcId, rcName, name);
479486 storeConceptAlias(rcId, conceptId, name, -1, tgtConcept, AliasScope.REDIRECT);
480487 } else {
481 - out.debug("skipping redirect to category with the same name");
 488+ out.debug("skipping inter-namespace redirect to page with the same title");
482489 }
483490 }
484491 } else {
485492 warn(rcId, "bad redirect (inter-namespace)", rcName+" -> "+link, null);
486493 }
487494 }
488 - else if (rcName.equals(link.getTarget().toString())) {
489 - warn(rcId, "bad redirect (self-link)", "page "+name, null);
 495+ else if (StringUtils.equals(rcName, link.getTarget().toString())) {
 496+ warn(rcId, "bad redirect (self-link)", "page "+rcName, null);
490497 }
491 - else {
492 - conceptId = store.storeAbout(rcId, rcName, name);
493 - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT );
494 - storeConceptAlias(rcId, conceptId, name, -1, link.getTarget().toString(), AliasScope.REDIRECT);
495 -
496 - //FIXME: redir to section!
 498+ else if ( analyzer.isConceptNamespace(link.getNamespace()) ) {
 499+ if (StringUtils.equals(name, tgtConcept)) {
 500+ warn(rcId, "bad redirect (self-link)", "page "+rcName, null);
 501+ } else {
 502+ conceptId = store.storeAbout(rcId, rcName, name);
 503+ storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, tgtConcept, ExtractionRule.TERM_FROM_REDIRECT );
 504+ storeConceptAlias(rcId, conceptId, name, -1, tgtConcept, AliasScope.REDIRECT);
 505+ if (link.getSection()!=null) storeSection(rcId, link.getTargetConcept().toString(), link.getTargetConceptPage().toString());
 506+ }
 507+ } else if (link.getInterwiki()!=null ) {
 508+ out.info("skipped uninterresting redirect "+rcName+" -> "+link);
497509 }
498510
499511 return conceptId;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java
@@ -17,6 +17,8 @@
1818 public WikiConfiguration_enwiki() {
1919 super();
2020
 21+ //conceptNamespacecs.add(Namespace.PORTAL); //FIXME: how to add portal namespace?!
 22+
2123 /*
2224 stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("^\\{\\{(wrapper)\\s*(\\|[^\\}\\r\\n]*)?\\}\\}\\s*$", "{|", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE));
2325 stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("^\\{\\{(end|col-end)\\s*\\}\\}\\s*$", "|}", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE));
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java
@@ -16,6 +16,8 @@
1717 public WikiConfiguration_dewiki() {
1818 super();
1919
 20+ //conceptNamespacecs.add(Namespace.PORTAL); //FIXME: how to add portal namespace?!
 21+
2022 /*
2123 stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("\\{\\{[Oo]kina\\}\\}", "\u02BB", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE));
2224 stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("\\{\\{\\s*(IPA(?:-Text)|IAST|Unicode|Musik)\\s*\\|\\s*([^|}]+)\\s*(\\|.*?)?\\s*(\\|.*?)?\\}\\}", "$2", Pattern.CASE_INSENSITIVE));

Status & tagging log