Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java |
— | — | @@ -3,9 +3,11 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.HashMap; |
| 7 | +import java.util.HashSet; |
7 | 8 | import java.util.List; |
8 | 9 | import java.util.Map; |
9 | 10 | import java.util.Properties; |
| 11 | +import java.util.Set; |
10 | 12 | import java.util.regex.Pattern; |
11 | 13 | |
12 | 14 | import de.brightbyte.data.measure.Measure; |
— | — | @@ -282,6 +284,8 @@ |
283 | 285 | protected WikiTextAnalyzer analyzer; |
284 | 286 | |
285 | 287 | protected String wikiName; |
| 288 | + |
| 289 | + public Set<Integer> conceptNamespacecs = new HashSet<Integer>(); |
286 | 290 | |
287 | 291 | protected WikiConfiguration() { |
288 | 292 | this(null); |
— | — | @@ -456,6 +460,9 @@ |
457 | 461 | |
458 | 462 | String category_redirect_templates = pyBotFamily.getProperty("category_redirect_templates"); |
459 | 463 | if (category_redirect_templates!=null) this.redirectExtractors.add( new TemplateParameterValueExtractor(category_redirect_templates.replace(' ', '_'), Pattern.CASE_INSENSITIVE, "1").setPrefix("Category:") ); |
| 464 | + |
| 465 | + conceptNamespacecs.add(Namespace.MAIN); |
| 466 | + conceptNamespacecs.add(Namespace.CATEGORY); |
460 | 467 | } |
461 | 468 | |
462 | 469 | public void attach(WikiTextAnalyzer analyzer) { |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java |
— | — | @@ -254,6 +254,8 @@ |
255 | 255 | private CharSequence interwiki; |
256 | 256 | private int namespace; |
257 | 257 | private CharSequence target; |
| 258 | + private CharSequence targetConcept; |
| 259 | + private CharSequence targetConceptPage; |
258 | 260 | private CharSequence title; |
259 | 261 | private CharSequence section; |
260 | 262 | private CharSequence text; |
— | — | @@ -399,7 +401,29 @@ |
400 | 402 | return false; |
401 | 403 | return true; |
402 | 404 | } |
403 | | - |
| 405 | + |
| 406 | + public CharSequence getTargetConcept() { |
| 407 | + if (targetConcept==null) { |
| 408 | + targetConcept = getTargetConceptPage(); |
| 409 | + if (section!=null) targetConcept = targetConcept + "#" + section; |
| 410 | + } |
| 411 | + |
| 412 | + return targetConcept; |
| 413 | + } |
| 414 | + |
| 415 | + public CharSequence getTargetConceptPage() { |
| 416 | + if (targetConceptPage==null) { |
| 417 | + if (namespace!=Namespace.MAIN && !isConceptNamespace(namespace)) { |
| 418 | + targetConceptPage = target; |
| 419 | + int idx = StringUtils.indexOf('#', targetConceptPage); |
| 420 | + if (idx>=0) targetConceptPage= targetConceptPage.subSequence(0, idx); |
| 421 | + } else { |
| 422 | + targetConceptPage = title; |
| 423 | + } |
| 424 | + } |
| 425 | + |
| 426 | + return targetConceptPage; |
| 427 | + } |
404 | 428 | } |
405 | 429 | |
406 | 430 | protected class Page implements WikiPage { |
— | — | @@ -1891,6 +1915,15 @@ |
1892 | 1916 | return mainArtikeMarkerMatcher.matches(); |
1893 | 1917 | } |
1894 | 1918 | |
| 1919 | + /** |
| 1920 | + * returns true of the given namespace ID identifies a namespace |
| 1921 | + * that contains pages about concepts. This is usually the main namespace |
| 1922 | + * and the category namespace. |
| 1923 | + */ |
| 1924 | + public boolean isConceptNamespace(int ns) { |
| 1925 | + return config.conceptNamespacecs.contains(ns); |
| 1926 | + } |
| 1927 | + |
1895 | 1928 | public boolean mayBeFormOf(CharSequence form, CharSequence base) { |
1896 | 1929 | form = AnalyzerUtils.toLowerCase(form); |
1897 | 1930 | base = AnalyzerUtils.toLowerCase(base); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java |
— | — | @@ -210,9 +210,11 @@ |
211 | 211 | if (m==WikiTextAnalyzer.LinkMagic.NONE) { |
212 | 212 | if (link.getNamespace()!=Namespace.MAIN) continue; |
213 | 213 | if (link.getInterwiki()!=null) continue; |
| 214 | + |
| 215 | + String tgt = link.getTargetConcept().toString(); |
214 | 216 | |
215 | | - storeReference(rcId, link.getText().toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK); |
216 | | - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString()); |
| 217 | + storeReference(rcId, link.getText().toString(), -1, tgt, ExtractionRule.TERM_FROM_LINK); |
| 218 | + if (link.getSection()!=null) storeSection(rcId, tgt, link.getTargetPage().toString()); |
217 | 219 | } |
218 | 220 | } |
219 | 221 | } |
— | — | @@ -224,9 +226,11 @@ |
225 | 227 | if (m==WikiTextAnalyzer.LinkMagic.NONE) { |
226 | 228 | if (link.getNamespace()!=Namespace.MAIN && link.getNamespace()!=Namespace.CATEGORY) continue; |
227 | 229 | if (link.getInterwiki()!=null) continue; |
| 230 | + |
| 231 | + String tgt = link.getTargetConcept().toString(); |
228 | 232 | |
229 | | - storeLink(rcId, conceptId, conceptName, link.getText().toString(), link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK); |
230 | | - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString()); |
| 233 | + storeLink(rcId, conceptId, conceptName, link.getText().toString(), tgt, ExtractionRule.TERM_FROM_LINK); |
| 234 | + if (link.getSection()!=null) storeSection(rcId, tgt, link.getTargetConceptPage().toString()); |
231 | 235 | } |
232 | 236 | } |
233 | 237 | } |
— | — | @@ -280,7 +284,7 @@ |
281 | 285 | |
282 | 286 | if (m==WikiTextAnalyzer.LinkMagic.CATEGORY) { |
283 | 287 | //FIXME: store this also as a reference to the categorie's concept under it's original title! |
284 | | - storeConceptBroader(rcId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT); |
| 288 | + storeConceptBroader(rcId, name, link.getTitle().toString(), ExtractionRule.BROADER_FROM_CAT); |
285 | 289 | } |
286 | 290 | |
287 | 291 | if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) { |
— | — | @@ -375,7 +379,7 @@ |
376 | 380 | //NOTE: the alias is preliminary: if a article with the name of the category |
377 | 381 | // exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks |
378 | 382 | |
379 | | - storeConceptAlias(rcId, -1, link.getTarget().toString(), conceptId, name, AliasScope.CATEGORY); |
| 383 | + storeConceptAlias(rcId, -1, link.getTargetConcept().toString(), conceptId, name, AliasScope.CATEGORY); |
380 | 384 | categorize = false; |
381 | 385 | } |
382 | 386 | } |
— | — | @@ -388,8 +392,10 @@ |
389 | 393 | //XXX: if {{DEFAULTSORT}} is handled for PageTerms, apply for each category again? |
390 | 394 | storeReference(rcId, sortKey, conceptId, name, ExtractionRule.TERM_FROM_SORTKEY); //sort key is a name for this page |
391 | 395 | } |
392 | | - |
393 | | - storeConceptBroader(rcId, conceptId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT); |
| 396 | + |
| 397 | + if (!StringUtils.equals(link.getTitle(),name) ) { |
| 398 | + storeConceptBroader(rcId, conceptId, name, link.getTitle().toString(), ExtractionRule.BROADER_FROM_CAT); |
| 399 | + } |
394 | 400 | } |
395 | 401 | } |
396 | 402 | else if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) { |
— | — | @@ -414,11 +420,11 @@ |
415 | 421 | WikiTextAnalyzer.LinkMagic m = link.getMagic(); |
416 | 422 | |
417 | 423 | if (m==WikiTextAnalyzer.LinkMagic.NONE) { |
418 | | - if (link.getNamespace()!=Namespace.MAIN) continue; |
| 424 | + if (!analyzer.isConceptNamespace(link.getNamespace())) continue; |
419 | 425 | if (link.getInterwiki()!=null) continue; |
420 | 426 | |
421 | 427 | for (CharSequence term : terms) { |
422 | | - storeReference(rcId, term.toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_DISAMBIG); |
| 428 | + storeReference(rcId, term.toString(), -1, link.getTitle().toString(), ExtractionRule.TERM_FROM_DISAMBIG); |
423 | 429 | } |
424 | 430 | } |
425 | 431 | } |
— | — | @@ -452,8 +458,9 @@ |
453 | 459 | String name = analyzerPage.getConceptName(); |
454 | 460 | String rcName = analyzerPage.getResourceName(); |
455 | 461 | String text = analyzerPage.getText().toString(); |
456 | | - |
| 462 | + |
457 | 463 | WikiTextAnalyzer.WikiLink link = analyzerPage.getRedirect(); |
| 464 | + String tgtConcept = link.getTargetConcept().toString(); |
458 | 465 | |
459 | 466 | int conceptId = 0; |
460 | 467 | |
— | — | @@ -465,34 +472,39 @@ |
466 | 473 | out.info("skipped interwiki redirect "+rcName+" -> "+link); |
467 | 474 | } |
468 | 475 | else if (link.getNamespace()!=analyzerPage.getNamespace()) { |
469 | | - if (link.getNamespace()==Namespace.CATEGORY && analyzerPage.getNamespace()==Namespace.MAIN) { |
470 | | - if ( StringUtils.equals(link.getTarget(), rcName) ) { |
471 | | - out.debug("ignored redundant category redirect "+rcName+" -> "+link); |
| 476 | + if ( analyzer.isConceptNamespace(link.getNamespace()) ) { |
| 477 | + if ( StringUtils.equals(tgtConcept, name) ) { |
| 478 | + out.debug("ignored redundant inter-namespace redirect "+rcName+" -> "+link); |
472 | 479 | } else { |
473 | 480 | out.debug("processing redirect to category "+rcName+" -> "+link); |
474 | | - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT ); |
475 | | - String tgtConcept = link.getTarget().toString(); |
476 | 481 | |
| 482 | + storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, tgtConcept, ExtractionRule.TERM_FROM_REDIRECT ); |
| 483 | + |
477 | 484 | if (!name.equals(tgtConcept)) { |
478 | 485 | conceptId = store.storeAbout(rcId, rcName, name); |
479 | 486 | storeConceptAlias(rcId, conceptId, name, -1, tgtConcept, AliasScope.REDIRECT); |
480 | 487 | } else { |
481 | | - out.debug("skipping redirect to category with the same name"); |
| 488 | + out.debug("skipping inter-namespace redirect to page with the same title"); |
482 | 489 | } |
483 | 490 | } |
484 | 491 | } else { |
485 | 492 | warn(rcId, "bad redirect (inter-namespace)", rcName+" -> "+link, null); |
486 | 493 | } |
487 | 494 | } |
488 | | - else if (rcName.equals(link.getTarget().toString())) { |
489 | | - warn(rcId, "bad redirect (self-link)", "page "+name, null); |
| 495 | + else if (StringUtils.equals(rcName, link.getTarget().toString())) { |
| 496 | + warn(rcId, "bad redirect (self-link)", "page "+rcName, null); |
490 | 497 | } |
491 | | - else { |
492 | | - conceptId = store.storeAbout(rcId, rcName, name); |
493 | | - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT ); |
494 | | - storeConceptAlias(rcId, conceptId, name, -1, link.getTarget().toString(), AliasScope.REDIRECT); |
495 | | - |
496 | | - //FIXME: redir to section! |
| 498 | + else if ( analyzer.isConceptNamespace(link.getNamespace()) ) { |
| 499 | + if (StringUtils.equals(name, tgtConcept)) { |
| 500 | + warn(rcId, "bad redirect (self-link)", "page "+rcName, null); |
| 501 | + } else { |
| 502 | + conceptId = store.storeAbout(rcId, rcName, name); |
| 503 | + storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, tgtConcept, ExtractionRule.TERM_FROM_REDIRECT ); |
| 504 | + storeConceptAlias(rcId, conceptId, name, -1, tgtConcept, AliasScope.REDIRECT); |
| 505 | + if (link.getSection()!=null) storeSection(rcId, link.getTargetConcept().toString(), link.getTargetConceptPage().toString()); |
| 506 | + } |
| 507 | + } else if (link.getInterwiki()!=null ) { |
| 508 | + out.info("skipped uninterresting redirect "+rcName+" -> "+link); |
497 | 509 | } |
498 | 510 | |
499 | 511 | return conceptId; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -17,6 +17,8 @@ |
18 | 18 | public WikiConfiguration_enwiki() { |
19 | 19 | super(); |
20 | 20 | |
| 21 | + //conceptNamespacecs.add(Namespace.PORTAL); //FIXME: how to add portal namespace?! |
| 22 | + |
21 | 23 | /* |
22 | 24 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("^\\{\\{(wrapper)\\s*(\\|[^\\}\\r\\n]*)?\\}\\}\\s*$", "{|", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
23 | 25 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("^\\{\\{(end|col-end)\\s*\\}\\}\\s*$", "|}", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java |
— | — | @@ -16,6 +16,8 @@ |
17 | 17 | public WikiConfiguration_dewiki() { |
18 | 18 | super(); |
19 | 19 | |
| 20 | + //conceptNamespacecs.add(Namespace.PORTAL); //FIXME: how to add portal namespace?! |
| 21 | + |
20 | 22 | /* |
21 | 23 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("\\{\\{[Oo]kina\\}\\}", "\u02BB", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
22 | 24 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("\\{\\{\\s*(IPA(?:-Text)|IAST|Unicode|Musik)\\s*\\|\\s*([^|}]+)\\s*(\\|.*?)?\\s*(\\|.*?)?\\}\\}", "$2", Pattern.CASE_INSENSITIVE)); |