r50513 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r50512‎ | r50513 | r50514 >
Date:10:18, 12 May 2009
Author:daniel
Status:deferred
Tags:
Comment:
food & stubs
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/AbstractPatternParameterExtractor.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/CategoryPatternParameterExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder4LifeScience/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder4LifeScience/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java
@@ -8,8 +8,10 @@
99 import de.brightbyte.wikiword.ResourceType;
1010 import de.brightbyte.wikiword.analyzer.WikiConfiguration;
1111 import de.brightbyte.wikiword.analyzer.WikiPage;
 12+import de.brightbyte.wikiword.analyzer.extractor.CategoryPatternParameterExtractor;
1213 import de.brightbyte.wikiword.analyzer.extractor.PagePropertyValueExtractor;
1314 import de.brightbyte.wikiword.analyzer.extractor.PropertyValueExtractor;
 15+import de.brightbyte.wikiword.analyzer.extractor.TemplateNamePatternParameterExtractor;
1416 import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
1517 import de.brightbyte.wikiword.analyzer.extractor.TitlePartExtractor;
1618 import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
@@ -150,7 +152,7 @@
151153 private static final String neuroNamesChars = "["+alphabeticChars+"]+-["+numericChars+"]+";
152154
153155 //TODO: exclude "Biography"...
154 - public static final String lifeScienceJournalPattern = "(^|[ _])(Chem[a-z]*|Bio[a-z]*|Gen[eo][a-z]*|Med[a-z]*|Cell[a-z]*|DNA|RNA|Nucleic|EMBO|FEBS|Onco[a-z]*|Blood|Immono[a-z]*|Cancer|Virol[a-z]*|Med[a-z]*|Clin[a-z]*|Lancet|Neuro[a-z]*|Zootaxa|JAMA|FASEB|Bacter[a-z]*|Mutat[a-z]*|Mol[a-z]*|Protein|Dermat[a-z]*|Pathol[a-z]*|Endocr[a-z]*|Microbio[a-z]*)($|[_ ])";
 156+ public static final String lifeScienceJournalPattern = "(^|[ _])(Chem[a-z]*|Biol?[.a-z]*|Gen[eo][a-z]*|Med[a-z]*|Cell[a-z]*|DNA|RNA|Nucleic|EMBO|FEBS|Onco[a-z]*|Blood|Immono[a-z]*|Cancer|Virol[a-z]*|Med[a-z]*|Clin[a-z]*|Lancet|Nature|PLoS|Neuro[a-z]*|Zootaxa|JAMA|FASEB|Bacter[a-z]*|Mutat[a-z]*|Mol[a-z]*|Protein|Dermat[a-z]*|Pathol[a-z]*|Endocr[a-z]*|Microbio[a-z]*)($|[_ ])";
155157
156158
157159 protected static DefaultTemplateParameterPropertySpec makeNamePropertySpec(String param, String prop, boolean multi, boolean space) {
@@ -606,6 +608,9 @@
607609 new DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true)
608610 ) );
609611
 612+ propertyExtractors.add(new CategoryPatternParameterExtractor("(_|$)([Ff]oods|[Vv]egetables|[Ff]ruits)", null, 0, "food-group"));
 613+ propertyExtractors.add(new TemplateNamePatternParameterExtractor("((.+-)?(Med(ical)?|Treatment|Pathology|Anatomy|Antibiotic|Disease)(-.+)?)-stub", "$2", 0, "med-stub-group")); //TODO: no limits to this one
 614+
610615 pageTermExtractors.add( new PagePropertyValueExtractor("IUPAC") );
611616 pageTermExtractors.add( new PagePropertyValueExtractor("AnatomyLatin") );
612617 pageTermExtractors.add( new PagePropertyValueExtractor("ProteinSymbol") );
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java
@@ -0,0 +1,29 @@
 2+/**
 3+ *
 4+ */
 5+package de.brightbyte.wikiword.analyzer.extractor;
 6+
 7+import java.util.regex.Matcher;
 8+import java.util.regex.Pattern;
 9+
 10+import de.brightbyte.wikiword.analyzer.WikiPage;
 11+
 12+public class TemplateNamePatternParameterExtractor extends AbstractPatternParameterExtractor {
 13+
 14+ public TemplateNamePatternParameterExtractor(String pattern, String replacement, int flags, String property) {
 15+ this(Pattern.compile(pattern, flags), replacement, property);
 16+ }
 17+
 18+ public TemplateNamePatternParameterExtractor(Pattern pattern, String replacement, String property) {
 19+ this(pattern.matcher(""), replacement, property);
 20+ }
 21+
 22+ public TemplateNamePatternParameterExtractor(Matcher matcher, String replacement, String property) {
 23+ super(matcher, replacement, property);
 24+ }
 25+
 26+ @Override
 27+ protected Iterable<? extends CharSequence> getPageStrings(WikiPage page) {
 28+ return page.getTemplates().keySet();
 29+ }
 30+}
\ No newline at end of file
Property changes on: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java
___________________________________________________________________
Name: svn:mergeinfo
131 +
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/CategoryPatternParameterExtractor.java
@@ -3,21 +3,12 @@
44 */
55 package de.brightbyte.wikiword.analyzer.extractor;
66
7 -import java.util.Set;
87 import java.util.regex.Matcher;
98 import java.util.regex.Pattern;
109
11 -import de.brightbyte.data.MultiMap;
12 -import de.brightbyte.data.ValueSetMultiMap;
13 -import de.brightbyte.wikiword.analyzer.AnalyzerUtils;
1410 import de.brightbyte.wikiword.analyzer.WikiPage;
1511
16 -public class CategoryPatternParameterExtractor implements PropertyExtractor {
17 - protected String property;
18 - protected Matcher matcher;
19 - protected String replacement;
20 - private boolean capitalize = false;
21 -
 12+public class CategoryPatternParameterExtractor extends AbstractPatternParameterExtractor {
2213 public CategoryPatternParameterExtractor(String pattern, String replacement, int flags, String property) {
2314 this(Pattern.compile(pattern, flags), replacement, property);
2415 }
@@ -27,33 +18,11 @@
2819 }
2920
3021 public CategoryPatternParameterExtractor(Matcher matcher, String replacement, String property) {
31 - this.property = property;
32 - this.matcher = matcher;
33 - this.replacement = replacement;
 22+ super(matcher, replacement, property);
3423 }
3524
36 - public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) {
37 - for(CharSequence s: page.getCategories()) {
38 - matcher.reset(s);
39 - if (matcher.matches()) {
40 - CharSequence v = matcher.group();
41 - v = matcher.replaceAll(replacement);
42 - v = AnalyzerUtils.replaceUnderscoreBySpace(v);
43 - v = AnalyzerUtils.trim(v);
44 -
45 - if (capitalize)
46 - v = AnalyzerUtils.titleCase(v);
47 -
48 - if (into==null) into = new ValueSetMultiMap<String, CharSequence>();
49 - into.put(property, v);
50 - }
51 - }
52 -
53 - return into;
 25+ @Override
 26+ protected Iterable<? extends CharSequence> getPageStrings(WikiPage page) {
 27+ return page.getCategories();
5428 }
55 -
56 - public PropertyExtractor setCapitalize(boolean capitalize) {
57 - this.capitalize = capitalize;
58 - return this;
59 - }
6029 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/AbstractPatternParameterExtractor.java
@@ -0,0 +1,60 @@
 2+package de.brightbyte.wikiword.analyzer.extractor;
 3+
 4+import java.util.Set;
 5+import java.util.regex.Matcher;
 6+import java.util.regex.Pattern;
 7+
 8+import de.brightbyte.data.MultiMap;
 9+import de.brightbyte.data.ValueSetMultiMap;
 10+import de.brightbyte.wikiword.analyzer.AnalyzerUtils;
 11+import de.brightbyte.wikiword.analyzer.WikiPage;
 12+
 13+public abstract class AbstractPatternParameterExtractor implements PropertyExtractor {
 14+
 15+ protected String property;
 16+ protected Matcher matcher;
 17+ protected String replacement;
 18+ private boolean capitalize = false;
 19+
 20+ public AbstractPatternParameterExtractor(String pattern, String replacement, int flags, String property) {
 21+ this(Pattern.compile(pattern, flags), replacement, property);
 22+ }
 23+
 24+ public AbstractPatternParameterExtractor(Pattern pattern, String replacement, String property) {
 25+ this(pattern.matcher(""), replacement, property);
 26+ }
 27+
 28+ public AbstractPatternParameterExtractor(Matcher matcher, String replacement, String property) {
 29+ this.property = property;
 30+ this.matcher = matcher;
 31+ this.replacement = replacement;
 32+ }
 33+
 34+ public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) {
 35+ for(CharSequence s: getPageStrings(page)) {
 36+ matcher.reset(s);
 37+ if (matcher.matches()) {
 38+ CharSequence v = matcher.group();
 39+ v = replacement == null ? s : matcher.replaceAll(replacement);
 40+ v = AnalyzerUtils.replaceUnderscoreBySpace(v);
 41+ v = AnalyzerUtils.trim(v);
 42+
 43+ if (capitalize)
 44+ v = AnalyzerUtils.titleCase(v);
 45+
 46+ if (into==null) into = new ValueSetMultiMap<String, CharSequence>();
 47+ into.put(property, v);
 48+ }
 49+ }
 50+
 51+ return into;
 52+ }
 53+
 54+ protected abstract Iterable<? extends CharSequence> getPageStrings(WikiPage page);
 55+
 56+ public PropertyExtractor setCapitalize(boolean capitalize) {
 57+ this.capitalize = capitalize;
 58+ return this;
 59+ }
 60+
 61+}
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
@@ -362,7 +362,8 @@
363363 String cat = link.getPage().toString();
364364
365365 if (!cat.equals(name) && analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) {
366 - storePageTerms(rcId, analyzer.determineTitleTerms(link.getPage()), conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME);
 366+ Set<CharSequence> terms = analyzer.determineTitleTerms(link.getPage());
 367+ storePageTerms(rcId, terms, conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME);
367368
368369 //NOTE: the alias is preliminary: if a article with the name of the category
369370 // exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks

Status & tagging log