Index: trunk/WikiWord/WikiWordBuilder4LifeScience/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -8,8 +8,10 @@ |
9 | 9 | import de.brightbyte.wikiword.ResourceType; |
10 | 10 | import de.brightbyte.wikiword.analyzer.WikiConfiguration; |
11 | 11 | import de.brightbyte.wikiword.analyzer.WikiPage; |
| 12 | +import de.brightbyte.wikiword.analyzer.extractor.CategoryPatternParameterExtractor; |
12 | 13 | import de.brightbyte.wikiword.analyzer.extractor.PagePropertyValueExtractor; |
13 | 14 | import de.brightbyte.wikiword.analyzer.extractor.PropertyValueExtractor; |
| 15 | +import de.brightbyte.wikiword.analyzer.extractor.TemplateNamePatternParameterExtractor; |
14 | 16 | import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor; |
15 | 17 | import de.brightbyte.wikiword.analyzer.extractor.TitlePartExtractor; |
16 | 18 | import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler; |
— | — | @@ -150,7 +152,7 @@ |
151 | 153 | private static final String neuroNamesChars = "["+alphabeticChars+"]+-["+numericChars+"]+"; |
152 | 154 | |
153 | 155 | //TODO: exclude "Biography"... |
154 | | - public static final String lifeScienceJournalPattern = "(^|[ _])(Chem[a-z]*|Bio[a-z]*|Gen[eo][a-z]*|Med[a-z]*|Cell[a-z]*|DNA|RNA|Nucleic|EMBO|FEBS|Onco[a-z]*|Blood|Immono[a-z]*|Cancer|Virol[a-z]*|Med[a-z]*|Clin[a-z]*|Lancet|Neuro[a-z]*|Zootaxa|JAMA|FASEB|Bacter[a-z]*|Mutat[a-z]*|Mol[a-z]*|Protein|Dermat[a-z]*|Pathol[a-z]*|Endocr[a-z]*|Microbio[a-z]*)($|[_ ])"; |
| 156 | + public static final String lifeScienceJournalPattern = "(^|[ _])(Chem[a-z]*|Biol?[.a-z]*|Gen[eo][a-z]*|Med[a-z]*|Cell[a-z]*|DNA|RNA|Nucleic|EMBO|FEBS|Onco[a-z]*|Blood|Immono[a-z]*|Cancer|Virol[a-z]*|Med[a-z]*|Clin[a-z]*|Lancet|Nature|PLoS|Neuro[a-z]*|Zootaxa|JAMA|FASEB|Bacter[a-z]*|Mutat[a-z]*|Mol[a-z]*|Protein|Dermat[a-z]*|Pathol[a-z]*|Endocr[a-z]*|Microbio[a-z]*)($|[_ ])"; |
155 | 157 | |
156 | 158 | |
157 | 159 | protected static DefaultTemplateParameterPropertySpec makeNamePropertySpec(String param, String prop, boolean multi, boolean space) { |
— | — | @@ -606,6 +608,9 @@ |
607 | 609 | new DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true) |
608 | 610 | ) ); |
609 | 611 | |
| 612 | + propertyExtractors.add(new CategoryPatternParameterExtractor("(_|$)([Ff]oods|[Vv]egetables|[Ff]ruits)", null, 0, "food-group")); |
| 613 | + propertyExtractors.add(new TemplateNamePatternParameterExtractor("((.+-)?(Med(ical)?|Treatment|Pathology|Anatomy|Antibiotic|Disease)(-.+)?)-stub", "$2", 0, "med-stub-group")); //TODO: no limits to this one |
| 614 | + |
610 | 615 | pageTermExtractors.add( new PagePropertyValueExtractor("IUPAC") ); |
611 | 616 | pageTermExtractors.add( new PagePropertyValueExtractor("AnatomyLatin") ); |
612 | 617 | pageTermExtractors.add( new PagePropertyValueExtractor("ProteinSymbol") ); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java |
— | — | @@ -0,0 +1,29 @@ |
| 2 | +/** |
| 3 | + * |
| 4 | + */ |
| 5 | +package de.brightbyte.wikiword.analyzer.extractor; |
| 6 | + |
| 7 | +import java.util.regex.Matcher; |
| 8 | +import java.util.regex.Pattern; |
| 9 | + |
| 10 | +import de.brightbyte.wikiword.analyzer.WikiPage; |
| 11 | + |
| 12 | +public class TemplateNamePatternParameterExtractor extends AbstractPatternParameterExtractor { |
| 13 | + |
| 14 | + public TemplateNamePatternParameterExtractor(String pattern, String replacement, int flags, String property) { |
| 15 | + this(Pattern.compile(pattern, flags), replacement, property); |
| 16 | + } |
| 17 | + |
| 18 | + public TemplateNamePatternParameterExtractor(Pattern pattern, String replacement, String property) { |
| 19 | + this(pattern.matcher(""), replacement, property); |
| 20 | + } |
| 21 | + |
| 22 | + public TemplateNamePatternParameterExtractor(Matcher matcher, String replacement, String property) { |
| 23 | + super(matcher, replacement, property); |
| 24 | + } |
| 25 | + |
| 26 | + @Override |
| 27 | + protected Iterable<? extends CharSequence> getPageStrings(WikiPage page) { |
| 28 | + return page.getTemplates().keySet(); |
| 29 | + } |
| 30 | +} |
\ No newline at end of file |
Property changes on: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 31 | + |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/CategoryPatternParameterExtractor.java |
— | — | @@ -3,21 +3,12 @@ |
4 | 4 | */ |
5 | 5 | package de.brightbyte.wikiword.analyzer.extractor; |
6 | 6 | |
7 | | -import java.util.Set; |
8 | 7 | import java.util.regex.Matcher; |
9 | 8 | import java.util.regex.Pattern; |
10 | 9 | |
11 | | -import de.brightbyte.data.MultiMap; |
12 | | -import de.brightbyte.data.ValueSetMultiMap; |
13 | | -import de.brightbyte.wikiword.analyzer.AnalyzerUtils; |
14 | 10 | import de.brightbyte.wikiword.analyzer.WikiPage; |
15 | 11 | |
16 | | -public class CategoryPatternParameterExtractor implements PropertyExtractor { |
17 | | - protected String property; |
18 | | - protected Matcher matcher; |
19 | | - protected String replacement; |
20 | | - private boolean capitalize = false; |
21 | | - |
| 12 | +public class CategoryPatternParameterExtractor extends AbstractPatternParameterExtractor { |
22 | 13 | public CategoryPatternParameterExtractor(String pattern, String replacement, int flags, String property) { |
23 | 14 | this(Pattern.compile(pattern, flags), replacement, property); |
24 | 15 | } |
— | — | @@ -27,33 +18,11 @@ |
28 | 19 | } |
29 | 20 | |
30 | 21 | public CategoryPatternParameterExtractor(Matcher matcher, String replacement, String property) { |
31 | | - this.property = property; |
32 | | - this.matcher = matcher; |
33 | | - this.replacement = replacement; |
| 22 | + super(matcher, replacement, property); |
34 | 23 | } |
35 | 24 | |
36 | | - public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) { |
37 | | - for(CharSequence s: page.getCategories()) { |
38 | | - matcher.reset(s); |
39 | | - if (matcher.matches()) { |
40 | | - CharSequence v = matcher.group(); |
41 | | - v = matcher.replaceAll(replacement); |
42 | | - v = AnalyzerUtils.replaceUnderscoreBySpace(v); |
43 | | - v = AnalyzerUtils.trim(v); |
44 | | - |
45 | | - if (capitalize) |
46 | | - v = AnalyzerUtils.titleCase(v); |
47 | | - |
48 | | - if (into==null) into = new ValueSetMultiMap<String, CharSequence>(); |
49 | | - into.put(property, v); |
50 | | - } |
51 | | - } |
52 | | - |
53 | | - return into; |
| 25 | + @Override |
| 26 | + protected Iterable<? extends CharSequence> getPageStrings(WikiPage page) { |
| 27 | + return page.getCategories(); |
54 | 28 | } |
55 | | - |
56 | | - public PropertyExtractor setCapitalize(boolean capitalize) { |
57 | | - this.capitalize = capitalize; |
58 | | - return this; |
59 | | - } |
60 | 29 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/AbstractPatternParameterExtractor.java |
— | — | @@ -0,0 +1,60 @@ |
| 2 | +package de.brightbyte.wikiword.analyzer.extractor; |
| 3 | + |
| 4 | +import java.util.Set; |
| 5 | +import java.util.regex.Matcher; |
| 6 | +import java.util.regex.Pattern; |
| 7 | + |
| 8 | +import de.brightbyte.data.MultiMap; |
| 9 | +import de.brightbyte.data.ValueSetMultiMap; |
| 10 | +import de.brightbyte.wikiword.analyzer.AnalyzerUtils; |
| 11 | +import de.brightbyte.wikiword.analyzer.WikiPage; |
| 12 | + |
| 13 | +public abstract class AbstractPatternParameterExtractor implements PropertyExtractor { |
| 14 | + |
| 15 | + protected String property; |
| 16 | + protected Matcher matcher; |
| 17 | + protected String replacement; |
| 18 | + private boolean capitalize = false; |
| 19 | + |
| 20 | + public AbstractPatternParameterExtractor(String pattern, String replacement, int flags, String property) { |
| 21 | + this(Pattern.compile(pattern, flags), replacement, property); |
| 22 | + } |
| 23 | + |
| 24 | + public AbstractPatternParameterExtractor(Pattern pattern, String replacement, String property) { |
| 25 | + this(pattern.matcher(""), replacement, property); |
| 26 | + } |
| 27 | + |
| 28 | + public AbstractPatternParameterExtractor(Matcher matcher, String replacement, String property) { |
| 29 | + this.property = property; |
| 30 | + this.matcher = matcher; |
| 31 | + this.replacement = replacement; |
| 32 | + } |
| 33 | + |
| 34 | + public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) { |
| 35 | + for(CharSequence s: getPageStrings(page)) { |
| 36 | + matcher.reset(s); |
| 37 | + if (matcher.matches()) { |
| 38 | + CharSequence v = matcher.group(); |
| 39 | + v = replacement == null ? s : matcher.replaceAll(replacement); |
| 40 | + v = AnalyzerUtils.replaceUnderscoreBySpace(v); |
| 41 | + v = AnalyzerUtils.trim(v); |
| 42 | + |
| 43 | + if (capitalize) |
| 44 | + v = AnalyzerUtils.titleCase(v); |
| 45 | + |
| 46 | + if (into==null) into = new ValueSetMultiMap<String, CharSequence>(); |
| 47 | + into.put(property, v); |
| 48 | + } |
| 49 | + } |
| 50 | + |
| 51 | + return into; |
| 52 | + } |
| 53 | + |
| 54 | + protected abstract Iterable<? extends CharSequence> getPageStrings(WikiPage page); |
| 55 | + |
| 56 | + public PropertyExtractor setCapitalize(boolean capitalize) { |
| 57 | + this.capitalize = capitalize; |
| 58 | + return this; |
| 59 | + } |
| 60 | + |
| 61 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java |
— | — | @@ -362,7 +362,8 @@ |
363 | 363 | String cat = link.getPage().toString(); |
364 | 364 | |
365 | 365 | if (!cat.equals(name) && analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) { |
366 | | - storePageTerms(rcId, analyzer.determineTitleTerms(link.getPage()), conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME); |
| 366 | + Set<CharSequence> terms = analyzer.determineTitleTerms(link.getPage()); |
| 367 | + storePageTerms(rcId, terms, conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME); |
367 | 368 | |
368 | 369 | //NOTE: the alias is preliminary: if a article with the name of the category |
369 | 370 | // exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks |