r50513 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r50512‎ \| r50513 \| r50514 >
Date:	10:18, 12 May 2009
Author:	daniel
Status:	deferred
Tags:
Comment:	food & stubs
Modified paths:	/trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/AbstractPatternParameterExtractor.java (added) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/CategoryPatternParameterExtractor.java (modified) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java (added) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history) /trunk/WikiWord/WikiWordBuilder4LifeScience/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder4LifeScience/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java
—	—	@@ -8,8 +8,10 @@
9	9	import de.brightbyte.wikiword.ResourceType;
10	10	import de.brightbyte.wikiword.analyzer.WikiConfiguration;
11	11	import de.brightbyte.wikiword.analyzer.WikiPage;
	12	+import de.brightbyte.wikiword.analyzer.extractor.CategoryPatternParameterExtractor;
12	13	import de.brightbyte.wikiword.analyzer.extractor.PagePropertyValueExtractor;
13	14	import de.brightbyte.wikiword.analyzer.extractor.PropertyValueExtractor;
	15	+import de.brightbyte.wikiword.analyzer.extractor.TemplateNamePatternParameterExtractor;
14	16	import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
15	17	import de.brightbyte.wikiword.analyzer.extractor.TitlePartExtractor;
16	18	import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
—	—	@@ -150,7 +152,7 @@
151	153	private static final String neuroNamesChars = "["+alphabeticChars+"]+-["+numericChars+"]+";
152	154
153	155	//TODO: exclude "Biography"...
154		- public static final String lifeScienceJournalPattern = "(^\|[ _])(Chem[a-z]\|Bio[a-z]\|Gen[eo][a-z]\|Med[a-z]\|Cell[a-z]\|DNA\|RNA\|Nucleic\|EMBO\|FEBS\|Onco[a-z]\|Blood\|Immono[a-z]\|Cancer\|Virol[a-z]\|Med[a-z]\|Clin[a-z]\|Lancet\|Neuro[a-z]\|Zootaxa\|JAMA\|FASEB\|Bacter[a-z]\|Mutat[a-z]\|Mol[a-z]\|Protein\|Dermat[a-z]\|Pathol[a-z]\|Endocr[a-z]\|Microbio[a-z])($\|[_ ])";
	156	+ public static final String lifeScienceJournalPattern = "(^\|[ _])(Chem[a-z]\|Biol?[.a-z]\|Gen[eo][a-z]\|Med[a-z]\|Cell[a-z]\|DNA\|RNA\|Nucleic\|EMBO\|FEBS\|Onco[a-z]\|Blood\|Immono[a-z]\|Cancer\|Virol[a-z]\|Med[a-z]\|Clin[a-z]\|Lancet\|Nature\|PLoS\|Neuro[a-z]\|Zootaxa\|JAMA\|FASEB\|Bacter[a-z]\|Mutat[a-z]\|Mol[a-z]\|Protein\|Dermat[a-z]\|Pathol[a-z]\|Endocr[a-z]\|Microbio[a-z])($\|[_ ])";
155	157
156	158
157	159	protected static DefaultTemplateParameterPropertySpec makeNamePropertySpec(String param, String prop, boolean multi, boolean space) {
—	—	@@ -606,6 +608,9 @@
607	609	new DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true)
608	610	) );
609	611
	612	+ propertyExtractors.add(new CategoryPatternParameterExtractor("(_\|$)([Ff]oods\|[Vv]egetables\|[Ff]ruits)", null, 0, "food-group"));
	613	+ propertyExtractors.add(new TemplateNamePatternParameterExtractor("((.+-)?(Med(ical)?\|Treatment\|Pathology\|Anatomy\|Antibiotic\|Disease)(-.+)?)-stub", "$2", 0, "med-stub-group")); //TODO: no limits to this one
	614	+
610	615	pageTermExtractors.add( new PagePropertyValueExtractor("IUPAC") );
611	616	pageTermExtractors.add( new PagePropertyValueExtractor("AnatomyLatin") );
612	617	pageTermExtractors.add( new PagePropertyValueExtractor("ProteinSymbol") );
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java
—	—	@@ -0,0 +1,29 @@
	2	+/**
	3	+ *
	4	+ */
	5	+package de.brightbyte.wikiword.analyzer.extractor;
	6	+
	7	+import java.util.regex.Matcher;
	8	+import java.util.regex.Pattern;
	9	+
	10	+import de.brightbyte.wikiword.analyzer.WikiPage;
	11	+
	12	+public class TemplateNamePatternParameterExtractor extends AbstractPatternParameterExtractor {
	13	+
	14	+ public TemplateNamePatternParameterExtractor(String pattern, String replacement, int flags, String property) {
	15	+ this(Pattern.compile(pattern, flags), replacement, property);
	16	+ }
	17	+
	18	+ public TemplateNamePatternParameterExtractor(Pattern pattern, String replacement, String property) {
	19	+ this(pattern.matcher(""), replacement, property);
	20	+ }
	21	+
	22	+ public TemplateNamePatternParameterExtractor(Matcher matcher, String replacement, String property) {
	23	+ super(matcher, replacement, property);
	24	+ }
	25	+
	26	+ @Override
	27	+ protected Iterable<? extends CharSequence> getPageStrings(WikiPage page) {
	28	+ return page.getTemplates().keySet();
	29	+ }
	30	+}
\ No newline at end of file
Property changes on: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/TemplateNamePatternParameterExtractor.java
___________________________________________________________________
Name: svn:mergeinfo
1	31	+
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/CategoryPatternParameterExtractor.java
—	—	@@ -3,21 +3,12 @@
4	4	*/
5	5	package de.brightbyte.wikiword.analyzer.extractor;
6	6
7		~~-import java.util.Set;~~
8	7	import java.util.regex.Matcher;
9	8	import java.util.regex.Pattern;
10	9
11		~~-import de.brightbyte.data.MultiMap;~~
12		~~-import de.brightbyte.data.ValueSetMultiMap;~~
13		~~-import de.brightbyte.wikiword.analyzer.AnalyzerUtils;~~
14	10	import de.brightbyte.wikiword.analyzer.WikiPage;
15	11
16		~~-public class CategoryPatternParameterExtractor implements PropertyExtractor {~~
17		~~- protected String property;~~
18		~~- protected Matcher matcher;~~
19		~~- protected String replacement;~~
20		~~- private boolean capitalize = false;~~
21		-
	12	+public class CategoryPatternParameterExtractor extends AbstractPatternParameterExtractor {
22	13	public CategoryPatternParameterExtractor(String pattern, String replacement, int flags, String property) {
23	14	this(Pattern.compile(pattern, flags), replacement, property);
24	15	}
—	—	@@ -27,33 +18,11 @@
28	19	}
29	20
30	21	public CategoryPatternParameterExtractor(Matcher matcher, String replacement, String property) {
31		~~- this.property = property;~~
32		~~- this.matcher = matcher;~~
33		~~- this.replacement = replacement;~~
	22	+ super(matcher, replacement, property);
34	23	}
35	24
36		~~- public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) {~~
37		~~- for(CharSequence s: page.getCategories()) {~~
38		~~- matcher.reset(s);~~
39		~~- if (matcher.matches()) {~~
40		~~- CharSequence v = matcher.group();~~
41		~~- v = matcher.replaceAll(replacement);~~
42		~~- v = AnalyzerUtils.replaceUnderscoreBySpace(v);~~
43		~~- v = AnalyzerUtils.trim(v);~~
44		-
45		~~- if (capitalize)~~
46		~~- v = AnalyzerUtils.titleCase(v);~~
47		-
48		~~- if (into==null) into = new ValueSetMultiMap<String, CharSequence>();~~
49		~~- into.put(property, v);~~
50		~~- }~~
51		~~- }~~
52		-
53		~~- return into;~~
	25	+ @Override
	26	+ protected Iterable<? extends CharSequence> getPageStrings(WikiPage page) {
	27	+ return page.getCategories();
54	28	}
55		-
56		~~- public PropertyExtractor setCapitalize(boolean capitalize) {~~
57		~~- this.capitalize = capitalize;~~
58		~~- return this;~~
59		~~- }~~
60	29	}
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/AbstractPatternParameterExtractor.java
—	—	@@ -0,0 +1,60 @@
	2	+package de.brightbyte.wikiword.analyzer.extractor;
	3	+
	4	+import java.util.Set;
	5	+import java.util.regex.Matcher;
	6	+import java.util.regex.Pattern;
	7	+
	8	+import de.brightbyte.data.MultiMap;
	9	+import de.brightbyte.data.ValueSetMultiMap;
	10	+import de.brightbyte.wikiword.analyzer.AnalyzerUtils;
	11	+import de.brightbyte.wikiword.analyzer.WikiPage;
	12	+
	13	+public abstract class AbstractPatternParameterExtractor implements PropertyExtractor {
	14	+
	15	+ protected String property;
	16	+ protected Matcher matcher;
	17	+ protected String replacement;
	18	+ private boolean capitalize = false;
	19	+
	20	+ public AbstractPatternParameterExtractor(String pattern, String replacement, int flags, String property) {
	21	+ this(Pattern.compile(pattern, flags), replacement, property);
	22	+ }
	23	+
	24	+ public AbstractPatternParameterExtractor(Pattern pattern, String replacement, String property) {
	25	+ this(pattern.matcher(""), replacement, property);
	26	+ }
	27	+
	28	+ public AbstractPatternParameterExtractor(Matcher matcher, String replacement, String property) {
	29	+ this.property = property;
	30	+ this.matcher = matcher;
	31	+ this.replacement = replacement;
	32	+ }
	33	+
	34	+ public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) {
	35	+ for(CharSequence s: getPageStrings(page)) {
	36	+ matcher.reset(s);
	37	+ if (matcher.matches()) {
	38	+ CharSequence v = matcher.group();
	39	+ v = replacement == null ? s : matcher.replaceAll(replacement);
	40	+ v = AnalyzerUtils.replaceUnderscoreBySpace(v);
	41	+ v = AnalyzerUtils.trim(v);
	42	+
	43	+ if (capitalize)
	44	+ v = AnalyzerUtils.titleCase(v);
	45	+
	46	+ if (into==null) into = new ValueSetMultiMap<String, CharSequence>();
	47	+ into.put(property, v);
	48	+ }
	49	+ }
	50	+
	51	+ return into;
	52	+ }
	53	+
	54	+ protected abstract Iterable<? extends CharSequence> getPageStrings(WikiPage page);
	55	+
	56	+ public PropertyExtractor setCapitalize(boolean capitalize) {
	57	+ this.capitalize = capitalize;
	58	+ return this;
	59	+ }
	60	+
	61	+}
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
—	—	@@ -362,7 +362,8 @@
363	363	String cat = link.getPage().toString();
364	364
365	365	if (!cat.equals(name) && analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) {
366		~~- storePageTerms(rcId, analyzer.determineTitleTerms(link.getPage()), conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME);~~
	366	+ Set<CharSequence> terms = analyzer.determineTitleTerms(link.getPage());
	367	+ storePageTerms(rcId, terms, conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME);
367	368
368	369	//NOTE: the alias is preliminary: if a article with the name of the category
369	370	// exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks

Status & tagging log

16:00, 26 May 2009 Tim Starling (talk | contribs) changed the status of r50513 [removed: new added: deferred]
20:50, 14 May 2009 Brion VIBBER (talk | contribs) changed the status of r50513 [removed: new added: deferred]