r66198 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r66197‎ | r66198 | r66199 >
Date:15:43, 11 May 2010
Author:daniel
Status:deferred
Tags:
Comment:
property extraction for geography: coordinates and nested templates
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AbstractAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/DeepTemplateExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/DefaultTemplateParameterPropertySpec.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/FlatTemplateExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/TemplateExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/TemplateParameterPropertySpec.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/geography/wikis/WikiConfiguration_enwiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AbstractAnalyzer.java
@@ -24,7 +24,8 @@
2525 if (manglers==null) return text;
2626
2727 for (Mangler mangler : manglers) {
28 - text = mangler.mangle(text);
 28+ CharSequence t = mangler.mangle(text);
 29+ text = t;
2930 }
3031
3132 return text;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java
@@ -48,7 +48,9 @@
4949 import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
5050 import de.brightbyte.wikiword.analyzer.mangler.TextArmor;
5151 import de.brightbyte.wikiword.analyzer.sensor.Sensor;
 52+import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor;
5253 import de.brightbyte.wikiword.analyzer.template.DummyTemplateUser;
 54+import de.brightbyte.wikiword.analyzer.template.FlatTemplateExtractor;
5355 import de.brightbyte.wikiword.analyzer.template.TemplateData;
5456 import de.brightbyte.wikiword.analyzer.template.TemplateExtractor;
5557 import de.brightbyte.wikiword.analyzer.template.TemplateUser;
@@ -427,7 +429,7 @@
428430 return targetConceptPage;
429431 }
430432 }
431 -
 433+
432434 protected class Page implements WikiPage {
433435 protected int namespace;
434436 protected String title;
@@ -529,8 +531,12 @@
530532 */
531533 public TemplateExtractor getTemplateExtractor() {
532534 if (templateExtractor==null) {
533 - templateExtractor =
534 - WikiTextAnalyzer.this.config.templateExtractorFactory.newTemplateExtractor(WikiTextAnalyzer.this, armor);
 535+ if ( WikiTextAnalyzer.this.config.nestedTemplateFields==null || WikiTextAnalyzer.this.config.nestedTemplateFields.isEmpty() ) {
 536+ templateExtractor = new FlatTemplateExtractor(WikiTextAnalyzer.this, armor);
 537+ } else {
 538+ templateExtractor = new DeepTemplateExtractor(WikiTextAnalyzer.this, armor);
 539+ ((DeepTemplateExtractor)templateExtractor).addContainerFields(WikiTextAnalyzer.this.config.nestedTemplateFields);
 540+ }
535541 }
536542
537543 return templateExtractor;
@@ -973,7 +979,7 @@
974980 private WikiTextSniffer sniffer = new WikiTextSniffer();
975981 private Map<String, String> languageNames;
976982 private Map<String, Interwiki> interwikiMap;
977 -
 983+
978984 public WikiTextAnalyzer(PlainTextAnalyzer language) throws IOException {
979985 this.language = language;
980986 this.corpus = language.getCorpus();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java
@@ -10,6 +10,8 @@
1111 import java.util.Set;
1212 import java.util.regex.Pattern;
1313
 14+import de.brightbyte.data.MultiMap;
 15+import de.brightbyte.data.ValueSetMultiMap;
1416 import de.brightbyte.data.measure.Measure;
1517 import de.brightbyte.util.StringUtils;
1618 import de.brightbyte.wikiword.ConceptType;
@@ -32,8 +34,6 @@
3335 import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor;
3436 import de.brightbyte.wikiword.analyzer.sensor.HasTemplateLikeSensor;
3537 import de.brightbyte.wikiword.analyzer.sensor.Sensor;
36 -import de.brightbyte.wikiword.analyzer.template.FlatTemplateExtractor;
37 -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor;
3838
3939 /**
4040 * A WikiConfiguration represents knowledge about language specific and project specific
@@ -284,13 +284,13 @@
285285 /** Flag indication wether extraction of definitions is supported for this wiki **/
286286 public boolean definitionsSupported = true;
287287
288 - public TemplateExtractor.Factory templateExtractorFactory;
289 -
290288 protected WikiTextAnalyzer analyzer;
291289
292290 protected String wikiName;
293291
294292 public Set<Integer> conceptNamespacecs = new HashSet<Integer>();
 293+
 294+ protected MultiMap<CharSequence, CharSequence, Set<CharSequence>> nestedTemplateFields = new ValueSetMultiMap<CharSequence, CharSequence>();
295295
296296 protected WikiConfiguration() {
297297 this(null);
@@ -407,8 +407,6 @@
408408 useCategoryAliases = true;
409409 mainArtikeMarkerPattern = Pattern.compile("^[- !_*$@#+~/%]?"); //use "category main articles" to resolve plural names
410410
411 - this.templateExtractorFactory = FlatTemplateExtractor.factory;
412 -
413411 this.useSuffixAsCategory = false;
414412 this.definitionsSupported = true;
415413 this.flatTextSupported = true;
@@ -497,10 +495,10 @@
498496 supplementedConceptExtractors.addAll(with.supplementedConceptExtractors);
499497
500498 extraTemplatePatterns.addAll(with.extraTemplatePatterns);
 499+ nestedTemplateFields.putAll(with.nestedTemplateFields);
501500
502501 //if (with.language!=null) language = with.language;
503502
504 - if (with.templateExtractorFactory!=null) templateExtractorFactory = with.templateExtractorFactory;
505503 if (with.linkTrail!=null) linkTrail = with.linkTrail;
506504 if (with.badLinkPattern!=null) badLinkPattern = with.badLinkPattern;
507505
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/DefaultTemplateParameterPropertySpec.java
@@ -105,17 +105,17 @@
106106 return this;
107107 }
108108
109 - public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> values) {
 109+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) {
110110 CharSequence v = params.getParameter(parameter);
111 - if (v==null) return values;
112 - if (v.length()==0) return values;
 111+ if (v==null) return intoValues;
 112+ if (v.length()==0) return intoValues;
113113
114114 if (clean!=null) {
115115 for (Mangler m: clean) v = m.mangle(v);
116116 }
117117
118118 if (cond!=null) {
119 - if (!cond.matches(v)) return values;
 119+ if (!cond.matches(v)) return intoValues;
120120 }
121121
122122 if (split!=null) {
@@ -139,7 +139,7 @@
140140 if (done) i = j;
141141 else i = split.end();
142142
143 - values = addValue(w, page, values);
 143+ intoValues = addValue(w, page, intoValues);
144144 }
145145 }
146146 else if (find!=null) {
@@ -147,14 +147,14 @@
148148 while (find.find()) {
149149 CharSequence w = find.groupCount() > 0 ? find.group(1) : find.group();
150150
151 - values = addValue(w, page, values);
 151+ intoValues = addValue(w, page, intoValues);
152152 }
153153 }
154154 else if (split==null) {
155 - values = addValue(v, page, values);
 155+ intoValues = addValue(v, page, intoValues);
156156 }
157157
158 - return values;
 158+ return intoValues;
159159 }
160160
161161 protected Set<CharSequence> addValue(CharSequence w, WikiPage page, Set<CharSequence> values) {
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/DeepTemplateExtractor.java
@@ -21,12 +21,6 @@
2222
2323 public class DeepTemplateExtractor extends AbstractTemplateExtractor {
2424
25 - public static final Factory factory = new Factory() {
26 - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) {
27 - return new DeepTemplateExtractor(context, armor);
28 - }
29 - };
30 -
3125 public DeepTemplateExtractor(Context context, TextArmor armor) {
3226 super(context, armor);
3327 }
@@ -251,11 +245,12 @@
252246 v = markerScanner.getText().subSequence(start, end);
253247
254248 v = AnalyzerUtils.trim(v);
255 - v = stripMarkup(v, true);
256249
257 - v = HtmlEntities.decodeEntities(v);
258 -
259 - data.setParameter(n, v);
 250+ if (v.length()>0) {
 251+ v = stripMarkup(v, true);
 252+ v = HtmlEntities.decodeEntities(v);
 253+ data.setParameter(n, v);
 254+ }
260255 }
261256 }
262257
@@ -263,16 +258,29 @@
264259
265260 private String getPrefix(CharSequence template, CharSequence parameter) {
266261 if (containerFields==null) return null;
267 - if (!containerFields.contains(template, parameter)) return null;
 262+
 263+ if (!containerFields.contains(template, parameter)) {
 264+ if (containerFields.contains("", parameter)) {
 265+ return template + "." + parameter;
 266+ } else {
 267+ return null;
 268+ }
 269+ }
268270
269271 return template + "." + parameter;
270272 }
271273
272274 public void addContainerField(CharSequence template, CharSequence parameter) {
 275+ if (template==null) template = "";
273276 if (containerFields==null) containerFields = new ValueSetMultiMap<CharSequence, CharSequence>();
274277 containerFields.put(template, parameter);
275278 }
276279
 280+ public void addContainerFields(MultiMap<CharSequence, CharSequence, Set<CharSequence>> nestedTemplateFields) {
 281+ if (containerFields==null) containerFields = new ValueSetMultiMap<CharSequence, CharSequence>();
 282+ containerFields.putAll(nestedTemplateFields);
 283+ }
 284+
277285 private void parseLink(String prefix) {
278286 int m;
279287
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/TemplateExtractor.java
@@ -13,9 +13,5 @@
1414 public boolean isRelevantTemplate(CharSequence name);
1515 }
1616
17 - public interface Factory {
18 - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor);
19 - }
20 -
2117 public MultiMap<String, TemplateData, List<TemplateData>> extractTemplates(CharSequence text);
2218 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/TemplateParameterPropertySpec.java
@@ -11,7 +11,7 @@
1212
1313 /** determins a property value from a map of template parameters.
1414 **/
15 - public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> values);
 15+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues);
1616
1717 public String getPropertyName();
1818 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/FlatTemplateExtractor.java
@@ -13,12 +13,6 @@
1414
1515 public class FlatTemplateExtractor extends AbstractTemplateExtractor {
1616
17 - public static final Factory factory = new Factory() {
18 - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) {
19 - return new FlatTemplateExtractor(context, armor);
20 - }
21 - };
22 -
2317 private Matcher templateMarkerMatcher = Pattern.compile("\\{\\{([^|]+?)(?=\\||\\}\\}|\\{\\{)|\\}\\}").matcher("");
2418 private Matcher templateParamMatcher = Pattern.compile("\\||\\{\\{!\\}\\}").matcher("");
2519
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java
@@ -96,7 +96,7 @@
9797 "Portal|Stub[-\\w]*|commons|Cong(Bio2?|Links)|" +
9898 "Tnavbar|Navbox([ _]generic)?|redirect|pp-.*?|" +
9999 "ambox|wikify|pov|cleanup|globalize|split|current|issue|merge|" +
100 - "Coor([ _]\\w+)?|Coord|reflist|precision[-\\w\\d]+|nowrap[ _]begin|" +
 100+ "reflist|precision[-\\w\\d]+|nowrap[ _]begin|" +
101101 "Audio|\\w+[ _]icon|lang-\\w+|Flagicon|Flag|Flagcountry|" +
102102 "Main|" +
103103 "redirect" //maybe keep that? but we need this for the :'' stripping
@@ -143,6 +143,9 @@
144144 //cruft regarding english/welsh census templates
145145 stripClutterManglers.add( new RegularExpressionMangler("rank\\s*=\\s*\\[\\[List[ _]of[ _][-\\w\\d\\s]+?\\|\\s*Ranked\\s+\\{\\{[-\\w\\d\\s]+?counties\\s*\\|\\s*\\w+=[-\\w\\d\\s]+\\}\\}\\]\\]", "", 0));
146146
 147+ //strip coodinate boxes only after template processing
 148+ stripBoxesManglers.add( new RegularExpressionMangler(templatePattern("Coor([ _]\\w+)?", 0, true), ""));
 149+
147150 conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PLACE,
148151 "^(NUTS|Geography_of|Places|Villages|Towns|Cities|Captials?|Constituencies|Counties|Countries|Municipalities|Settlements|States|Provinces|Territories|Federal_states|Islands|Regions|Domains|Communes|Districts|Locations)" +
149152 "(_|$)|_(places|villages|towns|cities|capitals|constituencies(_.*)?|counties|countries|municipalities|settlements|states|provinces|territories|federal_states|islands|regions|domains|communes|districts|locations)$", 0));
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/geography/wikis/WikiConfiguration_enwiki.java
@@ -1,44 +1,156 @@
22 package de.brightbyte.wikiword.geography.wikis;
33
 4+import java.util.Set;
 5+
46 import de.brightbyte.wikiword.ConceptType;
 7+import de.brightbyte.wikiword.analyzer.AnalyzerUtils;
58 import de.brightbyte.wikiword.analyzer.WikiConfiguration;
 9+import de.brightbyte.wikiword.analyzer.WikiPage;
610 import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
7 -import de.brightbyte.wikiword.analyzer.mangler.TextArmor;
8 -import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
911 import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher;
1012 import de.brightbyte.wikiword.analyzer.sensor.HasPropertySensor;
11 -import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor;
1213 import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec;
13 -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor;
14 -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor.Context;
 14+import de.brightbyte.wikiword.analyzer.template.TemplateData;
 15+import de.brightbyte.wikiword.analyzer.template.TemplateParameterPropertySpec;
1516
1617 public class WikiConfiguration_enwiki extends WikiConfiguration {
1718
 19+ protected static class CoordinatePropertySpec implements TemplateParameterPropertySpec {
 20+ protected String name;
 21+
 22+ protected String lat_deg;
 23+ protected String lat_min;
 24+ protected String lat_sec;
 25+ protected String lat_NS;
 26+
 27+ protected String long_deg;
 28+ protected String long_min;
 29+ protected String long_sec;
 30+ protected String long_EW;
 31+
 32+ public CoordinatePropertySpec(String name, String lat_deg, String lat_min, String lat_sec, String lat_NS, String long_deg, String long_min, String long_sec, String long_EW) {
 33+ super();
 34+ this.name = name;
 35+ this.lat_deg = lat_deg;
 36+ this.lat_min = lat_min;
 37+ this.lat_sec = lat_sec;
 38+ this.lat_NS = lat_NS;
 39+ this.long_deg = long_deg;
 40+ this.long_min = long_min;
 41+ this.long_sec = long_sec;
 42+ this.long_EW = long_EW;
 43+ }
 44+
 45+ public String getPropertyName() {
 46+ return name;
 47+ }
 48+
 49+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) {
 50+ StringBuilder b = new StringBuilder();
 51+
 52+ if (params.getParameter(lat_deg)==null) return intoValues;
 53+ if (params.getParameter(long_deg)==null) return intoValues;
 54+
 55+ addChunk(params, lat_deg, "°", b);
 56+ addChunk(params, lat_min, "'", b);
 57+ addChunk(params, lat_sec, "\"", b);
 58+ addChunk(params, lat_NS, null, b);
 59+
 60+ b.append(" ");
 61+
 62+ addChunk(params, long_deg, "°", b);
 63+ addChunk(params, long_min, "'", b);
 64+ addChunk(params, long_sec, "\"", b);
 65+ addChunk(params, long_EW, null, b);
 66+
 67+ intoValues = addValue(b, page, intoValues);
 68+ return intoValues;
 69+ }
 70+
 71+ protected boolean addChunk(TemplateData params, CharSequence key, String unit, StringBuilder buffer) {
 72+ if (key==null) return false;
 73+
 74+ CharSequence v = params.getParameter(key);
 75+ if (v==null) return false;
 76+
 77+ buffer.append(v);
 78+ if (unit!=null) buffer.append(unit);
 79+
 80+ return true;
 81+ }
 82+
 83+ protected Set<CharSequence> addValue(CharSequence w, WikiPage page, Set<CharSequence> intoValues) {
 84+ if (w==null || w.length()==0) return intoValues;
 85+
 86+ w = AnalyzerUtils.trim(w);
 87+ if (w.length()==0) return intoValues;
 88+
 89+ intoValues = AnalyzerUtils.addToSet(intoValues, w);
 90+ return intoValues;
 91+ }
 92+
 93+ }
 94+
 95+ protected static class Positional8CoordinatePropertySpec extends CoordinatePropertySpec {
 96+ public Positional8CoordinatePropertySpec(String name) {
 97+ super(name, "1", "2", "3", "4", "5", "6", "7", "8");
 98+ }
 99+
 100+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) {
 101+ if (params.getParameter("8")==null || params.getParameter("10")!=null) return intoValues;
 102+ return super.getPropertyValues(page, params, intoValues);
 103+ }
 104+ }
 105+
 106+ protected static class Positional6CoordinatePropertySpec extends CoordinatePropertySpec {
 107+ public Positional6CoordinatePropertySpec(String name) {
 108+ super(name, "1", "2", null, "3", "4", "5", null, "6");
 109+ }
 110+
 111+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) {
 112+ if (params.getParameter("6")==null || params.getParameter("8")!=null) return intoValues;
 113+ return super.getPropertyValues(page, params, intoValues);
 114+ }
 115+ }
 116+
 117+ protected static class Positional4CoordinatePropertySpec extends CoordinatePropertySpec {
 118+ public Positional4CoordinatePropertySpec(String name) {
 119+ super(name, "1", null, null, "2", "3", null, null, "4");
 120+ }
 121+
 122+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) {
 123+ if (params.getParameter("4")==null || params.getParameter("6")!=null) return intoValues;
 124+ return super.getPropertyValues(page, params, intoValues);
 125+ }
 126+ }
 127+
 128+ protected static class Positional2CoordinatePropertySpec extends CoordinatePropertySpec {
 129+ public Positional2CoordinatePropertySpec(String name) {
 130+ super(name, "1", null, null, null, "2", null, null, null);
 131+ }
 132+
 133+ public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) {
 134+ if (params.getParameter("2")==null || params.getParameter("4")!=null) return intoValues;
 135+ return super.getPropertyValues(page, params, intoValues);
 136+ }
 137+ }
 138+
18139 public WikiConfiguration_enwiki() {
19140 super();
20141
21 - templateExtractorFactory= new TemplateExtractor.Factory() {
22 - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) {
23 - DeepTemplateExtractor extractor = new DeepTemplateExtractor(context, armor);
24 - //FIXME: this needs to accumulate!!!! //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME
25 - return extractor;
26 - }
27 - };
 142+ nestedTemplateFields.put("Protbox", "coord");
 143+ nestedTemplateFields.put("Protbox", "coords");
 144+ nestedTemplateFields.put("Protbox", "coordinates");
28145
29146 //XXX: coord may appear nested. check if it works.
30 - propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Coord"),
31 - new DefaultTemplateParameterPropertySpec("1", "coord-lat-deg").setStripMarkup(true),
32 - new DefaultTemplateParameterPropertySpec("2", "coord-lat-min").setStripMarkup(true),
33 - new DefaultTemplateParameterPropertySpec("3", "coord-lat-sec").setStripMarkup(true),
34 - new DefaultTemplateParameterPropertySpec("4", "coord-lat-NS").setStripMarkup(true),
35 - new DefaultTemplateParameterPropertySpec("5", "coord-long-deg").setStripMarkup(true),
36 - new DefaultTemplateParameterPropertySpec("6", "coord-long-min").setStripMarkup(true),
37 - new DefaultTemplateParameterPropertySpec("7", "coord-long-sec").setStripMarkup(true),
38 - new DefaultTemplateParameterPropertySpec("8", "coord-long-EW").setStripMarkup(true),
39 - new DefaultTemplateParameterPropertySpec("9", "coord-args").setStripMarkup(true)
40 - ) );
 147+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("(.*\\.coords?::)?[Cc]oord", 0, true),
 148+ new Positional8CoordinatePropertySpec( "position"),
 149+ new Positional6CoordinatePropertySpec( "position"),
 150+ new Positional4CoordinatePropertySpec( "position"),
 151+ new Positional2CoordinatePropertySpec( "position")
 152+ ) );
41153
42 - propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("(Geobox|Infobox_(.*_)?([Ss]ettlement|[Cc]ountry|[Ss]tate|[Ll]ocation|[Cc]ounty|[Ll]ake)|.*_constituency_infobox)", 0, true),
 154+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("(Infobox_.*|.*box)", 0, true),
43155 new DefaultTemplateParameterPropertySpec("name", "place-name").setStripMarkup(true),
44156 new DefaultTemplateParameterPropertySpec("native_name", "place-name").setStripMarkup(true),
45157 new DefaultTemplateParameterPropertySpec("common_name", "place-name").setStripMarkup(true),
@@ -50,6 +162,10 @@
51163 new DefaultTemplateParameterPropertySpec("area_sq_mi", "area-mi2").setStripMarkup(true),
52164 new DefaultTemplateParameterPropertySpec("area_total_km2", "area-km2").setStripMarkup(true),
53165 new DefaultTemplateParameterPropertySpec("area_total_sq_mi", "area-mi2").setStripMarkup(true),
 166+
 167+ new DefaultTemplateParameterPropertySpec("elevation", "elevation").setStripMarkup(true),
 168+ new DefaultTemplateParameterPropertySpec("elevation_m", "elevation-m").setStripMarkup(true),
 169+ new DefaultTemplateParameterPropertySpec("elevation_ft", "elevation-ft").setStripMarkup(true),
54170
55171 new DefaultTemplateParameterPropertySpec("timezone", "time-zone").setStripMarkup(true),
56172 new DefaultTemplateParameterPropertySpec("time_zone", "time-zone").setStripMarkup(true),
@@ -60,27 +176,10 @@
61177 new DefaultTemplateParameterPropertySpec("population_density_km2", "population-density-km2").setStripMarkup(true),
62178 new DefaultTemplateParameterPropertySpec("population_density_sq_mi", "population-density-mi2").setStripMarkup(true),
63179
64 - new DefaultTemplateParameterPropertySpec("latd", "coord-lat-deg").setStripMarkup(true),
65 - new DefaultTemplateParameterPropertySpec("latm", "coord-lat-min").setStripMarkup(true),
66 - new DefaultTemplateParameterPropertySpec("lats", "coord-lat-sec").setStripMarkup(true),
67 - new DefaultTemplateParameterPropertySpec("latNS", "coord-lat-NS").setStripMarkup(true),
68 - new DefaultTemplateParameterPropertySpec("longd", "coord-long-deg").setStripMarkup(true),
69 - new DefaultTemplateParameterPropertySpec("longm", "coord-long-min").setStripMarkup(true),
70 - new DefaultTemplateParameterPropertySpec("longs", "coord-long-sec").setStripMarkup(true),
71 - new DefaultTemplateParameterPropertySpec("longEW", "coord-long-EW").setStripMarkup(true),
72 -
73 - new DefaultTemplateParameterPropertySpec("lat_deg", "coord-lat-deg").setStripMarkup(true),
74 - new DefaultTemplateParameterPropertySpec("lat_min", "coord-lat-min").setStripMarkup(true),
75 - new DefaultTemplateParameterPropertySpec("lat_sec", "coord-lat-sec").setStripMarkup(true),
76 - new DefaultTemplateParameterPropertySpec("lat_NS", "coord-lat-NS").setStripMarkup(true),
77 - new DefaultTemplateParameterPropertySpec("long_deg", "coord-long-d").setStripMarkup(true),
78 - new DefaultTemplateParameterPropertySpec("long_min", "coord-long-m").setStripMarkup(true),
79 - new DefaultTemplateParameterPropertySpec("long_sec", "coord-long-s").setStripMarkup(true),
80 - new DefaultTemplateParameterPropertySpec("long_EW", "coord-long-EW").setStripMarkup(true),
81 - new DefaultTemplateParameterPropertySpec("lon_deg", "coord-long-d").setStripMarkup(true),
82 - new DefaultTemplateParameterPropertySpec("lon_min", "coord-long-m").setStripMarkup(true),
83 - new DefaultTemplateParameterPropertySpec("lon_sec", "coord-long-s").setStripMarkup(true),
84 - new DefaultTemplateParameterPropertySpec("lon_EW", "coord-long-EW").setStripMarkup(true)
 180+ new CoordinatePropertySpec( "coordinates", "latd", "latm", "lats", "latNS", "longd", "longm", "longs", "longEW"),
 181+ new CoordinatePropertySpec( "coordinates", "lat_d", "lat_m", "lat_s", "lat_NS", "long_d", "long_m", "long_s", "long_EW"),
 182+ new CoordinatePropertySpec( "coordinates", "lat_deg", "lat_min", "lat_sec", "lat_NS", "lon_deg", "lon_min", "lon_sec", "lon_EW"),
 183+ new CoordinatePropertySpec( "coordinates", "lat_deg", "lat_min", "lat_sec", "lat_NS", "long_deg", "long_min", "long_sec", "long_EW")
85184 ) );
86185
87186 conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PLACE, "area") );
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java
@@ -15,7 +15,6 @@
1616 import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
1717 import de.brightbyte.wikiword.analyzer.extractor.TitlePartExtractor;
1818 import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
19 -import de.brightbyte.wikiword.analyzer.mangler.TextArmor;
2019 import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
2120 import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher;
2221 import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor;
@@ -25,12 +24,9 @@
2625 import de.brightbyte.wikiword.analyzer.sensor.HasTemplateSensor;
2726 import de.brightbyte.wikiword.analyzer.sensor.TitleSensor;
2827 import de.brightbyte.wikiword.analyzer.template.AbstractTemplateParameterPropertySpec;
29 -import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor;
3028 import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec;
3129 import de.brightbyte.wikiword.analyzer.template.TemplateData;
32 -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor;
3330 import de.brightbyte.wikiword.analyzer.template.TemplateParameterPropertySpec;
34 -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor.Context;
3531 import de.brightbyte.wikiword.lifescience.LifeScienceConceptType;
3632
3733 public class WikiConfiguration_enwiki extends WikiConfiguration {
@@ -185,18 +181,9 @@
186182 }
187183
188184 public WikiConfiguration_enwiki() {
189 - super();
 185+ nestedTemplateFields.put("Protbox", "Codes");
 186+ nestedTemplateFields.put("Protbox", "Caption");
190187
191 - templateExtractorFactory= new TemplateExtractor.Factory() {
192 - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) {
193 - DeepTemplateExtractor extractor = new DeepTemplateExtractor(context, armor);
194 - extractor.addContainerField("Protbox", "Codes");
195 - extractor.addContainerField("Protbox", "Caption");
196 - //FIXME: this needs to accumulate!!!! //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME
197 - return extractor;
198 - }
199 - };
200 -
201188 //NOTE: apply template replacement only when stripping markup, but then before everything else
202189 stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICD9", 1, true), " $2 ") );
203190 stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICD10", 3, true), " $2$3.$4 ") ); //XXX: use all 5 params?

Status & tagging log