Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AbstractAnalyzer.java |
— | — | @@ -24,7 +24,8 @@ |
25 | 25 | if (manglers==null) return text; |
26 | 26 | |
27 | 27 | for (Mangler mangler : manglers) { |
28 | | - text = mangler.mangle(text); |
| 28 | + CharSequence t = mangler.mangle(text); |
| 29 | + text = t; |
29 | 30 | } |
30 | 31 | |
31 | 32 | return text; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java |
— | — | @@ -48,7 +48,9 @@ |
49 | 49 | import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler; |
50 | 50 | import de.brightbyte.wikiword.analyzer.mangler.TextArmor; |
51 | 51 | import de.brightbyte.wikiword.analyzer.sensor.Sensor; |
| 52 | +import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor; |
52 | 53 | import de.brightbyte.wikiword.analyzer.template.DummyTemplateUser; |
| 54 | +import de.brightbyte.wikiword.analyzer.template.FlatTemplateExtractor; |
53 | 55 | import de.brightbyte.wikiword.analyzer.template.TemplateData; |
54 | 56 | import de.brightbyte.wikiword.analyzer.template.TemplateExtractor; |
55 | 57 | import de.brightbyte.wikiword.analyzer.template.TemplateUser; |
— | — | @@ -427,7 +429,7 @@ |
428 | 430 | return targetConceptPage; |
429 | 431 | } |
430 | 432 | } |
431 | | - |
| 433 | + |
432 | 434 | protected class Page implements WikiPage { |
433 | 435 | protected int namespace; |
434 | 436 | protected String title; |
— | — | @@ -529,8 +531,12 @@ |
530 | 532 | */ |
531 | 533 | public TemplateExtractor getTemplateExtractor() { |
532 | 534 | if (templateExtractor==null) { |
533 | | - templateExtractor = |
534 | | - WikiTextAnalyzer.this.config.templateExtractorFactory.newTemplateExtractor(WikiTextAnalyzer.this, armor); |
| 535 | + if ( WikiTextAnalyzer.this.config.nestedTemplateFields==null || WikiTextAnalyzer.this.config.nestedTemplateFields.isEmpty() ) { |
| 536 | + templateExtractor = new FlatTemplateExtractor(WikiTextAnalyzer.this, armor); |
| 537 | + } else { |
| 538 | + templateExtractor = new DeepTemplateExtractor(WikiTextAnalyzer.this, armor); |
| 539 | + ((DeepTemplateExtractor)templateExtractor).addContainerFields(WikiTextAnalyzer.this.config.nestedTemplateFields); |
| 540 | + } |
535 | 541 | } |
536 | 542 | |
537 | 543 | return templateExtractor; |
— | — | @@ -973,7 +979,7 @@ |
974 | 980 | private WikiTextSniffer sniffer = new WikiTextSniffer(); |
975 | 981 | private Map<String, String> languageNames; |
976 | 982 | private Map<String, Interwiki> interwikiMap; |
977 | | - |
| 983 | + |
978 | 984 | public WikiTextAnalyzer(PlainTextAnalyzer language) throws IOException { |
979 | 985 | this.language = language; |
980 | 986 | this.corpus = language.getCorpus(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java |
— | — | @@ -10,6 +10,8 @@ |
11 | 11 | import java.util.Set; |
12 | 12 | import java.util.regex.Pattern; |
13 | 13 | |
| 14 | +import de.brightbyte.data.MultiMap; |
| 15 | +import de.brightbyte.data.ValueSetMultiMap; |
14 | 16 | import de.brightbyte.data.measure.Measure; |
15 | 17 | import de.brightbyte.util.StringUtils; |
16 | 18 | import de.brightbyte.wikiword.ConceptType; |
— | — | @@ -32,8 +34,6 @@ |
33 | 35 | import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor; |
34 | 36 | import de.brightbyte.wikiword.analyzer.sensor.HasTemplateLikeSensor; |
35 | 37 | import de.brightbyte.wikiword.analyzer.sensor.Sensor; |
36 | | -import de.brightbyte.wikiword.analyzer.template.FlatTemplateExtractor; |
37 | | -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor; |
38 | 38 | |
39 | 39 | /** |
40 | 40 | * A WikiConfiguration represents knowledge about language specific and project specific |
— | — | @@ -284,13 +284,13 @@ |
285 | 285 | /** Flag indication wether extraction of definitions is supported for this wiki **/ |
286 | 286 | public boolean definitionsSupported = true; |
287 | 287 | |
288 | | - public TemplateExtractor.Factory templateExtractorFactory; |
289 | | - |
290 | 288 | protected WikiTextAnalyzer analyzer; |
291 | 289 | |
292 | 290 | protected String wikiName; |
293 | 291 | |
294 | 292 | public Set<Integer> conceptNamespacecs = new HashSet<Integer>(); |
| 293 | + |
| 294 | + protected MultiMap<CharSequence, CharSequence, Set<CharSequence>> nestedTemplateFields = new ValueSetMultiMap<CharSequence, CharSequence>(); |
295 | 295 | |
296 | 296 | protected WikiConfiguration() { |
297 | 297 | this(null); |
— | — | @@ -407,8 +407,6 @@ |
408 | 408 | useCategoryAliases = true; |
409 | 409 | mainArtikeMarkerPattern = Pattern.compile("^[- !_*$@#+~/%]?"); //use "category main articles" to resolve plural names |
410 | 410 | |
411 | | - this.templateExtractorFactory = FlatTemplateExtractor.factory; |
412 | | - |
413 | 411 | this.useSuffixAsCategory = false; |
414 | 412 | this.definitionsSupported = true; |
415 | 413 | this.flatTextSupported = true; |
— | — | @@ -497,10 +495,10 @@ |
498 | 496 | supplementedConceptExtractors.addAll(with.supplementedConceptExtractors); |
499 | 497 | |
500 | 498 | extraTemplatePatterns.addAll(with.extraTemplatePatterns); |
| 499 | + nestedTemplateFields.putAll(with.nestedTemplateFields); |
501 | 500 | |
502 | 501 | //if (with.language!=null) language = with.language; |
503 | 502 | |
504 | | - if (with.templateExtractorFactory!=null) templateExtractorFactory = with.templateExtractorFactory; |
505 | 503 | if (with.linkTrail!=null) linkTrail = with.linkTrail; |
506 | 504 | if (with.badLinkPattern!=null) badLinkPattern = with.badLinkPattern; |
507 | 505 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/DefaultTemplateParameterPropertySpec.java |
— | — | @@ -105,17 +105,17 @@ |
106 | 106 | return this; |
107 | 107 | } |
108 | 108 | |
109 | | - public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> values) { |
| 109 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) { |
110 | 110 | CharSequence v = params.getParameter(parameter); |
111 | | - if (v==null) return values; |
112 | | - if (v.length()==0) return values; |
| 111 | + if (v==null) return intoValues; |
| 112 | + if (v.length()==0) return intoValues; |
113 | 113 | |
114 | 114 | if (clean!=null) { |
115 | 115 | for (Mangler m: clean) v = m.mangle(v); |
116 | 116 | } |
117 | 117 | |
118 | 118 | if (cond!=null) { |
119 | | - if (!cond.matches(v)) return values; |
| 119 | + if (!cond.matches(v)) return intoValues; |
120 | 120 | } |
121 | 121 | |
122 | 122 | if (split!=null) { |
— | — | @@ -139,7 +139,7 @@ |
140 | 140 | if (done) i = j; |
141 | 141 | else i = split.end(); |
142 | 142 | |
143 | | - values = addValue(w, page, values); |
| 143 | + intoValues = addValue(w, page, intoValues); |
144 | 144 | } |
145 | 145 | } |
146 | 146 | else if (find!=null) { |
— | — | @@ -147,14 +147,14 @@ |
148 | 148 | while (find.find()) { |
149 | 149 | CharSequence w = find.groupCount() > 0 ? find.group(1) : find.group(); |
150 | 150 | |
151 | | - values = addValue(w, page, values); |
| 151 | + intoValues = addValue(w, page, intoValues); |
152 | 152 | } |
153 | 153 | } |
154 | 154 | else if (split==null) { |
155 | | - values = addValue(v, page, values); |
| 155 | + intoValues = addValue(v, page, intoValues); |
156 | 156 | } |
157 | 157 | |
158 | | - return values; |
| 158 | + return intoValues; |
159 | 159 | } |
160 | 160 | |
161 | 161 | protected Set<CharSequence> addValue(CharSequence w, WikiPage page, Set<CharSequence> values) { |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/DeepTemplateExtractor.java |
— | — | @@ -21,12 +21,6 @@ |
22 | 22 | |
23 | 23 | public class DeepTemplateExtractor extends AbstractTemplateExtractor { |
24 | 24 | |
25 | | - public static final Factory factory = new Factory() { |
26 | | - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) { |
27 | | - return new DeepTemplateExtractor(context, armor); |
28 | | - } |
29 | | - }; |
30 | | - |
31 | 25 | public DeepTemplateExtractor(Context context, TextArmor armor) { |
32 | 26 | super(context, armor); |
33 | 27 | } |
— | — | @@ -251,11 +245,12 @@ |
252 | 246 | v = markerScanner.getText().subSequence(start, end); |
253 | 247 | |
254 | 248 | v = AnalyzerUtils.trim(v); |
255 | | - v = stripMarkup(v, true); |
256 | 249 | |
257 | | - v = HtmlEntities.decodeEntities(v); |
258 | | - |
259 | | - data.setParameter(n, v); |
| 250 | + if (v.length()>0) { |
| 251 | + v = stripMarkup(v, true); |
| 252 | + v = HtmlEntities.decodeEntities(v); |
| 253 | + data.setParameter(n, v); |
| 254 | + } |
260 | 255 | } |
261 | 256 | } |
262 | 257 | |
— | — | @@ -263,16 +258,29 @@ |
264 | 259 | |
265 | 260 | private String getPrefix(CharSequence template, CharSequence parameter) { |
266 | 261 | if (containerFields==null) return null; |
267 | | - if (!containerFields.contains(template, parameter)) return null; |
| 262 | + |
| 263 | + if (!containerFields.contains(template, parameter)) { |
| 264 | + if (containerFields.contains("", parameter)) { |
| 265 | + return template + "." + parameter; |
| 266 | + } else { |
| 267 | + return null; |
| 268 | + } |
| 269 | + } |
268 | 270 | |
269 | 271 | return template + "." + parameter; |
270 | 272 | } |
271 | 273 | |
272 | 274 | public void addContainerField(CharSequence template, CharSequence parameter) { |
| 275 | + if (template==null) template = ""; |
273 | 276 | if (containerFields==null) containerFields = new ValueSetMultiMap<CharSequence, CharSequence>(); |
274 | 277 | containerFields.put(template, parameter); |
275 | 278 | } |
276 | 279 | |
| 280 | + public void addContainerFields(MultiMap<CharSequence, CharSequence, Set<CharSequence>> nestedTemplateFields) { |
| 281 | + if (containerFields==null) containerFields = new ValueSetMultiMap<CharSequence, CharSequence>(); |
| 282 | + containerFields.putAll(nestedTemplateFields); |
| 283 | + } |
| 284 | + |
277 | 285 | private void parseLink(String prefix) { |
278 | 286 | int m; |
279 | 287 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/TemplateExtractor.java |
— | — | @@ -13,9 +13,5 @@ |
14 | 14 | public boolean isRelevantTemplate(CharSequence name); |
15 | 15 | } |
16 | 16 | |
17 | | - public interface Factory { |
18 | | - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor); |
19 | | - } |
20 | | - |
21 | 17 | public MultiMap<String, TemplateData, List<TemplateData>> extractTemplates(CharSequence text); |
22 | 18 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/TemplateParameterPropertySpec.java |
— | — | @@ -11,7 +11,7 @@ |
12 | 12 | |
13 | 13 | /** determins a property value from a map of template parameters. |
14 | 14 | **/ |
15 | | - public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> values); |
| 15 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues); |
16 | 16 | |
17 | 17 | public String getPropertyName(); |
18 | 18 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/template/FlatTemplateExtractor.java |
— | — | @@ -13,12 +13,6 @@ |
14 | 14 | |
15 | 15 | public class FlatTemplateExtractor extends AbstractTemplateExtractor { |
16 | 16 | |
17 | | - public static final Factory factory = new Factory() { |
18 | | - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) { |
19 | | - return new FlatTemplateExtractor(context, armor); |
20 | | - } |
21 | | - }; |
22 | | - |
23 | 17 | private Matcher templateMarkerMatcher = Pattern.compile("\\{\\{([^|]+?)(?=\\||\\}\\}|\\{\\{)|\\}\\}").matcher(""); |
24 | 18 | private Matcher templateParamMatcher = Pattern.compile("\\||\\{\\{!\\}\\}").matcher(""); |
25 | 19 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -96,7 +96,7 @@ |
97 | 97 | "Portal|Stub[-\\w]*|commons|Cong(Bio2?|Links)|" + |
98 | 98 | "Tnavbar|Navbox([ _]generic)?|redirect|pp-.*?|" + |
99 | 99 | "ambox|wikify|pov|cleanup|globalize|split|current|issue|merge|" + |
100 | | - "Coor([ _]\\w+)?|Coord|reflist|precision[-\\w\\d]+|nowrap[ _]begin|" + |
| 100 | + "reflist|precision[-\\w\\d]+|nowrap[ _]begin|" + |
101 | 101 | "Audio|\\w+[ _]icon|lang-\\w+|Flagicon|Flag|Flagcountry|" + |
102 | 102 | "Main|" + |
103 | 103 | "redirect" //maybe keep that? but we need this for the :'' stripping |
— | — | @@ -143,6 +143,9 @@ |
144 | 144 | //cruft regarding english/welsh census templates |
145 | 145 | stripClutterManglers.add( new RegularExpressionMangler("rank\\s*=\\s*\\[\\[List[ _]of[ _][-\\w\\d\\s]+?\\|\\s*Ranked\\s+\\{\\{[-\\w\\d\\s]+?counties\\s*\\|\\s*\\w+=[-\\w\\d\\s]+\\}\\}\\]\\]", "", 0)); |
146 | 146 | |
| 147 | + //strip coodinate boxes only after template processing |
| 148 | + stripBoxesManglers.add( new RegularExpressionMangler(templatePattern("Coor([ _]\\w+)?", 0, true), "")); |
| 149 | + |
147 | 150 | conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PLACE, |
148 | 151 | "^(NUTS|Geography_of|Places|Villages|Towns|Cities|Captials?|Constituencies|Counties|Countries|Municipalities|Settlements|States|Provinces|Territories|Federal_states|Islands|Regions|Domains|Communes|Districts|Locations)" + |
149 | 152 | "(_|$)|_(places|villages|towns|cities|capitals|constituencies(_.*)?|counties|countries|municipalities|settlements|states|provinces|territories|federal_states|islands|regions|domains|communes|districts|locations)$", 0)); |
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/geography/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -1,44 +1,156 @@ |
2 | 2 | package de.brightbyte.wikiword.geography.wikis; |
3 | 3 | |
| 4 | +import java.util.Set; |
| 5 | + |
4 | 6 | import de.brightbyte.wikiword.ConceptType; |
| 7 | +import de.brightbyte.wikiword.analyzer.AnalyzerUtils; |
5 | 8 | import de.brightbyte.wikiword.analyzer.WikiConfiguration; |
| 9 | +import de.brightbyte.wikiword.analyzer.WikiPage; |
6 | 10 | import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor; |
7 | | -import de.brightbyte.wikiword.analyzer.mangler.TextArmor; |
8 | | -import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher; |
9 | 11 | import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher; |
10 | 12 | import de.brightbyte.wikiword.analyzer.sensor.HasPropertySensor; |
11 | | -import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor; |
12 | 13 | import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec; |
13 | | -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor; |
14 | | -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor.Context; |
| 14 | +import de.brightbyte.wikiword.analyzer.template.TemplateData; |
| 15 | +import de.brightbyte.wikiword.analyzer.template.TemplateParameterPropertySpec; |
15 | 16 | |
16 | 17 | public class WikiConfiguration_enwiki extends WikiConfiguration { |
17 | 18 | |
| 19 | + protected static class CoordinatePropertySpec implements TemplateParameterPropertySpec { |
| 20 | + protected String name; |
| 21 | + |
| 22 | + protected String lat_deg; |
| 23 | + protected String lat_min; |
| 24 | + protected String lat_sec; |
| 25 | + protected String lat_NS; |
| 26 | + |
| 27 | + protected String long_deg; |
| 28 | + protected String long_min; |
| 29 | + protected String long_sec; |
| 30 | + protected String long_EW; |
| 31 | + |
| 32 | + public CoordinatePropertySpec(String name, String lat_deg, String lat_min, String lat_sec, String lat_NS, String long_deg, String long_min, String long_sec, String long_EW) { |
| 33 | + super(); |
| 34 | + this.name = name; |
| 35 | + this.lat_deg = lat_deg; |
| 36 | + this.lat_min = lat_min; |
| 37 | + this.lat_sec = lat_sec; |
| 38 | + this.lat_NS = lat_NS; |
| 39 | + this.long_deg = long_deg; |
| 40 | + this.long_min = long_min; |
| 41 | + this.long_sec = long_sec; |
| 42 | + this.long_EW = long_EW; |
| 43 | + } |
| 44 | + |
| 45 | + public String getPropertyName() { |
| 46 | + return name; |
| 47 | + } |
| 48 | + |
| 49 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) { |
| 50 | + StringBuilder b = new StringBuilder(); |
| 51 | + |
| 52 | + if (params.getParameter(lat_deg)==null) return intoValues; |
| 53 | + if (params.getParameter(long_deg)==null) return intoValues; |
| 54 | + |
| 55 | + addChunk(params, lat_deg, "°", b); |
| 56 | + addChunk(params, lat_min, "'", b); |
| 57 | + addChunk(params, lat_sec, "\"", b); |
| 58 | + addChunk(params, lat_NS, null, b); |
| 59 | + |
| 60 | + b.append(" "); |
| 61 | + |
| 62 | + addChunk(params, long_deg, "°", b); |
| 63 | + addChunk(params, long_min, "'", b); |
| 64 | + addChunk(params, long_sec, "\"", b); |
| 65 | + addChunk(params, long_EW, null, b); |
| 66 | + |
| 67 | + intoValues = addValue(b, page, intoValues); |
| 68 | + return intoValues; |
| 69 | + } |
| 70 | + |
| 71 | + protected boolean addChunk(TemplateData params, CharSequence key, String unit, StringBuilder buffer) { |
| 72 | + if (key==null) return false; |
| 73 | + |
| 74 | + CharSequence v = params.getParameter(key); |
| 75 | + if (v==null) return false; |
| 76 | + |
| 77 | + buffer.append(v); |
| 78 | + if (unit!=null) buffer.append(unit); |
| 79 | + |
| 80 | + return true; |
| 81 | + } |
| 82 | + |
| 83 | + protected Set<CharSequence> addValue(CharSequence w, WikiPage page, Set<CharSequence> intoValues) { |
| 84 | + if (w==null || w.length()==0) return intoValues; |
| 85 | + |
| 86 | + w = AnalyzerUtils.trim(w); |
| 87 | + if (w.length()==0) return intoValues; |
| 88 | + |
| 89 | + intoValues = AnalyzerUtils.addToSet(intoValues, w); |
| 90 | + return intoValues; |
| 91 | + } |
| 92 | + |
| 93 | + } |
| 94 | + |
| 95 | + protected static class Positional8CoordinatePropertySpec extends CoordinatePropertySpec { |
| 96 | + public Positional8CoordinatePropertySpec(String name) { |
| 97 | + super(name, "1", "2", "3", "4", "5", "6", "7", "8"); |
| 98 | + } |
| 99 | + |
| 100 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) { |
| 101 | + if (params.getParameter("8")==null || params.getParameter("10")!=null) return intoValues; |
| 102 | + return super.getPropertyValues(page, params, intoValues); |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + protected static class Positional6CoordinatePropertySpec extends CoordinatePropertySpec { |
| 107 | + public Positional6CoordinatePropertySpec(String name) { |
| 108 | + super(name, "1", "2", null, "3", "4", "5", null, "6"); |
| 109 | + } |
| 110 | + |
| 111 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) { |
| 112 | + if (params.getParameter("6")==null || params.getParameter("8")!=null) return intoValues; |
| 113 | + return super.getPropertyValues(page, params, intoValues); |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + protected static class Positional4CoordinatePropertySpec extends CoordinatePropertySpec { |
| 118 | + public Positional4CoordinatePropertySpec(String name) { |
| 119 | + super(name, "1", null, null, "2", "3", null, null, "4"); |
| 120 | + } |
| 121 | + |
| 122 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) { |
| 123 | + if (params.getParameter("4")==null || params.getParameter("6")!=null) return intoValues; |
| 124 | + return super.getPropertyValues(page, params, intoValues); |
| 125 | + } |
| 126 | + } |
| 127 | + |
| 128 | + protected static class Positional2CoordinatePropertySpec extends CoordinatePropertySpec { |
| 129 | + public Positional2CoordinatePropertySpec(String name) { |
| 130 | + super(name, "1", null, null, null, "2", null, null, null); |
| 131 | + } |
| 132 | + |
| 133 | + public Set<CharSequence> getPropertyValues(WikiPage page, TemplateData params, Set<CharSequence> intoValues) { |
| 134 | + if (params.getParameter("2")==null || params.getParameter("4")!=null) return intoValues; |
| 135 | + return super.getPropertyValues(page, params, intoValues); |
| 136 | + } |
| 137 | + } |
| 138 | + |
18 | 139 | public WikiConfiguration_enwiki() { |
19 | 140 | super(); |
20 | 141 | |
21 | | - templateExtractorFactory= new TemplateExtractor.Factory() { |
22 | | - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) { |
23 | | - DeepTemplateExtractor extractor = new DeepTemplateExtractor(context, armor); |
24 | | - //FIXME: this needs to accumulate!!!! //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME |
25 | | - return extractor; |
26 | | - } |
27 | | - }; |
| 142 | + nestedTemplateFields.put("Protbox", "coord"); |
| 143 | + nestedTemplateFields.put("Protbox", "coords"); |
| 144 | + nestedTemplateFields.put("Protbox", "coordinates"); |
28 | 145 | |
29 | 146 | //XXX: coord may appear nested. check if it works. |
30 | | - propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Coord"), |
31 | | - new DefaultTemplateParameterPropertySpec("1", "coord-lat-deg").setStripMarkup(true), |
32 | | - new DefaultTemplateParameterPropertySpec("2", "coord-lat-min").setStripMarkup(true), |
33 | | - new DefaultTemplateParameterPropertySpec("3", "coord-lat-sec").setStripMarkup(true), |
34 | | - new DefaultTemplateParameterPropertySpec("4", "coord-lat-NS").setStripMarkup(true), |
35 | | - new DefaultTemplateParameterPropertySpec("5", "coord-long-deg").setStripMarkup(true), |
36 | | - new DefaultTemplateParameterPropertySpec("6", "coord-long-min").setStripMarkup(true), |
37 | | - new DefaultTemplateParameterPropertySpec("7", "coord-long-sec").setStripMarkup(true), |
38 | | - new DefaultTemplateParameterPropertySpec("8", "coord-long-EW").setStripMarkup(true), |
39 | | - new DefaultTemplateParameterPropertySpec("9", "coord-args").setStripMarkup(true) |
40 | | - ) ); |
| 147 | + propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("(.*\\.coords?::)?[Cc]oord", 0, true), |
| 148 | + new Positional8CoordinatePropertySpec( "position"), |
| 149 | + new Positional6CoordinatePropertySpec( "position"), |
| 150 | + new Positional4CoordinatePropertySpec( "position"), |
| 151 | + new Positional2CoordinatePropertySpec( "position") |
| 152 | + ) ); |
41 | 153 | |
42 | | - propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("(Geobox|Infobox_(.*_)?([Ss]ettlement|[Cc]ountry|[Ss]tate|[Ll]ocation|[Cc]ounty|[Ll]ake)|.*_constituency_infobox)", 0, true), |
| 154 | + propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("(Infobox_.*|.*box)", 0, true), |
43 | 155 | new DefaultTemplateParameterPropertySpec("name", "place-name").setStripMarkup(true), |
44 | 156 | new DefaultTemplateParameterPropertySpec("native_name", "place-name").setStripMarkup(true), |
45 | 157 | new DefaultTemplateParameterPropertySpec("common_name", "place-name").setStripMarkup(true), |
— | — | @@ -50,6 +162,10 @@ |
51 | 163 | new DefaultTemplateParameterPropertySpec("area_sq_mi", "area-mi2").setStripMarkup(true), |
52 | 164 | new DefaultTemplateParameterPropertySpec("area_total_km2", "area-km2").setStripMarkup(true), |
53 | 165 | new DefaultTemplateParameterPropertySpec("area_total_sq_mi", "area-mi2").setStripMarkup(true), |
| 166 | + |
| 167 | + new DefaultTemplateParameterPropertySpec("elevation", "elevation").setStripMarkup(true), |
| 168 | + new DefaultTemplateParameterPropertySpec("elevation_m", "elevation-m").setStripMarkup(true), |
| 169 | + new DefaultTemplateParameterPropertySpec("elevation_ft", "elevation-ft").setStripMarkup(true), |
54 | 170 | |
55 | 171 | new DefaultTemplateParameterPropertySpec("timezone", "time-zone").setStripMarkup(true), |
56 | 172 | new DefaultTemplateParameterPropertySpec("time_zone", "time-zone").setStripMarkup(true), |
— | — | @@ -60,27 +176,10 @@ |
61 | 177 | new DefaultTemplateParameterPropertySpec("population_density_km2", "population-density-km2").setStripMarkup(true), |
62 | 178 | new DefaultTemplateParameterPropertySpec("population_density_sq_mi", "population-density-mi2").setStripMarkup(true), |
63 | 179 | |
64 | | - new DefaultTemplateParameterPropertySpec("latd", "coord-lat-deg").setStripMarkup(true), |
65 | | - new DefaultTemplateParameterPropertySpec("latm", "coord-lat-min").setStripMarkup(true), |
66 | | - new DefaultTemplateParameterPropertySpec("lats", "coord-lat-sec").setStripMarkup(true), |
67 | | - new DefaultTemplateParameterPropertySpec("latNS", "coord-lat-NS").setStripMarkup(true), |
68 | | - new DefaultTemplateParameterPropertySpec("longd", "coord-long-deg").setStripMarkup(true), |
69 | | - new DefaultTemplateParameterPropertySpec("longm", "coord-long-min").setStripMarkup(true), |
70 | | - new DefaultTemplateParameterPropertySpec("longs", "coord-long-sec").setStripMarkup(true), |
71 | | - new DefaultTemplateParameterPropertySpec("longEW", "coord-long-EW").setStripMarkup(true), |
72 | | - |
73 | | - new DefaultTemplateParameterPropertySpec("lat_deg", "coord-lat-deg").setStripMarkup(true), |
74 | | - new DefaultTemplateParameterPropertySpec("lat_min", "coord-lat-min").setStripMarkup(true), |
75 | | - new DefaultTemplateParameterPropertySpec("lat_sec", "coord-lat-sec").setStripMarkup(true), |
76 | | - new DefaultTemplateParameterPropertySpec("lat_NS", "coord-lat-NS").setStripMarkup(true), |
77 | | - new DefaultTemplateParameterPropertySpec("long_deg", "coord-long-d").setStripMarkup(true), |
78 | | - new DefaultTemplateParameterPropertySpec("long_min", "coord-long-m").setStripMarkup(true), |
79 | | - new DefaultTemplateParameterPropertySpec("long_sec", "coord-long-s").setStripMarkup(true), |
80 | | - new DefaultTemplateParameterPropertySpec("long_EW", "coord-long-EW").setStripMarkup(true), |
81 | | - new DefaultTemplateParameterPropertySpec("lon_deg", "coord-long-d").setStripMarkup(true), |
82 | | - new DefaultTemplateParameterPropertySpec("lon_min", "coord-long-m").setStripMarkup(true), |
83 | | - new DefaultTemplateParameterPropertySpec("lon_sec", "coord-long-s").setStripMarkup(true), |
84 | | - new DefaultTemplateParameterPropertySpec("lon_EW", "coord-long-EW").setStripMarkup(true) |
| 180 | + new CoordinatePropertySpec( "coordinates", "latd", "latm", "lats", "latNS", "longd", "longm", "longs", "longEW"), |
| 181 | + new CoordinatePropertySpec( "coordinates", "lat_d", "lat_m", "lat_s", "lat_NS", "long_d", "long_m", "long_s", "long_EW"), |
| 182 | + new CoordinatePropertySpec( "coordinates", "lat_deg", "lat_min", "lat_sec", "lat_NS", "lon_deg", "lon_min", "lon_sec", "lon_EW"), |
| 183 | + new CoordinatePropertySpec( "coordinates", "lat_deg", "lat_min", "lat_sec", "lat_NS", "long_deg", "long_min", "long_sec", "long_EW") |
85 | 184 | ) ); |
86 | 185 | |
87 | 186 | conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PLACE, "area") ); |
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -15,7 +15,6 @@ |
16 | 16 | import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor; |
17 | 17 | import de.brightbyte.wikiword.analyzer.extractor.TitlePartExtractor; |
18 | 18 | import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler; |
19 | | -import de.brightbyte.wikiword.analyzer.mangler.TextArmor; |
20 | 19 | import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher; |
21 | 20 | import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher; |
22 | 21 | import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor; |
— | — | @@ -25,12 +24,9 @@ |
26 | 25 | import de.brightbyte.wikiword.analyzer.sensor.HasTemplateSensor; |
27 | 26 | import de.brightbyte.wikiword.analyzer.sensor.TitleSensor; |
28 | 27 | import de.brightbyte.wikiword.analyzer.template.AbstractTemplateParameterPropertySpec; |
29 | | -import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor; |
30 | 28 | import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec; |
31 | 29 | import de.brightbyte.wikiword.analyzer.template.TemplateData; |
32 | | -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor; |
33 | 30 | import de.brightbyte.wikiword.analyzer.template.TemplateParameterPropertySpec; |
34 | | -import de.brightbyte.wikiword.analyzer.template.TemplateExtractor.Context; |
35 | 31 | import de.brightbyte.wikiword.lifescience.LifeScienceConceptType; |
36 | 32 | |
37 | 33 | public class WikiConfiguration_enwiki extends WikiConfiguration { |
— | — | @@ -185,18 +181,9 @@ |
186 | 182 | } |
187 | 183 | |
188 | 184 | public WikiConfiguration_enwiki() { |
189 | | - super(); |
| 185 | + nestedTemplateFields.put("Protbox", "Codes"); |
| 186 | + nestedTemplateFields.put("Protbox", "Caption"); |
190 | 187 | |
191 | | - templateExtractorFactory= new TemplateExtractor.Factory() { |
192 | | - public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) { |
193 | | - DeepTemplateExtractor extractor = new DeepTemplateExtractor(context, armor); |
194 | | - extractor.addContainerField("Protbox", "Codes"); |
195 | | - extractor.addContainerField("Protbox", "Caption"); |
196 | | - //FIXME: this needs to accumulate!!!! //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME |
197 | | - return extractor; |
198 | | - } |
199 | | - }; |
200 | | - |
201 | 188 | //NOTE: apply template replacement only when stripping markup, but then before everything else |
202 | 189 | stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICD9", 1, true), " $2 ") ); |
203 | 190 | stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICD10", 3, true), " $2$3.$4 ") ); //XXX: use all 5 params? |