r48414 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r48413‎ | r48414 | r48415 >
Date:23:16, 14 March 2009
Author:daniel
Status:deferred
Tags:
Comment:
artist biography properties
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/debug-biography-tweaks.properties (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/TemplateExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/tweaks.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder4Biography/src/main/java/de/brightbyte/wikiword/biography/wikis/WikiConfiguration_dewiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder4Biography/src/main/java/de/brightbyte/wikiword/biography/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder4Biography/src/main/java/de/brightbyte/wikiword/biography/wikis/WikiConfiguration_enwiki.java
@@ -13,9 +13,15 @@
1414
1515 stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("awd", 1, true), "$1")); //TODO: {{awd|award|year|title|role|name}}
1616
17 - propertyExtractors.add( new WikiTextAnalyzer.TemplateParameterExtractor(new WikiTextAnalyzer.ExactNameMatcher("Personendaten"),
18 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-sortname", "NAME").setStripMarkup(true),
19 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-name", "NAME").setStripMarkup(true),
 17+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^(\\d+s?)_births$", "$1", 0, "person-birth-date") );
 18+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^(\\d+s?)_deaths$", "$1", 0, "person-death-date") );
 19+
 20+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^(.+)_(artists|painters|sculptors)$", "$1", 0, "artist-group") );
 21+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("(^|_)(painter|sculptor|photographer)s$", "$2", Pattern.CASE_INSENSITIVE, "artist-group") );
 22+
 23+ propertyExtractors.add( new WikiTextAnalyzer.TemplateParameterExtractor(new WikiTextAnalyzer.ExactNameMatcher("Persondata"),
 24+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("NAME", "person-sortname").setStripMarkup(true),
 25+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("NAME", "person-name").setStripMarkup(true),
2026 new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-name", "ALTERNATIV NAMENS").setStripMarkup(true)
2127 .setSplitPattern(Pattern.compile("\\s[;]\\s")).addNormalizer(Pattern.compile("\\(.*?\\)"),""),
2228 new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-occupation", "SHORT DESCRIPTION").setStripMarkup(true),
@@ -26,18 +32,18 @@
2733 ) );
2834
2935 propertyExtractors.add( new WikiTextAnalyzer.TemplateParameterExtractor(new WikiTextAnalyzer.ExactNameMatcher("Infobox_Artist"),
30 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-name", "name").setStripMarkup(true),
31 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-name", "birthname").setStripMarkup(true),
32 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-birth-date", "birthdate").setStripMarkup(true),
33 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-birth-place", "birthplace").setStripMarkup(true),
34 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-birth-place", "location").setStripMarkup(true),
35 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-death-date", "deathdate").setStripMarkup(true),
36 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-death-place", "deathplace").setStripMarkup(true),
37 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-nationality", "nationality").setStripMarkup(true),
38 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("artist-field", "field").setStripMarkup(true),
39 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("artist-movement", "movement").setStripMarkup(true),
40 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("artist-training", "training").setStripMarkup(true),
41 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("artist-award", "award").setStripMarkup(true)
 36+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("name", "person-name").setStripMarkup(true),
 37+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("birthname", "person-name").setStripMarkup(true),
 38+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("birthdate", "person-birth-date").setStripMarkup(true),
 39+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("birthplace", "person-birth-place").setStripMarkup(true),
 40+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("location", "person-birth-place").setStripMarkup(true),
 41+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("deathdate", "person-death-date").setStripMarkup(true),
 42+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("deathplace", "person-death-place").setStripMarkup(true),
 43+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true),
 44+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("field", "artist-group").setStripMarkup(true),
 45+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("movement", "artist-group").setStripMarkup(true),
 46+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("training", "artist-training").setStripMarkup(true),
 47+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("award", "artist-award").setStripMarkup(true)
4248 ) );
4349
4450 conceptTypeSensors.add( new WikiTextAnalyzer.HasTemplateLikeSensor(ConceptType.PERSON, "^(Infobox[ ]Artist)$", 0));
Index: trunk/WikiWord/WikiWordBuilder4Biography/src/main/java/de/brightbyte/wikiword/biography/wikis/WikiConfiguration_dewiki.java
@@ -10,16 +10,24 @@
1111 public WikiConfiguration_dewiki() {
1212 super();
1313
 14+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^Geboren_(\\d+(_v\\._Chr\\.)?)$", "$1", 0, "person-birth-date") );
 15+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^Gestorben_(\\d+(_v\\._Chr\\.)?)$", "$1", 0, "person-death-date") );
 16+
 17+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^Maler_(der|des)_(.+)$", "$2", 0, "artist-group") );
 18+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("^(Maler|Bildhauer|Fotograf)(_|$)", "$2", 0, "artist-group") );
 19+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("[^_](maler|bildhauer|fotograf)$", "$2", 0, "artist-group") );
 20+ propertyExtractors.add( new WikiTextAnalyzer.CategoryPatternParameterExtractor("([-_\\wäöü]+)(maler|bildhauer|fotograf)$", "$1", 0, "artist-group") );
 21+
1422 propertyExtractors.add( new WikiTextAnalyzer.TemplateParameterExtractor(new WikiTextAnalyzer.ExactNameMatcher("Personendaten"),
15 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-sortname", "NAME").setStripMarkup(true),
16 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-name", "NAME").setStripMarkup(true),
17 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-name", "ALTERNATIVNAMEN").setStripMarkup(true)
 23+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("NAME", "person-sortname").setStripMarkup(true),
 24+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("NAME", "person-name").setStripMarkup(true),
 25+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("ALTERNATIVNAMEN", "person-name").setStripMarkup(true)
1826 .setSplitPattern(Pattern.compile("\\s[;]\\s")).addNormalizer(Pattern.compile("\\(.*?\\)"),""),
19 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-occupation", "KURZBESCHREIBUNG").setStripMarkup(true),
20 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-birth-date", "GEBURTSDATUM").setStripMarkup(true),
21 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-death-date", "STERBEDATUM").setStripMarkup(true),
22 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-birth-place", "GEBURTSORT").setStripMarkup(true),
23 - new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("person-death-place", "STERBEORT").setStripMarkup(true)
 27+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("KURZBESCHREIBUNG", "person-occupation").setStripMarkup(true),
 28+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("GEBURTSDATUM", "person-birth-date").setStripMarkup(true),
 29+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("STERBEDATUM", "person-death-date").setStripMarkup(true),
 30+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("GEBURTSORT", "person-birth-place").setStripMarkup(true),
 31+ new WikiTextAnalyzer.DefaultTemplateParameterPropertySpec("STERBEORT", "person-death-place").setStripMarkup(true)
2432 ) );
2533
2634 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/TemplateExtractor.java
@@ -4,7 +4,6 @@
55 import java.util.HashMap;
66 import java.util.List;
77 import java.util.Map;
8 -import java.util.regex.Pattern;
98
109 import de.brightbyte.data.MultiMap;
1110
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java
@@ -237,38 +237,38 @@
238238 }
239239 }
240240
241 - public interface NameMatcher {
 241+ public interface AttributeMatcher<T> extends Filter<T> {
 242+ public boolean matches(T s);
242243
243 - public boolean matches(CharSequence s);
 244+ public <V> Iterable<V> matches(Map<? extends T, V> m);
 245+ public <V, C extends Collection<V>> Iterable<C> matches(MultiMap<? extends T, V, C> m);
 246+ public <V extends T> Iterable<V> matches(Set<V> values);
 247+ }
 248+
 249+ public interface NameMatcher extends AttributeMatcher<CharSequence> {
244250 public boolean matchesLine(String lines);
245251
246252 public String getRegularExpression();
247 -
248 - public <V> Iterable<V> matches(Map<? extends CharSequence, V> m);
249 - public <V, C extends Collection<V>> Iterable<C> matches(MultiMap<? extends CharSequence, V, C> m);
250 - public <V extends CharSequence> Iterable<V> matches(Set<V> values);
251 -
252253 }
253254
254 - public static abstract class AbstractNameMatcher implements NameMatcher {
 255+ public static abstract class AbstractAttributeMatcher<T> implements AttributeMatcher<T> {
255256
256 - public <V, C extends Collection<V>> Iterable<C> matches(MultiMap<? extends CharSequence, V, C> m) {
 257+ public <V, C extends Collection<V>> Iterable<C> matches(MultiMap<? extends T, V, C> m) {
257258 final Iterator it = m.entrySet().iterator();
258259 return matches(it);
259260 }
260261
261 - public <V> Iterable<V> matches(Map<? extends CharSequence, V> m) {
 262+ public <V> Iterable<V> matches(Map<? extends T, V> m) {
262263 final Iterator it = m.entrySet().iterator();
263264 return matches(it);
264265 }
265266
266 - public <V extends CharSequence> Iterable<V> matches(Set<V> m) {
 267+ public <V extends T> Iterable<V> matches(Set<V> m) {
267268 final Iterator it = m.iterator();
268269 return matches(it);
269270 }
270271
271272 protected <V> Iterable<V> matches(final Iterator it) {
272 -
273273 return new Iterable<V>() {
274274 public Iterator<V> iterator() {
275275 return new Iterator<V>() {
@@ -282,7 +282,7 @@
283283 hasNext = false;
284284 while (it.hasNext()) {
285285 hasNext = true;
286 - Map.Entry<? extends CharSequence, V> e = (Map.Entry<? extends CharSequence, V>) it.next();
 286+ Map.Entry<? extends T, V> e = (Map.Entry<? extends T, V>) it.next();
287287 next = e.getValue();
288288 if (matches(e.getKey())) break;
289289 hasNext = false;
@@ -390,7 +390,7 @@
391391 }
392392 }
393393
394 - public static class PatternNameMatcher extends AbstractNameMatcher {
 394+ public static class PatternNameMatcher extends AbstractAttributeMatcher<CharSequence> implements NameMatcher {
395395 protected Matcher matcher;
396396 protected boolean anchored;
397397
@@ -1063,11 +1063,11 @@
10641064 protected String prefix = null;
10651065 protected String suffix = null;
10661066
1067 - public DefaultTemplateParameterPropertySpec(String name, String prop) {
1068 - if (name==null) throw new NullPointerException();
 1067+ public DefaultTemplateParameterPropertySpec(String param, String prop) {
 1068+ if (param==null) throw new NullPointerException();
10691069 if (prop==null) throw new NullPointerException();
10701070
1071 - this.parameter = name;
 1071+ this.parameter = param;
10721072 this.property = prop;
10731073 }
10741074
@@ -1225,8 +1225,60 @@
12261226 }
12271227
12281228 }
 1229+
 1230+ public static interface TemplateMatcher extends AttributeMatcher<TemplateExtractor.TemplateData> {
 1231+ }
12291232
 1233+ public static class TemplateNameMatcher extends AbstractAttributeMatcher<TemplateExtractor.TemplateData> implements TemplateMatcher {
 1234+ protected NameMatcher matcher;
 1235+
 1236+ public TemplateNameMatcher(NameMatcher matcher) {
 1237+ if(matcher==null) throw new NullPointerException();
 1238+ this.matcher = matcher;
 1239+ }
 1240+
 1241+ public boolean matches(TemplateData t) {
 1242+ return false;
 1243+ }
 1244+ }
 1245+
 1246+ public static class CategoryPatternParameterExtractor implements PropertyExtractor {
 1247+ protected String property;
 1248+ protected Matcher matcher;
 1249+ protected String replacement;
 1250+
 1251+ public CategoryPatternParameterExtractor(String pattern, String replacement, int flags, String property) {
 1252+ this(Pattern.compile(pattern, flags), replacement, property);
 1253+ }
 1254+
 1255+ public CategoryPatternParameterExtractor(Pattern pattern, String replacement, String property) {
 1256+ this(pattern.matcher(""), replacement, property);
 1257+ }
 1258+
 1259+ public CategoryPatternParameterExtractor(Matcher matcher, String replacement, String property) {
 1260+ this.property = property;
 1261+ this.matcher = matcher;
 1262+ this.replacement = replacement;
 1263+ }
 1264+
 1265+ public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) {
 1266+ for(CharSequence s: page.getCategories()) {
 1267+ matcher.reset(s);
 1268+ if (matcher.find()) {
 1269+ String v = matcher.group();
 1270+ v = matcher.replaceAll(replacement);
 1271+
 1272+ if (into==null) into = new ValueSetMultiMap<String, CharSequence>();
 1273+ into.put(property, v);
 1274+ }
 1275+ }
 1276+
 1277+ return into;
 1278+ }
 1279+ }
 1280+
12301281 public static class TemplateParameterExtractor implements PropertyExtractor, TemplateUser {
 1282+ //TODO: allow for matching templates by the parameters they contain, etc. use TemplateMatcher
12311283 protected NameMatcher template;
12321284 protected TemplateParameterPropertySpec[] properties;
12331285
@@ -1239,6 +1291,11 @@
12401292 }
12411293
12421294 public TemplateParameterExtractor(NameMatcher template, TemplateParameterPropertySpec... properties) {
 1295+/* this(new TemplateNameMatcher(template), properties);
 1296+ }
 1297+
 1298+ public TemplateParameterExtractor(TemplateMatcher template, TemplateParameterPropertySpec... properties) {
 1299+*/
12431300 if (template==null) throw new NullPointerException();
12441301 if (properties==null) throw new NullPointerException();
12451302
Index: trunk/WikiWord/WikiWordBuilder/tweaks.properties
@@ -96,5 +96,5 @@
9797 dbstore.traceSQL = false
9898
9999 ### Custom special purpose packages #################
100 -#wikiword.ConfigPackages=["de.brightbyte.wikiword.wikipro","de.brightbyte.wikiword.wikipro.wikis"]
 100+#wikiword.ConfigPackages=["de.brightbyte.wikiword.lifescience","de.brightbyte.wikiword.lifescience.wikis","de.brightbyte.wikiword.bibliography","de.brightbyte.wikiword.bibliography.wikis"]
101101
Index: trunk/WikiWord/WikiWordBuilder/debug-biography-tweaks.properties
@@ -0,0 +1,100 @@
 2+## System config
 3+console.encoding = "UTF-8"
 4+
 5+## language handling
 6+
 7+# treat "commons" as a language code
 8+languages.commonsAsLanguage = false
 9+
 10+# treat "simple" as a language code
 11+languages.simpleAsLanguage = true
 12+
 13+## RDF Export Config
 14+# datase URI qualifier - should uniquely identify the entity creating
 15+# the datasets (that is, YOU). This enusres unique dataset URIs for
 16+# datasets created by different people.
 17+# The default is "*" which means "unknown, don't use this publically".
 18+rdf.dataset.qualifier = "*"
 19+
 20+## Import Driver
 21+# Run import in a thread separate from the one reading and parsing the dump.
 22+# Disabling this by the queue size to 0 will slightly reduce overhead on single-core systems;
 23+# Using a queue will not have as big an impact if unzippers (bunzip/gunzip) are used
 24+#dumpdriver.pageImportQueue = 8
 25+dumpdriver.pageImportQueue = 0
 26+
 27+# external unzippers - may boost performance, especially
 28+# on multi-core systems. The name of the file to
 29+# unzip will be appended to the command given here. Spaces
 30+# before the last / are taken to be part of the path, spaces
 31+# after the last / separate parameters.
 32+dumpdriver.externalBunzip = null
 33+dumpdriver.externalGunzip = null
 34+#dumpdriver.externalBunzip = "/bin/bunzip2 -c"
 35+#dumpdriver.externalGunzip = "/bin/gunzip -c"
 36+
 37+### Importer Output and Persistance ############
 38+importer.progressInterval = 1000
 39+importer.safepointInterval = 30000
 40+#importer.safepointInterval = 1000
 41+
 42+### Database Performance #######################
 43+#dbstore.backgroundFlushQueue = 4
 44+dbstore.backgroundFlushQueue = 0
 45+dbstore.useEntityBuffer = false
 46+dbstore.useRelationBuffer = false
 47+#dbstore.useEntityBuffer = true
 48+#dbstore.useRelationBuffer = true
 49+#dbstore.insertionBufferFactor = 16
 50+dbstore.insertionBufferFactor = 64
 51+dbstore.engine = "MyISAM"
 52+#dbstore.engine = "InnoDB"
 53+
 54+#sql mode - see http://dev.mysql.com/doc/refman/5.1/en/server-sql-mode.html
 55+dbstore.sqlMode = "STRICT_ALL_TABLES"
 56+#dbstore.sqlMode = "STRICT_TRANS_TABLES"
 57+
 58+#NOTE: MySQL does not support 4-byte utf-8 codes. So turn everything into binary...
 59+dbstore.useBinaryText = true;
 60+
 61+#maximum size of sql statement, bytes! MySQL's default: a bit below 16M bytes (!)
 62+#if not specified, mysql is asked for the current value.
 63+#dbstore.maxStatementSize = 16776192;
 64+
 65+#chunk size to use when chunking large updates by id
 66+#default is 100000, set to 0 to disable all chunking
 67+dbstore.queryChunkSize = 100000
 68+dbstore.updateChunkSize = 100000
 69+
 70+### Property Cache Fields ###########################
 71+#dbstore.cacheReferenceSeparator = '\u001E'
 72+#dbstore.cacheReferenceFieldSeparator = '\u001F'
 73+dbstore.listBlobSize = 65025
 74+
 75+### ID manager ######################################
 76+# NOTE: when using this, allow for 116 bytes plus the average size of names per ID entry.
 77+# So if you have anaverage name length of 12 and expect1million entries,
 78+# allow for about 1.3 gigabyte RAM to be used for ID caching.
 79+dbstore.idManager=false
 80+#dbstore.auxFileDir defaults to system temp dir
 81+#dbstore.auxFileDir="/tmp"
 82+dbstore.idManager.bufferSize=16384
 83+
 84+### CycleFinder #####################################
 85+dbstore.CycleFinder.levelWarningThreshold=32
 86+dbstore.CycleFinder.degreeWarningThreshold=1024
 87+dbstore.CycleFinder.maxDepth=1024
 88+
 89+### Database Debug Output ######################
 90+#see java.util.logging.Level for codes to use with dbstore.logLevel
 91+dbstore.logLevel = 720
 92+dbstore.explainSQLThreashold = 0
 93+#dbstore.explainSQLThreashold = 1000000
 94+dbstore.slowSQLThreashold = 0
 95+#dbstore.slowSQLThreashold = 10
 96+#dbstore.slowSQLThreashold = 60
 97+dbstore.traceSQL = false
 98+
 99+### Custom special purpose packages #################
 100+wikiword.ConfigPackages=["de.brightbyte.wikiword.biography","de.brightbyte.wikiword.biography.wikis"]
 101+
Index: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
@@ -96,5 +96,5 @@
9797 dbstore.traceSQL = false
9898
9999 ### Custom special purpose packages #################
100 -#wikiword.ConfigPackages=["de.brightbyte.wikiword.wikipro","de.brightbyte.wikiword.wikipro.wikis"]
 100+wikiword.ConfigPackages=["de.brightbyte.wikiword.bibliography","de.brightbyte.wikiword.bibliography.wikis"]
101101

Status & tagging log