r53901 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53900‎ | r53901 | r53902 >
Date:20:58, 28 July 2009
Author:daniel
Status:deferred
Tags:
Comment:
consolidate special propery extraction code into WikiWordProperties
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder4Biography (deleted) (history)
  • /trunk/WikiWord/WikiWordBuilder4LifeScience/src/main/java/de/brightbyte/wikiword/lifescience (deleted) (history)
  • /trunk/WikiWord/WikiWordProperties (added) (history)
  • /trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience (added) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordProperties/.classpath
@@ -0,0 +1,10 @@
 2+<?xml version="1.0" encoding="UTF-8"?>
 3+<classpath>
 4+ <classpathentry kind="src" path="src/main/java"/>
 5+ <classpathentry kind="src" path="src/test/java"/>
 6+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
 7+ <classpathentry combineaccessrules="false" kind="src" path="/WikiWord"/>
 8+ <classpathentry combineaccessrules="false" kind="src" path="/WikiWordBuilder"/>
 9+ <classpathentry combineaccessrules="false" kind="src" path="/BrightByteUtil"/>
 10+ <classpathentry kind="output" path="bin"/>
 11+</classpath>
Index: trunk/WikiWord/WikiWordProperties/.project
@@ -0,0 +1,17 @@
 2+<?xml version="1.0" encoding="UTF-8"?>
 3+<projectDescription>
 4+ <name>WikiWordBuilder4Biography</name>
 5+ <comment></comment>
 6+ <projects>
 7+ </projects>
 8+ <buildSpec>
 9+ <buildCommand>
 10+ <name>org.eclipse.jdt.core.javabuilder</name>
 11+ <arguments>
 12+ </arguments>
 13+ </buildCommand>
 14+ </buildSpec>
 15+ <natures>
 16+ <nature>org.eclipse.jdt.core.javanature</nature>
 17+ </natures>
 18+</projectDescription>
Index: trunk/WikiWord/WikiWordProperties/target/.svnignore
@@ -0,0 +1,2 @@
 2+*
 3+
Property changes on: trunk/WikiWord/WikiWordProperties/target
___________________________________________________________________
Name: svn:ignore
14 + *
Index: trunk/WikiWord/WikiWordProperties/src/main/assembly/src.xml
@@ -0,0 +1,24 @@
 2+<assembly>
 3+ <id>src</id>
 4+ <formats>
 5+ <format>tar.gz</format>
 6+ </formats>
 7+ <fileSets>
 8+ <fileSet>
 9+ <includes>
 10+ <include>*</include>
 11+ </includes>
 12+ <excludes>
 13+ <exclude>bin</exclude>
 14+ <exclude>target</exclude>
 15+ <exclude>local.*</exclude>
 16+ </excludes>
 17+ </fileSet>
 18+ <fileSet>
 19+ <directory>src</directory>
 20+ </fileSet>
 21+ <fileSet>
 22+ <directory>doc</directory>
 23+ </fileSet>
 24+ </fileSets>
 25+</assembly>
\ No newline at end of file
Index: trunk/WikiWord/WikiWordProperties/src/main/assembly/bin-dep.xml
@@ -0,0 +1,50 @@
 2+<assembly>
 3+ <id>bin-dep</id>
 4+ <formats>
 5+ <format>tar.gz</format>
 6+ </formats>
 7+ <fileSets>
 8+ <fileSet>
 9+ <includes>
 10+ <include>README*</include>
 11+ <include>LICENSE*</include>
 12+ <include>NOTICE*</include>
 13+ <include>*.properties</include>
 14+ <include>*.sh</include>
 15+ <include>*.bat</include>
 16+ </includes>
 17+ <excludes>
 18+ <exclude>debug.*</exclude>
 19+ <exclude>local.*</exclude>
 20+ </excludes>
 21+ </fileSet>
 22+ <fileSet>
 23+ <directory>src/main/</directory>
 24+ <outputDirectory></outputDirectory>
 25+ <includes>
 26+ <include>*.rdf</include>
 27+ </includes>
 28+ </fileSet>
 29+ <!--<fileSet>
 30+ <directory>doc</directory>
 31+ </fileSet>-->
 32+ <!--<fileSet>
 33+ <directory>target</directory>
 34+ <outputDirectory></outputDirectory>
 35+ <includes>
 36+ <include>*.jar</include>
 37+ </includes>
 38+ </fileSet>-->
 39+ </fileSets>
 40+ <dependencySets>
 41+ <dependencySet>
 42+ <outputDirectory>/lib</outputDirectory>
 43+ <unpack>false</unpack>
 44+ <scope>runtime</scope>
 45+ <excludes>
 46+ <exclude>junit:junit</exclude>
 47+ <exclude>org.apache.maven.wagon:wagon-ssh</exclude>
 48+ </excludes>
 49+ </dependencySet>
 50+ </dependencySets>
 51+</assembly>
\ No newline at end of file
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/biography/BiographyConceptType.java
@@ -0,0 +1,36 @@
 2+package de.brightbyte.wikiword.biography;
 3+
 4+import de.brightbyte.wikiword.ConceptType;
 5+import de.brightbyte.wikiword.ConceptTypeSet;
 6+
 7+/**
 8+ * Enumeration of concept types; each concept type represents a very broad category of concepts,
 9+ * which can be used to filter concepts identified in a corpus. The idea is at for some uses,
 10+ * some kinds of concepts are not usefull, or especially usefull. For example, people and polaces
 11+ * are not suitable for use in a general dictionary, but very useful for topic tracking.
 12+ * Each type is associated with a code (for internal use) and a URI (for external use).
 13+ * The URI is constructed based on {@link RdfEntities.conceptTypeBase}.
 14+ */
 15+public class BiographyConceptType extends ConceptType {
 16+
 17+ /**
 18+ * NamespaceSet for the canonical concept types. Loaded from the ConceptTypes.properties
 19+ * file in this package.
 20+ */
 21+ public static final ConceptTypeSet biographyConceptTypes;
 22+
 23+ static {
 24+ try {
 25+ biographyConceptTypes = getConceptTypes(null, "de.brightbyte.wikiword.biography"); //FIXME: make unmodifiable!
 26+
 27+ }
 28+ catch (NumberFormatException ex) {
 29+ throw new ExceptionInInitializerError(ex);
 30+ }
 31+ }
 32+
 33+ public BiographyConceptType(int code, String name) {
 34+ super(code, name);
 35+ }
 36+
 37+}
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/biography/wikis/WikiConfiguration_enwiki.java
@@ -0,0 +1,110 @@
 2+package de.brightbyte.wikiword.biography.wikis;
 3+
 4+import java.util.regex.Pattern;
 5+
 6+import de.brightbyte.wikiword.ConceptType;
 7+import de.brightbyte.wikiword.analyzer.WikiConfiguration;
 8+import de.brightbyte.wikiword.analyzer.extractor.CategoryPatternParameterExtractor;
 9+import de.brightbyte.wikiword.analyzer.extractor.PagePropertyValueExtractor;
 10+import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
 11+import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
 12+import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
 13+import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher;
 14+import de.brightbyte.wikiword.analyzer.sensor.HasPropertySensor;
 15+import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec;
 16+
 17+public class WikiConfiguration_enwiki extends WikiConfiguration {
 18+
 19+ public WikiConfiguration_enwiki() {
 20+ super();
 21+
 22+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("awd", 1, true), "$1")); //TODO: {{awd|award|year|title|role|name}}
 23+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("(Birth|Death)(Date(AndAge)?|_date(_and_age)?)", 1, true), " $1") );
 24+
 25+ propertyExtractors.add( new CategoryPatternParameterExtractor("^(\\d+s?)_births$", "$1", 0, "person-birth-date") );
 26+ propertyExtractors.add( new CategoryPatternParameterExtractor("^(\\d+s?)_deaths$", "$1", 0, "person-death-date") );
 27+
 28+ propertyExtractors.add( new CategoryPatternParameterExtractor("^(.+)_(artists|painters|sculptors)$", "$1", 0, "artist-group") );
 29+ propertyExtractors.add( new CategoryPatternParameterExtractor("^.*(^|_)(painter|sculptor|photographer)s$", "$2", Pattern.CASE_INSENSITIVE, "artist-group") );
 30+
 31+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Persondata"),
 32+ new DefaultTemplateParameterPropertySpec("NAME", "person-sortname").setStripMarkup(true),
 33+ new DefaultTemplateParameterPropertySpec("NAME", "person-name").setStripMarkup(true),
 34+ new DefaultTemplateParameterPropertySpec("ALTERNATIV NAMENS", "person-name").setStripMarkup(true)
 35+ .setSplitPattern(Pattern.compile("\\s[;]\\s")).addNormalizer(Pattern.compile("\\(.*?\\)"),""),
 36+ new DefaultTemplateParameterPropertySpec("SHORT DESCRIPTION", "person-occupation").setStripMarkup(true),
 37+ new DefaultTemplateParameterPropertySpec("DATE OF BIRTH", "person-birth-date").setStripMarkup(true),
 38+ new DefaultTemplateParameterPropertySpec("PLACE OF BIRTH", "person-birth-place").setStripMarkup(true),
 39+ new DefaultTemplateParameterPropertySpec("DATE OF DEATH", "person-death-date").setStripMarkup(true),
 40+ new DefaultTemplateParameterPropertySpec("PLACE OF DEATH", "person-death-place").setStripMarkup(true)
 41+ ) );
 42+
 43+ Pattern defaultSplitPattern = Pattern.compile("[,;/]\\s+|<br\\s*/?>");
 44+
 45+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_Artist"),
 46+ new DefaultTemplateParameterPropertySpec("name", "person-name").setStripMarkup(true),
 47+ new DefaultTemplateParameterPropertySpec("birthname", "person-name").setStripMarkup(true),
 48+ new DefaultTemplateParameterPropertySpec("birthdate", "person-birth-date").setStripMarkup(true),
 49+ new DefaultTemplateParameterPropertySpec("birthplace", "person-birth-place").setStripMarkup(true),
 50+ new DefaultTemplateParameterPropertySpec("location", "person-birth-place").setStripMarkup(true),
 51+ new DefaultTemplateParameterPropertySpec("deathdate", "person-death-date").setStripMarkup(true),
 52+ new DefaultTemplateParameterPropertySpec("deathplace", "person-death-place").setStripMarkup(true),
 53+ new DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true),
 54+ new DefaultTemplateParameterPropertySpec("field", "artist-group").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 55+ new DefaultTemplateParameterPropertySpec("movement", "artist-group").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 56+ new DefaultTemplateParameterPropertySpec("training", "artist-training").setStripMarkup(true),
 57+ new DefaultTemplateParameterPropertySpec("award", "artist-award").setStripMarkup(true).setSplitPattern(defaultSplitPattern)
 58+ ) );
 59+
 60+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Infobox_(((Medical|Military)_)?[Pp]erson|Actor|Astronaut|Criminal|Engineer|Musical_artist|Philosopher|Pope|ReligiousBio|Scientist)", 0, true),
 61+ new DefaultTemplateParameterPropertySpec("name", "person-name").setStripMarkup(true),
 62+ new DefaultTemplateParameterPropertySpec("other_names", "person-name").setStripMarkup(true),
 63+ new DefaultTemplateParameterPropertySpec("birth_date", "person-birth-date").setStripMarkup(true),
 64+ new DefaultTemplateParameterPropertySpec("birth_place", "person-birth-place").setStripMarkup(true),
 65+ new DefaultTemplateParameterPropertySpec("death_date", "person-death-date").setStripMarkup(true),
 66+ new DefaultTemplateParameterPropertySpec("death_place", "person-death-place").setStripMarkup(true),
 67+ new DefaultTemplateParameterPropertySpec("occupation", "person-occupation").setStripMarkup(true),
 68+ new DefaultTemplateParameterPropertySpec("known_for", "person-known-for").setStripMarkup(true),
 69+ new DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true),
 70+ new DefaultTemplateParameterPropertySpec("residence", "person-nationality").setStripMarkup(true),
 71+ new DefaultTemplateParameterPropertySpec("citizenship", "person-nationality").setStripMarkup(true)
 72+ ) );
 73+
 74+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Birth_date|BrithDate", 0, true),
 75+ new DefaultTemplateParameterPropertySpec("1", "person-birth-date").setStripMarkup(true)
 76+ ) );
 77+
 78+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Death_date|DeathDate", 0, true),
 79+ new DefaultTemplateParameterPropertySpec("1", "person-death-date").setStripMarkup(true)
 80+ ) );
 81+
 82+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Death_date_and_age|DeathDateAndAge", 0, true),
 83+ new DefaultTemplateParameterPropertySpec("1", "person-death-date").setStripMarkup(true),
 84+ new DefaultTemplateParameterPropertySpec("1", "person-birth-date").setStripMarkup(true)
 85+ ) );
 86+
 87+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_Medical_Person"),
 88+ new DefaultTemplateParameterPropertySpec("profession", "person-occupation").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 89+ new DefaultTemplateParameterPropertySpec("profession", "expert-group").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 90+ new DefaultTemplateParameterPropertySpec("specialism", "expert-group").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 91+ new DefaultTemplateParameterPropertySpec("research_field", "expert-group").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 92+ new DefaultTemplateParameterPropertySpec("work_institutions", "person-affiliation").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 93+ new DefaultTemplateParameterPropertySpec("prizes", "expert-prize").setStripMarkup(true).setSplitPattern(defaultSplitPattern)
 94+ ) );
 95+
 96+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_Scientist"),
 97+ new DefaultTemplateParameterPropertySpec("fields", "expert-group").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 98+ new DefaultTemplateParameterPropertySpec("alma-mater", "person-education").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 99+ new DefaultTemplateParameterPropertySpec("workplaces", "person-affiliation").setStripMarkup(true).setSplitPattern(defaultSplitPattern),
 100+ new DefaultTemplateParameterPropertySpec("awards", "expert-prize").setStripMarkup(true).setSplitPattern(defaultSplitPattern)
 101+ ) );
 102+
 103+ pageTermExtractors.add( new PagePropertyValueExtractor("person-sortname") );
 104+ pageTermExtractors.add( new PagePropertyValueExtractor("person-name") );
 105+
 106+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "artist-group"));
 107+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "person-name"));
 108+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "person-birth-date"));
 109+ }
 110+
 111+}
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/biography/wikis/WikiConfiguration_dewiki.java
@@ -0,0 +1,55 @@
 2+package de.brightbyte.wikiword.biography.wikis;
 3+
 4+import java.util.regex.Pattern;
 5+
 6+import de.brightbyte.wikiword.ConceptType;
 7+import de.brightbyte.wikiword.analyzer.WikiConfiguration;
 8+import de.brightbyte.wikiword.analyzer.extractor.CategoryPatternParameterExtractor;
 9+import de.brightbyte.wikiword.analyzer.extractor.PagePropertyValueExtractor;
 10+import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
 11+import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
 12+import de.brightbyte.wikiword.analyzer.sensor.HasPropertySensor;
 13+import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec;
 14+
 15+public class WikiConfiguration_dewiki extends WikiConfiguration {
 16+
 17+ public WikiConfiguration_dewiki() {
 18+ super();
 19+
 20+ propertyExtractors.add( new CategoryPatternParameterExtractor("^Geboren_(\\d+(_v\\._Chr\\.)?)$", "$1", 0, "person-birth-date") );
 21+ propertyExtractors.add( new CategoryPatternParameterExtractor("^Gestorben_(\\d+(_v\\._Chr\\.)?)$", "$1", 0, "person-death-date") );
 22+
 23+ propertyExtractors.add( new CategoryPatternParameterExtractor("^Maler_(der|des)_(.+)$", "$2", 0, "artist-group") );
 24+ propertyExtractors.add( new CategoryPatternParameterExtractor("^(Maler|Bildhauer|Fotograf)(_|$).*$", "$1", 0, "artist-group") );
 25+ propertyExtractors.add( new CategoryPatternParameterExtractor("^.*[^_](maler|bildhauer|fotograf)$", "$1", 0, "artist-group").setCapitalize(true) );
 26+ propertyExtractors.add( new CategoryPatternParameterExtractor("^.*?([-_\\wäöü]+)(maler|bildhauer|fotograf)$", "$2", 0, "artist-group") );
 27+
 28+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Personendaten"),
 29+ new DefaultTemplateParameterPropertySpec("NAME", "person-sortname").setStripMarkup(true),
 30+ new DefaultTemplateParameterPropertySpec("NAME", "person-name").setStripMarkup(true),
 31+ new DefaultTemplateParameterPropertySpec("ALTERNATIVNAMEN", "person-name").setStripMarkup(true)
 32+ .setSplitPattern(Pattern.compile("\\s[;]\\s")).addNormalizer(Pattern.compile("\\(.*?\\)"),""),
 33+ new DefaultTemplateParameterPropertySpec("KURZBESCHREIBUNG", "person-occupation").setStripMarkup(true),
 34+ new DefaultTemplateParameterPropertySpec("GEBURTSDATUM", "person-birth-date").setStripMarkup(true),
 35+ new DefaultTemplateParameterPropertySpec("STERBEDATUM", "person-death-date").setStripMarkup(true),
 36+ new DefaultTemplateParameterPropertySpec("GEBURTSORT", "person-birth-place").setStripMarkup(true),
 37+ new DefaultTemplateParameterPropertySpec("STERBEORT", "person-death-place").setStripMarkup(true)
 38+ ) );
 39+
 40+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("PND"),
 41+ new DefaultTemplateParameterPropertySpec("1", "ID-PND").setStripMarkup(true) ) );
 42+
 43+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("LeMO"),
 44+ new DefaultTemplateParameterPropertySpec("1", "ID-LeMO").setStripMarkup(true) ) );
 45+
 46+ //TODO: {{BAM|Kohl|Helmut}}
 47+
 48+ pageTermExtractors.add( new PagePropertyValueExtractor("person-sortname") );
 49+ pageTermExtractors.add( new PagePropertyValueExtractor("person-name") );
 50+
 51+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "person-name") );
 52+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "person-birth-date") );
 53+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "artist-group") );
 54+ }
 55+
 56+}
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/biography/ConceptTypes.properties
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience/wikis/WikiConfiguration_enwiki.java
@@ -0,0 +1,668 @@
 2+package de.brightbyte.wikiword.lifescience.wikis;
 3+
 4+import java.util.regex.Matcher;
 5+import java.util.regex.Pattern;
 6+
 7+import de.brightbyte.wikiword.ConceptType;
 8+import de.brightbyte.wikiword.Namespace;
 9+import de.brightbyte.wikiword.ResourceType;
 10+import de.brightbyte.wikiword.analyzer.WikiConfiguration;
 11+import de.brightbyte.wikiword.analyzer.WikiPage;
 12+import de.brightbyte.wikiword.analyzer.extractor.CategoryPatternParameterExtractor;
 13+import de.brightbyte.wikiword.analyzer.extractor.PagePropertyValueExtractor;
 14+import de.brightbyte.wikiword.analyzer.extractor.PropertyValueExtractor;
 15+import de.brightbyte.wikiword.analyzer.extractor.TemplateNamePatternParameterExtractor;
 16+import de.brightbyte.wikiword.analyzer.extractor.TemplateParameterExtractor;
 17+import de.brightbyte.wikiword.analyzer.extractor.TitlePartExtractor;
 18+import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
 19+import de.brightbyte.wikiword.analyzer.mangler.TextArmor;
 20+import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
 21+import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher;
 22+import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor;
 23+import de.brightbyte.wikiword.analyzer.sensor.HasCategorySensor;
 24+import de.brightbyte.wikiword.analyzer.sensor.HasPropertySensor;
 25+import de.brightbyte.wikiword.analyzer.sensor.HasTemplateLikeSensor;
 26+import de.brightbyte.wikiword.analyzer.sensor.HasTemplateSensor;
 27+import de.brightbyte.wikiword.analyzer.sensor.TitleSensor;
 28+import de.brightbyte.wikiword.analyzer.template.AbstractTemplateParameterPropertySpec;
 29+import de.brightbyte.wikiword.analyzer.template.DeepTemplateExtractor;
 30+import de.brightbyte.wikiword.analyzer.template.DefaultTemplateParameterPropertySpec;
 31+import de.brightbyte.wikiword.analyzer.template.TemplateData;
 32+import de.brightbyte.wikiword.analyzer.template.TemplateExtractor;
 33+import de.brightbyte.wikiword.analyzer.template.TemplateParameterPropertySpec;
 34+import de.brightbyte.wikiword.analyzer.template.TemplateExtractor.Context;
 35+import de.brightbyte.wikiword.lifescience.LifeScienceConceptType;
 36+
 37+public class WikiConfiguration_enwiki extends WikiConfiguration {
 38+
 39+ protected static String[] resolveSequence(String s, int max) {
 40+ int idx = s.indexOf('-');
 41+ if (idx<0) return new String[] { s };
 42+
 43+ return resolveSequence(s.substring(0, idx).trim(), s.substring(idx+1).trim(), max);
 44+ }
 45+
 46+ protected static String[] resolveSequence(String from, String to, int max) {
 47+ int i = 0;
 48+ int j = from.length()-1;
 49+ int k = to.length()-1;
 50+
 51+ while (i<from.length() && i<to.length() && from.charAt(i)==to.charAt(i)) {
 52+ i++;
 53+ }
 54+
 55+ while (j>=i && k>=i && from.charAt(j)==to.charAt(k)) {
 56+ j--;
 57+ k--;
 58+ }
 59+
 60+ if (j<i || k<i) return null;
 61+
 62+ String f = from.substring(i, j+1);
 63+ String t = to.substring(i, k+1);
 64+
 65+ String prefix = from.substring(0, i);
 66+ String suffix = from.substring(j+1);
 67+
 68+ int a;
 69+ int b;
 70+
 71+ try {
 72+ a = Integer.parseInt(f);
 73+ b = Integer.parseInt(t);
 74+ }
 75+ catch (NumberFormatException ex) {
 76+ return null;
 77+ }
 78+
 79+ int c = b-a +1;
 80+ if (c>max) return null;
 81+
 82+ String[] ss = new String[c];
 83+
 84+ for (int n=0; n<c; n++) {
 85+ ss[n] = prefix + (a+n) + suffix;
 86+ }
 87+
 88+ return ss;
 89+ }
 90+
 91+ //FIXME: for some, <br> resp. \n needs to be stripped!
 92+
 93+ protected static final String numericChars = "0-9";
 94+ protected static final String upperAlphabeticChars = "A-Z";
 95+ protected static final String alphabeticChars = upperAlphabeticChars+"a-z";
 96+ protected static final String upperAlphaNumericChars = upperAlphabeticChars+numericChars;
 97+ protected static final String alphaNumericChars = alphabeticChars+numericChars;
 98+ protected static final String dashChars = "-\u2212\uFE63\u2010-\u2014\uFE58\uFF0D";
 99+
 100+ protected static final Pattern identifierSeparatorPattern = Pattern.compile(",\\p{IsZ}+|[\\p{IsZ};]+|<br */?>", 0);
 101+ protected static final Pattern nameSeparatorPattern = Pattern.compile(",\\p{IsZ}+|[\r\n;]+|<br */?>", 0);
 102+ protected static final Pattern badStuffStripPattern = Pattern.compile("[\r\n]+", 0);
 103+ protected static final Pattern spaceStripPattern = Pattern.compile("\\p{IsZ}+", 0);
 104+ protected static final Pattern iupacCleanupPattern = Pattern.compile("(?<=["+dashChars+numericChars+"]|[0-9][a-z])\\p{IsZ}+", 0);
 105+ protected static final Pattern punctuationStripPattern = Pattern.compile("[\r\n,.;/]+", 0);
 106+ protected static final Pattern breakStripPattern = Pattern.compile("[\r\n\\p{IsZ}]+", 0);
 107+
 108+ private static final String uniProtChars = "["+upperAlphaNumericChars+"]{6,}";
 109+ private static final String pubChemChars = "["+numericChars+"]+";
 110+ private static final String pbbChars = "["+numericChars+"]+";
 111+ private static final String drugBankChars = "["+upperAlphabeticChars+"]{2,}["+numericChars+"]{4,}";
 112+ private static final String casChars = "["+numericChars+"]+(["+dashChars+"]["+numericChars+"]+)*";
 113+ private static final String smilesChars = "["+dashChars+"+="+alphaNumericChars+"/\\\\()@#:\\[\\]>.]+"; //FIXME: not greedy enough
 114+ private static final String atcChars = "["+upperAlphabeticChars+"]{6,}";
 115+ private static final String diseasesDbChars = "["+numericChars+"]+";
 116+ private static final String pagesChars = "["+numericChars+"]+(\\s*["+dashChars+",]\\s*["+numericChars+"]+)*";
 117+
 118+ private static final String icd10Chars = "["+upperAlphabeticChars+"]["+numericChars+"]+(\\.["+numericChars+"]*)?"; //FIXME: ranges!
 119+ private static final String icd9Chars = "["+numericChars+"]+(\\.["+numericChars+"]*)?"; //FIXME: ranges!
 120+ private static final String icdOChars = "M["+numericChars+"]+(/["+numericChars+"]*)?";
 121+ private static final String omimChars = "["+numericChars+"]{4,}";
 122+ private static final String medlinePlusChars = "["+numericChars+"]{6,}";
 123+ private static final String meshChars = "["+upperAlphabeticChars+"]?["+numericChars+"]+(\\.["+numericChars+"]+)*";
 124+ private static final String eMedicineChars = "["+alphabeticChars+"]+/["+numericChars+"]+";
 125+ private static final String chemAbbrevChars = "["+dashChars+alphaNumericChars+"(),]+";
 126+
 127+ private static final String inChIChars = "["+dashChars+"+"+alphabeticChars+"\\(\\),/]+";
 128+ private static final String einecsChars = "["+numericChars+"]+(["+dashChars+"]["+numericChars+"]+)*";
 129+ private static final String ecChemChars = "["+numericChars+"]+(["+dashChars+"]["+numericChars+"]+)*";
 130+ private static final String uncasnChars = "["+numericChars+"]{4,}";
 131+ private static final String rtecsChars = "["+upperAlphabeticChars+"]+["+numericChars+"]+";
 132+ private static final String keggChars = "["+upperAlphabeticChars+"]+["+numericChars+"]+";
 133+ private static final String chEbiChars = "["+numericChars+"]+";
 134+ private static final String gmelinChars = "["+numericChars+"]+";
 135+ private static final String beilsteinChars = "["+numericChars+"]+(["+dashChars+"]["+numericChars+"]+)*";
 136+ private static final String hgncChars = "["+numericChars+"]+";
 137+ private static final String hgiChars = "["+numericChars+"]+";
 138+ private static final String proteinSymbolChars = "["+alphaNumericChars+"]+(["+dashChars+"]["+alphaNumericChars+"]+)*(\\.["+numericChars+"]+)*";
 139+ private static final String entrezGeneChars = "["+numericChars+"]+";
 140+ private static final String refSeqChars = "["+upperAlphabeticChars+"]+_["+numericChars+"]+"; //NOTE: value may contain a decimal point, but we ignore that bit for better matching
 141+ private static final String pdbChars = "["+upperAlphaNumericChars+"]{4,}";
 142+
 143+ private static final String ecEnzymeChars = "["+numericChars+"](\\.["+numericChars+"]+)*";
 144+ private static final String homoloGeneChars = "["+numericChars+"]+";
 145+ private static final String mgiChars = "["+numericChars+"]{6,}";
 146+ private static final String ensemblChars = "["+upperAlphabeticChars+"]{2,}["+numericChars+"]{10,}";
 147+ private static final String icscChars = "["+numericChars+"]{4,}";
 148+ private static final String goCodeChars = "["+numericChars+"]{6,}";
 149+ //private static final String chemFormulaChars = "["+dashChars+"+,\\(\\)"+alphaNumericChars+"]{3,}";
 150+ private static final String chemSpiderChars = "["+numericChars+"]+";
 151+ private static final String threeDMetChars = "["+alphaNumericChars+"]{3,}";
 152+
 153+ private static final String dorlandsChars = "["+alphabeticChars+"]+/["+numericChars+"]+";
 154+ private static final String neuroNamesChars = "["+alphabeticChars+"]+-["+numericChars+"]+";
 155+
 156+ //TODO: exclude "Biography"...
 157+ public static final String lifeScienceJournalPattern = "(^|[ _])(Chem[a-z]*|Biol?[.a-z]*|Gen[eo][a-z]*|Med[a-z]*|Cell[a-z]*|DNA|RNA|Nucleic|EMBO|FEBS|Onco[a-z]*|Blood|Immono[a-z]*|Cancer|Virol[a-z]*|Med[a-z]*|Clin[a-z]*|Lancet|Nature|PLoS|Neuro[a-z]*|Zootaxa|JAMA|FASEB|Bacter[a-z]*|Mutat[a-z]*|Mol[a-z]*|Protein|Dermat[a-z]*|Pathol[a-z]*|Endocr[a-z]*|Microbio[a-z]*)($|[_ ])";
 158+
 159+
 160+ protected static DefaultTemplateParameterPropertySpec makeNamePropertySpec(String param, String prop, boolean multi, boolean space) {
 161+ DefaultTemplateParameterPropertySpec spec = new DefaultTemplateParameterPropertySpec(param, prop);
 162+
 163+ if (multi) {
 164+ if (space) spec.setSplitPattern(nameSeparatorPattern);
 165+ else spec.setSplitPattern(identifierSeparatorPattern);
 166+ }
 167+
 168+ if (space) spec.addNormalizer(badStuffStripPattern, "");
 169+ else spec.addNormalizer(spaceStripPattern, "");
 170+
 171+ return spec;
 172+ }
 173+
 174+ protected static DefaultTemplateParameterPropertySpec makeIdentifierPropertySpec(String param, String prop, String pattern) {
 175+ DefaultTemplateParameterPropertySpec spec = new DefaultTemplateParameterPropertySpec(param, prop);
 176+
 177+ pattern = "(?<=[^\\w\\d]|^)("+pattern+")(?=[^\\w\\d]|$)";
 178+
 179+ spec.setFindPattern(Pattern.compile(pattern));
 180+
 181+ if (pattern.indexOf('\u2212')>=0) { //XXX: hack for normalizing dashes
 182+ spec.addNormalizer(Pattern.compile("["+dashChars+"]"), "-");
 183+ }
 184+
 185+ return spec;
 186+ }
 187+
 188+ public WikiConfiguration_enwiki() {
 189+ super();
 190+
 191+ templateExtractorFactory= new TemplateExtractor.Factory() {
 192+ public TemplateExtractor newTemplateExtractor(Context context, TextArmor armor) {
 193+ DeepTemplateExtractor extractor = new DeepTemplateExtractor(context, armor);
 194+ extractor.addContainerField("Protbox", "Codes");
 195+ extractor.addContainerField("Protbox", "Caption");
 196+ //FIXME: this needs to accumulate!!!! //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME //FIXME
 197+ return extractor;
 198+ }
 199+ };
 200+
 201+ //NOTE: apply template replacement only when stripping markup, but then before everything else
 202+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICD9", 1, true), " $2 ") );
 203+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICD10", 3, true), " $2$3.$4 ") ); //XXX: use all 5 params?
 204+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ICDO", 2, true), " M$2/$3 ") );
 205+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("CAS", 1, true), " $2 ") );
 206+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("ATC", 2, true), " $2$3 ") );
 207+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("DiseasesDB2", 1, true), " $2 ") );
 208+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("OMIM\\d?", 1, true), " $2 ") );
 209+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("SMILES", 1, true), " $2 ") ); //FIXME: named param S= !
 210+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("eMedicine2", 2, true), " $2/$3 ") );
 211+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("MedlinePlus2", 1, true), " $2 ") );
 212+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("PDB", 1, true), " $2 ") );
 213+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("PDB2", 1, true), " $2 ") );
 214+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("PDB3", 1, true), " $2 ") );
 215+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("EC_number", 1, true), " $2 ") );
 216+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("OMIM", 1, true), " $2 ") );
 217+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("EntrezGene", 1, true), " $2 ") );
 218+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("UniProt", 1, true), " $2 ") );
 219+ stripMarkupManglers.add(0, new RegularExpressionMangler( templatePattern("RefSeq", 1, true), " $2 ") );
 220+
 221+ propertyExtractors.add( new TemplateParameterExtractor( new ExactNameMatcher("Cite_journal"),
 222+ new DefaultTemplateParameterPropertySpec("journal", "journal")
 223+ .addNormalizer(punctuationStripPattern, "")
 224+ .setCondition(lifeScienceJournalPattern, 0, false) ) );
 225+
 226+ TemplateParameterPropertySpec atcSpec = new AbstractTemplateParameterPropertySpec("ATC") {
 227+ private Matcher validator = Pattern.compile("["+upperAlphaNumericChars+"]+").matcher("");
 228+
 229+ @Override
 230+ public CharSequence getPropertyValue(WikiPage page, TemplateData params) {
 231+ CharSequence code= params.getParameter("ATCCode");
 232+ if (code!=null) {
 233+ if (code.length()==0) return null;
 234+ validator.reset(code);
 235+ if (!validator.matches()) return null;
 236+ return code;
 237+ }
 238+
 239+ CharSequence pre= params.getParameter("ATC_prefix");
 240+ CharSequence suf= params.getParameter("ATC_suffix");
 241+
 242+ if (pre==null) pre = params.getParameter("ATCCode_prefix");
 243+ if (suf==null) suf = params.getParameter("ATCCode_suffix");
 244+
 245+ if (pre==null || suf==null) return null;
 246+ if (pre.length()==0 || suf.length()==0) return null;
 247+
 248+ validator.reset(pre);
 249+ if (!validator.matches()) return null;
 250+
 251+ validator.reset(suf);
 252+ if (!validator.matches()) return null;
 253+
 254+ return pre+""+suf;
 255+ }
 256+ };
 257+
 258+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Drugbox"),
 259+ makeNamePropertySpec("IUPAC_name", "IUPAC", false, false).addCleanup(iupacCleanupPattern, ""),
 260+ makeNamePropertySpec("synonyms", "Name", true, true),
 261+
 262+ makeIdentifierPropertySpec("PubChem", "PubChem", pubChemChars),
 263+ makeIdentifierPropertySpec("DrugBank", "DrugBank", drugBankChars),
 264+ makeIdentifierPropertySpec("CAS_number", "CAS", casChars),
 265+
 266+ makeIdentifierPropertySpec("smiles", "SMILES", smilesChars).addCleanup(breakStripPattern, ""),
 267+ //makeIdentifierPropertySpec("chemical_formula", "Formula", true, false),
 268+
 269+ makeIdentifierPropertySpec("ATC_supplemental", "ATC", atcChars),
 270+ makeIdentifierPropertySpec("CAS_supplemental", "CAS", casChars),
 271+ atcSpec
 272+ ) );
 273+
 274+ TemplateParameterPropertySpec eMedSpec = new AbstractTemplateParameterPropertySpec("eMedicine") {
 275+ private Matcher subjectValidator = Pattern.compile("["+alphaNumericChars+"]+").matcher("");
 276+ private Matcher topicValidator = Pattern.compile("["+numericChars+"]+").matcher("");
 277+
 278+ @Override
 279+ public CharSequence getPropertyValue(WikiPage page, TemplateData params) {
 280+ CharSequence pre= params.getParameter("eMedicineSubj");
 281+ CharSequence suf= params.getParameter("eMedicineTopic");
 282+ if (pre==null || suf==null) return null;
 283+ if (pre.length()==0 || suf.length()==0) return null;
 284+
 285+ subjectValidator.reset(pre);
 286+ if (pre.equals("search")) return null;
 287+ if (!subjectValidator.matches()) return null;
 288+
 289+ topicValidator.reset(suf);
 290+ if (!topicValidator.matches()) return null;
 291+
 292+ return pre+"/"+suf;
 293+ }
 294+ };
 295+
 296+
 297+ TemplateParameterPropertySpec dorlandsSpec = new AbstractTemplateParameterPropertySpec("Dorlands") {
 298+ private Matcher preValidator = Pattern.compile("["+alphabeticChars+"]_["+numericChars+"]+").matcher("");
 299+ private Matcher sufValidator = Pattern.compile("["+numericChars+"]+").matcher("");
 300+
 301+ @Override
 302+ public CharSequence getPropertyValue(WikiPage page, TemplateData params) {
 303+ CharSequence pre= params.getParameter("DorlandsPre");
 304+ CharSequence suf= params.getParameter("DorlandsSuf");
 305+ if (pre==null || suf==null) return null;
 306+ if (pre.length()==0 || suf.length()==0) return null;
 307+
 308+ preValidator.reset(pre);
 309+ if (!preValidator.matches()) return null;
 310+
 311+ sufValidator.reset(suf);
 312+ if (!sufValidator.matches()) return null;
 313+
 314+ return pre+"/"+suf;
 315+ }
 316+ };
 317+
 318+ TemplateParameterPropertySpec neuroNamesSpec = new AbstractTemplateParameterPropertySpec("NeuroNames") {
 319+ private Matcher typeValidator = Pattern.compile("["+alphabeticChars+"]+").matcher("");
 320+ private Matcher numValidator = Pattern.compile("["+numericChars+"]+").matcher("");
 321+
 322+ @Override
 323+ public CharSequence getPropertyValue(WikiPage page, TemplateData params) {
 324+ CharSequence type= params.getParameter("BrainInfoType");
 325+ CharSequence num= params.getParameter("BrainInfoNumber");
 326+ if (type==null || num==null) return null;
 327+ if (type.length()==0 || num.length()==0) return null;
 328+
 329+ typeValidator.reset(type);
 330+ if (!typeValidator.matches()) return null;
 331+
 332+ typeValidator.reset(num);
 333+ if (!numValidator.matches()) return null;
 334+
 335+ return type+"-"+num;
 336+ }
 337+ };
 338+
 339+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Infobox_Disease|Infobox_Symptom|SignSymptom_infobox|DiseaseDisorder_infobox|Interventions_infobox", 0, true),
 340+ makeIdentifierPropertySpec("DiseasesDB", "DiseasesDB", diseasesDbChars),
 341+ makeIdentifierPropertySpec("ICD10", "ICD10", icd10Chars),
 342+ makeIdentifierPropertySpec("ICD9", "ICD9", icd9Chars),
 343+ makeIdentifierPropertySpec("ICDO", "ICDO", icdOChars),
 344+ makeIdentifierPropertySpec("OMIM", "OMIM", omimChars),
 345+ makeIdentifierPropertySpec("MedlinePlus", "MedlinePlus", medlinePlusChars),
 346+ makeIdentifierPropertySpec("MeshID", "MeSH", meshChars), //FIXME: UniqueId vs. TreeNumber
 347+ makeIdentifierPropertySpec("MeshNumber", "MeSH", meshChars), //FIXME: UniqueId vs. TreeNumber
 348+ makeNamePropertySpec("MeshName", "MeSHName", true, true),
 349+ makeIdentifierPropertySpec("OMIM_mult", "OMIM", omimChars),
 350+ makeIdentifierPropertySpec("DiseasesDB_mult", "DiseasesDB", diseasesDbChars),
 351+ makeIdentifierPropertySpec("MedlinePlus_mult", "MedlinePlus", medlinePlusChars),
 352+ makeIdentifierPropertySpec("eMedicine_mult", "eMedicine", eMedicineChars),
 353+ eMedSpec
 354+ ) );
 355+
 356+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Chembox_new"),
 357+ makeNamePropertySpec("IUPACName", "IUPAC", true, true).addCleanup(iupacCleanupPattern, ""),
 358+ makeNamePropertySpec("OtherNames", "Name", true, true) //FIXME: often spaced for auto-breaks and separated by <br>
 359+ ) );
 360+
 361+ //TODO: terms from names
 362+
 363+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Chembox_[Pp]harmacology", 0, true),
 364+ makeIdentifierPropertySpec("DrugBank", "DrugBank", drugBankChars),
 365+ atcSpec
 366+ ) );
 367+
 368+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Chembox_[Hh]azards", 0, true),
 369+ makeIdentifierPropertySpec("RTECS", "RTECS", rtecsChars)
 370+ ) );
 371+
 372+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Chembox_[Ii]dentifiers", 0, true),
 373+ makeIdentifierPropertySpec("Abbreviations", "ChemAbbrev", chemAbbrevChars),
 374+ makeIdentifierPropertySpec("CASNo", "CAS", casChars),
 375+ makeIdentifierPropertySpec("SMILES", "SMILES", smilesChars).addCleanup(breakStripPattern, ""),
 376+ makeIdentifierPropertySpec("FullSMILES", "SMILS", smilesChars).addCleanup(breakStripPattern, ""),
 377+ makeIdentifierPropertySpec("InChI", "InChI", inChIChars).addCleanup(breakStripPattern, ""),
 378+ makeIdentifierPropertySpec("DrugBank", "DrugBank", drugBankChars),
 379+ makeIdentifierPropertySpec("EINECS", "EINECS", einecsChars),
 380+ makeIdentifierPropertySpec("EC-number", "EC/chem", ecChemChars), //NOTE: replaces EINECS and ELINCS; not be confused with the Enzyme Commission EC number for enzymes. makeIdentifierPropertySpec("EINECSCASNO", "CAS", true, false),
 381+ makeIdentifierPropertySpec("UNNumber", "UNCASN", uncasnChars),
 382+ makeIdentifierPropertySpec("PubChem", "PubChem", pubChemChars),
 383+ makeIdentifierPropertySpec("RTECS", "RTECS", rtecsChars),
 384+ makeIdentifierPropertySpec("KEGG", "KEGG", keggChars),
 385+ makeNamePropertySpec("MeSHName", "MeSHName", true, true),
 386+ makeIdentifierPropertySpec("ChEBI", "ChEBI", chEbiChars),
 387+ makeIdentifierPropertySpec("Beilstein", "Beilstein", beilsteinChars),
 388+ makeIdentifierPropertySpec("Gmelin", "Gmelin", gmelinChars),
 389+ makeIdentifierPropertySpec("3DMet", "3DMet", threeDMetChars),
 390+ makeIdentifierPropertySpec("ChemSpiderID", "ChemSpider", chemSpiderChars),
 391+ atcSpec
 392+ ) );
 393+
 394+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("NatOrganicBox", 0, true),
 395+ makeNamePropertySpec("name", "IUPAC", false, false).addCleanup(iupacCleanupPattern, ""),
 396+ makeNamePropertySpec("synonyms", "Name", true, true),
 397+ makeIdentifierPropertySpec("abbreviations", "ChemAbbrev", chemAbbrevChars),
 398+ //makeIdentifierPropertySpec("chemical_formula", "Formula", chemFormulaChars),
 399+
 400+ makeIdentifierPropertySpec("CAS", "CAS", casChars),
 401+ makeIdentifierPropertySpec("DrugBank", "DrugBank", drugBankChars), //FIXME: getting "?"
 402+ makeIdentifierPropertySpec("SMILES", "SMILES", smilesChars).addCleanup(breakStripPattern, ""),
 403+ makeIdentifierPropertySpec("EINECS", "EINECS", einecsChars),
 404+ makeIdentifierPropertySpec("PubChem", "PubChem", pubChemChars),
 405+ atcSpec
 406+ ) );
 407+
 408+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Elementbox", 0, true),
 409+ makeNamePropertySpec("name", "Name", true, true),
 410+ makeIdentifierPropertySpec("number", "ElementNumber", "["+numericChars+"]"),
 411+ makeIdentifierPropertySpec("symbol", "ElementSymbol", "["+alphaNumericChars+"]"),
 412+
 413+ makeIdentifierPropertySpec("CAS number", "CAS", casChars),
 414+ atcSpec
 415+ ) );
 416+
 417+ //TODO: ...as terms
 418+
 419+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox"),
 420+ makeNamePropertySpec("Name", "ProteinName", false, true),
 421+ makeNamePropertySpec("Names", "ProteinName", true, true),
 422+
 423+ //makeIdentifierPropertySpec("Gene", "HGNC", hgncChars),
 424+ makeIdentifierPropertySpec("HGNCid", "HGNC", hgncChars),
 425+ makeIdentifierPropertySpec("MGIid", "MGI", hgiChars),
 426+ makeIdentifierPropertySpec("Symbol", "ProteinSymbol", proteinSymbolChars),
 427+ makeIdentifierPropertySpec("AltSymbols", "ProteinSymbol", proteinSymbolChars),
 428+
 429+ makeIdentifierPropertySpec("EntrezGene", "EntrezGene", entrezGeneChars),
 430+ makeIdentifierPropertySpec("OMIM", "OMIM", omimChars),
 431+ makeIdentifierPropertySpec("RefSeq", "RefSeq", refSeqChars),
 432+ makeIdentifierPropertySpec("UniProt", "UniProt", uniProtChars),
 433+ makeIdentifierPropertySpec("PDB", "PDB", pdbChars),
 434+ makeIdentifierPropertySpec("ECnumber", "EC/enzyme", ecEnzymeChars)
 435+ ) );
 436+
 437+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Taxobox"),
 438+ makeNamePropertySpec("name", "Name", false, true),
 439+ makeNamePropertySpec("regnum", "taxo-regnum", true, true),
 440+ makeNamePropertySpec("divisio", "taxo-divisio", true, true),
 441+ makeNamePropertySpec("classis", "taxo-classis", true, true),
 442+ makeNamePropertySpec("ordo", "taxo-ordo", true, true),
 443+ makeNamePropertySpec("familia", "taxo-familia", true, true),
 444+ makeNamePropertySpec("genus", "taxo-genus", true, true),
 445+ makeNamePropertySpec("species", "taxo-species", true, true)
 446+ ) );
 447+
 448+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Enzyme_(links|references)", 0, true),
 449+ makeIdentifierPropertySpec("EC_number", "EC/enzyme", ecEnzymeChars)
 450+ ) );
 451+
 452+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("GO_code_links"),
 453+ makeNamePropertySpec("name", "ProteinName", false, true),
 454+ makeIdentifierPropertySpec("GO_code", "GO_code", goCodeChars)
 455+ ) );
 456+
 457+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("CAS_registry"), //XXX: only as identifying element, or also in-context?
 458+ makeIdentifierPropertySpec("1", "CAS", casChars)
 459+ ) );
 460+
 461+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("MSW3_Groves"),
 462+ makeIdentifierPropertySpec("id", "GrovesId", numericChars),
 463+ makeIdentifierPropertySpec("pages", "GrovesPages", pagesChars)
 464+ ) );
 465+
 466+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Rfam"),
 467+ makeIdentifierPropertySpec("id", "RNA family", alphaNumericChars),
 468+ makeNamePropertySpec("name", "Name", false, true)
 469+ ) );
 470+
 471+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Rfam_box", 0, true),
 472+ makeIdentifierPropertySpec("acc", "RNA family", alphaNumericChars),
 473+ makeNamePropertySpec("description", "Name", true, true),
 474+ makeNamePropertySpec("abbreviation", "Name", true, true),
 475+ makeNamePropertySpec("type", "RNA type", true, true),
 476+ new DefaultTemplateParameterPropertySpec("journal", "journal")
 477+ .addNormalizer(punctuationStripPattern, "")
 478+ .setCondition(lifeScienceJournalPattern, 0, false)
 479+ ) );
 480+
 481+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Infobox_chemical_analysis", 0, true),
 482+ makeNamePropertySpec("name", "Name", true, true),
 483+ makeNamePropertySpec("acronym", "Name", true, true),
 484+ makeNamePropertySpec("classification", "AnalysisClass", true, true),
 485+ makeNamePropertySpec("analytes", "Analytes", true, true)
 486+ ) );
 487+
 488+ //Stuff from the container field Codes in Protbox:
 489+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::OMIM"),
 490+ makeIdentifierPropertySpec("1", "OMIM", omimChars) ) );
 491+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::OMIM2"),
 492+ makeIdentifierPropertySpec("1", "OMIM", omimChars) ) );
 493+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::EntrezGene"),
 494+ makeIdentifierPropertySpec("1", "EntrezGene", entrezGeneChars) ) );
 495+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::UniProt"),
 496+ makeIdentifierPropertySpec("1", "UniProt", uniProtChars) ) );
 497+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::RefSeq"),
 498+ makeIdentifierPropertySpec("1", "RefSeq", refSeqChars) ) );
 499+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::EC_number"),
 500+ makeIdentifierPropertySpec("1", "EC/enzyme", ecEnzymeChars) ) );
 501+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Codes::PDB"),
 502+ makeIdentifierPropertySpec("1", "PDB", pdbChars) ) );
 503+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protbox.Caption::PDB"),
 504+ makeIdentifierPropertySpec("1", "PDB", pdbChars) ) );
 505+
 506+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Protein"),
 507+ makeIdentifierPropertySpec("Symbol", "ProteinSymbol", proteinSymbolChars),
 508+ makeIdentifierPropertySpec("AltSymbols", "ProteinSymbol", proteinSymbolChars),
 509+ makeIdentifierPropertySpec("CAS_number", "CAS", casChars),
 510+ makeIdentifierPropertySpec("DrugBank", "DrugBank", drugBankChars), //FIXME: getting "?"
 511+ makeIdentifierPropertySpec("EntrezGene", "EntrezGene", entrezGeneChars),
 512+ makeIdentifierPropertySpec("HGNCid", "HGNC", hgncChars),
 513+ makeIdentifierPropertySpec("MGIid", "MGI", hgiChars),
 514+ makeIdentifierPropertySpec("OMIM", "OMIM", omimChars),
 515+ makeIdentifierPropertySpec("PDB", "PDB", pdbChars),
 516+ makeIdentifierPropertySpec("RefSeq", "RefSeq", refSeqChars),
 517+ makeIdentifierPropertySpec("UniProt", "UniProt", uniProtChars),
 518+ makeIdentifierPropertySpec("ECnumber", "EC/enzyme", ecEnzymeChars),
 519+ makeIdentifierPropertySpec("ATC_supplemental", "ATC", atcChars),
 520+ makeIdentifierPropertySpec("CAS_supplemental", "CAS", casChars),
 521+ atcSpec
 522+ ) );
 523+
 524+ //TODO: pull names and symbols as terms!
 525+
 526+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("GNF_Protein_box"),
 527+ makeIdentifierPropertySpec("Symbol", "ProteinSymbol", proteinSymbolChars),
 528+ makeIdentifierPropertySpec("AltSymbols", "ProteinSymbol", proteinSymbolChars),
 529+ makeIdentifierPropertySpec("HGNCid", "HGNC", hgncChars),
 530+ makeIdentifierPropertySpec("MGIid", "MGI", hgiChars),
 531+ makeIdentifierPropertySpec("OMIM", "OMIM", omimChars),
 532+ makeIdentifierPropertySpec("PDB", "PDB", pdbChars),
 533+ makeIdentifierPropertySpec("ECnumber", "EC/enzyme", ecEnzymeChars),
 534+ makeIdentifierPropertySpec("Homologene", "HomoloGene", homoloGeneChars),
 535+ makeIdentifierPropertySpec("MGIid", "MGI", mgiChars)
 536+ ) );
 537+
 538+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("GNF_Ortholog_box"),
 539+ makeIdentifierPropertySpec("Hs_Uniprot", "UniProt", uniProtChars),
 540+ makeIdentifierPropertySpec("Mm_Uniprot", "UniProt", uniProtChars),
 541+ makeIdentifierPropertySpec("Hs_Ensembl", "Ensembl", ensemblChars),
 542+ makeIdentifierPropertySpec("Mm_Ensembl", "Ensembl", ensemblChars),
 543+ makeIdentifierPropertySpec("Hs_EntrezGene", "EntrezGene", entrezGeneChars),
 544+ makeIdentifierPropertySpec("Mm_EntrezGene", "EntrezGene", entrezGeneChars),
 545+ makeIdentifierPropertySpec("Hs_RefseqProtein", "RefSeq", refSeqChars),
 546+ makeIdentifierPropertySpec("Mm_RefseqProtein", "RefSeq", refSeqChars),
 547+ makeIdentifierPropertySpec("Hs_RefseqmRNA", "RefSeq", refSeqChars),
 548+ makeIdentifierPropertySpec("Mm_RefseqmRNA", "RefSeq", refSeqChars)
 549+ ) );
 550+
 551+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_neuron"),
 552+ makeNamePropertySpec("neuron_name", "Name", false, true),
 553+ makeNamePropertySpec("function", "Function", false, true),
 554+ makeNamePropertySpec("GraySubject", "GraySubject", true, true)
 555+ ) );
 556+
 557+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_neurotransmitter"),
 558+ makeNamePropertySpec("name", "Name", false, true),
 559+ makeNamePropertySpec("abbrev", "Name", false, true)
 560+ ) );
 561+
 562+ //TODO: {{MedlinePlus}}...?
 563+
 564+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Infobox_(Anatomy|Artery|Vein|Bone|Brain|Nerve|Muscle|Embryology|Ligament|Lymph)", 0, true),
 565+ makeNamePropertySpec("Name", "Name", true, true),
 566+ makeNamePropertySpec("Latin", "AnatomyLatin", true, true),
 567+ makeNamePropertySpec("GraySubject", "GraySubject", true, true),
 568+ makeNamePropertySpec("MeshName", "MeSHName", true, true),
 569+ makeIdentifierPropertySpec("MeshNumber", "MeSH", meshChars),
 570+ makeNamePropertySpec("DorlandsID", "DorlandsName", true, true),
 571+ makeIdentifierPropertySpec("Dorlands", "Dorlands", dorlandsChars),
 572+ dorlandsSpec,
 573+ neuroNamesSpec
 574+ ) );
 575+
 576+ //FIXME: URLDecode for MeshName, etc!
 577+
 578+ //TODO: Infobox_(Artery|Brain|Bone|...)
 579+ // GraySubject, MeSH name&number, DorlandsPre/DorlandsSuf (Elsevier )
 580+
 581+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("ICSC"),
 582+ makeIdentifierPropertySpec("1", "ICSC", icscChars)
 583+ ) );
 584+
 585+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("PubChem"),
 586+ makeIdentifierPropertySpec("1", "PubChem", pubChemChars)
 587+ ) );
 588+
 589+
 590+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("PBB"),
 591+ makeIdentifierPropertySpec("geneid", "_PBB_", pbbChars)
 592+ ) );
 593+
 594+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_abortion_method"),
 595+ makeNamePropertySpec("name", "Name", false, true),
 596+ makeNamePropertySpec("AKA/Abbreviation", "Name", true, true),
 597+ makeNamePropertySpec("Abortion_type", "AbortionType", false, true)
 598+ ) );
 599+
 600+ propertyExtractors.add( new TemplateParameterExtractor(new ExactNameMatcher("Infobox_Birth_control"),
 601+ makeNamePropertySpec("name", "Name", false, true) //not really interesting, just make the concept show up as relevant for LS
 602+ ) );
 603+
 604+ propertyExtractors.add( new TemplateParameterExtractor(new PatternNameMatcher("Infobox_(((Medical)_)?[Pp]erson|Scientist)", 0, true),
 605+ new DefaultTemplateParameterPropertySpec("name", "person-name").setStripMarkup(true),
 606+ new DefaultTemplateParameterPropertySpec("other_names", "person-name").setStripMarkup(true),
 607+ new DefaultTemplateParameterPropertySpec("birth_date", "person-birth-date").setStripMarkup(true),
 608+ new DefaultTemplateParameterPropertySpec("occupation", "person-occupation").setStripMarkup(true),
 609+ new DefaultTemplateParameterPropertySpec("known_for", "person-known-for").setStripMarkup(true).setSplitPattern(nameSeparatorPattern),
 610+ new DefaultTemplateParameterPropertySpec("nationality", "person-nationality").setStripMarkup(true)
 611+ ) );
 612+
 613+ propertyExtractors.add(new CategoryPatternParameterExtractor("(_|$)([Ff]oods|[Vv]egetables|[Ff]ruits)", null, 0, "food-group"));
 614+ propertyExtractors.add(new TemplateNamePatternParameterExtractor("((.+-)?(Med(ical)?|Treatment|Pathology|Anatomy|Antibiotic|Disease)(-.+)?)-stub", "$2", 0, "med-stub-group")); //TODO: no limits to this one
 615+
 616+ pageTermExtractors.add( new PagePropertyValueExtractor("IUPAC") );
 617+ pageTermExtractors.add( new PagePropertyValueExtractor("AnatomyLatin") );
 618+ pageTermExtractors.add( new PagePropertyValueExtractor("ProteinSymbol") );
 619+ pageTermExtractors.add( new PagePropertyValueExtractor("ProteinName") );
 620+ pageTermExtractors.add( new PagePropertyValueExtractor("MeSHName") );
 621+ pageTermExtractors.add( new PagePropertyValueExtractor("Name") );
 622+ pageTermExtractors.add( new PagePropertyValueExtractor("Symbol") );
 623+ pageTermExtractors.add( new PagePropertyValueExtractor("DorlandsName") );
 624+ pageTermExtractors.add( new PagePropertyValueExtractor("person-name") );
 625+
 626+ supplementSensors.add( new TitleSensor<ResourceType>(ResourceType.SUPPLEMENT, Namespace.TEMPLATE, "PBB/\\d+", 0));
 627+
 628+ supplementNameExtractors.add( new PropertyValueExtractor("_PBB_").setPrefix("Template:PBB/") );
 629+
 630+ supplementedConceptExtractors.add( new TitlePartExtractor(Namespace.MAIN, "(.*)_\\(data_page\\)", 0, "$1") );
 631+ supplementedConceptExtractors.add( new TitlePartExtractor(Namespace.TEMPLATE, "Infobox_(.*)", 0, "$1")
 632+ .addCondition( new HasCategorySensor<ResourceType>(ResourceType.SUPPLEMENT, "Periodic_table_infobox_templates") ) );
 633+
 634+ conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(LifeScienceConceptType.DRUG, "_(treatments|therapies)$", 0));
 635+ conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(LifeScienceConceptType.DRUG, "Drugbox"));
 636+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.DRUG, "^Drugs_|^DrugsNav$", 0));
 637+ conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(LifeScienceConceptType.DRUG, "Major_Drug_Groups"));
 638+
 639+ conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(LifeScienceConceptType.PROTEIN, "EC_\\d+(\\.\\d+)*", 0)); //FIXME: too much meta-stuff!
 640+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.PROTEIN, "^(Enzyme_links|PBB|Protein|GNF_.*_box)$", 0) );
 641+
 642+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.CHEMICAL, "^Chembox|^NatOrganicBox$|^ICSC$|^Elementbox|^(Complex_)?Enzymatic_Reaction", 0));
 643+ conceptTypeSensors.add( new HasCategorySensor<ConceptType>(LifeScienceConceptType.CHEMICAL, "Chemical_elements"));
 644+
 645+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.DISEASE, "^(Infobox_Disease|Infobox_Symptom|SignSymptom_infobox|DiseaseDisorder_infobox)$", 0));
 646+ conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(LifeScienceConceptType.DISEASE, "(_diseases|_disorders)$", 0, false));
 647+
 648+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.ORGAN, "^Infobox_(Brain|Nerve|Muscle|Vein|Artery|Bone|Anatomy|Ligament|Lymph)$", 0));
 649+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.ORGAN, "_glands$|^SUNYAnatomy|^(BUHistology|AnatomyAtlasesMicroscopic|Gray's|Anatomy-stub)$", 0));
 650+
 651+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(LifeScienceConceptType.FOOD, "Nutritional_value", 0));
 652+ conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(LifeScienceConceptType.FOOD, "(_|$)([Ff]oods|[Vv]egetables|[Ff]ruits)", 0, false));
 653+
 654+ conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.LIFEFORM, "Taxobox"));
 655+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.LIFEFORM, "GrovesId"));
 656+
 657+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "person-name"));
 658+ conceptTypeSensors.add( new HasPropertySensor<ConceptType>(ConceptType.PERSON, "person-birth-date"));
 659+
 660+ //TODO; LOTS of anatomy navigation boxes
 661+
 662+ //TODO: generic markers, such as {{MedlinePlus}}, {{MeshName}}, {{GPnotebook}}, {{Gene}}, etc, or [[Category:EC_.*]]
 663+
 664+ //TODO: terms from properties! (ids, latin name, box caption, etc)
 665+
 666+
 667+ }
 668+
 669+}
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience/LifeScienceConceptType.java
@@ -0,0 +1,52 @@
 2+package de.brightbyte.wikiword.lifescience;
 3+
 4+import de.brightbyte.wikiword.ConceptType;
 5+import de.brightbyte.wikiword.ConceptTypeSet;
 6+
 7+/**
 8+ * Enumeration of concept types; each concept type represents a very broad category of concepts,
 9+ * which can be used to filter concepts identified in a corpus. The idea is at for some uses,
 10+ * some kinds of concepts are not usefull, or especially usefull. For example, people and polaces
 11+ * are not suitable for use in a general dictionary, but very useful for topic tracking.
 12+ * Each type is associated with a code (for internal use) and a URI (for external use).
 13+ * The URI is constructed based on {@link RdfEntities.conceptTypeBase}.
 14+ */
 15+public class LifeScienceConceptType extends ConceptType {
 16+
 17+ public static final ConceptType DISEASE;
 18+ public static final ConceptType DRUG;
 19+ public static final ConceptType CHEMICAL;
 20+ public static final ConceptType PROTEIN;
 21+ public static final ConceptType ORGAN;
 22+ public static final ConceptType FOOD;
 23+
 24+ /**
 25+ * NamespaceSet for the canonical concept types. Loaded from the ConceptTypes.properties
 26+ * file in this package.
 27+ */
 28+ public static final ConceptTypeSet lifeScienceConceptTypes;
 29+
 30+ static {
 31+ try {
 32+ lifeScienceConceptTypes = getConceptTypes(null, "de.brightbyte.wikiword.lifescience"); //FIXME: make unmodifiable!
 33+
 34+ DISEASE = lifeScienceConceptTypes.getType(1001);
 35+ //SYMPTOM = wikiProConceptTypes.getType(1002);
 36+ DRUG = lifeScienceConceptTypes.getType(1003);
 37+ //TREATMENT = wikiProConceptTypes.getType(1004);
 38+ CHEMICAL = lifeScienceConceptTypes.getType(1005);
 39+ PROTEIN = lifeScienceConceptTypes.getType(1006);
 40+ //GENE = wikiProConceptTypes.getType(1007);
 41+ ORGAN = lifeScienceConceptTypes.getType(1008);
 42+ FOOD = lifeScienceConceptTypes.getType(1009);
 43+ }
 44+ catch (NumberFormatException ex) {
 45+ throw new ExceptionInInitializerError(ex);
 46+ }
 47+ }
 48+
 49+ public LifeScienceConceptType(int code, String name) {
 50+ super(code, name);
 51+ }
 52+
 53+}
Index: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience/ConceptTypes.properties
@@ -0,0 +1,5 @@
 2+1001=DISEASE
 3+1003=DRUG
 4+1005=CHEMICAL
 5+1006=PROTEIN
 6+1008=ORGAN
Property changes on: trunk/WikiWord/WikiWordProperties/src/main/java/de/brightbyte/wikiword/lifescience
___________________________________________________________________
Name: svn:mergeinfo
17 +
Index: trunk/WikiWord/WikiWordProperties/.svnignore
@@ -0,0 +1,2 @@
 2+*.pyc
 3+
Index: trunk/WikiWord/WikiWordProperties/COPYING
@@ -0,0 +1,33 @@
 2+WikiWord is a system for automatically extracting a thesaurus from
 3+Wikipedia. It was developed by Daniel Kinzler in 2007-2009.
 4+
 5+Development started in 2007 as part of a master's thesis at the
 6+University of Leipzig, see <http://brightbyte.de/page/WikiWord>.
 7+
 8+Development was supported since 2009 by Wikimedia Deutschland e.V.
 9+
 10+ NOTE: This software is not released as a product. It was designed
 11+ for Wikimedia Deutschland's own use, and is made public as is, in
 12+ the hope it may be useful. Wikimedia Deutschland may at any time
 13+ discontinue developing or supporting this software. There is no
 14+ guarantee any new versions or even fixes for security issues will
 15+ be released.
 16+
 17+WikiWord was originally licensed under the GPL, with support of the
 18+University of Leipzig, and was released under the LGPL by its author in
 19+2009, in coordination with Wikimedia Deutschland. If you want to use it
 20+under some other license or condition, please contact the author at
 21+<http://brightbyte.de>.
 22+
 23+ This program is free software: you can redistribute it and/or modify
 24+ it under the terms of the GNU General Public License as published by
 25+ the Free Software Foundation, either version 3 of the License, or
 26+ (at your option) any later version.
 27+
 28+ This program is distributed in the hope that it will be useful,
 29+ but WITHOUT ANY WARRANTY; without even the implied warranty of
 30+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 31+ GNU General Public License for more details.
 32+
 33+ You should have received a copy of the GNU General Public License
 34+ along with this program. If not, see <http://www.gnu.org/licenses/>.
Property changes on: trunk/WikiWord/WikiWordProperties/bin
___________________________________________________________________
Name: svn:ignore
135 + *
Index: trunk/WikiWord/WikiWordProperties/pom.xml
@@ -0,0 +1,105 @@
 2+<project xmlns="http://maven.apache.org/POM/4.0.0"
 3+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 5+
 6+ <modelVersion>4.0.0</modelVersion>
 7+ <groupId>de.wikimedia</groupId>
 8+ <artifactId>WikiWordBuilder4Biography</artifactId>
 9+ <version>0.3</version>
 10+
 11+ <dependencies>
 12+ <dependency>
 13+ <groupId>org.wikimedia</groupId>
 14+ <artifactId>mwdumper</artifactId>
 15+ <version>1.11</version>
 16+ <scope>compile</scope>
 17+ </dependency>
 18+ <dependency>
 19+ <groupId>de.brightbyte</groupId>
 20+ <artifactId>BrightByteUtil</artifactId>
 21+ <version>0.2</version>
 22+ <scope>compile</scope>
 23+ </dependency>
 24+ <dependency>
 25+ <groupId>de.brightbyte</groupId>
 26+ <artifactId>BrightByteDB</artifactId>
 27+ <version>0.2</version>
 28+ <scope>compile</scope>
 29+ </dependency>
 30+ <dependency>
 31+ <groupId>de.wikimedia</groupId>
 32+ <artifactId>WikiWord</artifactId>
 33+ <version>0.3</version>
 34+ <scope>compile</scope>
 35+ </dependency>
 36+ <dependency>
 37+ <groupId>de.wikimedia</groupId>
 38+ <artifactId>WikiWordBuilder</artifactId>
 39+ <version>0.3</version>
 40+ <scope>compile</scope>
 41+ </dependency>
 42+ <dependency>
 43+ <groupId>junit</groupId>
 44+ <artifactId>junit</artifactId>
 45+ <version>3.8</version>
 46+ <scope>test</scope>
 47+ </dependency>
 48+ </dependencies>
 49+
 50+ <build>
 51+ <plugins>
 52+ <plugin>
 53+ <groupId>org.apache.maven.plugins</groupId>
 54+ <artifactId>maven-compiler-plugin</artifactId>
 55+ <configuration>
 56+ <source>1.5</source>
 57+ <target>1.5</target>
 58+ </configuration>
 59+ </plugin>
 60+ <plugin>
 61+ <groupId>org.apache.maven.plugins</groupId>
 62+ <artifactId>maven-javadoc-plugin</artifactId>
 63+ <executions>
 64+ <execution>
 65+ <phase>package</phase>
 66+ </execution>
 67+ </executions>
 68+ </plugin>
 69+ <plugin>
 70+ <artifactId>maven-assembly-plugin</artifactId>
 71+ <configuration>
 72+ <descriptors>
 73+ <descriptor>src/main/assembly/bin-dep.xml</descriptor>
 74+ <descriptor>src/main/assembly/src.xml</descriptor>
 75+ </descriptors>
 76+ </configuration>
 77+ </plugin>
 78+ </plugins>
 79+
 80+ <resources>
 81+ <resource>
 82+ <filtering>false</filtering>
 83+ <directory>src/main/java</directory>
 84+ <includes>
 85+ <include>**</include>
 86+ </includes>
 87+ <excludes>
 88+ <exclude>**/*.java</exclude>
 89+ </excludes>
 90+ </resource>
 91+ </resources>
 92+ <testResources>
 93+ <testResource>
 94+ <filtering>false</filtering>
 95+ <directory>src/test/java</directory>
 96+ <includes>
 97+ <include>**</include>
 98+ </includes>
 99+ <excludes>
 100+ <exclude>**/*.java</exclude>
 101+ </excludes>
 102+ </testResource>
 103+ </testResources>
 104+ </build>
 105+
 106+</project>
Index: trunk/WikiWord/WikiWordProperties/LGPL
@@ -0,0 +1,165 @@
 2+ GNU LESSER GENERAL PUBLIC LICENSE
 3+ Version 3, 29 June 2007
 4+
 5+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 6+ Everyone is permitted to copy and distribute verbatim copies
 7+ of this license document, but changing it is not allowed.
 8+
 9+
 10+ This version of the GNU Lesser General Public License incorporates
 11+the terms and conditions of version 3 of the GNU General Public
 12+License, supplemented by the additional permissions listed below.
 13+
 14+ 0. Additional Definitions.
 15+
 16+ As used herein, "this License" refers to version 3 of the GNU Lesser
 17+General Public License, and the "GNU GPL" refers to version 3 of the GNU
 18+General Public License.
 19+
 20+ "The Library" refers to a covered work governed by this License,
 21+other than an Application or a Combined Work as defined below.
 22+
 23+ An "Application" is any work that makes use of an interface provided
 24+by the Library, but which is not otherwise based on the Library.
 25+Defining a subclass of a class defined by the Library is deemed a mode
 26+of using an interface provided by the Library.
 27+
 28+ A "Combined Work" is a work produced by combining or linking an
 29+Application with the Library. The particular version of the Library
 30+with which the Combined Work was made is also called the "Linked
 31+Version".
 32+
 33+ The "Minimal Corresponding Source" for a Combined Work means the
 34+Corresponding Source for the Combined Work, excluding any source code
 35+for portions of the Combined Work that, considered in isolation, are
 36+based on the Application, and not on the Linked Version.
 37+
 38+ The "Corresponding Application Code" for a Combined Work means the
 39+object code and/or source code for the Application, including any data
 40+and utility programs needed for reproducing the Combined Work from the
 41+Application, but excluding the System Libraries of the Combined Work.
 42+
 43+ 1. Exception to Section 3 of the GNU GPL.
 44+
 45+ You may convey a covered work under sections 3 and 4 of this License
 46+without being bound by section 3 of the GNU GPL.
 47+
 48+ 2. Conveying Modified Versions.
 49+
 50+ If you modify a copy of the Library, and, in your modifications, a
 51+facility refers to a function or data to be supplied by an Application
 52+that uses the facility (other than as an argument passed when the
 53+facility is invoked), then you may convey a copy of the modified
 54+version:
 55+
 56+ a) under this License, provided that you make a good faith effort to
 57+ ensure that, in the event an Application does not supply the
 58+ function or data, the facility still operates, and performs
 59+ whatever part of its purpose remains meaningful, or
 60+
 61+ b) under the GNU GPL, with none of the additional permissions of
 62+ this License applicable to that copy.
 63+
 64+ 3. Object Code Incorporating Material from Library Header Files.
 65+
 66+ The object code form of an Application may incorporate material from
 67+a header file that is part of the Library. You may convey such object
 68+code under terms of your choice, provided that, if the incorporated
 69+material is not limited to numerical parameters, data structure
 70+layouts and accessors, or small macros, inline functions and templates
 71+(ten or fewer lines in length), you do both of the following:
 72+
 73+ a) Give prominent notice with each copy of the object code that the
 74+ Library is used in it and that the Library and its use are
 75+ covered by this License.
 76+
 77+ b) Accompany the object code with a copy of the GNU GPL and this license
 78+ document.
 79+
 80+ 4. Combined Works.
 81+
 82+ You may convey a Combined Work under terms of your choice that,
 83+taken together, effectively do not restrict modification of the
 84+portions of the Library contained in the Combined Work and reverse
 85+engineering for debugging such modifications, if you also do each of
 86+the following:
 87+
 88+ a) Give prominent notice with each copy of the Combined Work that
 89+ the Library is used in it and that the Library and its use are
 90+ covered by this License.
 91+
 92+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
 93+ document.
 94+
 95+ c) For a Combined Work that displays copyright notices during
 96+ execution, include the copyright notice for the Library among
 97+ these notices, as well as a reference directing the user to the
 98+ copies of the GNU GPL and this license document.
 99+
 100+ d) Do one of the following:
 101+
 102+ 0) Convey the Minimal Corresponding Source under the terms of this
 103+ License, and the Corresponding Application Code in a form
 104+ suitable for, and under terms that permit, the user to
 105+ recombine or relink the Application with a modified version of
 106+ the Linked Version to produce a modified Combined Work, in the
 107+ manner specified by section 6 of the GNU GPL for conveying
 108+ Corresponding Source.
 109+
 110+ 1) Use a suitable shared library mechanism for linking with the
 111+ Library. A suitable mechanism is one that (a) uses at run time
 112+ a copy of the Library already present on the user's computer
 113+ system, and (b) will operate properly with a modified version
 114+ of the Library that is interface-compatible with the Linked
 115+ Version.
 116+
 117+ e) Provide Installation Information, but only if you would otherwise
 118+ be required to provide such information under section 6 of the
 119+ GNU GPL, and only to the extent that such information is
 120+ necessary to install and execute a modified version of the
 121+ Combined Work produced by recombining or relinking the
 122+ Application with a modified version of the Linked Version. (If
 123+ you use option 4d0, the Installation Information must accompany
 124+ the Minimal Corresponding Source and Corresponding Application
 125+ Code. If you use option 4d1, you must provide the Installation
 126+ Information in the manner specified by section 6 of the GNU GPL
 127+ for conveying Corresponding Source.)
 128+
 129+ 5. Combined Libraries.
 130+
 131+ You may place library facilities that are a work based on the
 132+Library side by side in a single library together with other library
 133+facilities that are not Applications and are not covered by this
 134+License, and convey such a combined library under terms of your
 135+choice, if you do both of the following:
 136+
 137+ a) Accompany the combined library with a copy of the same work based
 138+ on the Library, uncombined with any other library facilities,
 139+ conveyed under the terms of this License.
 140+
 141+ b) Give prominent notice with the combined library that part of it
 142+ is a work based on the Library, and explaining where to find the
 143+ accompanying uncombined form of the same work.
 144+
 145+ 6. Revised Versions of the GNU Lesser General Public License.
 146+
 147+ The Free Software Foundation may publish revised and/or new versions
 148+of the GNU Lesser General Public License from time to time. Such new
 149+versions will be similar in spirit to the present version, but may
 150+differ in detail to address new problems or concerns.
 151+
 152+ Each version is given a distinguishing version number. If the
 153+Library as you received it specifies that a certain numbered version
 154+of the GNU Lesser General Public License "or any later version"
 155+applies to it, you have the option of following the terms and
 156+conditions either of that published version or of any later version
 157+published by the Free Software Foundation. If the Library as you
 158+received it does not specify a version number of the GNU Lesser
 159+General Public License, you may choose any version of the GNU Lesser
 160+General Public License ever published by the Free Software Foundation.
 161+
 162+ If the Library as you received it specifies that a proxy can decide
 163+whether future versions of the GNU Lesser General Public License shall
 164+apply, that proxy's public statement of acceptance of any version is
 165+permanent authorization for you to choose that version for the
 166+Library.
Index: trunk/WikiWord/WikiWordProperties/build.xml
@@ -0,0 +1,90 @@
 2+<?xml version="1.0" encoding="UTF-8"?>
 3+<project xmlns:artifact="antlib:org.apache.maven.artifact.ant"
 4+ name="WikiWordBuilder" default="dist" basedir=".">
 5+ <property name="src" location="src/main/java"/>
 6+ <property name="build" location="build"/>
 7+ <property name="dist" location="dist"/>
 8+
 9+ <artifact:pom id="maven.project" file="pom.xml">
 10+ </artifact:pom>
 11+
 12+ <artifact:dependencies pathId="compile.classpath" filesetId="compile.fileset" useScope="compile">
 13+ <artifact:pom refid="maven.project"/>
 14+ </artifact:dependencies>
 15+
 16+ <artifact:dependencies pathId="runtime.classpath" filesetId="runtime.fileset" useScope="runtime">
 17+ <artifact:pom refid="maven.project"/>
 18+ </artifact:dependencies>
 19+
 20+ <property name="versionedName" value="${maven.project.artifactId}-${maven.project.version}"/>
 21+
 22+ <target name="init">
 23+ <!-- Create the time stamp -->
 24+ <tstamp/>
 25+ <!-- Create the build directory structure used by compile -->
 26+ <mkdir dir="${build}"/>
 27+ </target>
 28+
 29+ <target name="compile" depends="init" description="compile the source " >
 30+ <!-- Compile the java code from ${src} into ${build} -->
 31+ <javac srcdir="${src}" destdir="${build}" source="1.5" target="1.5" encoding="UTF-8" debug="true" debuglevel="lines,source,vars">
 32+ <classpath refid="compile.classpath" />
 33+ </javac>
 34+ <copy todir="${build}">
 35+ <fileset dir="${src}">
 36+ <exclude name="**/*.java"/>
 37+ </fileset>
 38+ </copy>
 39+ </target>
 40+
 41+ <target name="dist" depends="compile"
 42+ description="generate the distribution" >
 43+ <!-- Create the distribution directory -->
 44+ <mkdir dir="${dist}/lib"/>
 45+ <!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
 46+ <jar jarfile="${dist}/lib/${versionedName}.jar" basedir="${build}"/>
 47+ <copy todir="${dist}/lib">
 48+ <fileset refid="runtime.fileset"/>
 49+ <mapper type="flatten" />
 50+ </copy>
 51+ </target>
 52+
 53+ <target name="assemble" depends="dist" description="assemble depolyment package">
 54+ <mkdir dir="${dist}/unpacked"/>
 55+ <mkdir dir="${dist}/assembled"/>
 56+
 57+ <!-- build jar with all dependencies -->
 58+ <unjar dest="${dist}/unpacked">
 59+ <fileset dir="${dist}/lib"/>
 60+ </unjar>
 61+
 62+ <jar jarfile="${dist}/assembled/${versionedName}-with-deps.jar" basedir="${dist}/unpacked"/>
 63+
 64+
 65+ <copy todir="${dist}/assembled/bin">
 66+ <fileset dir="bin"/>
 67+ </copy>
 68+
 69+ <copy tofile="${dist}/assembled/bin/${maven.project.artifactId}.jar">
 70+ <fileset file="${dist}/assembled/${versionedName}-with-deps.jar"/>
 71+ </copy>
 72+
 73+ <input addproperty="keystorepass">Enter Keystore Password</input>
 74+
 75+ <signjar jar="${dist}/assembled/bin/${maven.project.artifactId}.jar"
 76+ alias="brightbyte" storepass="${keystorepass}" />
 77+ </target>
 78+
 79+ <target name="install" depends="dist" description="install to local maven repository">
 80+ <artifact:install file="dist/lib/${versionedName}.jar">
 81+ <pom refid="maven.project"/>
 82+ </artifact:install>
 83+ </target>
 84+
 85+ <target name="clean" description="clean up" >
 86+ <!-- Delete the ${build} and ${dist} directory trees -->
 87+ <delete dir="${build}"/>
 88+ <delete dir="${dist}"/>
 89+ </target>
 90+
 91+</project>
Property changes on: trunk/WikiWord/WikiWordProperties
___________________________________________________________________
Name: svn:mergeinfo
192 +
Name: svn:ignore
293 + *.pyc

Status & tagging log