r53332 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53331‎ | r53332 | r53333 >
Date:21:12, 15 July 2009
Author:daniel
Status:deferred
Tags:
Comment:
prep: category processing, commons as pseudo-language
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Corpus.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/DatasetIdentifier.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Languages.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/GlobalConceptStoreSchema.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/LocalConceptStoreSchema.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/BuildThesaurus.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseGlobalConceptStoreBuilder.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseWikiWordConceptStoreBuilder.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_eswiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_frwiki.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTestBase.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerBenchmark.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTest.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTestBase.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/builder/PropertyDump.java (deleted) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/wikis/WikiTextAnalyzer_yywiki_Test.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/tweaks.properties.sample (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Languages.java
@@ -45,6 +45,12 @@
4646 ln.remove("simple");
4747 }
4848
 49+ if (tweaks.getTweak("languages.metaAsLanguage", true)) {
 50+ ln.put("meta", "Meta-Wiki");
 51+ } else {
 52+ ln.remove("meta");
 53+ }
 54+
4955 return Collections.unmodifiableMap(ln);
5056 }
5157 catch (IOException ex) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Corpus.java
@@ -85,14 +85,10 @@
8686 * as supplied at http://download.wikimedia.org, this method should return a Corpus instance
8787 * appropriate for the wiki the dump was generated for.
8888 **/
89 - public static Corpus forFile(String collection, File f, String[] configPackages) {
90 - return forName(collection, guessCorpusName(f), configPackages);
 89+ public static Corpus forFile(String collection, File f, TweakSet tweaks) {
 90+ return forName(collection, guessCorpusName(f), tweaks);
9191 }
9292
93 - public static Corpus forName(String collection, String name, TweakSet tweaks) {
94 - return forName(collection, name, getConfigPackages(tweaks));
95 - }
96 -
9793 /**
9894 * returns a new Corpus instance corresponding to the wiki project
9995 * with the given name. The name may be given as a domain name following
@@ -103,27 +99,29 @@
104100 * the language code "de", the family "wikipedia", the domain "de.wikipedia.org"
105101 * and the URI "http://de.wikipedia.org".
106102 */
107 - public static Corpus forName(String collection, String name, String[] configPackages) {
 103+ public static Corpus forName(String collection, String name, TweakSet tweaks) {
 104+ String[] configPackages = getConfigPackages(tweaks);
 105+
108106 String domain = name;
109 - if (domain.indexOf('.')<0) domain = guessCorpusDomain(domain);
 107+ if (domain.indexOf('.')<0) domain = guessCorpusDomain(domain, tweaks);
110108
111109 String[] ss = domain.split("\\.");
112110 if (ss.length<2) throw new IllegalArgumentException("bad domain: "+domain);
113111
114 - String language = guessCorpusLanguage(ss[0]);
115 - String family = guessCorpusFamily(ss[1]);
 112+ String language = guessCorpusLanguage(ss[0], tweaks);
 113+ String family = guessCorpusFamily(ss[1], tweaks);
116114
117 - String classSuffix = family.equals("wikipedia") ? language + "wiki" : language + family;
 115+ String classSuffix = family.equals("wikipedia") || family.equals("wikimedia") ? ss[0] + "wiki" : ss[0] + family;
118116
119117 String dbPrefix = dbPrefix(collection, ss[0]);
120118
121119 //TODO: cache!
122120 //NOTE: force domain as name
123 - return new Corpus(collection, language, dbPrefix, domain, classSuffix, language, family, configPackages);
 121+ return new Corpus(collection, name, dbPrefix, domain, classSuffix, language, family, configPackages);
124122 }
125123
126 - public static Corpus forDataset(DatasetIdentifier dataset, String[] configPackages) {
127 - return forName(dataset.getCollection(), dataset.getName(), configPackages);
 124+ public static Corpus forDataset(DatasetIdentifier dataset, TweakSet tweaks) {
 125+ return forName(dataset.getCollection(), dataset.getName(), tweaks);
128126 }
129127
130128 protected static String[] getConfigPackages(TweakSet tweaks) {
@@ -131,12 +129,8 @@
132130 return pkg.toArray(new String[pkg.size()]);
133131 }
134132
135 - public static Corpus forDataset(DatasetIdentifier dataset, TweakSet tweaks) {
136 - return forDataset(dataset, getConfigPackages(tweaks));
137 - }
138 -
139133 /** guesses the wiki family from a name as used for dump files **/
140 - protected static String guessCorpusFamily(String n) {
 134+ protected static String guessCorpusFamily(String n, TweakSet tweaks) {
141135 if (n.matches(".*commonswiki$")) return "commons";
142136 else if (n.matches(".*meta(wiki)?$")) return "meta";
143137 else if (n.matches(".*wiki$")) return "wikipedia";
@@ -147,25 +141,26 @@
148142 }
149143
150144 /** guesses the wiki language from a name as used for dump files **/
151 - protected static String guessCorpusLanguage(String n) {
 145+ protected static String guessCorpusLanguage(String n, TweakSet tweaks) {
152146 String lang = n.replaceAll("^(.*?)(wiki|wikt).*$", "$1");
153147
154 - if (lang.equals("commons")) return "en";
155 - else if (lang.equals("meta")) return "en";
 148+ if (lang.equals("commons")) return tweaks.getTweak("languages.commonsAsLanguage", false) ? lang : "en";
 149+ else if (lang.equals("meta")) return tweaks.getTweak("languages.metaAsLanguage", false) ? lang : "en";
 150+ else if (lang.equals("simple")) return tweaks.getTweak("languages.simpleAsLanguage", true) ? lang : "en";
156151 else return lang;
157152 }
158153
159154 /** guesses the wiki subdomain from a name as used for dump files **/
160 - protected static String guessCorpusSubdomain(String n) {
 155+ protected static String guessCorpusSubdomain(String n, TweakSet tweaks) {
161156 String sd = n.replaceAll("^(.*?)(wiki|wikt).*$", "$1");
162157
163158 return sd;
164159 }
165160
166161 /** guesses the wiki domain from a name as used for dump files **/
167 - protected static String guessCorpusDomain(String n) {
168 - String sub = guessCorpusSubdomain(n);
169 - String fam = guessCorpusFamily(n);
 162+ protected static String guessCorpusDomain(String n, TweakSet tweaks) {
 163+ String sub = guessCorpusSubdomain(n, tweaks);
 164+ String fam = guessCorpusFamily(n, tweaks);
170165
171166 if (!fam.matches("^(wiki.*|wiktionary)$")) {
172167 sub = fam;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/LocalConceptStoreSchema.java
@@ -107,7 +107,7 @@
108108 aboutTable.addField( new ReferenceField(this, "resource_name", getTextType(255), null, true, KeyType.INDEX, "resource", "name", null ) );
109109 aboutTable.addField( new ReferenceField(this, "concept", "INT", null, false, KeyType.INDEX, "concept", "id", null ) );
110110 aboutTable.addField( new ReferenceField(this, "concept_name", getTextType(255), null, true, KeyType.INDEX, "concept", "name", null ) );
111 - aboutTable.addKey( new DatabaseKey(this, KeyType.PRIMARY, "about", new String[] {"resource", "concept"}) );
 111+ aboutTable.addKey( new DatabaseKey(this, KeyType.PRIMARY, "about", new String[] {"resource", "concept_name"}) );
112112 addTable(aboutTable);
113113
114114 meaningTable = new RelationTable(this, "meaning", getDefaultTableAttributes());
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/GlobalConceptStoreSchema.java
@@ -6,6 +6,7 @@
77 import java.util.ArrayList;
88 import java.util.Arrays;
99 import java.util.List;
 10+import java.util.Map;
1011
1112 import javax.sql.DataSource;
1213
@@ -18,6 +19,7 @@
1920 import de.brightbyte.wikiword.ConceptTypeSet;
2021 import de.brightbyte.wikiword.Corpus;
2122 import de.brightbyte.wikiword.DatasetIdentifier;
 23+import de.brightbyte.wikiword.Languages;
2224 import de.brightbyte.wikiword.TweakSet;
2325
2426 public class GlobalConceptStoreSchema extends WikiWordConceptStoreSchema {
@@ -188,6 +190,16 @@
189191 return cc;
190192 }
191193
 194+ private Map<String, String> languageNames;
 195+
 196+ protected Map<String, String> getLanguageNames() {
 197+ if (this.languageNames==null) {
 198+ this.languageNames = Languages.load(this.tweaks);
 199+ }
 200+
 201+ return this.languageNames;
 202+ }
 203+
192204 public Corpus[] getLanguages() throws SQLException {
193205 if (languages!=null) return languages;
194206
@@ -199,6 +211,10 @@
200212
201213 int i = 0;
202214 for (String l: ll) {
 215+ if (!getLanguageNames().containsKey(l)) {
 216+ throw new SQLException("database inconsistency: encountered bad corpus prefix: "+l+" is not a language name. Hint: check tweaks languages.*AsLanguage");
 217+ }
 218+
203219 cc[i++] = Corpus.forName(getCollectionName(), l, tweaks);
204220 }
205221
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/DatasetIdentifier.java
@@ -30,7 +30,7 @@
3131
3232 @Override
3333 public String toString() {
34 - return getName();
 34+ return getCollection()+":"+getName();
3535 }
3636
3737 @Override
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTestBase.java
@@ -20,8 +20,8 @@
2121 protected PlainTextAnalyzer analyzer;
2222
2323 public PlainTextAnalyzerTestBase(String wikiName) {
24 - corpus = Corpus.forName("TEST", wikiName, (String[])null);
2524 tweaks = new TweakSet();
 25+ corpus = Corpus.forName("TEST", wikiName, tweaks);
2626 }
2727
2828 @Override
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerBenchmark.java
@@ -15,14 +15,14 @@
1616 protected TweakSet tweaks;
1717
1818 public WikiTextAnalyzerBenchmark(String wikiName) throws InstantiationException {
19 - corpus = Corpus.forName("TEST", wikiName, (String[])null);
 19+ tweaks = new TweakSet();
 20+ corpus = Corpus.forName("TEST", wikiName, tweaks);
2021
2122 //site.Base = "http://"+corpus.getDomain()+"/wiki/";
2223 //site.Sitename = corpus.getFamily();
2324
2425 titleCase = true;
2526 namespaces = corpus.getNamespaces();
26 - tweaks = new TweakSet();
2727
2828 analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks);
2929 analyzer.initialize(namespaces, titleCase);
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTestBase.java
@@ -34,8 +34,8 @@
3535 protected TweakSet tweaks;
3636
3737 public WikiTextAnalyzerTestBase(String wikiName) {
38 - corpus = Corpus.forName("TEST", wikiName, (String[])null);
3938 tweaks = new TweakSet();
 39+ corpus = Corpus.forName("TEST", wikiName, tweaks);
4040
4141 //site.Base = "http://"+corpus.getDomain()+"/wiki/";
4242 //site.Sitename = corpus.getFamily();
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTest.java
@@ -203,7 +203,7 @@
204204 WikiPage page = testAnalyzer.makeTestPage("Foo", "#REDIREcT [[bar]][[Category:Orf]]");
205205 WikiTextAnalyzer.WikiLink link = extractRedirectLink(page);
206206
207 - assertEquals("Bar", link.getPage());
 207+ assertEquals("Bar", link.getTarget());
208208 }
209209
210210 public void testIsInterlanguagePrefix() {
@@ -354,9 +354,9 @@
355355 +"end\n";
356356
357357 List<WikiLink> exp = new ArrayList<WikiLink>();
358 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Bla", null, "bla", true, LinkMagic.NONE));
359 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Foxo", null, "foxo", true, LinkMagic.NONE));
360 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Quux", null, "quux", true, LinkMagic.NONE));
 358+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Bla", Namespace.MAIN, "Bla", null, "bla", true, LinkMagic.NONE));
 359+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Foxo", Namespace.MAIN, "Foxo", null, "foxo", true, LinkMagic.NONE));
 360+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Quux", Namespace.MAIN, "Quux", null, "quux", true, LinkMagic.NONE));
361361
362362 WikiPage page = testAnalyzer.makeTestPage("Foo", text);
363363 List<WikiLink> links = extractDisambigLinks(page.getTitle(), page.getCleanedText(true));
@@ -374,11 +374,11 @@
375375 text += "Foo [[bar]]s!\n";
376376 text += "check [[this|that]] out, [[simple thing]]\n";
377377 text += "this [[pipe | pipes]], this [[ blow|blows ]]\n";
378 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Bar", null, "bars", true, LinkMagic.NONE));
379 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "This", null, "that", false, LinkMagic.NONE));
380 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Simple_thing", null, "simple thing", true, LinkMagic.NONE));
381 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Pipe", null, "pipes", false, LinkMagic.NONE));
382 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Blow", null, "blows", false, LinkMagic.NONE));
 378+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Bar", Namespace.MAIN, "Bar", null, "bars", true, LinkMagic.NONE));
 379+ exp.add(new WikiTextAnalyzer.WikiLink(null, "This", Namespace.MAIN, "This", null, "that", false, LinkMagic.NONE));
 380+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Simple_thing", Namespace.MAIN, "Simple_thing", null, "simple thing", true, LinkMagic.NONE));
 381+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Pipe", Namespace.MAIN, "Pipe", null, "pipes", false, LinkMagic.NONE));
 382+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Blow", Namespace.MAIN, "Blow", null, "blows", false, LinkMagic.NONE));
383383 page = testAnalyzer.makeTestPage("Foo", text);
384384 links = extractLinks(page.getTitle(), page.getCleanedText(true));
385385 assertEquals(exp, links);
@@ -389,11 +389,11 @@
390390 text += "[[first]] and [[:last]]\n";
391391 text += "[[give me some space|some space| and time]]\n";
392392 text += "[[odd#|stuff>]]\n";
393 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Inch", null, "\"", false, LinkMagic.NONE));
394 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "First", null, "first", true, LinkMagic.NONE));
395 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Last", null, "last", true, LinkMagic.NONE));
396 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Give_me_some_space", null, "some space| and time", false, LinkMagic.NONE));
397 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Odd", null, "stuff>", false, LinkMagic.NONE));
 393+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Inch", Namespace.MAIN, "Inch", null, "\"", false, LinkMagic.NONE));
 394+ exp.add(new WikiTextAnalyzer.WikiLink(null, "First", Namespace.MAIN, "First", null, "first", true, LinkMagic.NONE));
 395+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Last", Namespace.MAIN, "Last", null, "last", true, LinkMagic.NONE));
 396+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Give_me_some_space", Namespace.MAIN, "Give_me_some_space", null, "some space| and time", false, LinkMagic.NONE));
 397+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Odd", Namespace.MAIN, "Odd", null, "stuff>", false, LinkMagic.NONE));
398398 page = testAnalyzer.makeTestPage("Foo", text);
399399 links = extractLinks(page.getTitle(), page.getCleanedText(true));
400400 assertEquals(exp, links);
@@ -412,9 +412,9 @@
413413 text += "[[this|''works'' {{too}}]]\n";
414414 text += "[[quite'ok']]\n";
415415 text += "[[section# link thing...]]\n";
416 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "This", null, "works {{too}}", false, LinkMagic.NONE));
417 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Quite'ok'", null, "quite'ok'", true, LinkMagic.NONE));
418 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Section", "link_thing...", "section# link thing...", true, LinkMagic.NONE));
 416+ exp.add(new WikiTextAnalyzer.WikiLink(null, "This", Namespace.MAIN, "This", null, "works {{too}}", false, LinkMagic.NONE));
 417+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Quite'ok'", Namespace.MAIN, "Quite'ok'", null, "quite'ok'", true, LinkMagic.NONE));
 418+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Section", Namespace.MAIN, "Section", "link_thing...", "section# link thing...", true, LinkMagic.NONE));
419419 page = testAnalyzer.makeTestPage("Foo", text);
420420 links = extractLinks(page.getTitle(), page.getCleanedText(true));
421421 assertEquals(exp, links);
@@ -426,11 +426,11 @@
427427 text += "[[URL%23Encoding]]\n"; //url-encoded link (yes the # may also be encoded, this does not act as an escape)
428428 text += "[[HTML&amp;entities]]\n"; //html-entities
429429 text += "[[no%special&stuff]]\n"; //no special stuff
430 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Section", "\u00c4.C.ASX.Y&.4", "section#.C3.84.C.ASX.Y.0B.26.05.4", true, LinkMagic.NONE));
431 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "\u00c4%C%ASX%Y&%4", null, "\u00c4%C%ASX%Y&%4", true, LinkMagic.NONE));
432 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "URL", "Encoding", "URL#Encoding", true, LinkMagic.NONE));
433 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "HTML&entities", null, "HTML&entities", true, LinkMagic.NONE));
434 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "No%special&stuff", null, "no%special&stuff", true, LinkMagic.NONE));
 430+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Section", Namespace.MAIN, "Section", "\u00c4.C.ASX.Y&.4", "section#.C3.84.C.ASX.Y.0B.26.05.4", true, LinkMagic.NONE));
 431+ exp.add(new WikiTextAnalyzer.WikiLink(null, "\u00c4%C%ASX%Y&%4", Namespace.MAIN, "\u00c4%C%ASX%Y&%4", null, "\u00c4%C%ASX%Y&%4", true, LinkMagic.NONE));
 432+ exp.add(new WikiTextAnalyzer.WikiLink(null, "URL", Namespace.MAIN, "URL", "Encoding", "URL#Encoding", true, LinkMagic.NONE));
 433+ exp.add(new WikiTextAnalyzer.WikiLink(null, "HTML&entities", Namespace.MAIN, "HTML&entities", null, "HTML&entities", true, LinkMagic.NONE));
 434+ exp.add(new WikiTextAnalyzer.WikiLink(null, "No%special&stuff", Namespace.MAIN, "No%special&stuff", null, "no%special&stuff", true, LinkMagic.NONE));
435435 page = testAnalyzer.makeTestPage("Foo", text);
436436 links = extractLinks(page.getTitle(), page.getCleanedText(true));
437437 assertEquals(exp, links);
@@ -438,10 +438,10 @@
439439 exp = new ArrayList<WikiLink>();
440440 text = "";
441441 text += "\nimage: [[Image:test.jpg]], [[Image:test.jpg|thumb]], [[Image:test.jpg|the [[test]] image]], [[Image:test.jpg|the {{test}} image]];"; //NOTE: stripped as clutter
442 - text += "namespace: [[User:foo]], [[User talk:foo|talk]], [[:User:foo]]bar;\n";
443 - exp.add(new WikiLink(null, Namespace.USER, "Foo", null, "User:foo", true, LinkMagic.NONE));
444 - exp.add(new WikiLink(null, Namespace.USER_TALK, "Foo", null, "talk", false, LinkMagic.NONE));
445 - exp.add(new WikiLink(null, Namespace.USER, "Foo", null, "User:foobar", true, LinkMagic.NONE));
 442+ text += "namespace: [[User:foo]], [[user talk :foo|talk]], [[:User:foo]]bar;\n";
 443+ exp.add(new WikiLink(null, "User:Foo", Namespace.USER, "Foo", null, "User:foo", true, LinkMagic.NONE));
 444+ exp.add(new WikiLink(null, "User_talk:Foo", Namespace.USER_TALK, "Foo", null, "talk", false, LinkMagic.NONE));
 445+ exp.add(new WikiLink(null, "User:Foo", Namespace.USER, "Foo", null, "User:foobar", true, LinkMagic.NONE));
446446 page = testAnalyzer.makeTestPage("Foo", text);
447447 links = extractLinks(page.getTitle(), page.getCleanedText(true));
448448 assertEquals(exp, links);
@@ -452,20 +452,20 @@
453453 text += "[[Category:Foo]]\n"; //category
454454 text += "[[:Category:Foo|Bar]]\n"; //category link
455455 text += "[[Category:Foo|Bar]]\n"; //category sortkey
456 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "", false, LinkMagic.CATEGORY));
457 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "Foo", true, LinkMagic.CATEGORY));
458 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.NONE));
459 - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.CATEGORY));
 456+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "", false, LinkMagic.CATEGORY));
 457+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "Foo", true, LinkMagic.CATEGORY));
 458+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.NONE));
 459+ exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.CATEGORY));
460460 page = testAnalyzer.makeTestPage("Foo", text);
461461 links = extractLinks(page.getTitle(), page.getCleanedText(true));
462462 assertEquals(exp, links);
463463
464464 exp = new ArrayList<WikiLink>();
465 - text = "category: [[Category: z]], [[Category: z|zz]], [[:Category: z]], [[:Category: z|z]];\n";
466 - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "Foo", true, LinkMagic.CATEGORY));
467 - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "zz", false, LinkMagic.CATEGORY));
468 - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "Category: z", true, LinkMagic.NONE));
469 - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "z", false, LinkMagic.NONE));
 465+ text = "category: [[Category: Z]], [[category: z|zz]], [[:Category: z]], [[:Category: z|z]];\n";
 466+ exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "Foo", true, LinkMagic.CATEGORY));
 467+ exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "zz", false, LinkMagic.CATEGORY));
 468+ exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "Category: z", true, LinkMagic.NONE));
 469+ exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "z", false, LinkMagic.NONE));
470470 page = testAnalyzer.makeTestPage("Foo", text);
471471 links = extractLinks(page.getTitle(), page.getCleanedText(true));
472472 assertEquals(exp, links);
@@ -476,35 +476,35 @@
477477 text += "[[xyz:zeug|zeug]]\n"; //interwiki
478478 text += "[[de:Zeug]]\n"; //interlanguage
479479 text += "[[:de:Zeug]]\n"; //interwiki
480 - exp.add(new WikiTextAnalyzer.WikiLink("xyz", Namespace.MAIN, "Zeug", null, "zeug", false, LinkMagic.NONE));
481 - exp.add(new WikiTextAnalyzer.WikiLink("de", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.LANGUAGE));
482 - exp.add(new WikiTextAnalyzer.WikiLink("de", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.NONE));
 480+ exp.add(new WikiTextAnalyzer.WikiLink("xyz", "Zeug", Namespace.MAIN, "Zeug", null, "zeug", false, LinkMagic.NONE));
 481+ exp.add(new WikiTextAnalyzer.WikiLink("de", "Zeug", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.LANGUAGE));
 482+ exp.add(new WikiTextAnalyzer.WikiLink("de", "Zeug", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.NONE));
483483 page = testAnalyzer.makeTestPage("Foo", text);
484484 links = extractLinks(page.getTitle(), page.getCleanedText(true));
485485 assertEquals(exp, links);
486486
487487 exp = new ArrayList<WikiLink>();
488488 text = "language: [[nl: z]], [[zh-yue: z|z]], [[:de: z|z]];\n";
489 - exp.add(new WikiLink("nl", Namespace.MAIN, "Z", null, "nl: z", true, LinkMagic.LANGUAGE));
490 - exp.add(new WikiLink("zh-yue", Namespace.MAIN, "Z", null, "z", false, LinkMagic.LANGUAGE));
491 - exp.add(new WikiLink("de", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE));
 489+ exp.add(new WikiLink("nl", "Z", Namespace.MAIN, "Z", null, "nl: z", true, LinkMagic.LANGUAGE));
 490+ exp.add(new WikiLink("zh-yue", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.LANGUAGE));
 491+ exp.add(new WikiLink("de", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE));
492492 page = testAnalyzer.makeTestPage("Foo", text);
493493 links = extractLinks(page.getTitle(), page.getCleanedText(true));
494494 assertEquals(exp, links);
495495
496496 exp = new ArrayList<WikiLink>();
497497 text = "interwiki: [[ixy: z]], [[ixy: z|z]], [[:ixy: z|z]];\n";
498 - exp.add(new WikiLink("ixy", Namespace.MAIN, "Z", null, "ixy: z", true, LinkMagic.NONE));
499 - exp.add(new WikiLink("ixy", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE));
500 - exp.add(new WikiLink("ixy", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE));
 498+ exp.add(new WikiLink("ixy", "Z", Namespace.MAIN, "Z", null, "ixy: z", true, LinkMagic.NONE));
 499+ exp.add(new WikiLink("ixy", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE));
 500+ exp.add(new WikiLink("ixy", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE));
501501 page = testAnalyzer.makeTestPage("Foo", text);
502502 links = extractLinks(page.getTitle(), page.getCleanedText(true));
503503 assertEquals(exp, links);
504504
505505 exp = new ArrayList<WikiLink>();
506506 text = "prefix: [[x y: z]], [[x y: z|z ]];\n";
507 - exp.add(new WikiLink(null, Namespace.MAIN, "X_y:_z", null, "x y: z", true, LinkMagic.NONE));
508 - exp.add(new WikiLink(null, Namespace.MAIN, "X_y:_z", null, "z", false, LinkMagic.NONE));
 507+ exp.add(new WikiLink(null, "X_y:_z", Namespace.MAIN, "X_y:_z", null, "x y: z", true, LinkMagic.NONE));
 508+ exp.add(new WikiLink(null, "X_y:_z", Namespace.MAIN, "X_y:_z", null, "z", false, LinkMagic.NONE));
509509 page = testAnalyzer.makeTestPage("Foo", text);
510510 links = extractLinks(page.getTitle(), page.getCleanedText(true));
511511 assertEquals(exp, links);
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/builder/PropertyDump.java
@@ -1,101 +0,0 @@
2 -package de.brightbyte.wikiword.builder;
3 -import java.io.File;
4 -import java.io.IOException;
5 -import java.net.URL;
6 -import java.net.URLEncoder;
7 -import java.util.Map;
8 -import java.util.Set;
9 -
10 -import de.brightbyte.data.MultiMap;
11 -import de.brightbyte.io.IOUtil;
12 -import de.brightbyte.wikiword.Corpus;
13 -import de.brightbyte.wikiword.Namespace;
14 -import de.brightbyte.wikiword.NamespaceSet;
15 -import de.brightbyte.wikiword.TweakSet;
16 -import de.brightbyte.wikiword.analyzer.WikiPage;
17 -import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
18 -
19 -
20 -public class PropertyDump {
21 - public static void main(String[] args) throws InstantiationException, IOException {
22 - String lang = args[0];
23 - String n = args[1];
24 -
25 - Corpus corpus = Corpus.forName("TEST", lang, new String[] {"de.brightbyte.wikiword.wikipro", "de.brightbyte.wikiword.wikipro.wikis"});
26 -
27 - URL u;
28 -
29 - if ( args.length>2 ) {
30 - u = new File(args[2]).toURI().toURL();
31 - }
32 - else {
33 - u = new URL("http://"+lang+".wikipedia.org/w/index.php?action=raw&title=" + URLEncoder.encode(n, "UTF-8"));
34 - }
35 -
36 - String p = n;
37 -
38 - TweakSet tweaks = new TweakSet();
39 - WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks);
40 -
41 - System.out.println("loading "+u+"...");
42 - String text = IOUtil.slurp(u, "UTF-8");
43 - System.out.println("loaded.");
44 -
45 - NamespaceSet namespaces = Namespace.getNamespaces(null);
46 - analyzer.initialize(namespaces, true);
47 -
48 - WikiTextAnalyzer.WikiLink t = analyzer.makeLink(p, p, null, null);
49 -
50 - WikiPage page = analyzer.makePage(t.getNamespace(), t.getTarget().toString(), text, true);
51 -
52 - System.out.println("Resource: " + page.getResourceName());
53 - System.out.println("Concept: " + page.getConceptName());
54 -
55 - System.out.println("ResourceType: " + page.getResourceType());
56 - System.out.println("ConceptType: " + page.getConceptType());
57 -
58 - System.out.println("Definition: " + page.getFirstSentence());
59 -
60 - System.out.println("Properties:");
61 - MultiMap<String, CharSequence, Set<CharSequence>> properties = page.getProperties();
62 - for (Map.Entry<String, Set<CharSequence>> e : properties.entrySet()) {
63 - System.out.print("\t");
64 - System.out.print(e.getKey());
65 - System.out.print(": ");
66 -
67 - boolean first = true;
68 - for (CharSequence v : e.getValue()) {
69 - if (first) first = false;
70 - else System.out.print(", ");
71 -
72 - System.out.print(v);
73 - }
74 - System.out.println();
75 - }
76 -
77 - System.out.println("Supplements:");
78 - Set<CharSequence> supplements = page.getSupplementLinks();
79 - for (CharSequence s : supplements) {
80 - System.out.println("\t"+s);
81 - }
82 -
83 - CharSequence supplementedConcept = page.getSupplementedConcept();
84 - if (supplementedConcept!=null) {
85 - System.out.println("Supplemented: ");
86 - System.out.println("\t"+supplementedConcept);
87 - }
88 -
89 - System.out.println("TitleTerms:");
90 - Set<CharSequence> titleTerms = page.getTitleTerms();
91 - for (CharSequence s : titleTerms) {
92 - System.out.println("\t"+s);
93 - }
94 -
95 - System.out.println("PageTerms:");
96 - Set<CharSequence> titlePage = page.getPageTerms();
97 - for (CharSequence s : titlePage) {
98 - System.out.println("\t"+s);
99 - }
100 - }
101 -
102 -}
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/wikis/WikiTextAnalyzer_yywiki_Test.java
@@ -29,11 +29,11 @@
3030 assertTestCase(page, "conceptType", ConceptType.OTHER);
3131
3232 List<WikiTextAnalyzer.WikiLink> links = new ArrayList<WikiTextAnalyzer.WikiLink>();
33 - links.add(analyzer.newLink(null, Namespace.MAIN, "Yar", null, "Yar", true, LinkMagic.NONE));
34 - links.add(analyzer.newLink(null, Namespace.CATEGORY, "Yoo", null, "*", false, LinkMagic.CATEGORY));
35 - links.add(analyzer.newLink(null, Namespace.CATEGORY, "Yofos", null, "Yoo", true, LinkMagic.CATEGORY));
36 - links.add(analyzer.newLink("xx", Namespace.MAIN, "Xo", null, "xx:Xo", true, LinkMagic.LANGUAGE));
37 - links.add(analyzer.newLink("zz", Namespace.MAIN, "Zoo", null, "zz:Zoo", true, LinkMagic.LANGUAGE));
 33+ links.add(analyzer.newLink(null, "Yar", Namespace.MAIN, "Yar", null, "Yar", true, LinkMagic.NONE));
 34+ links.add(analyzer.newLink(null, "Category:Yoo", Namespace.CATEGORY, "Yoo", null, "*", false, LinkMagic.CATEGORY));
 35+ links.add(analyzer.newLink(null, "Category:Yofos", Namespace.CATEGORY, "Yofos", null, "Yoo", true, LinkMagic.CATEGORY));
 36+ links.add(analyzer.newLink("xx", "Xo", Namespace.MAIN, "Xo", null, "xx:Xo", true, LinkMagic.LANGUAGE));
 37+ links.add(analyzer.newLink("zz", "Zoo", Namespace.MAIN, "Zoo", null, "zz:Zoo", true, LinkMagic.LANGUAGE));
3838 assertTestCase(page, "links", links);
3939
4040 Set<String> categories = new HashSet<String>();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
@@ -212,7 +212,7 @@
213213 if (link.getInterwiki()!=null) continue;
214214
215215 storeReference(rcId, link.getText().toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK);
216 - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getPage().toString());
 216+ if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString());
217217 }
218218 }
219219 }
@@ -222,11 +222,11 @@
223223 WikiTextAnalyzer.LinkMagic m = link.getMagic();
224224
225225 if (m==WikiTextAnalyzer.LinkMagic.NONE) {
226 - if (link.getNamespace()!=Namespace.MAIN) continue;
 226+ if (link.getNamespace()!=Namespace.MAIN && link.getNamespace()!=Namespace.CATEGORY) continue;
227227 if (link.getInterwiki()!=null) continue;
228228
229229 storeLink(rcId, conceptId, conceptName, link.getText().toString(), link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK);
230 - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getPage().toString());
 230+ if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString());
231231 }
232232 }
233233 }
@@ -254,7 +254,7 @@
255255 String rcName = analyzerPage.getResourceName();
256256 String text = analyzerPage.getText().toString();
257257 //int namespace = analyzerPage.getNamespace();
258 - //String title = analyzerPage.getTitle().toString();
 258+ String title = analyzerPage.getTitle().toString();
259259
260260 //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update.
261261 int rcId = storeResource(rcName, rcType, timestamp);
@@ -280,7 +280,7 @@
281281
282282 if (m==WikiTextAnalyzer.LinkMagic.CATEGORY) {
283283 //FIXME: store this also as a reference to the categorie's concept under it's original title!
284 - storeConceptBroader(rcId, name, link.getPage().toString(), ExtractionRule.BROADER_FROM_CAT);
 284+ storeConceptBroader(rcId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT);
285285 }
286286
287287 if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) {
@@ -367,16 +367,15 @@
368368 if ( sortKey!=null && analyzer.isMainArticleMarker(sortKey) ) {
369369 if (analyzer.useCategoryAliases()) {
370370 //XXX: if there's more than one "main article", this breaks.
371 - String cat = link.getPage().toString();
372371
373 - if (!cat.equals(name) && analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) {
374 - Set<CharSequence> terms = analyzer.determineTitleTerms(link.getPage());
 372+ if (analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) {
 373+ Set<CharSequence> terms = analyzer.determineTitleTerms(link.getTitle());
375374 storePageTerms(rcId, terms, conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME);
376375
377376 //NOTE: the alias is preliminary: if a article with the name of the category
378377 // exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks
379378
380 - storeConceptAlias(rcId, -1, cat, conceptId, name, AliasScope.CATEGORY);
 379+ storeConceptAlias(rcId, -1, link.getTarget().toString(), conceptId, name, AliasScope.CATEGORY);
381380 categorize = false;
382381 }
383382 }
@@ -390,13 +389,11 @@
391390 storeReference(rcId, sortKey, conceptId, name, ExtractionRule.TERM_FROM_SORTKEY); //sort key is a name for this page
392391 }
393392
394 - if ( !link.getPage().toString().equals(name) ) { //NOTE: need the toString, CharSequences doen't "equal" strings :(
395 - storeConceptBroader(rcId, conceptId, name, link.getPage().toString(), ExtractionRule.BROADER_FROM_CAT);
396 - }
 393+ storeConceptBroader(rcId, conceptId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT);
397394 }
398395 }
399396 else if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) {
400 - storeLanguageLink(rcId, conceptId, name, link.getInterwiki().toString(), link.getPage().toString()); //XXX: consider target? consider both??
 397+ storeLanguageLink(rcId, conceptId, name, link.getInterwiki().toString(), link.getTarget().toString()); //XXX: consider target? consider both??
401398 }
402399 }
403400
@@ -421,7 +418,7 @@
422419 if (link.getInterwiki()!=null) continue;
423420
424421 for (CharSequence term : terms) {
425 - storeReference(rcId, term.toString(), -1, link.getPage().toString(), ExtractionRule.TERM_FROM_DISAMBIG);
 422+ storeReference(rcId, term.toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_DISAMBIG);
426423 }
427424 }
428425 }
@@ -469,12 +466,12 @@
470467 }
471468 else if (link.getNamespace()!=analyzerPage.getNamespace()) {
472469 if (link.getNamespace()==Namespace.CATEGORY && analyzerPage.getNamespace()==Namespace.MAIN) {
473 - if ( StringUtils.equals(link.getPage(), analyzerPage.getTitle()) ) {
 470+ if ( StringUtils.equals(link.getTarget(), rcName) ) {
474471 out.debug("ignored redundant category redirect "+rcName+" -> "+link);
475472 } else {
476473 out.debug("processing redirect to category "+rcName+" -> "+link);
477 - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getPage().toString(), ExtractionRule.TERM_FROM_REDIRECT );
478 - String tgtConcept = link.getPage().toString();
 474+ storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT );
 475+ String tgtConcept = link.getTarget().toString();
479476
480477 if (!name.equals(tgtConcept)) {
481478 conceptId = store.storeAbout(rcId, rcName, name);
@@ -487,13 +484,13 @@
488485 warn(rcId, "bad redirect (inter-namespace)", rcName+" -> "+link, null);
489486 }
490487 }
491 - else if (name.equals(link.getPage().toString())) {
 488+ else if (rcName.equals(link.getTarget().toString())) {
492489 warn(rcId, "bad redirect (self-link)", "page "+name, null);
493490 }
494491 else {
495492 conceptId = store.storeAbout(rcId, rcName, name);
496 - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getPage().toString(), ExtractionRule.TERM_FROM_REDIRECT );
497 - storeConceptAlias(rcId, conceptId, name, -1, link.getPage().toString(), AliasScope.REDIRECT);
 493+ storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT );
 494+ storeConceptAlias(rcId, conceptId, name, -1, link.getTarget().toString(), AliasScope.REDIRECT);
498495
499496 //FIXME: redir to section!
500497 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/BuildThesaurus.java
@@ -66,7 +66,25 @@
6767 @Override
6868 protected void run() throws Exception {
6969 section("-- importConcepts --------------------------------------------------");
 70+ if (languages==null) {
 71+ String lang = args.getStringOption("languages", null);
 72+ if (lang!=null) {
 73+ String[] ll = lang.split("[,;/|\\s+]+");
 74+ languages = new Corpus[ll.length];
 75+
 76+ int i = 0;
 77+ for (String l: ll) {
 78+ languages[i++] = Corpus.forName(getConfiguredCollectionName(), l, tweaks);
 79+ }
 80+ }
 81+
 82+ if (languages==null) {
 83+ languages = ((GlobalConceptStoreBuilder)this.conceptStore).detectLanguages();
 84+ }
 85+ }
 86+
7087 info("Using languages: "+Arrays.toString(languages));
 88+ ((GlobalConceptStoreBuilder)this.conceptStore).setLanguages(languages);
7189
7290 if (agenda.beginTask("BuildThesaurus.run", "importConcepts")) {
7391 ((GlobalConceptStoreBuilder)this.conceptStore).importConcepts();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java
@@ -69,9 +69,9 @@
7070 stripClutterManglers.add( new RegularExpressionMangler("^"+templatePatternString("wrapper", 0, true), "{|", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE));
7171 stripClutterManglers.add( new RegularExpressionMangler("^"+templatePatternString("end|col-end", 0, true), "|}", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE));
7272
73 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons(-inline|[ _]left|show\\d)?", 1, true), "[[commons:$1]]"));
74 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _+]?cat(-inline|[ _]left|show\\d)?", 1, true), "[[commons:Category:$1]]"));
75 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("wikimedia", 1, true), "[[commons:$1]]")); //FIXME: named params: commons=
 73+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons(?:-inline|[ _]left|show\\d)?", 1, true), "[[commons:$2]]"));
 74+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _+]?cat(?:-inline|[ _]left|show\\d)?", 1, true), "[[commons:Category:$2]]"));
 75+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("wikimedia", 1, true), "[[commons:$2]]")); //FIXME: named params: commons=
7676 //FIXME: Commonscat-N, Commons_cat_multi...
7777 stripClutterManglers.add( new RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE));
7878
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java
@@ -32,8 +32,8 @@
3333 ")\\s*\\|(?:\\s*rtl\\s*\\|)?.*?\\|\\s*(.*?)\\s*\\}\\}", "$1", Pattern.DOTALL | Pattern.CASE_INSENSITIVE));
3434 */
3535
36 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]"));
37 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$1]]"));
 36+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$2]]"));
 37+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$2]]"));
3838 stripClutterManglers.add( new RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE));
3939
4040 stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Okina", 0, false), "\u02BB"));
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_eswiki.java
@@ -25,8 +25,8 @@
2626 stripClutterManglers.add( new RegularExpressionMangler( templatePattern("C", 0, true), "\u00a9"));
2727 stripClutterManglers.add( new RegularExpressionMangler( templatePattern("E", 1, true), "\u00d710^$2"));
2828
29 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]"));
30 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$1]]"));
 29+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$2]]"));
 30+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$2]]"));
3131
3232 //reduce to third param
3333 stripClutterManglers.add( new RegularExpressionMangler(
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_frwiki.java
@@ -6,7 +6,6 @@
77 import de.brightbyte.wikiword.ResourceType;
88 import de.brightbyte.wikiword.analyzer.WikiConfiguration;
99 import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
10 -import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher;
1110 import de.brightbyte.wikiword.analyzer.matcher.NameMatcher;
1211 import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher;
1312 import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor;
@@ -18,9 +17,9 @@
1918
2019 public WikiConfiguration_frwiki() {
2120 super();
22 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]"));
23 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _]?cat?", 1, true), "[[commons:Category:$1]]"));
24 - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Autres[ _]projets", 1, true), "[[commons:$1]]")); //FIXME: named params: commons=
 21+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$2]]"));
 22+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _]?cat?", 1, true), "[[commons:Category:$2]]"));
 23+ stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Autres[ _]projets", 1, true), "[[commons:$2]]")); //FIXME: named params: commons=
2524
2625 stripClutterManglers.add( new RegularExpressionMangler(templatePattern("er?|\u00e8?re|(?:mini[ _])?[IVXCM]+(?:e|re|er)?|\\d+r?er?|Mlle|Mme|elle", 0, true), "$1"));
2726 stripClutterManglers.add( new RegularExpressionMangler(templatePattern("romain|rom|rom-min|rom-maj|APIb|IPA", 1, true), "$2"));
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseGlobalConceptStoreBuilder.java
@@ -135,7 +135,8 @@
136136 //-------------------------------
137137 public Corpus[] detectLanguages() throws PersistenceException {
138138 try {
139 - return ((GlobalConceptStoreSchema)database).getLanguages();
 139+ Corpus[] languages = ((GlobalConceptStoreSchema)database).getLanguages();
 140+ return languages;
140141 } catch (SQLException e) {
141142 throw new PersistenceException(e);
142143 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseWikiWordConceptStoreBuilder.java
@@ -401,7 +401,7 @@
402402 String sql = "insert ignore into "+relationTable.getSQLName()+" (concept1, concept2, bilink)" +
403403 " select A.anchor, A.target, 1 from "+linkTable.getSQLName()+" as A " +
404404 " join "+linkTable.getSQLName()+" as B " +
405 - " force index (anchor_target) " + //NOTE: avoid table scan!
 405+ " force index (target_anchor) " + //NOTE: avoid table scan!
406406 " on A.anchor = B.target AND B.anchor = A.target ";
407407 String suffix = " on duplicate key update bilink = bilink + values(bilink)";
408408
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java
@@ -369,7 +369,8 @@
370370 this.maxWordFormDistance = 1.0/3.0;
371371
372372 this.badTitlePattern = Pattern.compile("^$|''|[|{}<>\\]\\[]|^\\w+://");
373 - this.badLinkPattern = Pattern.compile("^[^\\d]+:[^ _]|^\\.\\.?$");
 373+ //this.badLinkPattern = Pattern.compile("^[^\\d]+:[^ _]|^\\.\\.?$"); //disallow namespace/interlang
 374+ this.badLinkPattern = Pattern.compile("^\\.\\.?$");
374375 this.titleSuffixPattern = Pattern.compile("^(.*)[ _]\\((.*?)\\)$");
375376 this.titlePrefixPattern = Pattern.compile("^(.*?)#(.+)$");
376377 this.disambigStripSectionPattern = null;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java
@@ -263,7 +263,7 @@
264264 private CharSequence lenientSection;
265265 private CharSequence lenientText;
266266
267 - public WikiLink(CharSequence interwiki, int namespace, CharSequence page, CharSequence section, CharSequence text, boolean impliedText, LinkMagic magic) {
 267+ public WikiLink(CharSequence interwiki, CharSequence target, int namespace, CharSequence page, CharSequence section, CharSequence text, boolean impliedText, LinkMagic magic) {
268268 super();
269269 this.magic = magic;
270270 this.interwiki = interwiki;
@@ -272,8 +272,7 @@
273273 this.section = section;
274274 this.text = text;
275275 this.impliedText = impliedText;
276 - this.target = page;
277 - if (section!=null && section.length()>0) this.target = this.target + "#" + section;
 276+ this.target = target;
278277 }
279278
280279 public CharSequence getInterwiki() {
@@ -284,15 +283,25 @@
285284 return namespace;
286285 }
287286
288 - public CharSequence getPage() {
 287+ public CharSequence getTitle() {
289288 return page;
290289 }
291290
292 - @Deprecated
293291 public CharSequence getTarget() {
294292 return target;
295293 }
296294
 295+
 296+ public CharSequence getTargetPage() {
 297+ CharSequence t = getTarget();
 298+ if (section==null) return t;
 299+
 300+ int idx = StringUtils.indexOf('#', t);
 301+ if (idx<0) return t;
 302+
 303+ return t.subSequence(idx+1, t.length());
 304+ }
 305+
297306 public CharSequence getSection() {
298307 return section;
299308 }
@@ -403,7 +412,6 @@
404413 return false;
405414 return true;
406415 }
407 -
408416
409417 }
410418
@@ -690,7 +698,7 @@
691699
692700 for (WikiLink link : links) {
693701 if (link.getMagic() == LinkMagic.CATEGORY) {
694 - c.add(link.getPage().toString());
 702+ c.add(link.getTitle().toString());
695703 }
696704 }
697705 categories = Collections.unmodifiableSet( c );
@@ -1376,46 +1384,55 @@
13771385 LinkMagic magic = LinkMagic.NONE;
13781386 CharSequence interwiki = null;
13791387 int namespace = Namespace.MAIN;
1380 - CharSequence page = target;
13811388 CharSequence section = null;
13821389 boolean esc = false;
13831390
1384 - while (page.length()>0 && page.charAt(0)==':') {
1385 - page = page.subSequence(1, page.length());
 1391+ while (target.length()>0 && target.charAt(0)==':') {
 1392+ target = target.subSequence(1, target.length());
13861393 esc = true;
13871394 }
13881395
1389 - if (page.length()==0) return null;
 1396+ if (target.length()==0) return null;
13901397
 1398+ CharSequence title = target;
 1399+
13911400 //handle section links ------------------------
1392 - int idx = StringUtils.indexOf('#', page);
1393 - if (idx==page.length()-1) {
1394 - page = page.subSequence(0, page.length()-1);
 1401+ int idx = StringUtils.indexOf('#', title);
 1402+ if (idx==title.length()-1) {
 1403+ title = title.subSequence(0, title.length()-1);
 1404+ target = title;
13951405 section = null;
13961406 }
13971407 else if (idx==0) {
1398 - section = page.subSequence(1, page.length());
1399 - page = context;
 1408+ section = title.subSequence(1, title.length());
 1409+ title = context;
 1410+ target = null; //restored later
14001411 }
14011412 else if (idx>0) {
1402 - section = page.subSequence(idx+1, page.length());
1403 - page = target.subSequence(0, idx);
 1413+ section = title.subSequence(idx+1, title.length());
 1414+ title = target.subSequence(0, idx);
14041415 }
14051416
 1417+ //TODO: subpages starting with "/"...
 1418+
14061419 if (section!=null) { //handle special encoded chars in section ref
14071420 section = decodeSectionName(AnalyzerUtils.trim(section));
14081421 section = AnalyzerUtils.replaceSpaceByUnderscore(section);
 1422+ if (target==null) target = context + "#" + section;
14091423 }
14101424
14111425 //handle qualifiers ------------------------
1412 - idx = StringUtils.indexOf(':', page);
 1426+ boolean setTargetToTitle = false;
 1427+ idx = StringUtils.indexOf(':', title);
14131428 if (idx>=0) {
1414 - CharSequence pre = AnalyzerUtils.trim(page.subSequence(0, idx));
 1429+ CharSequence pre = AnalyzerUtils.trim(title.subSequence(0, idx));
14151430 pre = normalizeTitle(pre);
14161431 int ns = getNamespaceId(pre);
14171432 if (ns!=Namespace.NONE) {
14181433 namespace = ns;
1419 - page = page.subSequence(idx+1, page.length());
 1434+ title = title.subSequence(idx+1, title.length());
 1435+ target = target.subSequence(idx+1, target.length());
 1436+ target = getNamespaceName(ns) + ":" + normalizeTitle(target);
14201437
14211438 if (!esc) {
14221439 if (ns==Namespace.IMAGE) magic = LinkMagic.IMAGE;
@@ -1423,9 +1440,19 @@
14241441 }
14251442 }
14261443 else if (isInterwikiPrefix(pre)) {
1427 - page = page.subSequence(idx+1, page.length());
 1444+ if (target==title) setTargetToTitle = true;
 1445+ title = title.subSequence(idx+1, title.length());
 1446+
 1447+ if (!setTargetToTitle) {
 1448+ idx = StringUtils.indexOf(':', target);
 1449+ target = target.subSequence(idx+1, target.length());
 1450+ target = normalizeTitle(target);
 1451+ }
 1452+
 1453+ //FIXME: normalize target title *namespace*, so it can be joined against the about table!
 1454+
14281455 interwiki = AnalyzerUtils.toLowerCase(pre);
1429 -
 1456+
14301457 if (isInterlanguagePrefix(pre) && !esc) {
14311458 magic = LinkMagic.LANGUAGE;
14321459 }
@@ -1449,14 +1476,17 @@
14501477 }
14511478 }
14521479
1453 - if (tail!=null && magic != LinkMagic.CATEGORY) text = text.toString() + tail;
 1480+ if (tail!=null && magic == LinkMagic.NONE) text = text.toString() + tail;
14541481 if (!implied) text = stripMarkup(text); //XXX: this can get pretty expensive...
14551482 text = HtmlEntities.decodeEntities(text);
14561483
1457 - if (page.length()==0) return null;
 1484+ if (title.length()==0) return null;
14581485
1459 - page = normalizeTitle(page);
1460 - return new WikiLink(interwiki, namespace, page, section, text, implied, magic);
 1486+ title = normalizeTitle(title);
 1487+ if (setTargetToTitle)
 1488+ target = title;
 1489+
 1490+ return new WikiLink(interwiki, title, namespace, title, section, text, implied, magic);
14611491 }
14621492
14631493 public boolean isInterlanguagePrefix(CharSequence pre) {
@@ -1485,6 +1515,12 @@
14861516 return namespaces.getNumber(name.toString());
14871517 }
14881518
 1519+ public String getNamespaceName(int id) {
 1520+ if (id==0) return "";
 1521+
 1522+ return namespaces.getNamespace(id).getLocalName();
 1523+ }
 1524+
14891525 public CharSequence normalizeTitle(CharSequence title) {
14901526 return normalizeTitle(title, true);
14911527 }
@@ -1815,9 +1851,13 @@
18161852
18171853 linkMatcher.reset(text);
18181854 while (linkMatcher.find()) {
1819 - WikiLink link = makeLink(title, linkMatcher.group(1), linkMatcher.group(3), linkMatcher.group(4));
 1855+ String target = linkMatcher.group(1);
 1856+ String label = linkMatcher.group(3);
 1857+ String trail = linkMatcher.group(4);
 1858+
 1859+ WikiLink link = makeLink(title, target, label, trail);
18201860 if (link==null) continue;
1821 - if (isBadLinkTarget(link.getPage())) continue;
 1861+ if (isBadLinkTarget(link.getTarget())) continue;
18221862
18231863 links.add(link);
18241864 }
@@ -1873,8 +1913,8 @@
18741914 return d <= config.maxWordFormDistance;
18751915 }
18761916
1877 - public WikiLink newLink(String interwiki, int namespace, String page, String section, String text, boolean impliedText, LinkMagic magic) {
1878 - return new WikiLink(interwiki, namespace, page, section, text, impliedText, magic);
 1917+ public WikiLink newLink(String interwiki, String target, int namespace, String title, String section, String text, boolean impliedText, LinkMagic magic) {
 1918+ return new WikiLink(interwiki, target, namespace, title, section, text, impliedText, magic);
18791919 }
18801920
18811921 public static WikiTextAnalyzer getWikiTextAnalyzer(Corpus corpus, TweakSet tweaks) throws InstantiationException {
Index: trunk/WikiWord/WikiWordBuilder/tweaks.properties.sample
@@ -6,6 +6,9 @@
77 # treat "commons" as a language code
88 languages.commonsAsLanguage = false
99
 10+# treat "meta" as a language code
 11+languages.metaAsLanguage = false
 12+
1013 # treat "simple" as a language code
1114 languages.simpleAsLanguage = true
1215

Status & tagging log