Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Languages.java |
— | — | @@ -45,6 +45,12 @@ |
46 | 46 | ln.remove("simple"); |
47 | 47 | } |
48 | 48 | |
| 49 | + if (tweaks.getTweak("languages.metaAsLanguage", true)) { |
| 50 | + ln.put("meta", "Meta-Wiki"); |
| 51 | + } else { |
| 52 | + ln.remove("meta"); |
| 53 | + } |
| 54 | + |
49 | 55 | return Collections.unmodifiableMap(ln); |
50 | 56 | } |
51 | 57 | catch (IOException ex) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Corpus.java |
— | — | @@ -85,14 +85,10 @@ |
86 | 86 | * as supplied at http://download.wikimedia.org, this method should return a Corpus instance |
87 | 87 | * appropriate for the wiki the dump was generated for. |
88 | 88 | **/ |
89 | | - public static Corpus forFile(String collection, File f, String[] configPackages) { |
90 | | - return forName(collection, guessCorpusName(f), configPackages); |
| 89 | + public static Corpus forFile(String collection, File f, TweakSet tweaks) { |
| 90 | + return forName(collection, guessCorpusName(f), tweaks); |
91 | 91 | } |
92 | 92 | |
93 | | - public static Corpus forName(String collection, String name, TweakSet tweaks) { |
94 | | - return forName(collection, name, getConfigPackages(tweaks)); |
95 | | - } |
96 | | - |
97 | 93 | /** |
98 | 94 | * returns a new Corpus instance corresponding to the wiki project |
99 | 95 | * with the given name. The name may be given as a domain name following |
— | — | @@ -103,27 +99,29 @@ |
104 | 100 | * the language code "de", the family "wikipedia", the domain "de.wikipedia.org" |
105 | 101 | * and the URI "http://de.wikipedia.org". |
106 | 102 | */ |
107 | | - public static Corpus forName(String collection, String name, String[] configPackages) { |
| 103 | + public static Corpus forName(String collection, String name, TweakSet tweaks) { |
| 104 | + String[] configPackages = getConfigPackages(tweaks); |
| 105 | + |
108 | 106 | String domain = name; |
109 | | - if (domain.indexOf('.')<0) domain = guessCorpusDomain(domain); |
| 107 | + if (domain.indexOf('.')<0) domain = guessCorpusDomain(domain, tweaks); |
110 | 108 | |
111 | 109 | String[] ss = domain.split("\\."); |
112 | 110 | if (ss.length<2) throw new IllegalArgumentException("bad domain: "+domain); |
113 | 111 | |
114 | | - String language = guessCorpusLanguage(ss[0]); |
115 | | - String family = guessCorpusFamily(ss[1]); |
| 112 | + String language = guessCorpusLanguage(ss[0], tweaks); |
| 113 | + String family = guessCorpusFamily(ss[1], tweaks); |
116 | 114 | |
117 | | - String classSuffix = family.equals("wikipedia") ? language + "wiki" : language + family; |
| 115 | + String classSuffix = family.equals("wikipedia") || family.equals("wikimedia") ? ss[0] + "wiki" : ss[0] + family; |
118 | 116 | |
119 | 117 | String dbPrefix = dbPrefix(collection, ss[0]); |
120 | 118 | |
121 | 119 | //TODO: cache! |
122 | 120 | //NOTE: force domain as name |
123 | | - return new Corpus(collection, language, dbPrefix, domain, classSuffix, language, family, configPackages); |
| 121 | + return new Corpus(collection, name, dbPrefix, domain, classSuffix, language, family, configPackages); |
124 | 122 | } |
125 | 123 | |
126 | | - public static Corpus forDataset(DatasetIdentifier dataset, String[] configPackages) { |
127 | | - return forName(dataset.getCollection(), dataset.getName(), configPackages); |
| 124 | + public static Corpus forDataset(DatasetIdentifier dataset, TweakSet tweaks) { |
| 125 | + return forName(dataset.getCollection(), dataset.getName(), tweaks); |
128 | 126 | } |
129 | 127 | |
130 | 128 | protected static String[] getConfigPackages(TweakSet tweaks) { |
— | — | @@ -131,12 +129,8 @@ |
132 | 130 | return pkg.toArray(new String[pkg.size()]); |
133 | 131 | } |
134 | 132 | |
135 | | - public static Corpus forDataset(DatasetIdentifier dataset, TweakSet tweaks) { |
136 | | - return forDataset(dataset, getConfigPackages(tweaks)); |
137 | | - } |
138 | | - |
139 | 133 | /** guesses the wiki family from a name as used for dump files **/ |
140 | | - protected static String guessCorpusFamily(String n) { |
| 134 | + protected static String guessCorpusFamily(String n, TweakSet tweaks) { |
141 | 135 | if (n.matches(".*commonswiki$")) return "commons"; |
142 | 136 | else if (n.matches(".*meta(wiki)?$")) return "meta"; |
143 | 137 | else if (n.matches(".*wiki$")) return "wikipedia"; |
— | — | @@ -147,25 +141,26 @@ |
148 | 142 | } |
149 | 143 | |
150 | 144 | /** guesses the wiki language from a name as used for dump files **/ |
151 | | - protected static String guessCorpusLanguage(String n) { |
| 145 | + protected static String guessCorpusLanguage(String n, TweakSet tweaks) { |
152 | 146 | String lang = n.replaceAll("^(.*?)(wiki|wikt).*$", "$1"); |
153 | 147 | |
154 | | - if (lang.equals("commons")) return "en"; |
155 | | - else if (lang.equals("meta")) return "en"; |
| 148 | + if (lang.equals("commons")) return tweaks.getTweak("languages.commonsAsLanguage", false) ? lang : "en"; |
| 149 | + else if (lang.equals("meta")) return tweaks.getTweak("languages.metaAsLanguage", false) ? lang : "en"; |
| 150 | + else if (lang.equals("simple")) return tweaks.getTweak("languages.simpleAsLanguage", true) ? lang : "en"; |
156 | 151 | else return lang; |
157 | 152 | } |
158 | 153 | |
159 | 154 | /** guesses the wiki subdomain from a name as used for dump files **/ |
160 | | - protected static String guessCorpusSubdomain(String n) { |
| 155 | + protected static String guessCorpusSubdomain(String n, TweakSet tweaks) { |
161 | 156 | String sd = n.replaceAll("^(.*?)(wiki|wikt).*$", "$1"); |
162 | 157 | |
163 | 158 | return sd; |
164 | 159 | } |
165 | 160 | |
166 | 161 | /** guesses the wiki domain from a name as used for dump files **/ |
167 | | - protected static String guessCorpusDomain(String n) { |
168 | | - String sub = guessCorpusSubdomain(n); |
169 | | - String fam = guessCorpusFamily(n); |
| 162 | + protected static String guessCorpusDomain(String n, TweakSet tweaks) { |
| 163 | + String sub = guessCorpusSubdomain(n, tweaks); |
| 164 | + String fam = guessCorpusFamily(n, tweaks); |
170 | 165 | |
171 | 166 | if (!fam.matches("^(wiki.*|wiktionary)$")) { |
172 | 167 | sub = fam; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/LocalConceptStoreSchema.java |
— | — | @@ -107,7 +107,7 @@ |
108 | 108 | aboutTable.addField( new ReferenceField(this, "resource_name", getTextType(255), null, true, KeyType.INDEX, "resource", "name", null ) ); |
109 | 109 | aboutTable.addField( new ReferenceField(this, "concept", "INT", null, false, KeyType.INDEX, "concept", "id", null ) ); |
110 | 110 | aboutTable.addField( new ReferenceField(this, "concept_name", getTextType(255), null, true, KeyType.INDEX, "concept", "name", null ) ); |
111 | | - aboutTable.addKey( new DatabaseKey(this, KeyType.PRIMARY, "about", new String[] {"resource", "concept"}) ); |
| 111 | + aboutTable.addKey( new DatabaseKey(this, KeyType.PRIMARY, "about", new String[] {"resource", "concept_name"}) ); |
112 | 112 | addTable(aboutTable); |
113 | 113 | |
114 | 114 | meaningTable = new RelationTable(this, "meaning", getDefaultTableAttributes()); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/GlobalConceptStoreSchema.java |
— | — | @@ -6,6 +6,7 @@ |
7 | 7 | import java.util.ArrayList; |
8 | 8 | import java.util.Arrays; |
9 | 9 | import java.util.List; |
| 10 | +import java.util.Map; |
10 | 11 | |
11 | 12 | import javax.sql.DataSource; |
12 | 13 | |
— | — | @@ -18,6 +19,7 @@ |
19 | 20 | import de.brightbyte.wikiword.ConceptTypeSet; |
20 | 21 | import de.brightbyte.wikiword.Corpus; |
21 | 22 | import de.brightbyte.wikiword.DatasetIdentifier; |
| 23 | +import de.brightbyte.wikiword.Languages; |
22 | 24 | import de.brightbyte.wikiword.TweakSet; |
23 | 25 | |
24 | 26 | public class GlobalConceptStoreSchema extends WikiWordConceptStoreSchema { |
— | — | @@ -188,6 +190,16 @@ |
189 | 191 | return cc; |
190 | 192 | } |
191 | 193 | |
| 194 | + private Map<String, String> languageNames; |
| 195 | + |
| 196 | + protected Map<String, String> getLanguageNames() { |
| 197 | + if (this.languageNames==null) { |
| 198 | + this.languageNames = Languages.load(this.tweaks); |
| 199 | + } |
| 200 | + |
| 201 | + return this.languageNames; |
| 202 | + } |
| 203 | + |
192 | 204 | public Corpus[] getLanguages() throws SQLException { |
193 | 205 | if (languages!=null) return languages; |
194 | 206 | |
— | — | @@ -199,6 +211,10 @@ |
200 | 212 | |
201 | 213 | int i = 0; |
202 | 214 | for (String l: ll) { |
| 215 | + if (!getLanguageNames().containsKey(l)) { |
| 216 | + throw new SQLException("database inconsistency: encountered bad corpus prefix: "+l+" is not a language name. Hint: check tweaks languages.*AsLanguage"); |
| 217 | + } |
| 218 | + |
203 | 219 | cc[i++] = Corpus.forName(getCollectionName(), l, tweaks); |
204 | 220 | } |
205 | 221 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/DatasetIdentifier.java |
— | — | @@ -30,7 +30,7 @@ |
31 | 31 | |
32 | 32 | @Override |
33 | 33 | public String toString() { |
34 | | - return getName(); |
| 34 | + return getCollection()+":"+getName(); |
35 | 35 | } |
36 | 36 | |
37 | 37 | @Override |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTestBase.java |
— | — | @@ -20,8 +20,8 @@ |
21 | 21 | protected PlainTextAnalyzer analyzer; |
22 | 22 | |
23 | 23 | public PlainTextAnalyzerTestBase(String wikiName) { |
24 | | - corpus = Corpus.forName("TEST", wikiName, (String[])null); |
25 | 24 | tweaks = new TweakSet(); |
| 25 | + corpus = Corpus.forName("TEST", wikiName, tweaks); |
26 | 26 | } |
27 | 27 | |
28 | 28 | @Override |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerBenchmark.java |
— | — | @@ -15,14 +15,14 @@ |
16 | 16 | protected TweakSet tweaks; |
17 | 17 | |
18 | 18 | public WikiTextAnalyzerBenchmark(String wikiName) throws InstantiationException { |
19 | | - corpus = Corpus.forName("TEST", wikiName, (String[])null); |
| 19 | + tweaks = new TweakSet(); |
| 20 | + corpus = Corpus.forName("TEST", wikiName, tweaks); |
20 | 21 | |
21 | 22 | //site.Base = "http://"+corpus.getDomain()+"/wiki/"; |
22 | 23 | //site.Sitename = corpus.getFamily(); |
23 | 24 | |
24 | 25 | titleCase = true; |
25 | 26 | namespaces = corpus.getNamespaces(); |
26 | | - tweaks = new TweakSet(); |
27 | 27 | |
28 | 28 | analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks); |
29 | 29 | analyzer.initialize(namespaces, titleCase); |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTestBase.java |
— | — | @@ -34,8 +34,8 @@ |
35 | 35 | protected TweakSet tweaks; |
36 | 36 | |
37 | 37 | public WikiTextAnalyzerTestBase(String wikiName) { |
38 | | - corpus = Corpus.forName("TEST", wikiName, (String[])null); |
39 | 38 | tweaks = new TweakSet(); |
| 39 | + corpus = Corpus.forName("TEST", wikiName, tweaks); |
40 | 40 | |
41 | 41 | //site.Base = "http://"+corpus.getDomain()+"/wiki/"; |
42 | 42 | //site.Sitename = corpus.getFamily(); |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTest.java |
— | — | @@ -203,7 +203,7 @@ |
204 | 204 | WikiPage page = testAnalyzer.makeTestPage("Foo", "#REDIREcT [[bar]][[Category:Orf]]"); |
205 | 205 | WikiTextAnalyzer.WikiLink link = extractRedirectLink(page); |
206 | 206 | |
207 | | - assertEquals("Bar", link.getPage()); |
| 207 | + assertEquals("Bar", link.getTarget()); |
208 | 208 | } |
209 | 209 | |
210 | 210 | public void testIsInterlanguagePrefix() { |
— | — | @@ -354,9 +354,9 @@ |
355 | 355 | +"end\n"; |
356 | 356 | |
357 | 357 | List<WikiLink> exp = new ArrayList<WikiLink>(); |
358 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Bla", null, "bla", true, LinkMagic.NONE)); |
359 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Foxo", null, "foxo", true, LinkMagic.NONE)); |
360 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Quux", null, "quux", true, LinkMagic.NONE)); |
| 358 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Bla", Namespace.MAIN, "Bla", null, "bla", true, LinkMagic.NONE)); |
| 359 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Foxo", Namespace.MAIN, "Foxo", null, "foxo", true, LinkMagic.NONE)); |
| 360 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Quux", Namespace.MAIN, "Quux", null, "quux", true, LinkMagic.NONE)); |
361 | 361 | |
362 | 362 | WikiPage page = testAnalyzer.makeTestPage("Foo", text); |
363 | 363 | List<WikiLink> links = extractDisambigLinks(page.getTitle(), page.getCleanedText(true)); |
— | — | @@ -374,11 +374,11 @@ |
375 | 375 | text += "Foo [[bar]]s!\n"; |
376 | 376 | text += "check [[this|that]] out, [[simple thing]]\n"; |
377 | 377 | text += "this [[pipe | pipes]], this [[ blow|blows ]]\n"; |
378 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Bar", null, "bars", true, LinkMagic.NONE)); |
379 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "This", null, "that", false, LinkMagic.NONE)); |
380 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Simple_thing", null, "simple thing", true, LinkMagic.NONE)); |
381 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Pipe", null, "pipes", false, LinkMagic.NONE)); |
382 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Blow", null, "blows", false, LinkMagic.NONE)); |
| 378 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Bar", Namespace.MAIN, "Bar", null, "bars", true, LinkMagic.NONE)); |
| 379 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "This", Namespace.MAIN, "This", null, "that", false, LinkMagic.NONE)); |
| 380 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Simple_thing", Namespace.MAIN, "Simple_thing", null, "simple thing", true, LinkMagic.NONE)); |
| 381 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Pipe", Namespace.MAIN, "Pipe", null, "pipes", false, LinkMagic.NONE)); |
| 382 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Blow", Namespace.MAIN, "Blow", null, "blows", false, LinkMagic.NONE)); |
383 | 383 | page = testAnalyzer.makeTestPage("Foo", text); |
384 | 384 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
385 | 385 | assertEquals(exp, links); |
— | — | @@ -389,11 +389,11 @@ |
390 | 390 | text += "[[first]] and [[:last]]\n"; |
391 | 391 | text += "[[give me some space|some space| and time]]\n"; |
392 | 392 | text += "[[odd#|stuff>]]\n"; |
393 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Inch", null, "\"", false, LinkMagic.NONE)); |
394 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "First", null, "first", true, LinkMagic.NONE)); |
395 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Last", null, "last", true, LinkMagic.NONE)); |
396 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Give_me_some_space", null, "some space| and time", false, LinkMagic.NONE)); |
397 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Odd", null, "stuff>", false, LinkMagic.NONE)); |
| 393 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Inch", Namespace.MAIN, "Inch", null, "\"", false, LinkMagic.NONE)); |
| 394 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "First", Namespace.MAIN, "First", null, "first", true, LinkMagic.NONE)); |
| 395 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Last", Namespace.MAIN, "Last", null, "last", true, LinkMagic.NONE)); |
| 396 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Give_me_some_space", Namespace.MAIN, "Give_me_some_space", null, "some space| and time", false, LinkMagic.NONE)); |
| 397 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Odd", Namespace.MAIN, "Odd", null, "stuff>", false, LinkMagic.NONE)); |
398 | 398 | page = testAnalyzer.makeTestPage("Foo", text); |
399 | 399 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
400 | 400 | assertEquals(exp, links); |
— | — | @@ -412,9 +412,9 @@ |
413 | 413 | text += "[[this|''works'' {{too}}]]\n"; |
414 | 414 | text += "[[quite'ok']]\n"; |
415 | 415 | text += "[[section# link thing...]]\n"; |
416 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "This", null, "works {{too}}", false, LinkMagic.NONE)); |
417 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Quite'ok'", null, "quite'ok'", true, LinkMagic.NONE)); |
418 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Section", "link_thing...", "section# link thing...", true, LinkMagic.NONE)); |
| 416 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "This", Namespace.MAIN, "This", null, "works {{too}}", false, LinkMagic.NONE)); |
| 417 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Quite'ok'", Namespace.MAIN, "Quite'ok'", null, "quite'ok'", true, LinkMagic.NONE)); |
| 418 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Section", Namespace.MAIN, "Section", "link_thing...", "section# link thing...", true, LinkMagic.NONE)); |
419 | 419 | page = testAnalyzer.makeTestPage("Foo", text); |
420 | 420 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
421 | 421 | assertEquals(exp, links); |
— | — | @@ -426,11 +426,11 @@ |
427 | 427 | text += "[[URL%23Encoding]]\n"; //url-encoded link (yes the # may also be encoded, this does not act as an escape) |
428 | 428 | text += "[[HTML&entities]]\n"; //html-entities |
429 | 429 | text += "[[no%special&stuff]]\n"; //no special stuff |
430 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "Section", "\u00c4.C.ASX.Y&.4", "section#.C3.84.C.ASX.Y.0B.26.05.4", true, LinkMagic.NONE)); |
431 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "\u00c4%C%ASX%Y&%4", null, "\u00c4%C%ASX%Y&%4", true, LinkMagic.NONE)); |
432 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "URL", "Encoding", "URL#Encoding", true, LinkMagic.NONE)); |
433 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "HTML&entities", null, "HTML&entities", true, LinkMagic.NONE)); |
434 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.MAIN, "No%special&stuff", null, "no%special&stuff", true, LinkMagic.NONE)); |
| 430 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Section", Namespace.MAIN, "Section", "\u00c4.C.ASX.Y&.4", "section#.C3.84.C.ASX.Y.0B.26.05.4", true, LinkMagic.NONE)); |
| 431 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "\u00c4%C%ASX%Y&%4", Namespace.MAIN, "\u00c4%C%ASX%Y&%4", null, "\u00c4%C%ASX%Y&%4", true, LinkMagic.NONE)); |
| 432 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "URL", Namespace.MAIN, "URL", "Encoding", "URL#Encoding", true, LinkMagic.NONE)); |
| 433 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "HTML&entities", Namespace.MAIN, "HTML&entities", null, "HTML&entities", true, LinkMagic.NONE)); |
| 434 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "No%special&stuff", Namespace.MAIN, "No%special&stuff", null, "no%special&stuff", true, LinkMagic.NONE)); |
435 | 435 | page = testAnalyzer.makeTestPage("Foo", text); |
436 | 436 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
437 | 437 | assertEquals(exp, links); |
— | — | @@ -438,10 +438,10 @@ |
439 | 439 | exp = new ArrayList<WikiLink>(); |
440 | 440 | text = ""; |
441 | 441 | text += "\nimage: [[Image:test.jpg]], [[Image:test.jpg|thumb]], [[Image:test.jpg|the [[test]] image]], [[Image:test.jpg|the {{test}} image]];"; //NOTE: stripped as clutter |
442 | | - text += "namespace: [[User:foo]], [[User talk:foo|talk]], [[:User:foo]]bar;\n"; |
443 | | - exp.add(new WikiLink(null, Namespace.USER, "Foo", null, "User:foo", true, LinkMagic.NONE)); |
444 | | - exp.add(new WikiLink(null, Namespace.USER_TALK, "Foo", null, "talk", false, LinkMagic.NONE)); |
445 | | - exp.add(new WikiLink(null, Namespace.USER, "Foo", null, "User:foobar", true, LinkMagic.NONE)); |
| 442 | + text += "namespace: [[User:foo]], [[user talk :foo|talk]], [[:User:foo]]bar;\n"; |
| 443 | + exp.add(new WikiLink(null, "User:Foo", Namespace.USER, "Foo", null, "User:foo", true, LinkMagic.NONE)); |
| 444 | + exp.add(new WikiLink(null, "User_talk:Foo", Namespace.USER_TALK, "Foo", null, "talk", false, LinkMagic.NONE)); |
| 445 | + exp.add(new WikiLink(null, "User:Foo", Namespace.USER, "Foo", null, "User:foobar", true, LinkMagic.NONE)); |
446 | 446 | page = testAnalyzer.makeTestPage("Foo", text); |
447 | 447 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
448 | 448 | assertEquals(exp, links); |
— | — | @@ -452,20 +452,20 @@ |
453 | 453 | text += "[[Category:Foo]]\n"; //category |
454 | 454 | text += "[[:Category:Foo|Bar]]\n"; //category link |
455 | 455 | text += "[[Category:Foo|Bar]]\n"; //category sortkey |
456 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "", false, LinkMagic.CATEGORY)); |
457 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "Foo", true, LinkMagic.CATEGORY)); |
458 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.NONE)); |
459 | | - exp.add(new WikiTextAnalyzer.WikiLink(null, Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.CATEGORY)); |
| 456 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "", false, LinkMagic.CATEGORY)); |
| 457 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "Foo", true, LinkMagic.CATEGORY)); |
| 458 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.NONE)); |
| 459 | + exp.add(new WikiTextAnalyzer.WikiLink(null, "Category:Foo", Namespace.CATEGORY, "Foo", null, "Bar", false, LinkMagic.CATEGORY)); |
460 | 460 | page = testAnalyzer.makeTestPage("Foo", text); |
461 | 461 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
462 | 462 | assertEquals(exp, links); |
463 | 463 | |
464 | 464 | exp = new ArrayList<WikiLink>(); |
465 | | - text = "category: [[Category: z]], [[Category: z|zz]], [[:Category: z]], [[:Category: z|z]];\n"; |
466 | | - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "Foo", true, LinkMagic.CATEGORY)); |
467 | | - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "zz", false, LinkMagic.CATEGORY)); |
468 | | - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "Category: z", true, LinkMagic.NONE)); |
469 | | - exp.add(new WikiLink(null, Namespace.CATEGORY, "Z", null, "z", false, LinkMagic.NONE)); |
| 465 | + text = "category: [[Category: Z]], [[category: z|zz]], [[:Category: z]], [[:Category: z|z]];\n"; |
| 466 | + exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "Foo", true, LinkMagic.CATEGORY)); |
| 467 | + exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "zz", false, LinkMagic.CATEGORY)); |
| 468 | + exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "Category: z", true, LinkMagic.NONE)); |
| 469 | + exp.add(new WikiLink(null, "Category:Z", Namespace.CATEGORY, "Z", null, "z", false, LinkMagic.NONE)); |
470 | 470 | page = testAnalyzer.makeTestPage("Foo", text); |
471 | 471 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
472 | 472 | assertEquals(exp, links); |
— | — | @@ -476,35 +476,35 @@ |
477 | 477 | text += "[[xyz:zeug|zeug]]\n"; //interwiki |
478 | 478 | text += "[[de:Zeug]]\n"; //interlanguage |
479 | 479 | text += "[[:de:Zeug]]\n"; //interwiki |
480 | | - exp.add(new WikiTextAnalyzer.WikiLink("xyz", Namespace.MAIN, "Zeug", null, "zeug", false, LinkMagic.NONE)); |
481 | | - exp.add(new WikiTextAnalyzer.WikiLink("de", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.LANGUAGE)); |
482 | | - exp.add(new WikiTextAnalyzer.WikiLink("de", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.NONE)); |
| 480 | + exp.add(new WikiTextAnalyzer.WikiLink("xyz", "Zeug", Namespace.MAIN, "Zeug", null, "zeug", false, LinkMagic.NONE)); |
| 481 | + exp.add(new WikiTextAnalyzer.WikiLink("de", "Zeug", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.LANGUAGE)); |
| 482 | + exp.add(new WikiTextAnalyzer.WikiLink("de", "Zeug", Namespace.MAIN, "Zeug", null, "de:Zeug", true, LinkMagic.NONE)); |
483 | 483 | page = testAnalyzer.makeTestPage("Foo", text); |
484 | 484 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
485 | 485 | assertEquals(exp, links); |
486 | 486 | |
487 | 487 | exp = new ArrayList<WikiLink>(); |
488 | 488 | text = "language: [[nl: z]], [[zh-yue: z|z]], [[:de: z|z]];\n"; |
489 | | - exp.add(new WikiLink("nl", Namespace.MAIN, "Z", null, "nl: z", true, LinkMagic.LANGUAGE)); |
490 | | - exp.add(new WikiLink("zh-yue", Namespace.MAIN, "Z", null, "z", false, LinkMagic.LANGUAGE)); |
491 | | - exp.add(new WikiLink("de", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE)); |
| 489 | + exp.add(new WikiLink("nl", "Z", Namespace.MAIN, "Z", null, "nl: z", true, LinkMagic.LANGUAGE)); |
| 490 | + exp.add(new WikiLink("zh-yue", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.LANGUAGE)); |
| 491 | + exp.add(new WikiLink("de", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE)); |
492 | 492 | page = testAnalyzer.makeTestPage("Foo", text); |
493 | 493 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
494 | 494 | assertEquals(exp, links); |
495 | 495 | |
496 | 496 | exp = new ArrayList<WikiLink>(); |
497 | 497 | text = "interwiki: [[ixy: z]], [[ixy: z|z]], [[:ixy: z|z]];\n"; |
498 | | - exp.add(new WikiLink("ixy", Namespace.MAIN, "Z", null, "ixy: z", true, LinkMagic.NONE)); |
499 | | - exp.add(new WikiLink("ixy", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE)); |
500 | | - exp.add(new WikiLink("ixy", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE)); |
| 498 | + exp.add(new WikiLink("ixy", "Z", Namespace.MAIN, "Z", null, "ixy: z", true, LinkMagic.NONE)); |
| 499 | + exp.add(new WikiLink("ixy", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE)); |
| 500 | + exp.add(new WikiLink("ixy", "Z", Namespace.MAIN, "Z", null, "z", false, LinkMagic.NONE)); |
501 | 501 | page = testAnalyzer.makeTestPage("Foo", text); |
502 | 502 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
503 | 503 | assertEquals(exp, links); |
504 | 504 | |
505 | 505 | exp = new ArrayList<WikiLink>(); |
506 | 506 | text = "prefix: [[x y: z]], [[x y: z|z ]];\n"; |
507 | | - exp.add(new WikiLink(null, Namespace.MAIN, "X_y:_z", null, "x y: z", true, LinkMagic.NONE)); |
508 | | - exp.add(new WikiLink(null, Namespace.MAIN, "X_y:_z", null, "z", false, LinkMagic.NONE)); |
| 507 | + exp.add(new WikiLink(null, "X_y:_z", Namespace.MAIN, "X_y:_z", null, "x y: z", true, LinkMagic.NONE)); |
| 508 | + exp.add(new WikiLink(null, "X_y:_z", Namespace.MAIN, "X_y:_z", null, "z", false, LinkMagic.NONE)); |
509 | 509 | page = testAnalyzer.makeTestPage("Foo", text); |
510 | 510 | links = extractLinks(page.getTitle(), page.getCleanedText(true)); |
511 | 511 | assertEquals(exp, links); |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/builder/PropertyDump.java |
— | — | @@ -1,101 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.builder; |
3 | | -import java.io.File; |
4 | | -import java.io.IOException; |
5 | | -import java.net.URL; |
6 | | -import java.net.URLEncoder; |
7 | | -import java.util.Map; |
8 | | -import java.util.Set; |
9 | | - |
10 | | -import de.brightbyte.data.MultiMap; |
11 | | -import de.brightbyte.io.IOUtil; |
12 | | -import de.brightbyte.wikiword.Corpus; |
13 | | -import de.brightbyte.wikiword.Namespace; |
14 | | -import de.brightbyte.wikiword.NamespaceSet; |
15 | | -import de.brightbyte.wikiword.TweakSet; |
16 | | -import de.brightbyte.wikiword.analyzer.WikiPage; |
17 | | -import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
18 | | - |
19 | | - |
20 | | -public class PropertyDump { |
21 | | - public static void main(String[] args) throws InstantiationException, IOException { |
22 | | - String lang = args[0]; |
23 | | - String n = args[1]; |
24 | | - |
25 | | - Corpus corpus = Corpus.forName("TEST", lang, new String[] {"de.brightbyte.wikiword.wikipro", "de.brightbyte.wikiword.wikipro.wikis"}); |
26 | | - |
27 | | - URL u; |
28 | | - |
29 | | - if ( args.length>2 ) { |
30 | | - u = new File(args[2]).toURI().toURL(); |
31 | | - } |
32 | | - else { |
33 | | - u = new URL("http://"+lang+".wikipedia.org/w/index.php?action=raw&title=" + URLEncoder.encode(n, "UTF-8")); |
34 | | - } |
35 | | - |
36 | | - String p = n; |
37 | | - |
38 | | - TweakSet tweaks = new TweakSet(); |
39 | | - WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks); |
40 | | - |
41 | | - System.out.println("loading "+u+"..."); |
42 | | - String text = IOUtil.slurp(u, "UTF-8"); |
43 | | - System.out.println("loaded."); |
44 | | - |
45 | | - NamespaceSet namespaces = Namespace.getNamespaces(null); |
46 | | - analyzer.initialize(namespaces, true); |
47 | | - |
48 | | - WikiTextAnalyzer.WikiLink t = analyzer.makeLink(p, p, null, null); |
49 | | - |
50 | | - WikiPage page = analyzer.makePage(t.getNamespace(), t.getTarget().toString(), text, true); |
51 | | - |
52 | | - System.out.println("Resource: " + page.getResourceName()); |
53 | | - System.out.println("Concept: " + page.getConceptName()); |
54 | | - |
55 | | - System.out.println("ResourceType: " + page.getResourceType()); |
56 | | - System.out.println("ConceptType: " + page.getConceptType()); |
57 | | - |
58 | | - System.out.println("Definition: " + page.getFirstSentence()); |
59 | | - |
60 | | - System.out.println("Properties:"); |
61 | | - MultiMap<String, CharSequence, Set<CharSequence>> properties = page.getProperties(); |
62 | | - for (Map.Entry<String, Set<CharSequence>> e : properties.entrySet()) { |
63 | | - System.out.print("\t"); |
64 | | - System.out.print(e.getKey()); |
65 | | - System.out.print(": "); |
66 | | - |
67 | | - boolean first = true; |
68 | | - for (CharSequence v : e.getValue()) { |
69 | | - if (first) first = false; |
70 | | - else System.out.print(", "); |
71 | | - |
72 | | - System.out.print(v); |
73 | | - } |
74 | | - System.out.println(); |
75 | | - } |
76 | | - |
77 | | - System.out.println("Supplements:"); |
78 | | - Set<CharSequence> supplements = page.getSupplementLinks(); |
79 | | - for (CharSequence s : supplements) { |
80 | | - System.out.println("\t"+s); |
81 | | - } |
82 | | - |
83 | | - CharSequence supplementedConcept = page.getSupplementedConcept(); |
84 | | - if (supplementedConcept!=null) { |
85 | | - System.out.println("Supplemented: "); |
86 | | - System.out.println("\t"+supplementedConcept); |
87 | | - } |
88 | | - |
89 | | - System.out.println("TitleTerms:"); |
90 | | - Set<CharSequence> titleTerms = page.getTitleTerms(); |
91 | | - for (CharSequence s : titleTerms) { |
92 | | - System.out.println("\t"+s); |
93 | | - } |
94 | | - |
95 | | - System.out.println("PageTerms:"); |
96 | | - Set<CharSequence> titlePage = page.getPageTerms(); |
97 | | - for (CharSequence s : titlePage) { |
98 | | - System.out.println("\t"+s); |
99 | | - } |
100 | | - } |
101 | | - |
102 | | -} |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/wikis/WikiTextAnalyzer_yywiki_Test.java |
— | — | @@ -29,11 +29,11 @@ |
30 | 30 | assertTestCase(page, "conceptType", ConceptType.OTHER); |
31 | 31 | |
32 | 32 | List<WikiTextAnalyzer.WikiLink> links = new ArrayList<WikiTextAnalyzer.WikiLink>(); |
33 | | - links.add(analyzer.newLink(null, Namespace.MAIN, "Yar", null, "Yar", true, LinkMagic.NONE)); |
34 | | - links.add(analyzer.newLink(null, Namespace.CATEGORY, "Yoo", null, "*", false, LinkMagic.CATEGORY)); |
35 | | - links.add(analyzer.newLink(null, Namespace.CATEGORY, "Yofos", null, "Yoo", true, LinkMagic.CATEGORY)); |
36 | | - links.add(analyzer.newLink("xx", Namespace.MAIN, "Xo", null, "xx:Xo", true, LinkMagic.LANGUAGE)); |
37 | | - links.add(analyzer.newLink("zz", Namespace.MAIN, "Zoo", null, "zz:Zoo", true, LinkMagic.LANGUAGE)); |
| 33 | + links.add(analyzer.newLink(null, "Yar", Namespace.MAIN, "Yar", null, "Yar", true, LinkMagic.NONE)); |
| 34 | + links.add(analyzer.newLink(null, "Category:Yoo", Namespace.CATEGORY, "Yoo", null, "*", false, LinkMagic.CATEGORY)); |
| 35 | + links.add(analyzer.newLink(null, "Category:Yofos", Namespace.CATEGORY, "Yofos", null, "Yoo", true, LinkMagic.CATEGORY)); |
| 36 | + links.add(analyzer.newLink("xx", "Xo", Namespace.MAIN, "Xo", null, "xx:Xo", true, LinkMagic.LANGUAGE)); |
| 37 | + links.add(analyzer.newLink("zz", "Zoo", Namespace.MAIN, "Zoo", null, "zz:Zoo", true, LinkMagic.LANGUAGE)); |
38 | 38 | assertTestCase(page, "links", links); |
39 | 39 | |
40 | 40 | Set<String> categories = new HashSet<String>(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java |
— | — | @@ -212,7 +212,7 @@ |
213 | 213 | if (link.getInterwiki()!=null) continue; |
214 | 214 | |
215 | 215 | storeReference(rcId, link.getText().toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK); |
216 | | - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getPage().toString()); |
| 216 | + if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString()); |
217 | 217 | } |
218 | 218 | } |
219 | 219 | } |
— | — | @@ -222,11 +222,11 @@ |
223 | 223 | WikiTextAnalyzer.LinkMagic m = link.getMagic(); |
224 | 224 | |
225 | 225 | if (m==WikiTextAnalyzer.LinkMagic.NONE) { |
226 | | - if (link.getNamespace()!=Namespace.MAIN) continue; |
| 226 | + if (link.getNamespace()!=Namespace.MAIN && link.getNamespace()!=Namespace.CATEGORY) continue; |
227 | 227 | if (link.getInterwiki()!=null) continue; |
228 | 228 | |
229 | 229 | storeLink(rcId, conceptId, conceptName, link.getText().toString(), link.getTarget().toString(), ExtractionRule.TERM_FROM_LINK); |
230 | | - if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getPage().toString()); |
| 230 | + if (link.getSection()!=null) storeSection(rcId, link.getTarget().toString(), link.getTargetPage().toString()); |
231 | 231 | } |
232 | 232 | } |
233 | 233 | } |
— | — | @@ -254,7 +254,7 @@ |
255 | 255 | String rcName = analyzerPage.getResourceName(); |
256 | 256 | String text = analyzerPage.getText().toString(); |
257 | 257 | //int namespace = analyzerPage.getNamespace(); |
258 | | - //String title = analyzerPage.getTitle().toString(); |
| 258 | + String title = analyzerPage.getTitle().toString(); |
259 | 259 | |
260 | 260 | //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update. |
261 | 261 | int rcId = storeResource(rcName, rcType, timestamp); |
— | — | @@ -280,7 +280,7 @@ |
281 | 281 | |
282 | 282 | if (m==WikiTextAnalyzer.LinkMagic.CATEGORY) { |
283 | 283 | //FIXME: store this also as a reference to the categorie's concept under it's original title! |
284 | | - storeConceptBroader(rcId, name, link.getPage().toString(), ExtractionRule.BROADER_FROM_CAT); |
| 284 | + storeConceptBroader(rcId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT); |
285 | 285 | } |
286 | 286 | |
287 | 287 | if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) { |
— | — | @@ -367,16 +367,15 @@ |
368 | 368 | if ( sortKey!=null && analyzer.isMainArticleMarker(sortKey) ) { |
369 | 369 | if (analyzer.useCategoryAliases()) { |
370 | 370 | //XXX: if there's more than one "main article", this breaks. |
371 | | - String cat = link.getPage().toString(); |
372 | 371 | |
373 | | - if (!cat.equals(name) && analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) { |
374 | | - Set<CharSequence> terms = analyzer.determineTitleTerms(link.getPage()); |
| 372 | + if (analyzer.mayBeFormOf(link.getLenientPage(), analyzerPage.getTitleBaseName())) { |
| 373 | + Set<CharSequence> terms = analyzer.determineTitleTerms(link.getTitle()); |
375 | 374 | storePageTerms(rcId, terms, conceptId, name, ExtractionRule.TERM_FROM_CAT_NAME); |
376 | 375 | |
377 | 376 | //NOTE: the alias is preliminary: if a article with the name of the category |
378 | 377 | // exists, the alias will be ignored. See DatabaseLocalConceptBuilder.finishBadLinks |
379 | 378 | |
380 | | - storeConceptAlias(rcId, -1, cat, conceptId, name, AliasScope.CATEGORY); |
| 379 | + storeConceptAlias(rcId, -1, link.getTarget().toString(), conceptId, name, AliasScope.CATEGORY); |
381 | 380 | categorize = false; |
382 | 381 | } |
383 | 382 | } |
— | — | @@ -390,13 +389,11 @@ |
391 | 390 | storeReference(rcId, sortKey, conceptId, name, ExtractionRule.TERM_FROM_SORTKEY); //sort key is a name for this page |
392 | 391 | } |
393 | 392 | |
394 | | - if ( !link.getPage().toString().equals(name) ) { //NOTE: need the toString, CharSequences doen't "equal" strings :( |
395 | | - storeConceptBroader(rcId, conceptId, name, link.getPage().toString(), ExtractionRule.BROADER_FROM_CAT); |
396 | | - } |
| 393 | + storeConceptBroader(rcId, conceptId, name, link.getTarget().toString(), ExtractionRule.BROADER_FROM_CAT); |
397 | 394 | } |
398 | 395 | } |
399 | 396 | else if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) { |
400 | | - storeLanguageLink(rcId, conceptId, name, link.getInterwiki().toString(), link.getPage().toString()); //XXX: consider target? consider both?? |
| 397 | + storeLanguageLink(rcId, conceptId, name, link.getInterwiki().toString(), link.getTarget().toString()); //XXX: consider target? consider both?? |
401 | 398 | } |
402 | 399 | } |
403 | 400 | |
— | — | @@ -421,7 +418,7 @@ |
422 | 419 | if (link.getInterwiki()!=null) continue; |
423 | 420 | |
424 | 421 | for (CharSequence term : terms) { |
425 | | - storeReference(rcId, term.toString(), -1, link.getPage().toString(), ExtractionRule.TERM_FROM_DISAMBIG); |
| 422 | + storeReference(rcId, term.toString(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_DISAMBIG); |
426 | 423 | } |
427 | 424 | } |
428 | 425 | } |
— | — | @@ -469,12 +466,12 @@ |
470 | 467 | } |
471 | 468 | else if (link.getNamespace()!=analyzerPage.getNamespace()) { |
472 | 469 | if (link.getNamespace()==Namespace.CATEGORY && analyzerPage.getNamespace()==Namespace.MAIN) { |
473 | | - if ( StringUtils.equals(link.getPage(), analyzerPage.getTitle()) ) { |
| 470 | + if ( StringUtils.equals(link.getTarget(), rcName) ) { |
474 | 471 | out.debug("ignored redundant category redirect "+rcName+" -> "+link); |
475 | 472 | } else { |
476 | 473 | out.debug("processing redirect to category "+rcName+" -> "+link); |
477 | | - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getPage().toString(), ExtractionRule.TERM_FROM_REDIRECT ); |
478 | | - String tgtConcept = link.getPage().toString(); |
| 474 | + storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT ); |
| 475 | + String tgtConcept = link.getTarget().toString(); |
479 | 476 | |
480 | 477 | if (!name.equals(tgtConcept)) { |
481 | 478 | conceptId = store.storeAbout(rcId, rcName, name); |
— | — | @@ -487,13 +484,13 @@ |
488 | 485 | warn(rcId, "bad redirect (inter-namespace)", rcName+" -> "+link, null); |
489 | 486 | } |
490 | 487 | } |
491 | | - else if (name.equals(link.getPage().toString())) { |
| 488 | + else if (rcName.equals(link.getTarget().toString())) { |
492 | 489 | warn(rcId, "bad redirect (self-link)", "page "+name, null); |
493 | 490 | } |
494 | 491 | else { |
495 | 492 | conceptId = store.storeAbout(rcId, rcName, name); |
496 | | - storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getPage().toString(), ExtractionRule.TERM_FROM_REDIRECT ); |
497 | | - storeConceptAlias(rcId, conceptId, name, -1, link.getPage().toString(), AliasScope.REDIRECT); |
| 493 | + storePageTerms(rcId, analyzerPage.getTitleTerms(), -1, link.getTarget().toString(), ExtractionRule.TERM_FROM_REDIRECT ); |
| 494 | + storeConceptAlias(rcId, conceptId, name, -1, link.getTarget().toString(), AliasScope.REDIRECT); |
498 | 495 | |
499 | 496 | //FIXME: redir to section! |
500 | 497 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/BuildThesaurus.java |
— | — | @@ -66,7 +66,25 @@ |
67 | 67 | @Override |
68 | 68 | protected void run() throws Exception { |
69 | 69 | section("-- importConcepts --------------------------------------------------"); |
| 70 | + if (languages==null) { |
| 71 | + String lang = args.getStringOption("languages", null); |
| 72 | + if (lang!=null) { |
| 73 | + String[] ll = lang.split("[,;/|\\s+]+"); |
| 74 | + languages = new Corpus[ll.length]; |
| 75 | + |
| 76 | + int i = 0; |
| 77 | + for (String l: ll) { |
| 78 | + languages[i++] = Corpus.forName(getConfiguredCollectionName(), l, tweaks); |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + if (languages==null) { |
| 83 | + languages = ((GlobalConceptStoreBuilder)this.conceptStore).detectLanguages(); |
| 84 | + } |
| 85 | + } |
| 86 | + |
70 | 87 | info("Using languages: "+Arrays.toString(languages)); |
| 88 | + ((GlobalConceptStoreBuilder)this.conceptStore).setLanguages(languages); |
71 | 89 | |
72 | 90 | if (agenda.beginTask("BuildThesaurus.run", "importConcepts")) { |
73 | 91 | ((GlobalConceptStoreBuilder)this.conceptStore).importConcepts(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -69,9 +69,9 @@ |
70 | 70 | stripClutterManglers.add( new RegularExpressionMangler("^"+templatePatternString("wrapper", 0, true), "{|", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
71 | 71 | stripClutterManglers.add( new RegularExpressionMangler("^"+templatePatternString("end|col-end", 0, true), "|}", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
72 | 72 | |
73 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons(-inline|[ _]left|show\\d)?", 1, true), "[[commons:$1]]")); |
74 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _+]?cat(-inline|[ _]left|show\\d)?", 1, true), "[[commons:Category:$1]]")); |
75 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("wikimedia", 1, true), "[[commons:$1]]")); //FIXME: named params: commons= |
| 73 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons(?:-inline|[ _]left|show\\d)?", 1, true), "[[commons:$2]]")); |
| 74 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _+]?cat(?:-inline|[ _]left|show\\d)?", 1, true), "[[commons:Category:$2]]")); |
| 75 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("wikimedia", 1, true), "[[commons:$2]]")); //FIXME: named params: commons= |
76 | 76 | //FIXME: Commonscat-N, Commons_cat_multi... |
77 | 77 | stripClutterManglers.add( new RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE)); |
78 | 78 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java |
— | — | @@ -32,8 +32,8 @@ |
33 | 33 | ")\\s*\\|(?:\\s*rtl\\s*\\|)?.*?\\|\\s*(.*?)\\s*\\}\\}", "$1", Pattern.DOTALL | Pattern.CASE_INSENSITIVE)); |
34 | 34 | */ |
35 | 35 | |
36 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]")); |
37 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$1]]")); |
| 36 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$2]]")); |
| 37 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$2]]")); |
38 | 38 | stripClutterManglers.add( new RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE)); |
39 | 39 | |
40 | 40 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Okina", 0, false), "\u02BB")); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_eswiki.java |
— | — | @@ -25,8 +25,8 @@ |
26 | 26 | stripClutterManglers.add( new RegularExpressionMangler( templatePattern("C", 0, true), "\u00a9")); |
27 | 27 | stripClutterManglers.add( new RegularExpressionMangler( templatePattern("E", 1, true), "\u00d710^$2")); |
28 | 28 | |
29 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]")); |
30 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$1]]")); |
| 29 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$2]]")); |
| 30 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$2]]")); |
31 | 31 | |
32 | 32 | //reduce to third param |
33 | 33 | stripClutterManglers.add( new RegularExpressionMangler( |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_frwiki.java |
— | — | @@ -6,7 +6,6 @@ |
7 | 7 | import de.brightbyte.wikiword.ResourceType; |
8 | 8 | import de.brightbyte.wikiword.analyzer.WikiConfiguration; |
9 | 9 | import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler; |
10 | | -import de.brightbyte.wikiword.analyzer.matcher.ExactNameMatcher; |
11 | 10 | import de.brightbyte.wikiword.analyzer.matcher.NameMatcher; |
12 | 11 | import de.brightbyte.wikiword.analyzer.matcher.PatternNameMatcher; |
13 | 12 | import de.brightbyte.wikiword.analyzer.sensor.HasCategoryLikeSensor; |
— | — | @@ -18,9 +17,9 @@ |
19 | 18 | |
20 | 19 | public WikiConfiguration_frwiki() { |
21 | 20 | super(); |
22 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]")); |
23 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _]?cat?", 1, true), "[[commons:Category:$1]]")); |
24 | | - stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Autres[ _]projets", 1, true), "[[commons:$1]]")); //FIXME: named params: commons= |
| 21 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$2]]")); |
| 22 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("commons[ _]?cat?", 1, true), "[[commons:Category:$2]]")); |
| 23 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Autres[ _]projets", 1, true), "[[commons:$2]]")); //FIXME: named params: commons= |
25 | 24 | |
26 | 25 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("er?|\u00e8?re|(?:mini[ _])?[IVXCM]+(?:e|re|er)?|\\d+r?er?|Mlle|Mme|elle", 0, true), "$1")); |
27 | 26 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("romain|rom|rom-min|rom-maj|APIb|IPA", 1, true), "$2")); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseGlobalConceptStoreBuilder.java |
— | — | @@ -135,7 +135,8 @@ |
136 | 136 | //------------------------------- |
137 | 137 | public Corpus[] detectLanguages() throws PersistenceException { |
138 | 138 | try { |
139 | | - return ((GlobalConceptStoreSchema)database).getLanguages(); |
| 139 | + Corpus[] languages = ((GlobalConceptStoreSchema)database).getLanguages(); |
| 140 | + return languages; |
140 | 141 | } catch (SQLException e) { |
141 | 142 | throw new PersistenceException(e); |
142 | 143 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseWikiWordConceptStoreBuilder.java |
— | — | @@ -401,7 +401,7 @@ |
402 | 402 | String sql = "insert ignore into "+relationTable.getSQLName()+" (concept1, concept2, bilink)" + |
403 | 403 | " select A.anchor, A.target, 1 from "+linkTable.getSQLName()+" as A " + |
404 | 404 | " join "+linkTable.getSQLName()+" as B " + |
405 | | - " force index (anchor_target) " + //NOTE: avoid table scan! |
| 405 | + " force index (target_anchor) " + //NOTE: avoid table scan! |
406 | 406 | " on A.anchor = B.target AND B.anchor = A.target "; |
407 | 407 | String suffix = " on duplicate key update bilink = bilink + values(bilink)"; |
408 | 408 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiConfiguration.java |
— | — | @@ -369,7 +369,8 @@ |
370 | 370 | this.maxWordFormDistance = 1.0/3.0; |
371 | 371 | |
372 | 372 | this.badTitlePattern = Pattern.compile("^$|''|[|{}<>\\]\\[]|^\\w+://"); |
373 | | - this.badLinkPattern = Pattern.compile("^[^\\d]+:[^ _]|^\\.\\.?$"); |
| 373 | + //this.badLinkPattern = Pattern.compile("^[^\\d]+:[^ _]|^\\.\\.?$"); //disallow namespace/interlang |
| 374 | + this.badLinkPattern = Pattern.compile("^\\.\\.?$"); |
374 | 375 | this.titleSuffixPattern = Pattern.compile("^(.*)[ _]\\((.*?)\\)$"); |
375 | 376 | this.titlePrefixPattern = Pattern.compile("^(.*?)#(.+)$"); |
376 | 377 | this.disambigStripSectionPattern = null; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java |
— | — | @@ -263,7 +263,7 @@ |
264 | 264 | private CharSequence lenientSection; |
265 | 265 | private CharSequence lenientText; |
266 | 266 | |
267 | | - public WikiLink(CharSequence interwiki, int namespace, CharSequence page, CharSequence section, CharSequence text, boolean impliedText, LinkMagic magic) { |
| 267 | + public WikiLink(CharSequence interwiki, CharSequence target, int namespace, CharSequence page, CharSequence section, CharSequence text, boolean impliedText, LinkMagic magic) { |
268 | 268 | super(); |
269 | 269 | this.magic = magic; |
270 | 270 | this.interwiki = interwiki; |
— | — | @@ -272,8 +272,7 @@ |
273 | 273 | this.section = section; |
274 | 274 | this.text = text; |
275 | 275 | this.impliedText = impliedText; |
276 | | - this.target = page; |
277 | | - if (section!=null && section.length()>0) this.target = this.target + "#" + section; |
| 276 | + this.target = target; |
278 | 277 | } |
279 | 278 | |
280 | 279 | public CharSequence getInterwiki() { |
— | — | @@ -284,15 +283,25 @@ |
285 | 284 | return namespace; |
286 | 285 | } |
287 | 286 | |
288 | | - public CharSequence getPage() { |
| 287 | + public CharSequence getTitle() { |
289 | 288 | return page; |
290 | 289 | } |
291 | 290 | |
292 | | - @Deprecated |
293 | 291 | public CharSequence getTarget() { |
294 | 292 | return target; |
295 | 293 | } |
296 | 294 | |
| 295 | + |
| 296 | + public CharSequence getTargetPage() { |
| 297 | + CharSequence t = getTarget(); |
| 298 | + if (section==null) return t; |
| 299 | + |
| 300 | + int idx = StringUtils.indexOf('#', t); |
| 301 | + if (idx<0) return t; |
| 302 | + |
| 303 | + return t.subSequence(idx+1, t.length()); |
| 304 | + } |
| 305 | + |
297 | 306 | public CharSequence getSection() { |
298 | 307 | return section; |
299 | 308 | } |
— | — | @@ -403,7 +412,6 @@ |
404 | 413 | return false; |
405 | 414 | return true; |
406 | 415 | } |
407 | | - |
408 | 416 | |
409 | 417 | } |
410 | 418 | |
— | — | @@ -690,7 +698,7 @@ |
691 | 699 | |
692 | 700 | for (WikiLink link : links) { |
693 | 701 | if (link.getMagic() == LinkMagic.CATEGORY) { |
694 | | - c.add(link.getPage().toString()); |
| 702 | + c.add(link.getTitle().toString()); |
695 | 703 | } |
696 | 704 | } |
697 | 705 | categories = Collections.unmodifiableSet( c ); |
— | — | @@ -1376,46 +1384,55 @@ |
1377 | 1385 | LinkMagic magic = LinkMagic.NONE; |
1378 | 1386 | CharSequence interwiki = null; |
1379 | 1387 | int namespace = Namespace.MAIN; |
1380 | | - CharSequence page = target; |
1381 | 1388 | CharSequence section = null; |
1382 | 1389 | boolean esc = false; |
1383 | 1390 | |
1384 | | - while (page.length()>0 && page.charAt(0)==':') { |
1385 | | - page = page.subSequence(1, page.length()); |
| 1391 | + while (target.length()>0 && target.charAt(0)==':') { |
| 1392 | + target = target.subSequence(1, target.length()); |
1386 | 1393 | esc = true; |
1387 | 1394 | } |
1388 | 1395 | |
1389 | | - if (page.length()==0) return null; |
| 1396 | + if (target.length()==0) return null; |
1390 | 1397 | |
| 1398 | + CharSequence title = target; |
| 1399 | + |
1391 | 1400 | //handle section links ------------------------ |
1392 | | - int idx = StringUtils.indexOf('#', page); |
1393 | | - if (idx==page.length()-1) { |
1394 | | - page = page.subSequence(0, page.length()-1); |
| 1401 | + int idx = StringUtils.indexOf('#', title); |
| 1402 | + if (idx==title.length()-1) { |
| 1403 | + title = title.subSequence(0, title.length()-1); |
| 1404 | + target = title; |
1395 | 1405 | section = null; |
1396 | 1406 | } |
1397 | 1407 | else if (idx==0) { |
1398 | | - section = page.subSequence(1, page.length()); |
1399 | | - page = context; |
| 1408 | + section = title.subSequence(1, title.length()); |
| 1409 | + title = context; |
| 1410 | + target = null; //restored later |
1400 | 1411 | } |
1401 | 1412 | else if (idx>0) { |
1402 | | - section = page.subSequence(idx+1, page.length()); |
1403 | | - page = target.subSequence(0, idx); |
| 1413 | + section = title.subSequence(idx+1, title.length()); |
| 1414 | + title = target.subSequence(0, idx); |
1404 | 1415 | } |
1405 | 1416 | |
| 1417 | + //TODO: subpages starting with "/"... |
| 1418 | + |
1406 | 1419 | if (section!=null) { //handle special encoded chars in section ref |
1407 | 1420 | section = decodeSectionName(AnalyzerUtils.trim(section)); |
1408 | 1421 | section = AnalyzerUtils.replaceSpaceByUnderscore(section); |
| 1422 | + if (target==null) target = context + "#" + section; |
1409 | 1423 | } |
1410 | 1424 | |
1411 | 1425 | //handle qualifiers ------------------------ |
1412 | | - idx = StringUtils.indexOf(':', page); |
| 1426 | + boolean setTargetToTitle = false; |
| 1427 | + idx = StringUtils.indexOf(':', title); |
1413 | 1428 | if (idx>=0) { |
1414 | | - CharSequence pre = AnalyzerUtils.trim(page.subSequence(0, idx)); |
| 1429 | + CharSequence pre = AnalyzerUtils.trim(title.subSequence(0, idx)); |
1415 | 1430 | pre = normalizeTitle(pre); |
1416 | 1431 | int ns = getNamespaceId(pre); |
1417 | 1432 | if (ns!=Namespace.NONE) { |
1418 | 1433 | namespace = ns; |
1419 | | - page = page.subSequence(idx+1, page.length()); |
| 1434 | + title = title.subSequence(idx+1, title.length()); |
| 1435 | + target = target.subSequence(idx+1, target.length()); |
| 1436 | + target = getNamespaceName(ns) + ":" + normalizeTitle(target); |
1420 | 1437 | |
1421 | 1438 | if (!esc) { |
1422 | 1439 | if (ns==Namespace.IMAGE) magic = LinkMagic.IMAGE; |
— | — | @@ -1423,9 +1440,19 @@ |
1424 | 1441 | } |
1425 | 1442 | } |
1426 | 1443 | else if (isInterwikiPrefix(pre)) { |
1427 | | - page = page.subSequence(idx+1, page.length()); |
| 1444 | + if (target==title) setTargetToTitle = true; |
| 1445 | + title = title.subSequence(idx+1, title.length()); |
| 1446 | + |
| 1447 | + if (!setTargetToTitle) { |
| 1448 | + idx = StringUtils.indexOf(':', target); |
| 1449 | + target = target.subSequence(idx+1, target.length()); |
| 1450 | + target = normalizeTitle(target); |
| 1451 | + } |
| 1452 | + |
| 1453 | + //FIXME: normalize target title *namespace*, so it can be joined against the about table! |
| 1454 | + |
1428 | 1455 | interwiki = AnalyzerUtils.toLowerCase(pre); |
1429 | | - |
| 1456 | + |
1430 | 1457 | if (isInterlanguagePrefix(pre) && !esc) { |
1431 | 1458 | magic = LinkMagic.LANGUAGE; |
1432 | 1459 | } |
— | — | @@ -1449,14 +1476,17 @@ |
1450 | 1477 | } |
1451 | 1478 | } |
1452 | 1479 | |
1453 | | - if (tail!=null && magic != LinkMagic.CATEGORY) text = text.toString() + tail; |
| 1480 | + if (tail!=null && magic == LinkMagic.NONE) text = text.toString() + tail; |
1454 | 1481 | if (!implied) text = stripMarkup(text); //XXX: this can get pretty expensive... |
1455 | 1482 | text = HtmlEntities.decodeEntities(text); |
1456 | 1483 | |
1457 | | - if (page.length()==0) return null; |
| 1484 | + if (title.length()==0) return null; |
1458 | 1485 | |
1459 | | - page = normalizeTitle(page); |
1460 | | - return new WikiLink(interwiki, namespace, page, section, text, implied, magic); |
| 1486 | + title = normalizeTitle(title); |
| 1487 | + if (setTargetToTitle) |
| 1488 | + target = title; |
| 1489 | + |
| 1490 | + return new WikiLink(interwiki, title, namespace, title, section, text, implied, magic); |
1461 | 1491 | } |
1462 | 1492 | |
1463 | 1493 | public boolean isInterlanguagePrefix(CharSequence pre) { |
— | — | @@ -1485,6 +1515,12 @@ |
1486 | 1516 | return namespaces.getNumber(name.toString()); |
1487 | 1517 | } |
1488 | 1518 | |
| 1519 | + public String getNamespaceName(int id) { |
| 1520 | + if (id==0) return ""; |
| 1521 | + |
| 1522 | + return namespaces.getNamespace(id).getLocalName(); |
| 1523 | + } |
| 1524 | + |
1489 | 1525 | public CharSequence normalizeTitle(CharSequence title) { |
1490 | 1526 | return normalizeTitle(title, true); |
1491 | 1527 | } |
— | — | @@ -1815,9 +1851,13 @@ |
1816 | 1852 | |
1817 | 1853 | linkMatcher.reset(text); |
1818 | 1854 | while (linkMatcher.find()) { |
1819 | | - WikiLink link = makeLink(title, linkMatcher.group(1), linkMatcher.group(3), linkMatcher.group(4)); |
| 1855 | + String target = linkMatcher.group(1); |
| 1856 | + String label = linkMatcher.group(3); |
| 1857 | + String trail = linkMatcher.group(4); |
| 1858 | + |
| 1859 | + WikiLink link = makeLink(title, target, label, trail); |
1820 | 1860 | if (link==null) continue; |
1821 | | - if (isBadLinkTarget(link.getPage())) continue; |
| 1861 | + if (isBadLinkTarget(link.getTarget())) continue; |
1822 | 1862 | |
1823 | 1863 | links.add(link); |
1824 | 1864 | } |
— | — | @@ -1873,8 +1913,8 @@ |
1874 | 1914 | return d <= config.maxWordFormDistance; |
1875 | 1915 | } |
1876 | 1916 | |
1877 | | - public WikiLink newLink(String interwiki, int namespace, String page, String section, String text, boolean impliedText, LinkMagic magic) { |
1878 | | - return new WikiLink(interwiki, namespace, page, section, text, impliedText, magic); |
| 1917 | + public WikiLink newLink(String interwiki, String target, int namespace, String title, String section, String text, boolean impliedText, LinkMagic magic) { |
| 1918 | + return new WikiLink(interwiki, target, namespace, title, section, text, impliedText, magic); |
1879 | 1919 | } |
1880 | 1920 | |
1881 | 1921 | public static WikiTextAnalyzer getWikiTextAnalyzer(Corpus corpus, TweakSet tweaks) throws InstantiationException { |
Index: trunk/WikiWord/WikiWordBuilder/tweaks.properties.sample |
— | — | @@ -6,6 +6,9 @@ |
7 | 7 | # treat "commons" as a language code |
8 | 8 | languages.commonsAsLanguage = false |
9 | 9 | |
| 10 | +# treat "meta" as a language code |
| 11 | +languages.metaAsLanguage = false |
| 12 | + |
10 | 13 | # treat "simple" as a language code |
11 | 14 | languages.simpleAsLanguage = true |
12 | 15 | |