Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Languages.java |
— | — | @@ -16,9 +16,8 @@ |
17 | 17 | * It is loaded from the Languages.properties located in the de.brightbyte.wikiword package. |
18 | 18 | */ |
19 | 19 | public class Languages { |
20 | | - public static final Map<String, String> names; |
21 | 20 | |
22 | | - static { |
| 21 | + public static Map<String, String> load(TweakSet tweaks){ |
23 | 22 | try { |
24 | 23 | InputStream in = Languages.class.getResourceAsStream("Languages.properties"); |
25 | 24 | if (in == null) throw new ExceptionInInitializerError("missing resource Languages.properties"); |
— | — | @@ -34,10 +33,22 @@ |
35 | 34 | ln.put(k, v); |
36 | 35 | } |
37 | 36 | |
38 | | - names = Collections.unmodifiableMap(ln); |
| 37 | + if (tweaks.getTweak("languages.commonsAsLanguage", false)) { |
| 38 | + ln.put("commons", "Commons"); |
| 39 | + } else { |
| 40 | + ln.remove("commons"); |
| 41 | + } |
| 42 | + |
| 43 | + if (tweaks.getTweak("languages.simpleAsLanguage", true)) { |
| 44 | + ln.put("simple", "Simple English"); |
| 45 | + } else { |
| 46 | + ln.remove("simple"); |
| 47 | + } |
| 48 | + |
| 49 | + return Collections.unmodifiableMap(ln); |
39 | 50 | } |
40 | 51 | catch (IOException ex) { |
41 | | - throw new ExceptionInInitializerError(ex); |
| 52 | + throw new RuntimeException("failed to load Languages.properties via ClassLoader", ex); |
42 | 53 | } |
43 | 54 | } |
44 | 55 | |
Index: trunk/WikiWord/WikiWordBuilder4LifeScience/src/test/java/de/brightbyte/wikiword/wikipro/PropertyDump.java |
— | — | @@ -11,6 +11,7 @@ |
12 | 12 | import de.brightbyte.wikiword.Corpus; |
13 | 13 | import de.brightbyte.wikiword.Namespace; |
14 | 14 | import de.brightbyte.wikiword.NamespaceSet; |
| 15 | +import de.brightbyte.wikiword.TweakSet; |
15 | 16 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
16 | 17 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiLink; |
17 | 18 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage; |
— | — | @@ -34,7 +35,8 @@ |
35 | 36 | |
36 | 37 | String p = n; |
37 | 38 | |
38 | | - WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus); |
| 39 | + TweakSet tweaks = new TweakSet(); |
| 40 | + WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks); |
39 | 41 | |
40 | 42 | System.out.println("loading "+u+"..."); |
41 | 43 | String text = IOUtil.slurp(u, "UTF-8"); |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTestBase.java |
— | — | @@ -7,6 +7,7 @@ |
8 | 8 | |
9 | 9 | import junit.framework.TestCase; |
10 | 10 | import de.brightbyte.wikiword.Corpus; |
| 11 | +import de.brightbyte.wikiword.TweakSet; |
11 | 12 | import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer; |
12 | 13 | |
13 | 14 | /** |
— | — | @@ -15,15 +16,17 @@ |
16 | 17 | public abstract class PlainTextAnalyzerTestBase extends TestCase { |
17 | 18 | |
18 | 19 | protected Corpus corpus; |
| 20 | + protected TweakSet tweaks; |
19 | 21 | protected PlainTextAnalyzer analyzer; |
20 | 22 | |
21 | 23 | public PlainTextAnalyzerTestBase(String wikiName) { |
22 | 24 | corpus = Corpus.forName("TEST", wikiName, (String[])null); |
| 25 | + tweaks = new TweakSet(); |
23 | 26 | } |
24 | 27 | |
25 | 28 | @Override |
26 | 29 | public void setUp() throws Exception { |
27 | | - analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(corpus); |
| 30 | + analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(corpus, tweaks); |
28 | 31 | analyzer.initialize(); |
29 | 32 | } |
30 | 33 | |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerBenchmark.java |
— | — | @@ -4,6 +4,7 @@ |
5 | 5 | |
6 | 6 | import de.brightbyte.wikiword.Corpus; |
7 | 7 | import de.brightbyte.wikiword.NamespaceSet; |
| 8 | +import de.brightbyte.wikiword.TweakSet; |
8 | 9 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
9 | 10 | |
10 | 11 | public class WikiTextAnalyzerBenchmark { |
— | — | @@ -11,6 +12,7 @@ |
12 | 13 | protected NamespaceSet namespaces; |
13 | 14 | protected Corpus corpus; |
14 | 15 | protected WikiTextAnalyzer analyzer; |
| 16 | + protected TweakSet tweaks; |
15 | 17 | |
16 | 18 | public WikiTextAnalyzerBenchmark(String wikiName) throws InstantiationException { |
17 | 19 | corpus = Corpus.forName("TEST", wikiName, (String[])null); |
— | — | @@ -19,9 +21,10 @@ |
20 | 22 | //site.Sitename = corpus.getFamily(); |
21 | 23 | |
22 | 24 | titleCase = true; |
23 | | - namespaces = corpus.getNamespaces(); |
| 25 | + namespaces = corpus.getNamespaces(); |
| 26 | + tweaks = new TweakSet(); |
24 | 27 | |
25 | | - analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus); |
| 28 | + analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks); |
26 | 29 | analyzer.initialize(namespaces, titleCase); |
27 | 30 | } |
28 | 31 | |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTestBase.java |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | import junit.framework.TestCase; |
21 | 21 | import de.brightbyte.wikiword.Corpus; |
22 | 22 | import de.brightbyte.wikiword.NamespaceSet; |
| 23 | +import de.brightbyte.wikiword.TweakSet; |
23 | 24 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
24 | 25 | |
25 | 26 | /** |
— | — | @@ -30,9 +31,11 @@ |
31 | 32 | protected NamespaceSet namespaces; |
32 | 33 | protected Corpus corpus; |
33 | 34 | protected WikiTextAnalyzer analyzer; |
| 35 | + protected TweakSet tweaks; |
34 | 36 | |
35 | 37 | public WikiTextAnalyzerTestBase(String wikiName) { |
36 | 38 | corpus = Corpus.forName("TEST", wikiName, (String[])null); |
| 39 | + tweaks = new TweakSet(); |
37 | 40 | |
38 | 41 | //site.Base = "http://"+corpus.getDomain()+"/wiki/"; |
39 | 42 | //site.Sitename = corpus.getFamily(); |
— | — | @@ -43,7 +46,7 @@ |
44 | 47 | |
45 | 48 | @Override |
46 | 49 | public void setUp() throws Exception { |
47 | | - analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus); |
| 50 | + analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks); |
48 | 51 | analyzer.initialize(namespaces, titleCase); |
49 | 52 | } |
50 | 53 | |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java |
— | — | @@ -84,7 +84,7 @@ |
85 | 85 | |
86 | 86 | corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null); |
87 | 87 | testAnalyzer = new TestPlainTextAnalyzer(corpus); |
88 | | - testAnalyzer.configure(config); |
| 88 | + testAnalyzer.configure(config, tweaks); |
89 | 89 | testAnalyzer.initialize(); |
90 | 90 | |
91 | 91 | analyzer = testAnalyzer; |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzerTest.java |
— | — | @@ -643,7 +643,7 @@ |
644 | 644 | |
645 | 645 | corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null); |
646 | 646 | PlainTextAnalyzer language = new PlainTextAnalyzer(corpus); |
647 | | - language.configure(lconfig); |
| 647 | + language.configure(lconfig, tweaks); |
648 | 648 | language.initialize(); |
649 | 649 | |
650 | 650 | WikiConfiguration config = new WikiConfiguration(); |
— | — | @@ -662,7 +662,7 @@ |
663 | 663 | |
664 | 664 | testAnalyzer = new TestWikiTextAnalyzer(language); |
665 | 665 | testAnalyzer.addExtraTemplateUser(Pattern.compile(".*"), true); |
666 | | - testAnalyzer.configure(config); |
| 666 | + testAnalyzer.configure(config, tweaks); |
667 | 667 | testAnalyzer.initialize(namespaces, titleCase); |
668 | 668 | |
669 | 669 | analyzer = testAnalyzer; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AbstractAnalyzer.java |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | |
14 | 14 | import de.brightbyte.util.StringUtils; |
15 | 15 | import de.brightbyte.wikiword.Corpus; |
| 16 | +import de.brightbyte.wikiword.TweakSet; |
16 | 17 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.ArmorEntry; |
17 | 18 | |
18 | 19 | /** |
— | — | @@ -22,6 +23,8 @@ |
23 | 24 | */ |
24 | 25 | public class AbstractAnalyzer { |
25 | 26 | |
| 27 | + protected TweakSet tweaks; |
| 28 | + |
26 | 29 | /** |
27 | 30 | * A Mangler changes text in some way. |
28 | 31 | */ |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -7,6 +7,7 @@ |
8 | 8 | import java.util.regex.Matcher; |
9 | 9 | |
10 | 10 | import de.brightbyte.wikiword.Corpus; |
| 11 | +import de.brightbyte.wikiword.TweakSet; |
11 | 12 | |
12 | 13 | public class PlainTextAnalyzer extends AbstractAnalyzer { |
13 | 14 | private LanguageConfiguration config; |
— | — | @@ -25,7 +26,7 @@ |
26 | 27 | config.defaults(); |
27 | 28 | } |
28 | 29 | |
29 | | - public static PlainTextAnalyzer getPlainTextAnalyzer(Corpus corpus) throws InstantiationException { |
| 30 | + public static PlainTextAnalyzer getPlainTextAnalyzer(Corpus corpus, TweakSet tweaks) throws InstantiationException { |
30 | 31 | Class[] acc = getSpecializedClasses(corpus, PlainTextAnalyzer.class, "PlainTextAnalyzer"); |
31 | 32 | Class[] ccc = getSpecializedClasses(corpus, LanguageConfiguration.class, "LanguageConfiguration", corpus.getConfigPackages()); |
32 | 33 | |
— | — | @@ -36,7 +37,7 @@ |
37 | 38 | for (int i = ccc.length-1; i >= 0; i--) { //NOTE: most specific last, because last write wins. |
38 | 39 | ctor = ccc[i].getConstructor(new Class[] { }); |
39 | 40 | LanguageConfiguration conf = (LanguageConfiguration)ctor.newInstance(new Object[] { } ); |
40 | | - analyzer.configure(conf); |
| 41 | + analyzer.configure(conf, tweaks); |
41 | 42 | } |
42 | 43 | |
43 | 44 | return analyzer; |
— | — | @@ -53,7 +54,10 @@ |
54 | 55 | } |
55 | 56 | } |
56 | 57 | |
57 | | - public void configure(LanguageConfiguration config) { |
| 58 | + public void configure(LanguageConfiguration config, TweakSet tweaks) { |
| 59 | + if (tweaks==null) throw new NullPointerException(); |
| 60 | + |
| 61 | + this.tweaks = tweaks; |
58 | 62 | this.config.merge(config); |
59 | 63 | } |
60 | 64 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java |
— | — | @@ -38,6 +38,7 @@ |
39 | 39 | import de.brightbyte.wikiword.Namespace; |
40 | 40 | import de.brightbyte.wikiword.NamespaceSet; |
41 | 41 | import de.brightbyte.wikiword.ResourceType; |
| 42 | +import de.brightbyte.wikiword.TweakSet; |
42 | 43 | import de.brightbyte.wikiword.analyzer.TemplateExtractor.TemplateData; |
43 | 44 | import de.brightbyte.xml.HtmlEntities; |
44 | 45 | |
— | — | @@ -1980,7 +1981,8 @@ |
1981 | 1982 | private Matcher relevantTemplateMatcher; |
1982 | 1983 | private List<TemplateUser> extraTemplateUsers = new ArrayList<TemplateUser>(); |
1983 | 1984 | |
1984 | | - private WikiTextSniffer sniffer = new WikiTextSniffer(); |
| 1985 | + private WikiTextSniffer sniffer = new WikiTextSniffer(); |
| 1986 | + private Map<String, String> languageNames; |
1985 | 1987 | |
1986 | 1988 | public WikiTextAnalyzer(PlainTextAnalyzer language) { |
1987 | 1989 | this.language = language; |
— | — | @@ -2008,8 +2010,11 @@ |
2009 | 2011 | return initialized; |
2010 | 2012 | } |
2011 | 2013 | |
2012 | | - public void configure(WikiConfiguration config) { |
| 2014 | + public void configure(WikiConfiguration config, TweakSet tweaks) { |
2013 | 2015 | if (isInitialized()) throw new IllegalStateException("already initialized"); |
| 2016 | + if (tweaks==null) throw new NullPointerException(); |
| 2017 | + |
| 2018 | + this.tweaks = tweaks; |
2014 | 2019 | this.config.merge(config); |
2015 | 2020 | } |
2016 | 2021 | |
— | — | @@ -2504,8 +2509,16 @@ |
2505 | 2510 | |
2506 | 2511 | public boolean isInterlanguagePrefix(CharSequence pre) { |
2507 | 2512 | pre = trimAndLower(pre); |
2508 | | - return Languages.names.containsKey(pre); |
| 2513 | + return getLanguageNames().containsKey(pre); |
2509 | 2514 | } |
| 2515 | + |
| 2516 | + protected Map<String, String> getLanguageNames() { |
| 2517 | + if (this.languageNames==null) { |
| 2518 | + this.languageNames = Languages.load(this.tweaks); |
| 2519 | + } |
| 2520 | + |
| 2521 | + return this.languageNames; |
| 2522 | + } |
2510 | 2523 | |
2511 | 2524 | public boolean isInterwikiPrefix(CharSequence pre) { |
2512 | 2525 | interwikiMatcher.reset(pre); |
— | — | @@ -2925,8 +2938,8 @@ |
2926 | 2939 | return new WikiLink(interwiki, namespace, page, section, text, impliedText, magic); |
2927 | 2940 | } |
2928 | 2941 | |
2929 | | - public static WikiTextAnalyzer getWikiTextAnalyzer(Corpus corpus) throws InstantiationException { |
2930 | | - PlainTextAnalyzer language = PlainTextAnalyzer.getPlainTextAnalyzer(corpus); |
| 2942 | + public static WikiTextAnalyzer getWikiTextAnalyzer(Corpus corpus, TweakSet tweaks) throws InstantiationException { |
| 2943 | + PlainTextAnalyzer language = PlainTextAnalyzer.getPlainTextAnalyzer(corpus, tweaks); |
2931 | 2944 | language.initialize(); |
2932 | 2945 | |
2933 | 2946 | return getWikiTextAnalyzer(language); |
— | — | @@ -2951,7 +2964,7 @@ |
2952 | 2965 | |
2953 | 2966 | ctor = ccc[i].getConstructor(new Class[] { }); |
2954 | 2967 | WikiConfiguration conf = (WikiConfiguration)ctor.newInstance(new Object[] { } ); |
2955 | | - analyzer.configure(conf); |
| 2968 | + analyzer.configure(conf, language.tweaks); |
2956 | 2969 | } |
2957 | 2970 | |
2958 | 2971 | return analyzer; |
— | — | @@ -2991,8 +3004,10 @@ |
2992 | 3005 | |
2993 | 3006 | String text = IOUtil.slurp(new File(file), "UTF-8"); |
2994 | 3007 | |
| 3008 | + TweakSet tweaks = new TweakSet(); |
| 3009 | + |
2995 | 3010 | Corpus corpus = Corpus.forName("TEST", lang, (String[])null); |
2996 | | - WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus); |
| 3011 | + WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(corpus, tweaks); |
2997 | 3012 | |
2998 | 3013 | NamespaceSet namespaces = Namespace.getNamespaces(null); |
2999 | 3014 | analyzer.initialize(namespaces, true); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportDump.java |
— | — | @@ -61,7 +61,7 @@ |
62 | 62 | } |
63 | 63 | */ |
64 | 64 | |
65 | | - WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(getCorpus()); |
| 65 | + WikiTextAnalyzer analyzer = WikiTextAnalyzer.getWikiTextAnalyzer(getCorpus(), tweaks); |
66 | 66 | WikiWordImporter importer = newImporter(analyzer, store, tweaks); |
67 | 67 | importer.setLogOutput(getLogOutput()); |
68 | 68 | importer.configure(args); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_commons.java |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +package de.brightbyte.wikiword.wikis; |
| 3 | + |
| 4 | + |
| 5 | +public class LanguageConfiguration_commons extends LanguageConfiguration_en { |
| 6 | + //noop |
| 7 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_commonswiki.java |
— | — | @@ -0,0 +1,9 @@ |
| 2 | +package de.brightbyte.wikiword.wikis; |
| 3 | + |
| 4 | +public class WikiConfiguration_commonswiki extends WikiConfiguration_enwiki { |
| 5 | + |
| 6 | + public WikiConfiguration_commonswiki() { |
| 7 | + //noop |
| 8 | + } |
| 9 | + |
| 10 | +} |