Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -64,6 +64,12 @@ |
65 | 65 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("^"+templatePatternString("wrapper", 0, true), "{|", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
66 | 66 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("^"+templatePatternString("end|col-end", 0, true), "|}", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE)); |
67 | 67 | |
| 68 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("commons(-inline|[ _]left|show\\d)?", 1, true), "[[commons:$1]]")); |
| 69 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("commons[ _+]?cat(-inline|[ _]left|show\\d)?", 1, true), "[[commons:Category:$1]]")); |
| 70 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("wikimedia", 1, true), "[[commons:$1]]")); //FIXME: named params: commons= |
| 71 | + //FIXME: Commonscat-N, Commons_cat_multi... |
| 72 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE)); |
| 73 | + |
68 | 74 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("Okina", 0, false), "\u02BB")); |
69 | 75 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("\u00b7|moddot|dot", 0, false), "\u00b7")); |
70 | 76 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("spaces", 1, true), " ")); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_frwiki.java |
— | — | @@ -7,12 +7,15 @@ |
8 | 8 | import de.brightbyte.wikiword.ResourceType; |
9 | 9 | import de.brightbyte.wikiword.analyzer.WikiConfiguration; |
10 | 10 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
11 | | -import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.NameMatcher; |
12 | 11 | |
13 | 12 | public class WikiConfiguration_frwiki extends WikiConfiguration { |
14 | 13 | |
15 | 14 | public WikiConfiguration_frwiki() { |
16 | 15 | super(); |
| 16 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]")); |
| 17 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("commons[ _]?cat?", 1, true), "[[commons:Category:$1]]")); |
| 18 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("Autres[ _]projets", 1, true), "[[commons:$1]]")); //FIXME: named params: commons= |
| 19 | + |
17 | 20 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("er?|\u00e8?re|(?:mini[ _])?[IVXCM]+(?:e|re|er)?|\\d+r?er?|Mlle|Mme|elle", 0, true), "$1")); |
18 | 21 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("romain|rom|rom-min|rom-maj|APIb|IPA", 1, true), "$2")); |
19 | 22 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("avjc", 0, false), "av. J.-C.")); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java |
— | — | @@ -1,5 +1,7 @@ |
2 | 2 | package de.brightbyte.wikiword.wikis; |
3 | 3 | |
| 4 | +import java.util.regex.Pattern; |
| 5 | + |
4 | 6 | import de.brightbyte.wikiword.ConceptType; |
5 | 7 | import de.brightbyte.wikiword.ResourceType; |
6 | 8 | import de.brightbyte.wikiword.analyzer.WikiConfiguration; |
— | — | @@ -26,9 +28,13 @@ |
27 | 29 | ")\\s*\\|(?:\\s*rtl\\s*\\|)?.*?\\|\\s*(.*?)\\s*\\}\\}", "$1", Pattern.DOTALL | Pattern.CASE_INSENSITIVE)); |
28 | 30 | */ |
29 | 31 | |
| 32 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("commons", 1, true), "[[commons:$1]]")); |
| 33 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("commonscat", 1, true), "[[commons:Category:$1]]")); |
| 34 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE)); |
| 35 | + |
30 | 36 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("Okina", 0, false), "\u02BB")); |
31 | 37 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("IPA(?:-Text)|IAST|Unicode|Musik", 1, true), "$2")); |
32 | | - stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("SWD|Commons|Wiktionary", 0, true), "")); |
| 38 | + stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("SWD|Wiktionary", 0, true), "")); |
33 | 39 | |
34 | 40 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("en|it|fr|ar|Polytonisch", 1, true), "$2")); |
35 | 41 | stripClutterManglers.add( new WikiTextAnalyzer.RegularExpressionMangler(templatePattern("lang", 2, true), "$3")); |