Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_ndswiki.java |
— | — | @@ -14,30 +14,30 @@ |
15 | 15 | |
16 | 16 | public WikiConfiguration_ndswiki() { |
17 | 17 | super(); |
18 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.PLACE, "^[Ll]\u00e4nner_in_.*", 0)); |
19 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.PLACE, "Oort")); |
20 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.PLACE, "Land")); |
| 18 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PLACE, "^[Ll]\u00e4nner_in_.*", 0)); |
| 19 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.PLACE, "Oort")); |
| 20 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.PLACE, "Land")); |
21 | 21 | |
22 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.PERSON, "Mann")); |
23 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.PERSON, "Fru")); |
| 22 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.PERSON, "Mann")); |
| 23 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.PERSON, "Fru")); |
24 | 24 | |
25 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "V\u00f6rnaam_f\u00f6r_Deerns")); |
26 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "V\u00f6rnaam_f\u00f6r_Jungs")); |
27 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "Familiennaam")); |
| 25 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "V\u00f6rnaam_f\u00f6r_Deerns")); |
| 26 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "V\u00f6rnaam_f\u00f6r_Jungs")); |
| 27 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "Familiennaam")); |
28 | 28 | |
29 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.TIME, "Johr")); |
30 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.TIME, "Dag")); |
31 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.TIME, "Johrhunnert")); |
| 29 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.TIME, "Johr")); |
| 30 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.TIME, "Dag")); |
| 31 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.TIME, "Johrhunnert")); |
32 | 32 | |
33 | | - conceptTypeSensors.add( new HasTemplateSensor(ConceptType.LIFEFORM, "Taxobox")); |
| 33 | + conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.LIFEFORM, "Taxobox")); |
34 | 34 | //TODO: cooperations & organizations |
35 | 35 | |
36 | | - resourceTypeSensors.add( new HasTemplateSensor(ResourceType.BAD, "Delete")); |
37 | | - resourceTypeSensors.add( new HasTemplateSensor(ResourceType.BAD, "Gauweg")); |
38 | | - resourceTypeSensors.add( new HasTemplateSensor(ResourceType.BAD, "Wegsmieten")); |
| 36 | + resourceTypeSensors.add( new HasTemplateSensor<ResourceType>(ResourceType.BAD, "Delete")); |
| 37 | + resourceTypeSensors.add( new HasTemplateSensor<ResourceType>(ResourceType.BAD, "Gauweg")); |
| 38 | + resourceTypeSensors.add( new HasTemplateSensor<ResourceType>(ResourceType.BAD, "Wegsmieten")); |
39 | 39 | |
40 | 40 | //resourceTypeSensors.add( new HasTemplateSensor(ResourceType.DISAMBIG, "Mehrd\u00fcdig_Begreep") ); |
41 | | - resourceTypeSensors.add( new TitleSensor(ResourceType.LIST, "Lieste?_(van|mit).*", 0)); |
| 41 | + resourceTypeSensors.add( new TitleSensor<ResourceType>(ResourceType.LIST, "Lieste?_(van|mit).*", 0)); |
42 | 42 | |
43 | 43 | disambigStripSectionPattern = sectionPattern("Kiek ok( bi)?:?", 0); //FIXME: often not as a section, but plain text! |
44 | 44 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -79,6 +79,7 @@ |
80 | 80 | stripClutterManglers.add( new RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE)); |
81 | 81 | |
82 | 82 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Okina", 0, false), "\u02BB")); |
| 83 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("0", 0, true), " ")); |
83 | 84 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("\u00b7|moddot|dot", 0, false), "\u00b7")); |
84 | 85 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("spaces", 1, true), " ")); |
85 | 86 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_nlwiki.java |
— | — | @@ -19,35 +19,36 @@ |
20 | 20 | super(); |
21 | 21 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("wrapper", 0, true), "{|")); |
22 | 22 | |
| 23 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("0", 0, true), " ")); |
23 | 24 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("e", 0, false), "$1")); |
24 | 25 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("unicode", 1, true), "$2")); |
25 | 26 | |
26 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.PLACE, "^(Landtabel|Gemeente|Plaats)($|_)|(^|_)plaats$", 0)); |
27 | | - conceptTypeSensors.add( new HasCategoryLikeSensor(ConceptType.PLACE, "^(Gemeente|Stad|Land|Plaats)(_|$)", 0)); |
| 27 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PLACE, "^(Landtabel|Gemeente|Plaats)($|_)|(^|_)plaats$", 0)); |
| 28 | + conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PLACE, "^(Gemeente|Stad|Land|Plaats)(_|$)", 0)); |
28 | 29 | |
29 | | - conceptTypeSensors.add( new HasTemplateSensor(ConceptType.TIME, "Jaarbox")); |
30 | | - conceptTypeSensors.add( new HasTemplateSensor(ConceptType.TIME, "Kalenders")); |
31 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.TIME, "Datum")); |
32 | | - conceptTypeSensors.add( new TitleSensor(ConceptType.TIME, "(\\d{1,4}|\\d{1,2}e_eeuw)(_v\\._Chr\\.)?", 0)); |
| 30 | + conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.TIME, "Jaarbox")); |
| 31 | + conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.TIME, "Kalenders")); |
| 32 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.TIME, "Datum")); |
| 33 | + conceptTypeSensors.add( new TitleSensor<ConceptType>(ConceptType.TIME, "(\\d{1,4}|\\d{1,2}e_eeuw)(_v\\._Chr\\.)?", 0)); |
33 | 34 | |
34 | | - conceptTypeSensors.add( new HasCategoryLikeSensor(ConceptType.PERSON, "(^|_)persoon(_|$)|(.*schapper|.*oloog|.*icus)$", 0)); |
35 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.PERSON, "^(Infobox_(artiest|Auteur|acteur|Comedian|.*speler|Presentator|regisseur)|Winnaars_.*)$", Pattern.CASE_INSENSITIVE)); |
36 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.PERSON, "^(Infobox_.*|.*cus|.*eur|.*ler|.*schapper)$", Pattern.CASE_INSENSITIVE, new String[] {"geboren"})); |
37 | | - conceptTypeSensors.add( new HasSectionLikeSensor(ConceptType.PERSON, "^((.* )?leven|Carri\u00e8re|Stamvader|Levensloop|Filmografie|Bibliografie|publicaties|(Eigen )?Biografie|Priv\u00e9|.*Loopbaan.*|Jeugd|Kinderen|Familie|Familieachtergrond)$", Pattern.CASE_INSENSITIVE) ); |
| 35 | + conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PERSON, "(^|_)persoon(_|$)|(.*schapper|.*oloog|.*icus)$", 0)); |
| 36 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PERSON, "^(Infobox_(artiest|Auteur|acteur|Comedian|.*speler|Presentator|regisseur)|Winnaars_.*)$", Pattern.CASE_INSENSITIVE)); |
| 37 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PERSON, "^(Infobox_.*|.*cus|.*eur|.*ler|.*schapper)$", Pattern.CASE_INSENSITIVE, new String[] {"geboren"})); |
| 38 | + conceptTypeSensors.add( new HasSectionLikeSensor<ConceptType>(ConceptType.PERSON, "^((.* )?leven|Carri\u00e8re|Stamvader|Levensloop|Filmografie|Bibliografie|publicaties|(Eigen )?Biografie|Priv\u00e9|.*Loopbaan.*|Jeugd|Kinderen|Familie|Familieachtergrond)$", Pattern.CASE_INSENSITIVE) ); |
38 | 39 | |
39 | 40 | //conceptTypeSensors.add( new WikiTextAnalyzer.TitleSensor(".*_\\(voornaam\\)", 0)); |
40 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "Jongensnaam")); |
41 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "Meisjesnaam")); |
42 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "Achternaam")); |
| 41 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "Jongensnaam")); |
| 42 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "Meisjesnaam")); |
| 43 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "Achternaam")); |
43 | 44 | |
44 | | - conceptTypeSensors.add( new HasTemplateSensor(ConceptType.LIFEFORM, "Taxobox_end")); |
| 45 | + conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.LIFEFORM, "Taxobox_end")); |
45 | 46 | //TODO: cooperations & organizations |
46 | 47 | |
47 | | - resourceTypeSensors.add( new HasTemplateLikeSensor(ResourceType.BAD, "^(Weg|Ne)$|weg$", 0)); |
48 | | - resourceTypeSensors.add( new HasTemplateSensor(ResourceType.DISAMBIG, "Dp") ); |
| 48 | + resourceTypeSensors.add( new HasTemplateLikeSensor<ResourceType>(ResourceType.BAD, "^(Weg|Ne)$|weg$", 0)); |
| 49 | + resourceTypeSensors.add( new HasTemplateSensor<ResourceType>(ResourceType.DISAMBIG, "Dp") ); |
49 | 50 | //resourceTypeSensors.add( new TitleSensor(ResourceType.DISAMBIG, ".*\\(doorverwijspagina\\)", 0) ); |
50 | | - resourceTypeSensors.add( new HasCategoryLikeSensor(ResourceType.LIST, "^Lijsten_|lijsten$", 0) ); |
51 | | - resourceTypeSensors.add( new TitleSensor(ResourceType.LIST, "Lijst_.*|.*lijst", 0) ); |
| 51 | + resourceTypeSensors.add( new HasCategoryLikeSensor<ResourceType>(ResourceType.LIST, "^Lijsten_|lijsten$", 0) ); |
| 52 | + resourceTypeSensors.add( new TitleSensor<ResourceType>(ResourceType.LIST, "Lijst_.*|.*lijst", 0) ); |
52 | 53 | //resourceTypeSensors.add( new WikiTextAnalyzer.RegularExpressionTitleSensor("^Lijst_", 0) ); //NOTE: too broad. some concrete concepts have a name matching this. |
53 | 54 | |
54 | 55 | disambigStripSectionPattern = sectionPattern("Zie ook", 0); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_dewiki.java |
— | — | @@ -39,6 +39,7 @@ |
40 | 40 | stripClutterManglers.add( new RegularExpressionMangler("\\[\\[:commons:", "[[commons:", Pattern.CASE_INSENSITIVE)); |
41 | 41 | |
42 | 42 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("Okina", 0, false), "\u02BB")); |
| 43 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("0", 0, true), " ")); |
43 | 44 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("IPA(?:-Text)|IAST|Unicode|Musik", 1, true), "$2")); |
44 | 45 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("SWD|Wiktionary", 0, true), "")); |
45 | 46 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_frwiki.java |
— | — | @@ -23,6 +23,7 @@ |
24 | 24 | |
25 | 25 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("er?|\u00e8?re|(?:mini[ _])?[IVXCM]+(?:e|re|er)?|\\d+r?er?|Mlle|Mme|elle", 0, true), "$1")); |
26 | 26 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("romain|rom|rom-min|rom-maj|APIb|IPA", 1, true), "$2")); |
| 27 | + stripClutterManglers.add( new RegularExpressionMangler(templatePattern("0", 0, true), " ")); |
27 | 28 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("avjc", 0, false), "av. J.-C.")); |
28 | 29 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("[XVI]+es", 0, false), "$1")); |
29 | 30 | stripClutterManglers.add( new RegularExpressionMangler(templatePattern("formatnum", 1, true), "$2")); |
— | — | @@ -43,24 +44,24 @@ |
44 | 45 | stripClutterManglers.add( new RegularExpressionMangler( |
45 | 46 | templatePattern("lang(?:\\s*\\|\\s*(?:rtl|ltr)\\s*)?", 2, true), "$3")); |
46 | 47 | |
47 | | - conceptTypeSensors.add( new HasCategoryLikeSensor(ConceptType.PLACE, "^(Pays|Territoire|R\u00e9publique|Subdivision|Ville|Municipalit\u00e9s|Ocean)(_|$)", 0)); |
| 48 | + conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PLACE, "^(Pays|Territoire|R\u00e9publique|Subdivision|Ville|Municipalit\u00e9s|Ocean)(_|$)", 0)); |
48 | 49 | conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.PLACE, "ODP", new HashMap<String, NameMatcher>() { { put("1", new PatternNameMatcher("Regional/.*", 0, true)); } })); |
49 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.PLACE, "^(Infobox_)?(Pays|Continent|Commune_)(_|$)", 0)); |
| 50 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PLACE, "^(Infobox_)?(Pays|Continent|Commune_)(_|$)", 0)); |
50 | 51 | |
51 | | - conceptTypeSensors.add( new HasCategoryLikeSensor(ConceptType.PERSON, "(^Homme$|^Femme$|^Naissance_en|D\u00e9c\u00e8s_en)", 0)); |
| 52 | + conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PERSON, "(^Homme$|^Femme$|^Naissance_en|D\u00e9c\u00e8s_en)", 0)); |
52 | 53 | |
53 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "Pr\u00e9nom")); |
54 | | - conceptTypeSensors.add( new HasCategorySensor(ConceptType.NAME, "Patronyme")); |
| 54 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "Pr\u00e9nom")); |
| 55 | + conceptTypeSensors.add( new HasCategorySensor<ConceptType>(ConceptType.NAME, "Patronyme")); |
55 | 56 | |
56 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.TIME, "^(Ann\u00e9es|Portail_ann\u00e9es_\\d+|Portails_?I+er?_mill\u00e9naire(_av\\._J\\.-C\\.)?|Portails_d\u00e9cennies)$", 0)); |
| 57 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.TIME, "^(Ann\u00e9es|Portail_ann\u00e9es_\\d+|Portails_?I+er?_mill\u00e9naire(_av\\._J\\.-C\\.)?|Portails_d\u00e9cennies)$", 0)); |
57 | 58 | |
58 | | - conceptTypeSensors.add( new HasTemplateLikeSensor(ConceptType.LIFEFORM, "^Taxobox_", 0)); |
| 59 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.LIFEFORM, "^Taxobox_", 0)); |
59 | 60 | //TODO: cooperations & organizations |
60 | 61 | |
61 | | - resourceTypeSensors.add( new HasTemplateLikeSensor(ResourceType.BAD, "^Suppression[ _/]", 0)); |
| 62 | + resourceTypeSensors.add( new HasTemplateLikeSensor<ResourceType>(ResourceType.BAD, "^Suppression[ _/]", 0)); |
62 | 63 | |
63 | 64 | //resourceTypeSensors.add( new HasTemplateLikeSensor(ResourceType.DISAMBIG, "^Homonymie(_|$)|_homonymes$|^Paronymie$|^Patronyme$|^Internationalisation$", 0) ); |
64 | | - resourceTypeSensors.add( new HasCategoryLikeSensor(ResourceType.LIST, "^Liste(_|$)", 0)); |
| 65 | + resourceTypeSensors.add( new HasCategoryLikeSensor<ResourceType>(ResourceType.LIST, "^Liste(_|$)", 0)); |
65 | 66 | |
66 | 67 | disambigStripSectionPattern = sectionPattern("^(Voir aussi|Liens internes)$", 0); |
67 | 68 | |
Index: trunk/WikiWord/pom.xml |
— | — | @@ -12,8 +12,7 @@ |
13 | 13 | <module>WikiWord</module>
|
14 | 14 | <module>WikiWordBuilder</module>
|
15 | 15 | <module>WikiWordIntegrator</module>
|
16 | | - <module>WikiWordBuilder4LifeScience</module>
|
17 | | - <module>WikiWordBuilder4Biography</module>
|
| 16 | + <module>WikiWordProperties</module>
|
18 | 17 | </modules>
|
19 | 18 |
|
20 | 19 | <!--
|