Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/extractor/CategoryPatternParameterExtractor.java |
— | — | @@ -9,12 +9,14 @@ |
10 | 10 | |
11 | 11 | import de.brightbyte.data.MultiMap; |
12 | 12 | import de.brightbyte.data.ValueSetMultiMap; |
| 13 | +import de.brightbyte.wikiword.analyzer.AnalyzerUtils; |
13 | 14 | import de.brightbyte.wikiword.analyzer.WikiPage; |
14 | 15 | |
15 | 16 | public class CategoryPatternParameterExtractor implements PropertyExtractor { |
16 | 17 | protected String property; |
17 | 18 | protected Matcher matcher; |
18 | 19 | protected String replacement; |
| 20 | + private boolean capitalize = false; |
19 | 21 | |
20 | 22 | public CategoryPatternParameterExtractor(String pattern, String replacement, int flags, String property) { |
21 | 23 | this(Pattern.compile(pattern, flags), replacement, property); |
— | — | @@ -33,10 +35,15 @@ |
34 | 36 | public MultiMap<String, CharSequence, Set<CharSequence>> extract(WikiPage page, MultiMap<String, CharSequence, Set<CharSequence>> into) { |
35 | 37 | for(CharSequence s: page.getCategories()) { |
36 | 38 | matcher.reset(s); |
37 | | - if (matcher.find()) { |
38 | | - String v = matcher.group(); |
| 39 | + if (matcher.matches()) { |
| 40 | + CharSequence v = matcher.group(); |
39 | 41 | v = matcher.replaceAll(replacement); |
| 42 | + v = AnalyzerUtils.replaceUnderscoreBySpace(v); |
| 43 | + v = AnalyzerUtils.trim(v); |
40 | 44 | |
| 45 | + if (capitalize) |
| 46 | + v = AnalyzerUtils.titleCase(v); |
| 47 | + |
41 | 48 | if (into==null) into = new ValueSetMultiMap<String, CharSequence>(); |
42 | 49 | into.put(property, v); |
43 | 50 | } |
— | — | @@ -44,4 +51,9 @@ |
45 | 52 | |
46 | 53 | return into; |
47 | 54 | } |
| 55 | + |
| 56 | + public PropertyExtractor setCapitalize(boolean capitalize) { |
| 57 | + this.capitalize = capitalize; |
| 58 | + return this; |
| 59 | + } |
48 | 60 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/WikiTextAnalyzer.java |
— | — | @@ -1456,33 +1456,8 @@ |
1457 | 1457 | |
1458 | 1458 | title = AnalyzerUtils.replaceSpaceByUnderscore(title); |
1459 | 1459 | |
1460 | | - if (titleCase && forceCase && title.charAt(0)>'Z') { //fast check for ascii caps first |
1461 | | - int ch = Character.codePointAt(title, 0); |
1462 | | - if (Character.isLowerCase(ch)) { |
1463 | | - int uch = Character.toUpperCase(ch); |
1464 | | - if (uch!=ch) { |
1465 | | - int w = Character.charCount(ch); |
1466 | | - int len= title.length(); |
1467 | | - |
1468 | | - char[] uchars = Character.toChars(uch); |
1469 | | - char[] chars = new char[uchars.length + len - w]; |
1470 | | - System.arraycopy(uchars, 0, chars, 0, uchars.length); |
1471 | | - |
1472 | | - if (title instanceof String) { |
1473 | | - ((String)title).getChars(w, len, chars, uchars.length); |
1474 | | - } |
1475 | | - else if (title instanceof StringBuilder) { |
1476 | | - ((StringBuilder)title).getChars(w, len, chars, uchars.length); |
1477 | | - } |
1478 | | - else { |
1479 | | - for (int i = 0; i<len-w; i++) { |
1480 | | - chars[i+uchars.length] = title.charAt(i); |
1481 | | - } |
1482 | | - } |
1483 | | - |
1484 | | - title = new String(chars); |
1485 | | - } |
1486 | | - } |
| 1460 | + if (titleCase && forceCase) { |
| 1461 | + title = AnalyzerUtils.titleCase(title); |
1487 | 1462 | } |
1488 | 1463 | |
1489 | 1464 | return title; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerUtils.java |
— | — | @@ -118,4 +118,38 @@ |
119 | 119 | return toLowerCase(trim(s)); |
120 | 120 | } |
121 | 121 | |
| 122 | + public static CharSequence titleCase(CharSequence s) { |
| 123 | + if (s.length()==0) return s; |
| 124 | + |
| 125 | + if (s.charAt(0)<='Z') return s; //fast check for ascii caps first |
| 126 | + |
| 127 | + int ch = Character.codePointAt(s, 0); |
| 128 | + if (Character.isLowerCase(ch)) { |
| 129 | + int uch = Character.toUpperCase(ch); |
| 130 | + if (uch!=ch) { |
| 131 | + int w = Character.charCount(ch); |
| 132 | + int len= s.length(); |
| 133 | + |
| 134 | + char[] uchars = Character.toChars(uch); |
| 135 | + char[] chars = new char[uchars.length + len - w]; |
| 136 | + System.arraycopy(uchars, 0, chars, 0, uchars.length); |
| 137 | + |
| 138 | + if (s instanceof String) { |
| 139 | + ((String)s).getChars(w, len, chars, uchars.length); |
| 140 | + } |
| 141 | + else if (s instanceof StringBuilder) { |
| 142 | + ((StringBuilder)s).getChars(w, len, chars, uchars.length); |
| 143 | + } |
| 144 | + else { |
| 145 | + for (int i = 0; i<len-w; i++) { |
| 146 | + chars[i+uchars.length] = s.charAt(i); |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + s = new String(chars); |
| 151 | + } |
| 152 | + } |
| 153 | + |
| 154 | + return s; |
| 155 | + } |
122 | 156 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/sensor/HasPropertySensor.java |
— | — | @@ -25,6 +25,7 @@ |
26 | 26 | public HasPropertySensor(V value, String name, NameMatcher matcher) { |
27 | 27 | super(value); |
28 | 28 | this.matcher = matcher; |
| 29 | + this.name = name; |
29 | 30 | } |
30 | 31 | |
31 | 32 | @Override |