Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/FeatureSetValueSplitter.java |
— | — | @@ -1,12 +1,15 @@ |
2 | 2 | package de.brightbyte.wikiword.integrator.data; |
3 | 3 | |
| 4 | +import java.text.ParseException; |
4 | 5 | import java.util.ArrayList; |
5 | 6 | import java.util.List; |
6 | 7 | import java.util.Map; |
7 | | -import java.util.regex.Matcher; |
8 | 8 | import java.util.regex.Pattern; |
9 | 9 | |
| 10 | +import de.brightbyte.text.Chunker; |
| 11 | +import de.brightbyte.text.RegularExpressionChunker; |
10 | 12 | |
| 13 | + |
11 | 14 | public class FeatureSetValueSplitter implements FeatureSetMangler { |
12 | 15 | |
13 | 16 | public static FeatureSetMultiMangler multi(FeatureSetValueSplitter... splitters) { |
— | — | @@ -44,15 +47,19 @@ |
45 | 48 | } |
46 | 49 | |
47 | 50 | protected String field; |
48 | | - protected Matcher splitter; |
| 51 | + protected Chunker chunker; |
49 | 52 | |
50 | 53 | public FeatureSetValueSplitter(String field, String regex, int flags) { |
51 | 54 | this(field, Pattern.compile(regex, flags)); |
52 | 55 | } |
53 | 56 | |
54 | | - public FeatureSetValueSplitter(String field, Pattern splitter) { |
| 57 | + public FeatureSetValueSplitter(String field, Pattern pattern) { |
| 58 | + this(field, new RegularExpressionChunker(pattern)); |
| 59 | + } |
| 60 | + |
| 61 | + public FeatureSetValueSplitter(String field, Chunker chunker) { |
55 | 62 | this.field = field; |
56 | | - this.splitter = splitter.matcher(""); |
| 63 | + this.chunker = chunker; |
57 | 64 | } |
58 | 65 | |
59 | 66 | @Override |
— | — | @@ -60,7 +67,7 @@ |
61 | 68 | final int PRIME = 31; |
62 | 69 | int result = 1; |
63 | 70 | result = PRIME * result + ((field == null) ? 0 : field.hashCode()); |
64 | | - result = PRIME * result + ((splitter == null) ? 0 : splitter.pattern().hashCode()); |
| 71 | + result = PRIME * result + ((chunker == null) ? 0 : chunker.hashCode()); |
65 | 72 | return result; |
66 | 73 | } |
67 | 74 | |
— | — | @@ -78,10 +85,10 @@ |
79 | 86 | return false; |
80 | 87 | } else if (!field.equals(other.field)) |
81 | 88 | return false; |
82 | | - if (splitter == null) { |
83 | | - if (other.splitter != null) |
| 89 | + if (chunker == null) { |
| 90 | + if (other.chunker != null) |
84 | 91 | return false; |
85 | | - } else if (!splitter.pattern().equals(other.splitter.pattern())) |
| 92 | + } else if (!chunker.equals(chunker)) |
86 | 93 | return false; |
87 | 94 | return true; |
88 | 95 | } |
— | — | @@ -95,26 +102,26 @@ |
96 | 103 | int c = 0; |
97 | 104 | for (Object obj: v) { |
98 | 105 | String s = obj.toString(); |
99 | | - splitter.reset(s); |
100 | 106 | |
101 | | - int i = 0; |
102 | | - while (splitter.find()) { |
103 | | - if (w==null) { |
104 | | - w = new ArrayList<Object>(v.size()*2); |
105 | | - if (c>0) w.addAll(v.subList(0, c)); |
106 | | - } |
107 | | - |
108 | | - String t = s.substring(i, splitter.start()); |
109 | | - if (t.length()>0) w.add(t); |
110 | | - |
111 | | - i = splitter.end(); |
| 107 | + try { |
| 108 | + Iterable<String> vv = chunker.chunk(s); |
| 109 | + |
| 110 | + //TODO: detecting trivial items might speed things up. |
| 111 | + //if (w!=null || vv.size()!=1 || !vv.get(0).equals(s)) { |
| 112 | + if (w==null) { |
| 113 | + w = new ArrayList<Object>(v.size()*2); |
| 114 | + if (c>0) w.addAll(v.subList(0, c)); |
| 115 | + } |
| 116 | + |
| 117 | + for (String a: vv) |
| 118 | + w.add(a); |
| 119 | + //} |
| 120 | + } catch (ParseException e) { |
| 121 | + //split failed, so keep value as-is |
| 122 | + //XXX: somehow report? |
| 123 | + if (w!=null) w.add(s); |
112 | 124 | } |
113 | 125 | |
114 | | - if (i<s.length() && w!=null) { |
115 | | - String t = s.substring(i); |
116 | | - if (t.length()>0) w.add(t); |
117 | | - } |
118 | | - |
119 | 126 | c++; |
120 | 127 | } |
121 | 128 | |