r51352 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r51351‎ | r51352 | r51353 >
Date:14:25, 2 June 2009
Author:daniel
Status:deferred
Tags:
Comment:
use generic string chunker
Modified paths:
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/FeatureSetValueSplitter.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/FeatureSetValueSplitter.java
@@ -1,12 +1,15 @@
22 package de.brightbyte.wikiword.integrator.data;
33
 4+import java.text.ParseException;
45 import java.util.ArrayList;
56 import java.util.List;
67 import java.util.Map;
7 -import java.util.regex.Matcher;
88 import java.util.regex.Pattern;
99
 10+import de.brightbyte.text.Chunker;
 11+import de.brightbyte.text.RegularExpressionChunker;
1012
 13+
1114 public class FeatureSetValueSplitter implements FeatureSetMangler {
1215
1316 public static FeatureSetMultiMangler multi(FeatureSetValueSplitter... splitters) {
@@ -44,15 +47,19 @@
4548 }
4649
4750 protected String field;
48 - protected Matcher splitter;
 51+ protected Chunker chunker;
4952
5053 public FeatureSetValueSplitter(String field, String regex, int flags) {
5154 this(field, Pattern.compile(regex, flags));
5255 }
5356
54 - public FeatureSetValueSplitter(String field, Pattern splitter) {
 57+ public FeatureSetValueSplitter(String field, Pattern pattern) {
 58+ this(field, new RegularExpressionChunker(pattern));
 59+ }
 60+
 61+ public FeatureSetValueSplitter(String field, Chunker chunker) {
5562 this.field = field;
56 - this.splitter = splitter.matcher("");
 63+ this.chunker = chunker;
5764 }
5865
5966 @Override
@@ -60,7 +67,7 @@
6168 final int PRIME = 31;
6269 int result = 1;
6370 result = PRIME * result + ((field == null) ? 0 : field.hashCode());
64 - result = PRIME * result + ((splitter == null) ? 0 : splitter.pattern().hashCode());
 71+ result = PRIME * result + ((chunker == null) ? 0 : chunker.hashCode());
6572 return result;
6673 }
6774
@@ -78,10 +85,10 @@
7986 return false;
8087 } else if (!field.equals(other.field))
8188 return false;
82 - if (splitter == null) {
83 - if (other.splitter != null)
 89+ if (chunker == null) {
 90+ if (other.chunker != null)
8491 return false;
85 - } else if (!splitter.pattern().equals(other.splitter.pattern()))
 92+ } else if (!chunker.equals(chunker))
8693 return false;
8794 return true;
8895 }
@@ -95,26 +102,26 @@
96103 int c = 0;
97104 for (Object obj: v) {
98105 String s = obj.toString();
99 - splitter.reset(s);
100106
101 - int i = 0;
102 - while (splitter.find()) {
103 - if (w==null) {
104 - w = new ArrayList<Object>(v.size()*2);
105 - if (c>0) w.addAll(v.subList(0, c));
106 - }
107 -
108 - String t = s.substring(i, splitter.start());
109 - if (t.length()>0) w.add(t);
110 -
111 - i = splitter.end();
 107+ try {
 108+ Iterable<String> vv = chunker.chunk(s);
 109+
 110+ //TODO: detecting trivial items might speed things up.
 111+ //if (w!=null || vv.size()!=1 || !vv.get(0).equals(s)) {
 112+ if (w==null) {
 113+ w = new ArrayList<Object>(v.size()*2);
 114+ if (c>0) w.addAll(v.subList(0, c));
 115+ }
 116+
 117+ for (String a: vv)
 118+ w.add(a);
 119+ //}
 120+ } catch (ParseException e) {
 121+ //split failed, so keep value as-is
 122+ //XXX: somehow report?
 123+ if (w!=null) w.add(s);
112124 }
113125
114 - if (i<s.length() && w!=null) {
115 - String t = s.substring(i);
116 - if (t.length()>0) w.add(t);
117 - }
118 -
119126 c++;
120127 }
121128

Status & tagging log