Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -3,10 +3,8 @@ |
4 | 4 | import java.util.ArrayList; |
5 | 5 | import java.util.Collection; |
6 | 6 | import java.util.Collections; |
7 | | -import java.util.HashSet; |
8 | 7 | import java.util.List; |
9 | 8 | import java.util.Map; |
10 | | -import java.util.Set; |
11 | 9 | |
12 | 10 | import de.brightbyte.io.Output; |
13 | 11 | import de.brightbyte.util.PersistenceException; |
— | — | @@ -17,46 +15,6 @@ |
18 | 16 | |
19 | 17 | public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> { |
20 | 18 | |
21 | | - public interface NodeListener<T extends TermReference> { |
22 | | - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal); |
23 | | - } |
24 | | - |
25 | | - public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> { |
26 | | - protected List<List<T>> seqencees; |
27 | | - |
28 | | - public SequenceSetBuilder() { |
29 | | - seqencees = new ArrayList<List<T>>(); |
30 | | - } |
31 | | - |
32 | | - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) { |
33 | | - if (terminal) { |
34 | | - List<T> p = new ArrayList<T>(seqence); //clone |
35 | | - seqencees.add(p); |
36 | | - } |
37 | | - } |
38 | | - |
39 | | - public List<List<T>> getSequences() { |
40 | | - return seqencees; |
41 | | - } |
42 | | - } |
43 | | - |
44 | | - public static class TermSetBuilder <T extends TermReference> implements NodeListener<T> { |
45 | | - protected Set<T> terms; |
46 | | - |
47 | | - public TermSetBuilder() { |
48 | | - terms = new HashSet<T>(); |
49 | | - } |
50 | | - |
51 | | - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) { |
52 | | - T t = node.getTermReference(); |
53 | | - if (t.getTerm().length()>0) terms.add(t); |
54 | | - } |
55 | | - |
56 | | - public Collection<T> getTerms() { |
57 | | - return terms; |
58 | | - } |
59 | | - } |
60 | | - |
61 | 19 | private MeaningFetcher<C> meaningFetcher; |
62 | 20 | |
63 | 21 | private Output trace; |
— | — | @@ -78,18 +36,6 @@ |
79 | 37 | this.meaningOverrides = overrideMap; |
80 | 38 | } |
81 | 39 | |
82 | | - protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) { |
83 | | - TermSetBuilder<X> builder = new TermSetBuilder<X>(); |
84 | | - walk(root, null, builder, depth); |
85 | | - return builder.getTerms(); |
86 | | - } |
87 | | - |
88 | | - protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root, int depth) { |
89 | | - SequenceSetBuilder<X> builder = new SequenceSetBuilder<X>(); |
90 | | - walk(root, null, builder, depth); |
91 | | - return builder.getSequences(); |
92 | | - } |
93 | | - |
94 | 40 | protected <X extends T>PhraseNode<X> getLastNode(PhraseNode<X> root, List<X> sequence) { |
95 | 41 | PhraseNode<X> n = findLastNode(root, sequence); |
96 | 42 | if (n==null) throw new IllegalArgumentException("sequence does not match node structure: "+sequence); |
— | — | @@ -126,31 +72,19 @@ |
127 | 73 | return root; |
128 | 74 | } |
129 | 75 | |
130 | | - protected <X extends T>void walk(PhraseNode<X> root, List<X> seqence, NodeListener<? super X> nodeListener, int depth) { |
131 | | - if (depth<1) return; |
132 | | - if (seqence == null) seqence = new ArrayList<X>(); |
133 | | - |
134 | | - X t = root.getTermReference(); |
135 | | - if (t.getTerm().length()>0) seqence.add(t); //push |
136 | | - else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes. |
137 | | - |
138 | | - boolean terminal = (depth<=1); |
139 | | - |
140 | | - Collection<? extends PhraseNode<X>> successors = terminal ? null : root.getSuccessors(); |
141 | | - if (successors==null || successors.isEmpty()) terminal = true; |
142 | | - |
143 | | - if (nodeListener!=null) |
144 | | - nodeListener.onNode(root, seqence, terminal); |
145 | | - |
146 | | - if (!terminal) { |
147 | | - for (PhraseNode<X> n: successors) { |
148 | | - walk(n, seqence, nodeListener, depth-1); |
149 | | - } |
150 | | - } |
151 | | - |
152 | | - if (t.getTerm().length()>0) seqence.remove(t); //pop |
| 76 | + protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) { |
| 77 | + PhraseNode.TermSetBuilder<X> builder = new PhraseNode.TermSetBuilder<X>(); |
| 78 | + builder.walk(root, 0, null, depth, Double.MAX_VALUE); |
| 79 | + return builder.getTerms(); |
153 | 80 | } |
154 | 81 | |
| 82 | + protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root, int depth) { |
| 83 | + PhraseNode.SequenceSetBuilder<X> builder = new PhraseNode.SequenceSetBuilder<X>(); |
| 84 | + builder.walk(root, 0, null, depth, Double.MAX_VALUE); |
| 85 | + return builder.getSequences(); |
| 86 | + } |
| 87 | + |
| 88 | + |
155 | 89 | protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException { |
156 | 90 | Collection<X> terms = getTerms(root, Integer.MAX_VALUE); |
157 | 91 | return getMeanings(terms); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Interwiki.java |
— | — | @@ -78,7 +78,9 @@ |
79 | 79 | |
80 | 80 | return interwikis; |
81 | 81 | } catch (IOException e) { |
82 | | - throw new RuntimeException("failed to load interwiki map from "+n); |
| 82 | + throw new RuntimeException("failed to load interwiki map from "+n, e); |
| 83 | + } catch (IllegalArgumentException e) { //NOTE: malformed \\u-encoding triggers this. wtf? why not a *real* exception?... |
| 84 | + throw new RuntimeException("failed to load interwiki map from "+n, e); |
83 | 85 | } |
84 | 86 | |
85 | 87 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java |
— | — | @@ -13,10 +13,6 @@ |
14 | 14 | |
15 | 15 | public PhraseOccurance(String phrase, int weight, int offset, int length) { |
16 | 16 | if (length < 0) throw new IllegalArgumentException("bad length: "+length); |
17 | | - if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string"); |
18 | | - //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string"); |
19 | | - if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string"); |
20 | | - if (length < phrase.length()) phrase = phrase.substring(offset, offset+length); |
21 | 17 | |
22 | 18 | this.phrase = phrase; |
23 | 19 | this.weight = weight; |
— | — | @@ -49,7 +45,7 @@ |
50 | 46 | } |
51 | 47 | |
52 | 48 | public String toString() { |
53 | | - return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]"; |
| 49 | + return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]#"+weight; |
54 | 50 | } |
55 | 51 | |
56 | 52 | @Override |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java |
— | — | @@ -1,6 +1,5 @@ |
2 | 2 | package de.brightbyte.wikiword.model; |
3 | 3 | |
4 | | -import java.util.AbstractList; |
5 | 4 | import java.util.ArrayList; |
6 | 5 | import java.util.Collection; |
7 | 6 | import java.util.Collections; |
— | — | @@ -8,11 +7,49 @@ |
9 | 8 | import java.util.Iterator; |
10 | 9 | import java.util.List; |
11 | 10 | import java.util.ListIterator; |
12 | | -import java.util.RandomAccess; |
13 | 11 | import java.util.Set; |
| 12 | +import java.util.regex.Matcher; |
14 | 13 | |
15 | | -public class PhraseOccuranceSet extends AbstractList<PhraseOccurance> implements RandomAccess { |
| 14 | +public class PhraseOccuranceSet implements List<PhraseOccurance> { |
16 | 15 | |
| 16 | + public static class AggregatePhraseBuilder extends PhraseNode.Walker<PhraseOccurance> { |
| 17 | + protected Collection<PhraseOccurance> aggregated; |
| 18 | + protected double minWeight; |
| 19 | + protected double maxWeight; |
| 20 | + protected Matcher phraseBreak; |
| 21 | + |
| 22 | + public AggregatePhraseBuilder( double minWeight, double maxWeight, Matcher phraseBreak ) { |
| 23 | + aggregated = new HashSet<PhraseOccurance>(); |
| 24 | + this.minWeight = minWeight; |
| 25 | + this.maxWeight = maxWeight ; |
| 26 | + this.phraseBreak = phraseBreak; |
| 27 | + } |
| 28 | + |
| 29 | + public boolean onNode(PhraseNode<? extends PhraseOccurance> node, List<? extends PhraseOccurance> sequence, double weight, boolean terminal) { |
| 30 | + if (weight>=minWeight && !sequence.isEmpty()) { |
| 31 | + PhraseOccurance p = aggregatePhrase( sequence, minWeight, maxWeight ); |
| 32 | + if (p!=null) aggregated.add(p); |
| 33 | + |
| 34 | + PhraseOccurance last = sequence.get( sequence.size()-1); |
| 35 | + |
| 36 | + if (phraseBreak!=null) { |
| 37 | + phraseBreak.reset(last.getTerm()); |
| 38 | + if (phraseBreak.matches()) |
| 39 | + return false; //phrase terminates here, don't dig deeper. |
| 40 | + } |
| 41 | + |
| 42 | + if (p==null) return weight <= maxWeight; //XXX: something is wrong here |
| 43 | + else return p.getWeight() <= maxWeight; //XXX: can we do that? |
| 44 | + } else { |
| 45 | + return weight <= maxWeight; //XXX: not sure... |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + public Collection<PhraseOccurance> getAggregatedPhrases() { |
| 50 | + return aggregated; |
| 51 | + } |
| 52 | + } |
| 53 | + |
17 | 54 | protected class Node implements PhraseNode<PhraseOccurance> { |
18 | 55 | protected PhraseOccurance phrase; |
19 | 56 | |
— | — | @@ -57,8 +94,6 @@ |
58 | 95 | return false; |
59 | 96 | return true; |
60 | 97 | } |
61 | | - |
62 | | - |
63 | 98 | } |
64 | 99 | |
65 | 100 | protected List<PhraseOccurance> phrases; |
— | — | @@ -71,12 +106,51 @@ |
72 | 107 | Collections.sort(this.phrases); //essential! |
73 | 108 | } |
74 | 109 | |
75 | | - @Override |
| 110 | + private static PhraseOccurance aggregatePhrase(List<? extends PhraseOccurance> sequence, double minWeight, double maxWeight) { |
| 111 | + if (sequence.isEmpty()) return null; |
| 112 | + |
| 113 | + int i = 0; |
| 114 | + while ( i<sequence.size() && sequence.get(i).getWeight() < minWeight ) i++; |
| 115 | + |
| 116 | + int j = sequence.size()-1; |
| 117 | + while ( j>i && sequence.get(j).getWeight() < minWeight ) j--; |
| 118 | + |
| 119 | + if ( j<i ) return null; |
| 120 | + |
| 121 | + double weight = 0; |
| 122 | + int ofs = -1; |
| 123 | + int start = -1; |
| 124 | + StringBuilder s = new StringBuilder(); |
| 125 | + |
| 126 | + for (int n=i; n<=j; n++) { |
| 127 | + PhraseOccurance p = sequence.get(n); |
| 128 | + |
| 129 | + double w = p.getWeight(); |
| 130 | + if (w<0) w = 0; |
| 131 | + if (weight+w > maxWeight) break; |
| 132 | + |
| 133 | + if ( start < 0 ) { |
| 134 | + start = p.getOffset(); |
| 135 | + ofs = p.getOffset(); |
| 136 | + } else { |
| 137 | + if (p.getOffset()>ofs) s.append(" "); |
| 138 | + } |
| 139 | + |
| 140 | + ofs = p.getEndOffset(); |
| 141 | + |
| 142 | + weight += w; |
| 143 | + s.append(p.getTerm()); |
| 144 | + } |
| 145 | + |
| 146 | + if (start < 0) return null; |
| 147 | + |
| 148 | + return new PhraseOccurance(s.toString(), (int)weight, start, ofs - start); |
| 149 | + } |
| 150 | + |
76 | 151 | public PhraseOccurance get(int index) { |
77 | 152 | return phrases.get(index); |
78 | 153 | } |
79 | 154 | |
80 | | - @Override |
81 | 155 | public int size() { |
82 | 156 | return phrases.size(); |
83 | 157 | } |
— | — | @@ -179,6 +253,15 @@ |
180 | 254 | return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
181 | 255 | } |
182 | 256 | |
| 257 | + public boolean hasPhrasesAt(int at) { |
| 258 | + for ( PhraseOccurance p: phrases ) { |
| 259 | + if ( p.getOffset() == at) return true; |
| 260 | + else if ( p.getOffset() > at) return false; |
| 261 | + } |
| 262 | + |
| 263 | + return false; |
| 264 | + } |
| 265 | + |
183 | 266 | public List<PhraseOccurance> getPhrasesFrom(int offs) { |
184 | 267 | int i = 0; |
185 | 268 | while (i<size()) { |
— | — | @@ -262,7 +345,96 @@ |
263 | 346 | public <T> T[] toArray(T[] a) { |
264 | 347 | return phrases.toArray(a); |
265 | 348 | } |
| 349 | + |
| 350 | + public void prune( double minWeight ) { |
| 351 | + Iterator<PhraseOccurance> it = phrases.iterator(); |
| 352 | + while (it.hasNext()) { |
| 353 | + PhraseOccurance t = it.next(); |
| 354 | + if ( t.getWeight() < minWeight ) it.remove(); |
| 355 | + } |
| 356 | + } |
266 | 357 | |
| 358 | + public void buildAggregatePhrases( int start, double minWeight, double maxWeight, Matcher phraseBreak ) { |
| 359 | + AggregatePhraseBuilder builder = new AggregatePhraseBuilder( minWeight, maxWeight, phraseBreak ); |
| 360 | + |
| 361 | + if (isEmpty()) return; |
| 362 | + PhraseOccurance last = phrases.get(phrases.size()-1); |
| 363 | + int end = last.getEndOffset(); |
| 364 | + |
| 365 | + for (int i=start; i<end; i++) { |
| 366 | + if (hasPhrasesAt(i)) { |
| 367 | + builder.walk(getRootNodeAt(i), 0, null, Integer.MAX_VALUE, maxWeight); |
| 368 | + } |
| 369 | + } |
| 370 | + |
| 371 | + Collection<PhraseOccurance> phrases = builder.getAggregatedPhrases(); |
| 372 | + addAll( phrases ); |
| 373 | + } |
| 374 | + |
| 375 | + public String toString() { |
| 376 | + return phrases.toString(); |
| 377 | + } |
| 378 | + |
| 379 | + public Collection<PhraseOccurance> getTerms(PhraseNode<PhraseOccurance> root, int depth) { |
| 380 | + PhraseNode.TermSetBuilder<PhraseOccurance> builder = new PhraseNode.TermSetBuilder<PhraseOccurance>(); |
| 381 | + builder.walk(root, 0, null, depth, Double.MAX_VALUE); |
| 382 | + return builder.getTerms(); |
| 383 | + } |
| 384 | + |
| 385 | + public Collection<List<PhraseOccurance>> getSequences(PhraseNode<PhraseOccurance> root, int depth) { |
| 386 | + PhraseNode.SequenceSetBuilder<PhraseOccurance> builder = new PhraseNode.SequenceSetBuilder<PhraseOccurance>(); |
| 387 | + builder.walk(root, 0, null, depth, Double.MAX_VALUE); |
| 388 | + return builder.getSequences(); |
| 389 | + } |
| 390 | + |
| 391 | + public void add(int index, PhraseOccurance element) { |
| 392 | + add(element); |
| 393 | + } |
| 394 | + |
| 395 | + public boolean add(PhraseOccurance e) { |
| 396 | + int i = Collections.binarySearch(phrases, e); |
| 397 | + |
| 398 | + if (i<0) i = -i-1; |
| 399 | + else { |
| 400 | + PhraseOccurance old = get(i); |
| 401 | + if (old.equals(e)) return false; |
| 402 | + } |
| 403 | + |
| 404 | + phrases.add(i, e); |
| 405 | + |
| 406 | + return true; |
| 407 | + } |
| 408 | + |
| 409 | + public boolean addAll(Collection<? extends PhraseOccurance> c) { |
| 410 | + int count = 0; |
| 411 | + for (PhraseOccurance p: c) { |
| 412 | + if ( add(p) ) count++; |
| 413 | + } |
| 414 | + |
| 415 | + return count>0; |
| 416 | + } |
| 417 | + |
| 418 | + public boolean addAll(int index, Collection<? extends PhraseOccurance> c) { |
| 419 | + return addAll(c); |
| 420 | + } |
| 421 | + |
| 422 | + public int hashCode() { |
| 423 | + return phrases.hashCode(); |
| 424 | + } |
| 425 | + |
| 426 | + public int lastIndexOf(Object o) { |
| 427 | + return phrases.lastIndexOf(o); |
| 428 | + } |
| 429 | + |
| 430 | + public PhraseOccurance set(int index, PhraseOccurance element) { |
| 431 | + throw new UnsupportedOperationException(); |
| 432 | + } |
| 433 | + |
| 434 | + public PhraseOccuranceSet subList(int fromIndex, int toIndex) { |
| 435 | + return new PhraseOccuranceSet(text, phrases.subList(fromIndex, toIndex)); |
| 436 | + } |
| 437 | + |
| 438 | + |
267 | 439 | /* |
268 | 440 | public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) { |
269 | 441 | List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java |
— | — | @@ -1,9 +1,90 @@ |
2 | 2 | package de.brightbyte.wikiword.model; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
4 | 5 | import java.util.Collection; |
| 6 | +import java.util.HashSet; |
| 7 | +import java.util.List; |
| 8 | +import java.util.Set; |
5 | 9 | |
6 | 10 | public interface PhraseNode<T extends TermReference> { |
| 11 | + public abstract class Walker<T extends TermReference> { |
| 12 | + public abstract boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal); |
| 13 | + |
| 14 | + public void walk(PhraseNode<T> root, int depth) { |
| 15 | + walk(root, 0, null, depth, Double.MAX_VALUE); |
| 16 | + } |
7 | 17 | |
| 18 | + public void walk(PhraseNode<T> root, double baseWeight, List<T> intoSeqence, int depth, double maxWeight) { |
| 19 | + if (depth<1) return; |
| 20 | + if (intoSeqence == null) intoSeqence = new ArrayList<T>(); |
| 21 | + |
| 22 | + T t = root.getTermReference(); |
| 23 | + if (t.getTerm().length()>0) intoSeqence.add(t); //push |
| 24 | + else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes. |
| 25 | + |
| 26 | + boolean terminal = (depth<=1) || (baseWeight>=maxWeight); |
| 27 | + |
| 28 | + Collection<? extends PhraseNode<T>> successors = terminal ? null : root.getSuccessors(); |
| 29 | + if (successors==null || successors.isEmpty()) terminal = true; |
| 30 | + |
| 31 | + double w = root.getTermReference().getWeight(); |
| 32 | + if (w<0) w = 0; |
| 33 | + if ( !onNode(root, intoSeqence, w, terminal) ) terminal = true; |
| 34 | + |
| 35 | + //System.out.println( " - walk: "+intoSeqence+" " ); |
| 36 | + |
| 37 | + if (!terminal) { |
| 38 | + for (PhraseNode<T> n: successors) { |
| 39 | + w = n.getTermReference().getWeight(); |
| 40 | + if (w<0) w = 0; |
| 41 | + walk(n, baseWeight + w, intoSeqence, depth-1, maxWeight); |
| 42 | + } |
| 43 | + } |
| 44 | + |
| 45 | + if (t.getTerm().length()>0) intoSeqence.remove(t); //pop |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + public static class SequenceSetBuilder <T extends TermReference> extends Walker<T> { |
| 50 | + protected List<List<T>> sequences; |
| 51 | + |
| 52 | + public SequenceSetBuilder() { |
| 53 | + sequences = new ArrayList<List<T>>(); |
| 54 | + } |
| 55 | + |
| 56 | + public boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal) { |
| 57 | + if (terminal) { |
| 58 | + List<T> p = new ArrayList<T>(sequence); //clone |
| 59 | + sequences.add(p); |
| 60 | + } |
| 61 | + |
| 62 | + return !terminal; |
| 63 | + } |
| 64 | + |
| 65 | + public List<List<T>> getSequences() { |
| 66 | + return sequences; |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + public static class TermSetBuilder <T extends TermReference> extends Walker<T> { |
| 71 | + protected Set<T> terms; |
| 72 | + |
| 73 | + public TermSetBuilder() { |
| 74 | + terms = new HashSet<T>(); |
| 75 | + } |
| 76 | + |
| 77 | + public boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal) { |
| 78 | + T t = node.getTermReference(); |
| 79 | + if (t.getTerm().length()>0) terms.add(t); |
| 80 | + |
| 81 | + return !terminal; |
| 82 | + } |
| 83 | + |
| 84 | + public Collection<T> getTerms() { |
| 85 | + return terms; |
| 86 | + } |
| 87 | + } |
| 88 | + |
8 | 89 | public T getTermReference(); |
9 | 90 | |
10 | 91 | public Collection<? extends PhraseNode<T>> getSuccessors(); |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java |
— | — | @@ -2,14 +2,16 @@ |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.net.URISyntaxException; |
| 6 | +import java.util.ArrayList; |
6 | 7 | import java.util.Arrays; |
| 8 | +import java.util.Collections; |
7 | 9 | import java.util.HashSet; |
8 | 10 | import java.util.List; |
9 | 11 | import java.util.Set; |
10 | 12 | |
11 | 13 | import de.brightbyte.wikiword.Corpus; |
12 | | -import de.brightbyte.wikiword.analyzer.LanguageConfiguration; |
13 | | -import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer; |
| 14 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 15 | +import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
14 | 16 | |
15 | 17 | /** |
16 | 18 | * Unit tests for PlainTextAnalyzer |
— | — | @@ -60,8 +62,8 @@ |
61 | 63 | words = extractWords("foo-bar"); |
62 | 64 | assertEquals(theList( "foo-bar" ), words); |
63 | 65 | |
64 | | - words = extractWords("harald's 'schaaand"); |
65 | | - assertEquals(theList( "harald's", "'schaaand" ), words); |
| 66 | + words = extractWords("harald's 'schlaaand"); |
| 67 | + assertEquals(theList( "harald's", "'schlaaand" ), words); |
66 | 68 | |
67 | 69 | words = extractWords("23-42"); |
68 | 70 | assertEquals(theList( "23-42" ), words); |
— | — | @@ -69,6 +71,92 @@ |
70 | 72 | words = extractWords("23foo42"); |
71 | 73 | assertEquals(theList( "23", "foo", "42" ), words); |
72 | 74 | } |
| 75 | + |
| 76 | + public void testExtractPhrases() { |
| 77 | + PhraseOccuranceSet phrases = extractPhrases("", 3); |
| 78 | + assertEquals(0, phrases.size()); |
| 79 | + assertEquals(theList(), getWordList(phrases.getPhrasesAt(0))); |
| 80 | + |
| 81 | + phrases = extractPhrases("foo", 3); |
| 82 | + assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(0))); |
| 83 | + |
| 84 | + phrases = extractPhrases(" foo ", 3); |
| 85 | + assertEquals(theList(), getWordList(phrases.getPhrasesAt(0))); |
| 86 | + assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(1))); |
| 87 | + assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesFrom(0))); |
| 88 | + } |
| 89 | + |
| 90 | + public void testExtractPhrases2() { |
| 91 | + PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3); |
| 92 | + assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0))); |
| 93 | + assertEquals(theList( "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4))); |
| 94 | + |
| 95 | + phrases = extractPhrases("red green blue yellow black", 5); |
| 96 | + assertEquals(theList( "red green blue yellow black", "red green blue yellow", "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0))); |
| 97 | + assertEquals(theList( "green blue yellow black", "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4))); |
| 98 | + |
| 99 | + phrases = extractPhrases("and red and green and blue and yellow", 3); |
| 100 | + assertEquals(theList( "and red and green and blue", |
| 101 | + "and red and green and", |
| 102 | + "and red and green", |
| 103 | + "and red and", |
| 104 | + "and red" |
| 105 | + ), |
| 106 | + getWordList(phrases.getPhrasesAt(0))); |
| 107 | + assertEquals(theList( "red and green and blue", |
| 108 | + "red and green and", |
| 109 | + "red and green", |
| 110 | + "red and", |
| 111 | + "red" |
| 112 | + ), |
| 113 | + getWordList(phrases.getPhrasesAt(4))); |
| 114 | + |
| 115 | + phrases = extractPhrases("red green blue. yellow black", 5); |
| 116 | + assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0))); |
| 117 | + assertEquals(theList( "blue" ), getWordList(phrases.getPhrasesAt(10))); |
| 118 | + assertEquals(theList( "yellow black", "yellow" ), getWordList(phrases.getPhrasesAt(16))); |
| 119 | + } |
| 120 | + |
| 121 | + public void testExtractPhrases3() { |
| 122 | + PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3); |
| 123 | + assertEquals(theList( "Krababbel"), getWordList(phrases.getPhrasesAt(0))); |
| 124 | + |
| 125 | + assertEquals(theList( "l'Foo-Bar", |
| 126 | + "l'Foo" |
| 127 | + ), |
| 128 | + getWordList(phrases.getPhrasesAt(11))); |
| 129 | + |
| 130 | + assertEquals(theList( "Foo-Bar", |
| 131 | + "Foo" |
| 132 | + ), |
| 133 | + getWordList(phrases.getPhrasesAt(13))); |
| 134 | + |
| 135 | + assertEquals(theList( "Bar"), |
| 136 | + getWordList(phrases.getPhrasesAt(17))); |
| 137 | + |
| 138 | + phrases = extractPhrases("harald's 'schlaaand", 3); |
| 139 | + assertEquals(theList( "harald's 'schlaaand", |
| 140 | + "harald's", |
| 141 | + "harald" |
| 142 | + ), |
| 143 | + getWordList(phrases.getPhrasesAt(0))); |
| 144 | + |
| 145 | + assertEquals(theList( "'schlaaand"), getWordList(phrases.getPhrasesAt(9))); |
| 146 | + assertEquals(theList("schlaaand"), getWordList(phrases.getPhrasesAt(10))); |
| 147 | + } |
| 148 | + |
| 149 | + private List<String> getWordList(List<PhraseOccurance> phrases) { |
| 150 | + if (phrases==null) return Collections.emptyList(); |
| 151 | + |
| 152 | + List<String> words = new ArrayList<String>(phrases.size()); |
| 153 | + |
| 154 | + for (PhraseOccurance phrase: phrases) { |
| 155 | + String w = phrase.getTerm(); |
| 156 | + words.add(w); |
| 157 | + } |
| 158 | + |
| 159 | + return words; |
| 160 | + } |
73 | 161 | |
74 | 162 | } |
75 | 163 | |
— | — | @@ -86,7 +174,7 @@ |
87 | 175 | public void setUp() throws URISyntaxException, IOException { |
88 | 176 | LanguageConfiguration config = new LanguageConfiguration(); |
89 | 177 | |
90 | | - corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null); |
| 178 | + corpus = new Corpus("TEST", "en", "en", "en", "en", "en", "en", null); |
91 | 179 | testAnalyzer = new TestPlainTextAnalyzer(corpus); |
92 | 180 | testAnalyzer.configure(config, tweaks); |
93 | 181 | testAnalyzer.initialize(); |
— | — | @@ -102,6 +190,18 @@ |
103 | 191 | testAnalyzer.testExtractWords(); |
104 | 192 | } |
105 | 193 | |
| 194 | + public void testExtractPhrases() { |
| 195 | + testAnalyzer.testExtractPhrases(); |
| 196 | + } |
| 197 | + |
| 198 | + public void testExtractPhrases2() { |
| 199 | + testAnalyzer.testExtractPhrases2(); |
| 200 | + } |
| 201 | + |
| 202 | + public void testExtractPhrases3() { |
| 203 | + testAnalyzer.testExtractPhrases3(); |
| 204 | + } |
| 205 | + |
106 | 206 | public static void main(String[] args) { |
107 | 207 | run(PlainTextAnalyzerTest.class, args); |
108 | 208 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java |
— | — | @@ -1,113 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.analyzer; |
3 | | - |
4 | | -import java.util.ArrayList; |
5 | | -import java.util.Collection; |
6 | | -import java.util.Iterator; |
7 | | -import java.util.regex.Matcher; |
8 | | - |
9 | | -import de.brightbyte.wikiword.model.PhraseOccurance; |
10 | | - |
11 | | -public class PhraseAggregator { |
12 | | - public class PhraseBuilder { |
13 | | - protected StringBuilder phrase; |
14 | | - protected int weight; |
15 | | - protected int lastWeight; |
16 | | - protected int offset; |
17 | | - |
18 | | - public PhraseBuilder(int offset) { |
19 | | - this.phrase = new StringBuilder(); |
20 | | - this.weight = 0; |
21 | | - this.offset = offset; |
22 | | - } |
23 | | - |
24 | | - public int getLength() { |
25 | | - return phrase.length(); |
26 | | - } |
27 | | - |
28 | | - public int getOffset() { |
29 | | - return offset; |
30 | | - } |
31 | | - |
32 | | - public int getEndOffset() { |
33 | | - return getOffset() + getLength(); |
34 | | - } |
35 | | - |
36 | | - public String getPhrase() { |
37 | | - return phrase.toString(); |
38 | | - } |
39 | | - |
40 | | - public int getWeight() { |
41 | | - return weight; |
42 | | - } |
43 | | - |
44 | | - public int getLastWeight() { |
45 | | - return lastWeight; |
46 | | - } |
47 | | - |
48 | | - public PhraseOccurance toPhraseOccurance() { |
49 | | - return new PhraseOccurance(getPhrase(), getWeight(), getOffset(), getLength()); |
50 | | - } |
51 | | - |
52 | | - public String toString() { |
53 | | - return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]"; |
54 | | - } |
55 | | - |
56 | | - public void push(CharSequence w, int weight) { |
57 | | - phrase.append(w); |
58 | | - if (weight>0) this.weight+= weight; |
59 | | - this.lastWeight = weight; |
60 | | - } |
61 | | - } |
62 | | - |
63 | | - private int offset = 0; |
64 | | - private int maxWeight = 0; |
65 | | - |
66 | | - private Matcher phraseBreakeMatcher; |
67 | | - private ArrayList<PhraseBuilder> phrases = new ArrayList<PhraseBuilder>(); |
68 | | - |
69 | | - public PhraseAggregator(Matcher phraseBreakeMatcher) { |
70 | | - super(); |
71 | | - this.phraseBreakeMatcher = phraseBreakeMatcher; |
72 | | - } |
73 | | - |
74 | | - public void reset(int offset, int maxWeight) { |
75 | | - this.offset = offset; |
76 | | - this.maxWeight = maxWeight; |
77 | | - clear(); |
78 | | - } |
79 | | - |
80 | | - public void clear() { |
81 | | - phrases.clear(); |
82 | | - } |
83 | | - |
84 | | - public void update(int index, CharSequence word, int weight, Collection<PhraseOccurance> into) { |
85 | | - if (weight<0) { |
86 | | - phraseBreakeMatcher.reset(word); |
87 | | - if (phraseBreakeMatcher.matches()) { |
88 | | - this.clear(); |
89 | | - return; |
90 | | - } |
91 | | - } |
92 | | - |
93 | | - this.push(index, word, weight); |
94 | | - this.commit(into); |
95 | | - } |
96 | | - |
97 | | - public void push(int index, CharSequence word, int weight) { |
98 | | - if (weight >= 0) phrases.add(new PhraseBuilder(index+offset)); |
99 | | - |
100 | | - Iterator<PhraseBuilder> it = phrases.iterator(); |
101 | | - while (it.hasNext()) { |
102 | | - PhraseBuilder b = it.next(); |
103 | | - b.push(word, weight); |
104 | | - if (b.getWeight() > maxWeight) it.remove(); |
105 | | - } |
106 | | - } |
107 | | - |
108 | | - public void commit(Collection<PhraseOccurance> into) { |
109 | | - for (PhraseBuilder b: phrases) { |
110 | | - if (b.getWeight() > 0 && b.getLastWeight() > 0) into.add(b.toPhraseOccurance()); |
111 | | - } |
112 | | - } |
113 | | - |
114 | | -} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java |
— | — | @@ -60,10 +60,18 @@ |
61 | 61 | |
62 | 62 | /** |
63 | 63 | * A pattern matching individual words, for splitting a string into words. This is usually |
64 | | - * set to match any sequence of letters but not numbers, whitespace or punctuation. |
| 64 | + * set to match any sequence of letters but not numbers, whitespace or punctuation, except |
| 65 | + * those that may accurs as part of a word, such as an apostrophy or hyphen. |
65 | 66 | */ |
66 | 67 | public Pattern wordPattern; |
67 | 68 | |
| 69 | + /** |
| 70 | + * A pattern matching individual words parts, for splitting a words into components. This is usually |
| 71 | + * set to match any sequence of letters but not numbers, whitespace or punctuation, nor |
| 72 | + * those that may accurs as part of a word, such as an apostrophy or hyphen. |
| 73 | + */ |
| 74 | + public Pattern wordPartPattern; |
| 75 | + |
68 | 76 | protected String languageName; |
69 | 77 | |
70 | 78 | /** |
— | — | @@ -98,7 +106,8 @@ |
99 | 107 | } |
100 | 108 | |
101 | 109 | public void defaults() throws IOException { |
102 | | - if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}'']+(?:[\\p{Pc}\\p{Pd}][\\p{L}'']+)*|\\p{Nd}+(?:.\\p{Nd}+)?"); |
| 110 | + if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}']+(?:[\\p{Pc}\\p{Pd}][\\p{L}']+)*|\\p{Nd}+(?:.\\p{Nd}+)?"); |
| 111 | + if (this.wordPartPattern==null) this.wordPartPattern = Pattern.compile("[\\p{L}]+|\\p{Nd}+"); |
103 | 112 | |
104 | 113 | this.sentenceManglers.add( new RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks |
105 | 114 | this.sentenceManglers.add( new RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "", 0) ); //strip leading cruft (lines without any characters) |
— | — | @@ -110,7 +119,7 @@ |
111 | 120 | List<String> stop = AuxilliaryWikiProperties.loadList("Stopwords", languageName); |
112 | 121 | if (stop!=null) this.stopwords.addAll(stop); |
113 | 122 | |
114 | | - this.phraseBreakerPattern = Pattern.compile("[,;:]\\s|\""); |
| 123 | + this.phraseBreakerPattern = Pattern.compile("[,;:\".!?]\\s*"); |
115 | 124 | this.parentacies = new ArrayList<Pair<String, String>>(); |
116 | 125 | this.parentacies.add( new Pair<String, String>("(", ")") ); |
117 | 126 | this.parentacies.add( new Pair<String, String>("[", "]") ); |
— | — | @@ -126,6 +135,7 @@ |
127 | 136 | if (with.sentenceManglers!=null) sentenceManglers.addAll(with.sentenceManglers); |
128 | 137 | |
129 | 138 | if (with.wordPattern!=null) wordPattern = with.wordPattern; |
| 139 | + if (with.wordPartPattern!=null) wordPartPattern = with.wordPartPattern; |
130 | 140 | if (with.phraseBreakerPattern!=null) phraseBreakerPattern = with.phraseBreakerPattern; |
131 | 141 | |
132 | 142 | if (with.stopwords!=null) stopwords.addAll(with.stopwords); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -13,8 +13,8 @@ |
14 | 14 | |
15 | 15 | import de.brightbyte.application.Arguments; |
16 | 16 | import de.brightbyte.audit.DebugUtil; |
| 17 | +import de.brightbyte.data.Lookup; |
17 | 18 | import de.brightbyte.data.MapLookup; |
18 | | -import de.brightbyte.data.Lookup; |
19 | 19 | import de.brightbyte.data.filter.Filter; |
20 | 20 | import de.brightbyte.data.filter.FixedSetFilter; |
21 | 21 | import de.brightbyte.io.ConsoleIO; |
— | — | @@ -31,6 +31,7 @@ |
32 | 32 | private Matcher sentenceTailGlueMatcher; |
33 | 33 | private Matcher sentenceFollowGlueMatcher; |
34 | 34 | private Matcher wordMatcher; |
| 35 | + private Matcher wordPartMatcher; |
35 | 36 | |
36 | 37 | protected Filter<String> stopwordFilter; |
37 | 38 | protected Matcher phraseBreakeMatcher; |
— | — | @@ -94,6 +95,7 @@ |
95 | 96 | sentenceTailGlueMatcher = config.sentenceTailGluePattern.matcher(""); |
96 | 97 | sentenceFollowGlueMatcher = config.sentenceFollowGluePattern.matcher(""); |
97 | 98 | wordMatcher = config.wordPattern.matcher(""); |
| 99 | + wordPartMatcher = config.wordPartPattern.matcher(""); |
98 | 100 | |
99 | 101 | phraseBreakeMatcher = config.phraseBreakerPattern.matcher(""); |
100 | 102 | stopwordFilter = new FixedSetFilter<String>(config.stopwords); |
— | — | @@ -191,6 +193,7 @@ |
192 | 194 | return corpus; |
193 | 195 | } |
194 | 196 | |
| 197 | + /* |
195 | 198 | public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) { |
196 | 199 | ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
197 | 200 | |
— | — | @@ -231,13 +234,121 @@ |
232 | 235 | |
233 | 236 | if (stopwordFilter.matches(w)) weight = 0; |
234 | 237 | buildPhrasesAggregator.update(wordMatcher.start(), w, weight, into); |
| 238 | + |
| 239 | + //after adding the word, now register word parts |
| 240 | + int j = 0; |
| 241 | + wordPartMatcher.reset(w); |
| 242 | + while (wordPartMatcher.find()) { |
| 243 | + if (wordPartMatcher.start() == 0 && wordPartMatcher.end() == w.length()) { |
| 244 | + break; //full word matched as a single part. no need to register parts. |
| 245 | + } |
| 246 | + |
| 247 | + if (j != wordPartMatcher.start()) { |
| 248 | + CharSequence glue = w.subSequence(j, wordPartMatcher.start()); |
| 249 | + buildPhrasesAggregator.update(i, glue, -1, into); |
| 250 | + } |
| 251 | + |
| 252 | + j = wordPartMatcher.end(); |
| 253 | + weight = 1; |
| 254 | + String p; |
| 255 | + |
| 256 | + if (wordPartMatcher.groupCount()>0) p = wordPartMatcher.group(1); |
| 257 | + else p = wordPartMatcher.group(0); |
| 258 | + |
| 259 | + if (stopwordFilter.matches(p)) weight = 0; |
| 260 | + buildPhrasesAggregator.update(i+wordPartMatcher.start(), p, weight, into); |
| 261 | + } |
| 262 | + |
| 263 | + if (j>0 && j < w.length()) { |
| 264 | + CharSequence glue = text.subSequence(j, w.length()); |
| 265 | + buildPhrasesAggregator.update(j, glue, -1, into); |
| 266 | + } |
235 | 267 | } |
236 | 268 | |
237 | 269 | if (i < text.length()) { |
238 | 270 | CharSequence space = text.subSequence(i, text.length()); |
239 | | - buildPhrasesAggregator.update(i, space, 0, into); |
| 271 | + buildPhrasesAggregator.update(i, space, -1, into); |
240 | 272 | } |
| 273 | + } */ |
| 274 | + |
| 275 | + public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) { |
| 276 | + PhraseOccuranceSet phrases = new PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>()); |
| 277 | + |
| 278 | + text = applyManglers(config.sentenceManglers, text); |
| 279 | + |
| 280 | + ParsePosition pos = new ParsePosition(0); |
| 281 | + while (pos.getIndex() < text.length()) { |
| 282 | + int ofs = pos.getIndex(); |
| 283 | + CharSequence s = extractNextSentence(text, pos, false); |
| 284 | + if (s==null || s.length()==0) break; |
| 285 | + |
| 286 | + buildPhrases(s, ofs, phrases, maxWeight); |
| 287 | + if (phrases.isEmpty()) continue; |
| 288 | + |
| 289 | + phrases.buildAggregatePhrases(ofs, 0, maxWeight, phraseBreakeMatcher); |
| 290 | + } |
| 291 | + |
| 292 | + if (phrases.isEmpty()) return phrases; |
| 293 | + |
| 294 | + phrases.prune(1); |
| 295 | + return phrases; |
241 | 296 | } |
| 297 | + |
| 298 | + private void buildPhrases(CharSequence text, int offset, PhraseOccuranceSet into, int maxWeight) { |
| 299 | + int i = 0; |
| 300 | + wordMatcher.reset(text); |
| 301 | + while (wordMatcher.find()) { |
| 302 | + if (i != wordMatcher.start()) { |
| 303 | + CharSequence space = text.subSequence(i, wordMatcher.start()); |
| 304 | + into.add( new PhraseOccurance(space.toString(), -1, offset+i, space.length()) ); |
| 305 | + } |
| 306 | + |
| 307 | + i = wordMatcher.end(); |
| 308 | + String w; |
| 309 | + int weight = 1; |
| 310 | + |
| 311 | + if (wordMatcher.groupCount()>0) w = wordMatcher.group(1); |
| 312 | + else w = wordMatcher.group(0); |
| 313 | + |
| 314 | + if (stopwordFilter.matches(w)) weight = 0; |
| 315 | + into.add( new PhraseOccurance(w, weight, offset+wordMatcher.start(), w.length()) ); |
| 316 | + |
| 317 | + //after adding the word, now register word parts |
| 318 | + int j = 0; |
| 319 | + int b = wordMatcher.start(); |
| 320 | + wordPartMatcher.reset(w); |
| 321 | + while (wordPartMatcher.find()) { |
| 322 | + if (wordPartMatcher.start() == 0 && wordPartMatcher.end() == w.length()) { |
| 323 | + break; //full word matched as a single part. no need to register parts. |
| 324 | + } |
| 325 | + |
| 326 | + if (j != wordPartMatcher.start()) { |
| 327 | + CharSequence glue = w.subSequence(j, wordPartMatcher.start()); |
| 328 | + into.add( new PhraseOccurance(glue.toString(), -1, offset+b+j, glue.length()) ); |
| 329 | + } |
| 330 | + |
| 331 | + j = wordPartMatcher.end(); |
| 332 | + weight = 1; |
| 333 | + String p; |
| 334 | + |
| 335 | + if (wordPartMatcher.groupCount()>0) p = wordPartMatcher.group(1); |
| 336 | + else p = wordPartMatcher.group(0); |
| 337 | + |
| 338 | + if (stopwordFilter.matches(p)) weight = 0; |
| 339 | + into.add( new PhraseOccurance(p, weight, offset+b+wordPartMatcher.start(), p.length()) ); |
| 340 | + } |
| 341 | + |
| 342 | + if (j>0 && j < w.length()) { |
| 343 | + CharSequence glue = text.subSequence(j, w.length()); |
| 344 | + into.add( new PhraseOccurance(glue.toString(), -1, offset+b+j, glue.length()) ); |
| 345 | + } |
| 346 | + } |
| 347 | + |
| 348 | + if (i < text.length()) { |
| 349 | + CharSequence space = text.subSequence(i, text.length()); |
| 350 | + into.add( new PhraseOccurance(space.toString(), -1, offset+i, space.length()) ); |
| 351 | + } |
| 352 | + } |
242 | 353 | |
243 | 354 | public static void main(String[] argv) throws IOException, InstantiationException { |
244 | 355 | Arguments args = new Arguments(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en_int.java |
— | — | @@ -0,0 +1,10 @@ |
| 2 | +package de.brightbyte.wikiword.wikis; |
| 3 | + |
| 4 | +public class LanguageConfiguration_en_int extends LanguageConfiguration_en { |
| 5 | + |
| 6 | + //TODO: list of abbreviations |
| 7 | + public LanguageConfiguration_en_int() { |
| 8 | + super(); |
| 9 | + } |
| 10 | + |
| 11 | +} |