r68117 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r68116‎ | r68117 | r68118 >
Date:12:12, 16 June 2010
Author:daniel
Status:deferred
Tags:
Comment:
improved phrase detection: allow for overlapping alternative word chunks
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Interwiki.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java (deleted) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en_int.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -3,10 +3,8 @@
44 import java.util.ArrayList;
55 import java.util.Collection;
66 import java.util.Collections;
7 -import java.util.HashSet;
87 import java.util.List;
98 import java.util.Map;
10 -import java.util.Set;
119
1210 import de.brightbyte.io.Output;
1311 import de.brightbyte.util.PersistenceException;
@@ -17,46 +15,6 @@
1816
1917 public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> {
2018
21 - public interface NodeListener<T extends TermReference> {
22 - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal);
23 - }
24 -
25 - public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> {
26 - protected List<List<T>> seqencees;
27 -
28 - public SequenceSetBuilder() {
29 - seqencees = new ArrayList<List<T>>();
30 - }
31 -
32 - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) {
33 - if (terminal) {
34 - List<T> p = new ArrayList<T>(seqence); //clone
35 - seqencees.add(p);
36 - }
37 - }
38 -
39 - public List<List<T>> getSequences() {
40 - return seqencees;
41 - }
42 - }
43 -
44 - public static class TermSetBuilder <T extends TermReference> implements NodeListener<T> {
45 - protected Set<T> terms;
46 -
47 - public TermSetBuilder() {
48 - terms = new HashSet<T>();
49 - }
50 -
51 - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) {
52 - T t = node.getTermReference();
53 - if (t.getTerm().length()>0) terms.add(t);
54 - }
55 -
56 - public Collection<T> getTerms() {
57 - return terms;
58 - }
59 - }
60 -
6119 private MeaningFetcher<C> meaningFetcher;
6220
6321 private Output trace;
@@ -78,18 +36,6 @@
7937 this.meaningOverrides = overrideMap;
8038 }
8139
82 - protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) {
83 - TermSetBuilder<X> builder = new TermSetBuilder<X>();
84 - walk(root, null, builder, depth);
85 - return builder.getTerms();
86 - }
87 -
88 - protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root, int depth) {
89 - SequenceSetBuilder<X> builder = new SequenceSetBuilder<X>();
90 - walk(root, null, builder, depth);
91 - return builder.getSequences();
92 - }
93 -
9440 protected <X extends T>PhraseNode<X> getLastNode(PhraseNode<X> root, List<X> sequence) {
9541 PhraseNode<X> n = findLastNode(root, sequence);
9642 if (n==null) throw new IllegalArgumentException("sequence does not match node structure: "+sequence);
@@ -126,31 +72,19 @@
12773 return root;
12874 }
12975
130 - protected <X extends T>void walk(PhraseNode<X> root, List<X> seqence, NodeListener<? super X> nodeListener, int depth) {
131 - if (depth<1) return;
132 - if (seqence == null) seqence = new ArrayList<X>();
133 -
134 - X t = root.getTermReference();
135 - if (t.getTerm().length()>0) seqence.add(t); //push
136 - else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes.
137 -
138 - boolean terminal = (depth<=1);
139 -
140 - Collection<? extends PhraseNode<X>> successors = terminal ? null : root.getSuccessors();
141 - if (successors==null || successors.isEmpty()) terminal = true;
142 -
143 - if (nodeListener!=null)
144 - nodeListener.onNode(root, seqence, terminal);
145 -
146 - if (!terminal) {
147 - for (PhraseNode<X> n: successors) {
148 - walk(n, seqence, nodeListener, depth-1);
149 - }
150 - }
151 -
152 - if (t.getTerm().length()>0) seqence.remove(t); //pop
 76+ protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) {
 77+ PhraseNode.TermSetBuilder<X> builder = new PhraseNode.TermSetBuilder<X>();
 78+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
 79+ return builder.getTerms();
15380 }
15481
 82+ protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root, int depth) {
 83+ PhraseNode.SequenceSetBuilder<X> builder = new PhraseNode.SequenceSetBuilder<X>();
 84+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
 85+ return builder.getSequences();
 86+ }
 87+
 88+
15589 protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException {
15690 Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
15791 return getMeanings(terms);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Interwiki.java
@@ -78,7 +78,9 @@
7979
8080 return interwikis;
8181 } catch (IOException e) {
82 - throw new RuntimeException("failed to load interwiki map from "+n);
 82+ throw new RuntimeException("failed to load interwiki map from "+n, e);
 83+ } catch (IllegalArgumentException e) { //NOTE: malformed \\u-encoding triggers this. wtf? why not a *real* exception?...
 84+ throw new RuntimeException("failed to load interwiki map from "+n, e);
8385 }
8486
8587 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java
@@ -13,10 +13,6 @@
1414
1515 public PhraseOccurance(String phrase, int weight, int offset, int length) {
1616 if (length < 0) throw new IllegalArgumentException("bad length: "+length);
17 - if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string");
18 - //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string");
19 - if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string");
20 - if (length < phrase.length()) phrase = phrase.substring(offset, offset+length);
2117
2218 this.phrase = phrase;
2319 this.weight = weight;
@@ -49,7 +45,7 @@
5046 }
5147
5248 public String toString() {
53 - return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";
 49+ return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]#"+weight;
5450 }
5551
5652 @Override
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
@@ -1,6 +1,5 @@
22 package de.brightbyte.wikiword.model;
33
4 -import java.util.AbstractList;
54 import java.util.ArrayList;
65 import java.util.Collection;
76 import java.util.Collections;
@@ -8,11 +7,49 @@
98 import java.util.Iterator;
109 import java.util.List;
1110 import java.util.ListIterator;
12 -import java.util.RandomAccess;
1311 import java.util.Set;
 12+import java.util.regex.Matcher;
1413
15 -public class PhraseOccuranceSet extends AbstractList<PhraseOccurance> implements RandomAccess {
 14+public class PhraseOccuranceSet implements List<PhraseOccurance> {
1615
 16+ public static class AggregatePhraseBuilder extends PhraseNode.Walker<PhraseOccurance> {
 17+ protected Collection<PhraseOccurance> aggregated;
 18+ protected double minWeight;
 19+ protected double maxWeight;
 20+ protected Matcher phraseBreak;
 21+
 22+ public AggregatePhraseBuilder( double minWeight, double maxWeight, Matcher phraseBreak ) {
 23+ aggregated = new HashSet<PhraseOccurance>();
 24+ this.minWeight = minWeight;
 25+ this.maxWeight = maxWeight ;
 26+ this.phraseBreak = phraseBreak;
 27+ }
 28+
 29+ public boolean onNode(PhraseNode<? extends PhraseOccurance> node, List<? extends PhraseOccurance> sequence, double weight, boolean terminal) {
 30+ if (weight>=minWeight && !sequence.isEmpty()) {
 31+ PhraseOccurance p = aggregatePhrase( sequence, minWeight, maxWeight );
 32+ if (p!=null) aggregated.add(p);
 33+
 34+ PhraseOccurance last = sequence.get( sequence.size()-1);
 35+
 36+ if (phraseBreak!=null) {
 37+ phraseBreak.reset(last.getTerm());
 38+ if (phraseBreak.matches())
 39+ return false; //phrase terminates here, don't dig deeper.
 40+ }
 41+
 42+ if (p==null) return weight <= maxWeight; //XXX: something is wrong here
 43+ else return p.getWeight() <= maxWeight; //XXX: can we do that?
 44+ } else {
 45+ return weight <= maxWeight; //XXX: not sure...
 46+ }
 47+ }
 48+
 49+ public Collection<PhraseOccurance> getAggregatedPhrases() {
 50+ return aggregated;
 51+ }
 52+ }
 53+
1754 protected class Node implements PhraseNode<PhraseOccurance> {
1855 protected PhraseOccurance phrase;
1956
@@ -57,8 +94,6 @@
5895 return false;
5996 return true;
6097 }
61 -
62 -
6398 }
6499
65100 protected List<PhraseOccurance> phrases;
@@ -71,12 +106,51 @@
72107 Collections.sort(this.phrases); //essential!
73108 }
74109
75 - @Override
 110+ private static PhraseOccurance aggregatePhrase(List<? extends PhraseOccurance> sequence, double minWeight, double maxWeight) {
 111+ if (sequence.isEmpty()) return null;
 112+
 113+ int i = 0;
 114+ while ( i<sequence.size() && sequence.get(i).getWeight() < minWeight ) i++;
 115+
 116+ int j = sequence.size()-1;
 117+ while ( j>i && sequence.get(j).getWeight() < minWeight ) j--;
 118+
 119+ if ( j<i ) return null;
 120+
 121+ double weight = 0;
 122+ int ofs = -1;
 123+ int start = -1;
 124+ StringBuilder s = new StringBuilder();
 125+
 126+ for (int n=i; n<=j; n++) {
 127+ PhraseOccurance p = sequence.get(n);
 128+
 129+ double w = p.getWeight();
 130+ if (w<0) w = 0;
 131+ if (weight+w > maxWeight) break;
 132+
 133+ if ( start < 0 ) {
 134+ start = p.getOffset();
 135+ ofs = p.getOffset();
 136+ } else {
 137+ if (p.getOffset()>ofs) s.append(" ");
 138+ }
 139+
 140+ ofs = p.getEndOffset();
 141+
 142+ weight += w;
 143+ s.append(p.getTerm());
 144+ }
 145+
 146+ if (start < 0) return null;
 147+
 148+ return new PhraseOccurance(s.toString(), (int)weight, start, ofs - start);
 149+ }
 150+
76151 public PhraseOccurance get(int index) {
77152 return phrases.get(index);
78153 }
79154
80 - @Override
81155 public int size() {
82156 return phrases.size();
83157 }
@@ -179,6 +253,15 @@
180254 return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
181255 }
182256
 257+ public boolean hasPhrasesAt(int at) {
 258+ for ( PhraseOccurance p: phrases ) {
 259+ if ( p.getOffset() == at) return true;
 260+ else if ( p.getOffset() > at) return false;
 261+ }
 262+
 263+ return false;
 264+ }
 265+
183266 public List<PhraseOccurance> getPhrasesFrom(int offs) {
184267 int i = 0;
185268 while (i<size()) {
@@ -262,7 +345,96 @@
263346 public <T> T[] toArray(T[] a) {
264347 return phrases.toArray(a);
265348 }
 349+
 350+ public void prune( double minWeight ) {
 351+ Iterator<PhraseOccurance> it = phrases.iterator();
 352+ while (it.hasNext()) {
 353+ PhraseOccurance t = it.next();
 354+ if ( t.getWeight() < minWeight ) it.remove();
 355+ }
 356+ }
266357
 358+ public void buildAggregatePhrases( int start, double minWeight, double maxWeight, Matcher phraseBreak ) {
 359+ AggregatePhraseBuilder builder = new AggregatePhraseBuilder( minWeight, maxWeight, phraseBreak );
 360+
 361+ if (isEmpty()) return;
 362+ PhraseOccurance last = phrases.get(phrases.size()-1);
 363+ int end = last.getEndOffset();
 364+
 365+ for (int i=start; i<end; i++) {
 366+ if (hasPhrasesAt(i)) {
 367+ builder.walk(getRootNodeAt(i), 0, null, Integer.MAX_VALUE, maxWeight);
 368+ }
 369+ }
 370+
 371+ Collection<PhraseOccurance> phrases = builder.getAggregatedPhrases();
 372+ addAll( phrases );
 373+ }
 374+
 375+ public String toString() {
 376+ return phrases.toString();
 377+ }
 378+
 379+ public Collection<PhraseOccurance> getTerms(PhraseNode<PhraseOccurance> root, int depth) {
 380+ PhraseNode.TermSetBuilder<PhraseOccurance> builder = new PhraseNode.TermSetBuilder<PhraseOccurance>();
 381+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
 382+ return builder.getTerms();
 383+ }
 384+
 385+ public Collection<List<PhraseOccurance>> getSequences(PhraseNode<PhraseOccurance> root, int depth) {
 386+ PhraseNode.SequenceSetBuilder<PhraseOccurance> builder = new PhraseNode.SequenceSetBuilder<PhraseOccurance>();
 387+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
 388+ return builder.getSequences();
 389+ }
 390+
 391+ public void add(int index, PhraseOccurance element) {
 392+ add(element);
 393+ }
 394+
 395+ public boolean add(PhraseOccurance e) {
 396+ int i = Collections.binarySearch(phrases, e);
 397+
 398+ if (i<0) i = -i-1;
 399+ else {
 400+ PhraseOccurance old = get(i);
 401+ if (old.equals(e)) return false;
 402+ }
 403+
 404+ phrases.add(i, e);
 405+
 406+ return true;
 407+ }
 408+
 409+ public boolean addAll(Collection<? extends PhraseOccurance> c) {
 410+ int count = 0;
 411+ for (PhraseOccurance p: c) {
 412+ if ( add(p) ) count++;
 413+ }
 414+
 415+ return count>0;
 416+ }
 417+
 418+ public boolean addAll(int index, Collection<? extends PhraseOccurance> c) {
 419+ return addAll(c);
 420+ }
 421+
 422+ public int hashCode() {
 423+ return phrases.hashCode();
 424+ }
 425+
 426+ public int lastIndexOf(Object o) {
 427+ return phrases.lastIndexOf(o);
 428+ }
 429+
 430+ public PhraseOccurance set(int index, PhraseOccurance element) {
 431+ throw new UnsupportedOperationException();
 432+ }
 433+
 434+ public PhraseOccuranceSet subList(int fromIndex, int toIndex) {
 435+ return new PhraseOccuranceSet(text, phrases.subList(fromIndex, toIndex));
 436+ }
 437+
 438+
267439 /*
268440 public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
269441 List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java
@@ -1,9 +1,90 @@
22 package de.brightbyte.wikiword.model;
33
 4+import java.util.ArrayList;
45 import java.util.Collection;
 6+import java.util.HashSet;
 7+import java.util.List;
 8+import java.util.Set;
59
610 public interface PhraseNode<T extends TermReference> {
 11+ public abstract class Walker<T extends TermReference> {
 12+ public abstract boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal);
 13+
 14+ public void walk(PhraseNode<T> root, int depth) {
 15+ walk(root, 0, null, depth, Double.MAX_VALUE);
 16+ }
717
 18+ public void walk(PhraseNode<T> root, double baseWeight, List<T> intoSeqence, int depth, double maxWeight) {
 19+ if (depth<1) return;
 20+ if (intoSeqence == null) intoSeqence = new ArrayList<T>();
 21+
 22+ T t = root.getTermReference();
 23+ if (t.getTerm().length()>0) intoSeqence.add(t); //push
 24+ else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes.
 25+
 26+ boolean terminal = (depth<=1) || (baseWeight>=maxWeight);
 27+
 28+ Collection<? extends PhraseNode<T>> successors = terminal ? null : root.getSuccessors();
 29+ if (successors==null || successors.isEmpty()) terminal = true;
 30+
 31+ double w = root.getTermReference().getWeight();
 32+ if (w<0) w = 0;
 33+ if ( !onNode(root, intoSeqence, w, terminal) ) terminal = true;
 34+
 35+ //System.out.println( " - walk: "+intoSeqence+" " );
 36+
 37+ if (!terminal) {
 38+ for (PhraseNode<T> n: successors) {
 39+ w = n.getTermReference().getWeight();
 40+ if (w<0) w = 0;
 41+ walk(n, baseWeight + w, intoSeqence, depth-1, maxWeight);
 42+ }
 43+ }
 44+
 45+ if (t.getTerm().length()>0) intoSeqence.remove(t); //pop
 46+ }
 47+ }
 48+
 49+ public static class SequenceSetBuilder <T extends TermReference> extends Walker<T> {
 50+ protected List<List<T>> sequences;
 51+
 52+ public SequenceSetBuilder() {
 53+ sequences = new ArrayList<List<T>>();
 54+ }
 55+
 56+ public boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal) {
 57+ if (terminal) {
 58+ List<T> p = new ArrayList<T>(sequence); //clone
 59+ sequences.add(p);
 60+ }
 61+
 62+ return !terminal;
 63+ }
 64+
 65+ public List<List<T>> getSequences() {
 66+ return sequences;
 67+ }
 68+ }
 69+
 70+ public static class TermSetBuilder <T extends TermReference> extends Walker<T> {
 71+ protected Set<T> terms;
 72+
 73+ public TermSetBuilder() {
 74+ terms = new HashSet<T>();
 75+ }
 76+
 77+ public boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal) {
 78+ T t = node.getTermReference();
 79+ if (t.getTerm().length()>0) terms.add(t);
 80+
 81+ return !terminal;
 82+ }
 83+
 84+ public Collection<T> getTerms() {
 85+ return terms;
 86+ }
 87+ }
 88+
889 public T getTermReference();
990
1091 public Collection<? extends PhraseNode<T>> getSuccessors();
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
@@ -2,14 +2,16 @@
33
44 import java.io.IOException;
55 import java.net.URISyntaxException;
 6+import java.util.ArrayList;
67 import java.util.Arrays;
 8+import java.util.Collections;
79 import java.util.HashSet;
810 import java.util.List;
911 import java.util.Set;
1012
1113 import de.brightbyte.wikiword.Corpus;
12 -import de.brightbyte.wikiword.analyzer.LanguageConfiguration;
13 -import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer;
 14+import de.brightbyte.wikiword.model.PhraseOccurance;
 15+import de.brightbyte.wikiword.model.PhraseOccuranceSet;
1416
1517 /**
1618 * Unit tests for PlainTextAnalyzer
@@ -60,8 +62,8 @@
6163 words = extractWords("foo-bar");
6264 assertEquals(theList( "foo-bar" ), words);
6365
64 - words = extractWords("harald's 'schaaand");
65 - assertEquals(theList( "harald's", "'schaaand" ), words);
 66+ words = extractWords("harald's 'schlaaand");
 67+ assertEquals(theList( "harald's", "'schlaaand" ), words);
6668
6769 words = extractWords("23-42");
6870 assertEquals(theList( "23-42" ), words);
@@ -69,6 +71,92 @@
7072 words = extractWords("23foo42");
7173 assertEquals(theList( "23", "foo", "42" ), words);
7274 }
 75+
 76+ public void testExtractPhrases() {
 77+ PhraseOccuranceSet phrases = extractPhrases("", 3);
 78+ assertEquals(0, phrases.size());
 79+ assertEquals(theList(), getWordList(phrases.getPhrasesAt(0)));
 80+
 81+ phrases = extractPhrases("foo", 3);
 82+ assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(0)));
 83+
 84+ phrases = extractPhrases(" foo ", 3);
 85+ assertEquals(theList(), getWordList(phrases.getPhrasesAt(0)));
 86+ assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(1)));
 87+ assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesFrom(0)));
 88+ }
 89+
 90+ public void testExtractPhrases2() {
 91+ PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3);
 92+ assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
 93+ assertEquals(theList( "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
 94+
 95+ phrases = extractPhrases("red green blue yellow black", 5);
 96+ assertEquals(theList( "red green blue yellow black", "red green blue yellow", "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
 97+ assertEquals(theList( "green blue yellow black", "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
 98+
 99+ phrases = extractPhrases("and red and green and blue and yellow", 3);
 100+ assertEquals(theList( "and red and green and blue",
 101+ "and red and green and",
 102+ "and red and green",
 103+ "and red and",
 104+ "and red"
 105+ ),
 106+ getWordList(phrases.getPhrasesAt(0)));
 107+ assertEquals(theList( "red and green and blue",
 108+ "red and green and",
 109+ "red and green",
 110+ "red and",
 111+ "red"
 112+ ),
 113+ getWordList(phrases.getPhrasesAt(4)));
 114+
 115+ phrases = extractPhrases("red green blue. yellow black", 5);
 116+ assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
 117+ assertEquals(theList( "blue" ), getWordList(phrases.getPhrasesAt(10)));
 118+ assertEquals(theList( "yellow black", "yellow" ), getWordList(phrases.getPhrasesAt(16)));
 119+ }
 120+
 121+ public void testExtractPhrases3() {
 122+ PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3);
 123+ assertEquals(theList( "Krababbel"), getWordList(phrases.getPhrasesAt(0)));
 124+
 125+ assertEquals(theList( "l'Foo-Bar",
 126+ "l'Foo"
 127+ ),
 128+ getWordList(phrases.getPhrasesAt(11)));
 129+
 130+ assertEquals(theList( "Foo-Bar",
 131+ "Foo"
 132+ ),
 133+ getWordList(phrases.getPhrasesAt(13)));
 134+
 135+ assertEquals(theList( "Bar"),
 136+ getWordList(phrases.getPhrasesAt(17)));
 137+
 138+ phrases = extractPhrases("harald's 'schlaaand", 3);
 139+ assertEquals(theList( "harald's 'schlaaand",
 140+ "harald's",
 141+ "harald"
 142+ ),
 143+ getWordList(phrases.getPhrasesAt(0)));
 144+
 145+ assertEquals(theList( "'schlaaand"), getWordList(phrases.getPhrasesAt(9)));
 146+ assertEquals(theList("schlaaand"), getWordList(phrases.getPhrasesAt(10)));
 147+ }
 148+
 149+ private List<String> getWordList(List<PhraseOccurance> phrases) {
 150+ if (phrases==null) return Collections.emptyList();
 151+
 152+ List<String> words = new ArrayList<String>(phrases.size());
 153+
 154+ for (PhraseOccurance phrase: phrases) {
 155+ String w = phrase.getTerm();
 156+ words.add(w);
 157+ }
 158+
 159+ return words;
 160+ }
73161
74162 }
75163
@@ -86,7 +174,7 @@
87175 public void setUp() throws URISyntaxException, IOException {
88176 LanguageConfiguration config = new LanguageConfiguration();
89177
90 - corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null);
 178+ corpus = new Corpus("TEST", "en", "en", "en", "en", "en", "en", null);
91179 testAnalyzer = new TestPlainTextAnalyzer(corpus);
92180 testAnalyzer.configure(config, tweaks);
93181 testAnalyzer.initialize();
@@ -102,6 +190,18 @@
103191 testAnalyzer.testExtractWords();
104192 }
105193
 194+ public void testExtractPhrases() {
 195+ testAnalyzer.testExtractPhrases();
 196+ }
 197+
 198+ public void testExtractPhrases2() {
 199+ testAnalyzer.testExtractPhrases2();
 200+ }
 201+
 202+ public void testExtractPhrases3() {
 203+ testAnalyzer.testExtractPhrases3();
 204+ }
 205+
106206 public static void main(String[] args) {
107207 run(PlainTextAnalyzerTest.class, args);
108208 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java
@@ -1,113 +0,0 @@
2 -package de.brightbyte.wikiword.analyzer;
3 -
4 -import java.util.ArrayList;
5 -import java.util.Collection;
6 -import java.util.Iterator;
7 -import java.util.regex.Matcher;
8 -
9 -import de.brightbyte.wikiword.model.PhraseOccurance;
10 -
11 -public class PhraseAggregator {
12 - public class PhraseBuilder {
13 - protected StringBuilder phrase;
14 - protected int weight;
15 - protected int lastWeight;
16 - protected int offset;
17 -
18 - public PhraseBuilder(int offset) {
19 - this.phrase = new StringBuilder();
20 - this.weight = 0;
21 - this.offset = offset;
22 - }
23 -
24 - public int getLength() {
25 - return phrase.length();
26 - }
27 -
28 - public int getOffset() {
29 - return offset;
30 - }
31 -
32 - public int getEndOffset() {
33 - return getOffset() + getLength();
34 - }
35 -
36 - public String getPhrase() {
37 - return phrase.toString();
38 - }
39 -
40 - public int getWeight() {
41 - return weight;
42 - }
43 -
44 - public int getLastWeight() {
45 - return lastWeight;
46 - }
47 -
48 - public PhraseOccurance toPhraseOccurance() {
49 - return new PhraseOccurance(getPhrase(), getWeight(), getOffset(), getLength());
50 - }
51 -
52 - public String toString() {
53 - return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";
54 - }
55 -
56 - public void push(CharSequence w, int weight) {
57 - phrase.append(w);
58 - if (weight>0) this.weight+= weight;
59 - this.lastWeight = weight;
60 - }
61 - }
62 -
63 - private int offset = 0;
64 - private int maxWeight = 0;
65 -
66 - private Matcher phraseBreakeMatcher;
67 - private ArrayList<PhraseBuilder> phrases = new ArrayList<PhraseBuilder>();
68 -
69 - public PhraseAggregator(Matcher phraseBreakeMatcher) {
70 - super();
71 - this.phraseBreakeMatcher = phraseBreakeMatcher;
72 - }
73 -
74 - public void reset(int offset, int maxWeight) {
75 - this.offset = offset;
76 - this.maxWeight = maxWeight;
77 - clear();
78 - }
79 -
80 - public void clear() {
81 - phrases.clear();
82 - }
83 -
84 - public void update(int index, CharSequence word, int weight, Collection<PhraseOccurance> into) {
85 - if (weight<0) {
86 - phraseBreakeMatcher.reset(word);
87 - if (phraseBreakeMatcher.matches()) {
88 - this.clear();
89 - return;
90 - }
91 - }
92 -
93 - this.push(index, word, weight);
94 - this.commit(into);
95 - }
96 -
97 - public void push(int index, CharSequence word, int weight) {
98 - if (weight >= 0) phrases.add(new PhraseBuilder(index+offset));
99 -
100 - Iterator<PhraseBuilder> it = phrases.iterator();
101 - while (it.hasNext()) {
102 - PhraseBuilder b = it.next();
103 - b.push(word, weight);
104 - if (b.getWeight() > maxWeight) it.remove();
105 - }
106 - }
107 -
108 - public void commit(Collection<PhraseOccurance> into) {
109 - for (PhraseBuilder b: phrases) {
110 - if (b.getWeight() > 0 && b.getLastWeight() > 0) into.add(b.toPhraseOccurance());
111 - }
112 - }
113 -
114 -}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
@@ -60,10 +60,18 @@
6161
6262 /**
6363 * A pattern matching individual words, for splitting a string into words. This is usually
64 - * set to match any sequence of letters but not numbers, whitespace or punctuation.
 64+ * set to match any sequence of letters but not numbers, whitespace or punctuation, except
 65+ * those that may accurs as part of a word, such as an apostrophy or hyphen.
6566 */
6667 public Pattern wordPattern;
6768
 69+ /**
 70+ * A pattern matching individual words parts, for splitting a words into components. This is usually
 71+ * set to match any sequence of letters but not numbers, whitespace or punctuation, nor
 72+ * those that may accurs as part of a word, such as an apostrophy or hyphen.
 73+ */
 74+ public Pattern wordPartPattern;
 75+
6876 protected String languageName;
6977
7078 /**
@@ -98,7 +106,8 @@
99107 }
100108
101109 public void defaults() throws IOException {
102 - if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}'']+(?:[\\p{Pc}\\p{Pd}][\\p{L}'']+)*|\\p{Nd}+(?:.\\p{Nd}+)?");
 110+ if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}']+(?:[\\p{Pc}\\p{Pd}][\\p{L}']+)*|\\p{Nd}+(?:.\\p{Nd}+)?");
 111+ if (this.wordPartPattern==null) this.wordPartPattern = Pattern.compile("[\\p{L}]+|\\p{Nd}+");
103112
104113 this.sentenceManglers.add( new RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks
105114 this.sentenceManglers.add( new RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "", 0) ); //strip leading cruft (lines without any characters)
@@ -110,7 +119,7 @@
111120 List<String> stop = AuxilliaryWikiProperties.loadList("Stopwords", languageName);
112121 if (stop!=null) this.stopwords.addAll(stop);
113122
114 - this.phraseBreakerPattern = Pattern.compile("[,;:]\\s|\"");
 123+ this.phraseBreakerPattern = Pattern.compile("[,;:\".!?]\\s*");
115124 this.parentacies = new ArrayList<Pair<String, String>>();
116125 this.parentacies.add( new Pair<String, String>("(", ")") );
117126 this.parentacies.add( new Pair<String, String>("[", "]") );
@@ -126,6 +135,7 @@
127136 if (with.sentenceManglers!=null) sentenceManglers.addAll(with.sentenceManglers);
128137
129138 if (with.wordPattern!=null) wordPattern = with.wordPattern;
 139+ if (with.wordPartPattern!=null) wordPartPattern = with.wordPartPattern;
130140 if (with.phraseBreakerPattern!=null) phraseBreakerPattern = with.phraseBreakerPattern;
131141
132142 if (with.stopwords!=null) stopwords.addAll(with.stopwords);
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
@@ -13,8 +13,8 @@
1414
1515 import de.brightbyte.application.Arguments;
1616 import de.brightbyte.audit.DebugUtil;
 17+import de.brightbyte.data.Lookup;
1718 import de.brightbyte.data.MapLookup;
18 -import de.brightbyte.data.Lookup;
1919 import de.brightbyte.data.filter.Filter;
2020 import de.brightbyte.data.filter.FixedSetFilter;
2121 import de.brightbyte.io.ConsoleIO;
@@ -31,6 +31,7 @@
3232 private Matcher sentenceTailGlueMatcher;
3333 private Matcher sentenceFollowGlueMatcher;
3434 private Matcher wordMatcher;
 35+ private Matcher wordPartMatcher;
3536
3637 protected Filter<String> stopwordFilter;
3738 protected Matcher phraseBreakeMatcher;
@@ -94,6 +95,7 @@
9596 sentenceTailGlueMatcher = config.sentenceTailGluePattern.matcher("");
9697 sentenceFollowGlueMatcher = config.sentenceFollowGluePattern.matcher("");
9798 wordMatcher = config.wordPattern.matcher("");
 99+ wordPartMatcher = config.wordPartPattern.matcher("");
98100
99101 phraseBreakeMatcher = config.phraseBreakerPattern.matcher("");
100102 stopwordFilter = new FixedSetFilter<String>(config.stopwords);
@@ -191,6 +193,7 @@
192194 return corpus;
193195 }
194196
 197+ /*
195198 public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) {
196199 ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
197200
@@ -231,13 +234,121 @@
232235
233236 if (stopwordFilter.matches(w)) weight = 0;
234237 buildPhrasesAggregator.update(wordMatcher.start(), w, weight, into);
 238+
 239+ //after adding the word, now register word parts
 240+ int j = 0;
 241+ wordPartMatcher.reset(w);
 242+ while (wordPartMatcher.find()) {
 243+ if (wordPartMatcher.start() == 0 && wordPartMatcher.end() == w.length()) {
 244+ break; //full word matched as a single part. no need to register parts.
 245+ }
 246+
 247+ if (j != wordPartMatcher.start()) {
 248+ CharSequence glue = w.subSequence(j, wordPartMatcher.start());
 249+ buildPhrasesAggregator.update(i, glue, -1, into);
 250+ }
 251+
 252+ j = wordPartMatcher.end();
 253+ weight = 1;
 254+ String p;
 255+
 256+ if (wordPartMatcher.groupCount()>0) p = wordPartMatcher.group(1);
 257+ else p = wordPartMatcher.group(0);
 258+
 259+ if (stopwordFilter.matches(p)) weight = 0;
 260+ buildPhrasesAggregator.update(i+wordPartMatcher.start(), p, weight, into);
 261+ }
 262+
 263+ if (j>0 && j < w.length()) {
 264+ CharSequence glue = text.subSequence(j, w.length());
 265+ buildPhrasesAggregator.update(j, glue, -1, into);
 266+ }
235267 }
236268
237269 if (i < text.length()) {
238270 CharSequence space = text.subSequence(i, text.length());
239 - buildPhrasesAggregator.update(i, space, 0, into);
 271+ buildPhrasesAggregator.update(i, space, -1, into);
240272 }
 273+ } */
 274+
 275+ public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) {
 276+ PhraseOccuranceSet phrases = new PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
 277+
 278+ text = applyManglers(config.sentenceManglers, text);
 279+
 280+ ParsePosition pos = new ParsePosition(0);
 281+ while (pos.getIndex() < text.length()) {
 282+ int ofs = pos.getIndex();
 283+ CharSequence s = extractNextSentence(text, pos, false);
 284+ if (s==null || s.length()==0) break;
 285+
 286+ buildPhrases(s, ofs, phrases, maxWeight);
 287+ if (phrases.isEmpty()) continue;
 288+
 289+ phrases.buildAggregatePhrases(ofs, 0, maxWeight, phraseBreakeMatcher);
 290+ }
 291+
 292+ if (phrases.isEmpty()) return phrases;
 293+
 294+ phrases.prune(1);
 295+ return phrases;
241296 }
 297+
 298+ private void buildPhrases(CharSequence text, int offset, PhraseOccuranceSet into, int maxWeight) {
 299+ int i = 0;
 300+ wordMatcher.reset(text);
 301+ while (wordMatcher.find()) {
 302+ if (i != wordMatcher.start()) {
 303+ CharSequence space = text.subSequence(i, wordMatcher.start());
 304+ into.add( new PhraseOccurance(space.toString(), -1, offset+i, space.length()) );
 305+ }
 306+
 307+ i = wordMatcher.end();
 308+ String w;
 309+ int weight = 1;
 310+
 311+ if (wordMatcher.groupCount()>0) w = wordMatcher.group(1);
 312+ else w = wordMatcher.group(0);
 313+
 314+ if (stopwordFilter.matches(w)) weight = 0;
 315+ into.add( new PhraseOccurance(w, weight, offset+wordMatcher.start(), w.length()) );
 316+
 317+ //after adding the word, now register word parts
 318+ int j = 0;
 319+ int b = wordMatcher.start();
 320+ wordPartMatcher.reset(w);
 321+ while (wordPartMatcher.find()) {
 322+ if (wordPartMatcher.start() == 0 && wordPartMatcher.end() == w.length()) {
 323+ break; //full word matched as a single part. no need to register parts.
 324+ }
 325+
 326+ if (j != wordPartMatcher.start()) {
 327+ CharSequence glue = w.subSequence(j, wordPartMatcher.start());
 328+ into.add( new PhraseOccurance(glue.toString(), -1, offset+b+j, glue.length()) );
 329+ }
 330+
 331+ j = wordPartMatcher.end();
 332+ weight = 1;
 333+ String p;
 334+
 335+ if (wordPartMatcher.groupCount()>0) p = wordPartMatcher.group(1);
 336+ else p = wordPartMatcher.group(0);
 337+
 338+ if (stopwordFilter.matches(p)) weight = 0;
 339+ into.add( new PhraseOccurance(p, weight, offset+b+wordPartMatcher.start(), p.length()) );
 340+ }
 341+
 342+ if (j>0 && j < w.length()) {
 343+ CharSequence glue = text.subSequence(j, w.length());
 344+ into.add( new PhraseOccurance(glue.toString(), -1, offset+b+j, glue.length()) );
 345+ }
 346+ }
 347+
 348+ if (i < text.length()) {
 349+ CharSequence space = text.subSequence(i, text.length());
 350+ into.add( new PhraseOccurance(space.toString(), -1, offset+i, space.length()) );
 351+ }
 352+ }
242353
243354 public static void main(String[] argv) throws IOException, InstantiationException {
244355 Arguments args = new Arguments();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en_int.java
@@ -0,0 +1,10 @@
 2+package de.brightbyte.wikiword.wikis;
 3+
 4+public class LanguageConfiguration_en_int extends LanguageConfiguration_en {
 5+
 6+ //TODO: list of abbreviations
 7+ public LanguageConfiguration_en_int() {
 8+ super();
 9+ }
 10+
 11+}

Status & tagging log