r65495 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r65494‎ | r65495 | r65496 >
Date:21:01, 23 April 2010
Author:daniel
Status:deferred
Tags:
Comment:
add support for phrase detection to disambiguator package. incomplete, does not compile.
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningCache.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java (deleted) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermListNode.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningFetcher.java
@@ -1,5 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.Collection;
45 import java.util.List;
56 import java.util.Map;
67
@@ -10,5 +11,5 @@
1112 public interface MeaningFetcher<C extends WikiWordConcept> {
1213 public List<? extends C> getMeanings(String term) throws PersistenceException;
1314
14 - public <X extends TermReference>Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException;
 15+ public <X extends TermReference>Map<X, List<? extends C>> getMeanings(Collection<X> terms) throws PersistenceException;
1516 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -16,6 +16,7 @@
1717 import de.brightbyte.data.measure.Similarity;
1818 import de.brightbyte.util.PersistenceException;
1919 import de.brightbyte.wikiword.model.LocalConcept;
 20+import de.brightbyte.wikiword.model.PhraseNode;
2021 import de.brightbyte.wikiword.model.TermReference;
2122 import de.brightbyte.wikiword.model.WikiWordConcept;
2223
@@ -39,14 +40,14 @@
4041 /* (non-Javadoc)
4142 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
4243 */
43 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 44+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
4445 if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings");
4546
4647 int sz = Math.min(terms.size(), meanings.size());
4748 if (context!=null) sz += context.size();
4849
4950 if (window < 2 || sz<2) {
50 - return popularityDisambiguator.disambiguate(terms, meanings, context);
 51+ return popularityDisambiguator.disambiguate(root, terms, meanings, context);
5152 }
5253
5354 pruneMeanings(meanings);
@@ -55,7 +56,7 @@
5657 if (context!=null) sz += context.size();
5758
5859 if (sz<2) {
59 - return popularityDisambiguator.disambiguate(terms, meanings, context);
 60+ return popularityDisambiguator.disambiguate(root, terms, meanings, context);
6061 }
6162
6263 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
@@ -77,7 +78,7 @@
7879 Result r ;
7980
8081 if (to-from < 2) {
81 - r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings, context);
 82+ r = popularityDisambiguator.disambiguate(root..., terms.subList(from, to), meanings, context);
8283 } else {
8384 List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings);
8485 r = getBestInterpretation(terms, meanings, context, interpretations, similarities, features);
@@ -124,7 +125,7 @@
125126 mset.put(t, m);
126127 }
127128
128 - return getInterpretations(terms.subList(from, to), mset);
 129+ return getSequenceInterpretations(terms.subList(from, to), mset);
129130 }
130131
131132 public boolean getRunningStart() {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
@@ -1,5 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.Collection;
45 import java.util.HashMap;
56 import java.util.List;
67 import java.util.Map;
@@ -34,7 +35,7 @@
3536 return m.load();
3637 }
3738
38 - public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(List<X> terms) throws PersistenceException {
 39+ public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(Collection<X> terms) throws PersistenceException {
3940 Map<X, List<? extends LocalConcept>> meanings = new HashMap<X, List<? extends LocalConcept>>();
4041
4142 for (X t: terms) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningCache.java
@@ -1,6 +1,7 @@
22 package de.brightbyte.wikiword.disambig;
33
44 import java.util.ArrayList;
 5+import java.util.Collection;
56 import java.util.HashMap;
67 import java.util.List;
78 import java.util.Map;
@@ -79,7 +80,7 @@
8081 }
8182
8283
83 - public <X extends TermReference> Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException {
 84+ public <X extends TermReference> Map<X, List<? extends C>> getMeanings(Collection<X> terms) throws PersistenceException {
8485 Map<X, List<? extends C>> meanings= new HashMap<X, List<? extends C>>();
8586 List<X> todo = new ArrayList<X>(terms.size());
8687
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -9,6 +9,7 @@
1010 import de.brightbyte.data.measure.Measure;
1111 import de.brightbyte.data.measure.Measure.Comparator;
1212 import de.brightbyte.wikiword.model.LocalConcept;
 13+import de.brightbyte.wikiword.model.PhraseNode;
1314 import de.brightbyte.wikiword.model.TermReference;
1415 import de.brightbyte.wikiword.model.WikiWordConcept;
1516
@@ -36,7 +37,7 @@
3738 this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true);
3839 }
3940
40 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) {
 41+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) {
4142 if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings");
4243
4344 Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>();
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -22,6 +22,7 @@
2323 import de.brightbyte.util.PersistenceException;
2424 import de.brightbyte.wikiword.model.ConceptFeatures;
2525 import de.brightbyte.wikiword.model.LocalConcept;
 26+import de.brightbyte.wikiword.model.PhraseNode;
2627 import de.brightbyte.wikiword.model.TermReference;
2728 import de.brightbyte.wikiword.model.WikiWordConcept;
2829
@@ -180,14 +181,14 @@
181182 /* (non-Javadoc)
182183 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
183184 */
184 - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 185+ public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
185186 if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings");
186187
187188 int sz = Math.min(terms.size(), meanings.size());
188189 if (context!=null) sz += context.size();
189190
190191 if (sz<2) {
191 - return popularityDisambiguator.disambiguate(terms, meanings, context);
 192+ return popularityDisambiguator.disambiguate(root, terms, meanings, context);
192193 }
193194
194195 pruneMeanings(meanings);
@@ -195,7 +196,7 @@
196197 sz = Math.min(terms.size(), meanings.size());
197198 if (context!=null) sz += context.size();
198199 if (sz <2) {
199 - return popularityDisambiguator.disambiguate(terms, meanings, context);
 200+ return popularityDisambiguator.disambiguate(root, terms, meanings, context);
200201 }
201202
202203 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
@@ -203,9 +204,10 @@
204205 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
205206 FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context);
206207
207 - List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings);
 208+ Collection<List<X>> sequences = getSequences(root);
 209+ List<Map<X, LocalConcept>> interpretations = getInterpretations(sequences, meanings);
208210
209 - return getBestInterpretation(terms, meanings, context, interpretations, similarities, features);
 211+ return getBestInterpretation(root, terms, meanings, context, interpretations, similarities, features);
210212 }
211213
212214 protected void pruneMeanings(Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
@@ -235,7 +237,7 @@
236238 }
237239 }
238240
239 - protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings,
 241+ protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings,
240242 Collection<LocalConcept> context, List<Map<X, LocalConcept>> interpretations,
241243 LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
242244
@@ -254,7 +256,7 @@
255257 }
256258
257259 if (rankings.size()==0) {
258 - return popularityDisambiguator.disambiguate(terms, meanings, context);
 260+ return popularityDisambiguator.disambiguate(root, terms, meanings, context);
259261 }
260262
261263 Collections.sort(rankings);
@@ -265,7 +267,17 @@
266268 return r;
267269 }
268270
269 - protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) {
 271+ protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(Collection<List<X>> sequences, Map<X, List<? extends LocalConcept>> meanings) {
 272+ List<Map<X, LocalConcept>> interpretations = new ArrayList<Map<X, LocalConcept>>();
 273+ for (List<X> sq: sequences) {
 274+ List<Map<X, LocalConcept>> sqint = getSequenceInterpretations(sq, meanings);
 275+ interpretations.addAll(sqint);
 276+ }
 277+
 278+ return interpretations;
 279+ }
 280+
 281+ protected <X extends TermReference>List<Map<X, LocalConcept>> getSequenceInterpretations(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) {
270282 if (terms.size()==0) {
271283 return Collections.singletonList(Collections.<X, LocalConcept>emptyMap());
272284 }
@@ -273,7 +285,7 @@
274286 X t = terms.get(0);
275287 List<? extends LocalConcept> m = meanings.get(t);
276288
277 - List<Map<X, LocalConcept>> base = getInterpretations(terms.subList(1, terms.size()), meanings);
 289+ List<Map<X, LocalConcept>> base = getSequenceInterpretations(terms.subList(1, terms.size()), meanings);
278290
279291 if (m==null || m.size()==0) return base;
280292
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java
@@ -6,6 +6,7 @@
77
88 import de.brightbyte.io.Output;
99 import de.brightbyte.util.PersistenceException;
 10+import de.brightbyte.wikiword.model.PhraseNode;
1011 import de.brightbyte.wikiword.model.TermReference;
1112 import de.brightbyte.wikiword.model.WikiWordConcept;
1213
@@ -54,5 +55,6 @@
5556 public void setTrace(Output trace);
5657
5758 public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException;
 59+ public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException;
5860
5961 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -8,11 +8,53 @@
99
1010 import de.brightbyte.io.Output;
1111 import de.brightbyte.util.PersistenceException;
 12+import de.brightbyte.wikiword.model.PhraseNode;
 13+import de.brightbyte.wikiword.model.TermListNode;
1214 import de.brightbyte.wikiword.model.TermReference;
1315 import de.brightbyte.wikiword.model.WikiWordConcept;
1416
1517 public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> {
1618
 19+ public interface NodeListener<T extends TermReference> {
 20+ public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence);
 21+ }
 22+
 23+ public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> {
 24+ protected List<List<T>> seqencees;
 25+
 26+ public SequenceSetBuilder() {
 27+ seqencees = new ArrayList<List<T>>();
 28+ }
 29+
 30+ public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) {
 31+ if (node.getSuccessors().isEmpty()) { //is leaf
 32+ List<T> p = new ArrayList<T>(seqence); //clone
 33+ seqencees.add(p);
 34+ }
 35+ }
 36+
 37+ public List<List<T>> getSequences() {
 38+ return seqencees;
 39+ }
 40+ }
 41+
 42+ public static class TermSetBuilder <T extends TermReference> implements NodeListener<T> {
 43+ protected List<T> terms;
 44+
 45+ public TermSetBuilder() {
 46+ terms = new ArrayList<T>();
 47+ }
 48+
 49+ public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) {
 50+ T t = node.getTermReference();
 51+ if (t.getTerm().length()>0) terms.add(t);
 52+ }
 53+
 54+ public List<T> getTerms() {
 55+ return terms;
 56+ }
 57+ }
 58+
1759 private MeaningCache.Manager<C> meaningCacheManager;
1860
1961 private Output trace;
@@ -28,9 +70,44 @@
2971 this.meaningOverrides = overrideMap;
3072 }
3173
32 - protected <X extends T>Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException {
33 - List<X> todo = terms;
 74+ protected <X extends T>Collection<X> getTerms(PhraseNode<X> root) {
 75+ TermSetBuilder<X> builder = new TermSetBuilder<X>();
 76+ walk(root, null, builder);
 77+ return builder.getTerms();
 78+ }
 79+
 80+ protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root) {
 81+ SequenceSetBuilder<X> builder = new SequenceSetBuilder<X>();
 82+ walk(root, null, builder);
 83+ return builder.getSequences();
 84+ }
 85+
 86+ protected <X extends T>void walk(PhraseNode<X> root, List<X> seqence, NodeListener<? super X> nodeListener) {
 87+ if (seqence == null) seqence = new ArrayList<X>();
3488
 89+ X t = root.getTermReference();
 90+ if (t.getTerm().length()>0) seqence.add(t); //push
 91+
 92+ if (nodeListener!=null)
 93+ nodeListener.onNode(root, seqence);
 94+
 95+ List<? extends PhraseNode<X>> successors = root.getSuccessors();
 96+
 97+ for (PhraseNode<X> n: successors) {
 98+ walk(n, seqence, nodeListener);
 99+ }
 100+
 101+ if (t.getTerm().length()>0) seqence.remove(t); //pop
 102+ }
 103+
 104+ protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException {
 105+ Collection<X> terms = getTerms(root);
 106+ return getMeanings(terms);
 107+ }
 108+
 109+ protected <X extends T>Map<X, List<? extends C>> getMeanings(Collection<X> terms) throws PersistenceException {
 110+ Collection<X> todo = terms;
 111+
35112 if (meaningOverrides!=null) {
36113 todo = new ArrayList<X>();
37114 for (X t: terms) {
@@ -52,11 +129,16 @@
53130 }
54131
55132 public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException {
 133+ return this.<X>disambiguate(new TermListNode<X>(terms, 0), context);
 134+ }
 135+
 136+ public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException {
 137+ Collection<X> terms = getTerms(root);
56138 Map<X, List<? extends C>> meanings = getMeanings(terms);
57 - return disambiguate(terms, meanings, context);
 139+ return disambiguate(root, terms, meanings, context);
58140 }
59141
60 - public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException;
 142+ public abstract <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException;
61143
62144 public Output getTrace() {
63145 return trace;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java
@@ -1,82 +0,0 @@
2 -package de.brightbyte.wikiword.model;
3 -
4 -import java.util.AbstractList;
5 -import java.util.ArrayList;
6 -import java.util.Collections;
7 -import java.util.List;
8 -import java.util.RandomAccess;
9 -
10 -import de.brightbyte.data.filter.Filter;
11 -
12 -public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess {
13 -
14 - protected List<PhraseOccurance> phrases;
15 - protected String text;
16 -
17 - public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) {
18 - this.text = text;
19 -
20 - this.phrases = phrases;
21 - Collections.sort(this.phrases); //essential!
22 - }
23 -
24 - @Override
25 - public PhraseOccurance get(int index) {
26 - return phrases.get(index);
27 - }
28 -
29 - @Override
30 - public int size() {
31 - return phrases.size();
32 - }
33 -
34 - public String getText() {
35 - return text;
36 - }
37 -
38 - public List<PhraseOccurance> getPhrasesAt(int offs) {
39 - int i = 0;
40 - while (i<size()) {
41 - PhraseOccurance p = get(i);
42 - if (p.getOffset() >= offs) {
43 - offs = p.getOffset();
44 - break;
45 - }
46 -
47 - i++;
48 - }
49 -
50 - if (i>=size()) return null;
51 -
52 - int j = i;
53 - while (j<size()) {
54 - PhraseOccurance p = get(j);
55 - if (p.getOffset() > offs) break;
56 - j++;
57 - }
58 -
59 - return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
60 - }
61 -
62 - public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
63 - List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
64 -
65 - int i = 0;
66 -
67 - outer:
68 - while (i<size()) {
69 - List<PhraseOccurance> candidates = getPhrasesAt(i);
70 - if (candidates == null) break;
71 -
72 - for (PhraseOccurance p: candidates) {
73 - i = p.getEndOffset();
74 - if (filter==null || filter.matches(p.getPhrase())) {
75 - phrases.add(p);
76 - continue outer;
77 - }
78 - }
79 - }
80 -
81 - return phrases;
82 - }
83 -}
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
@@ -0,0 +1,136 @@
 2+package de.brightbyte.wikiword.model;
 3+
 4+import java.util.AbstractList;
 5+import java.util.ArrayList;
 6+import java.util.Collections;
 7+import java.util.List;
 8+import java.util.RandomAccess;
 9+
 10+import de.brightbyte.wikiword.disambig.Term;
 11+
 12+public class PhraseOccuranceSet extends AbstractList<PhraseOccurance> implements RandomAccess {
 13+
 14+ protected class Node implements PhraseNode<PhraseOccurance> {
 15+ protected PhraseOccurance phrase;
 16+
 17+ public Node(PhraseOccurance phrase) {
 18+ super();
 19+ this.phrase = phrase;
 20+ }
 21+
 22+ public List<? extends PhraseNode<PhraseOccurance>> getSuccessors() {
 23+ return PhraseOccuranceSet.this.getPhraseNodesAt(phrase.getEndOffset());
 24+ }
 25+
 26+ public PhraseOccurance getTermReference() {
 27+ return phrase;
 28+ }
 29+
 30+ public String toString() {
 31+ return phrase.toString();
 32+ }
 33+ }
 34+
 35+ protected List<PhraseOccurance> phrases;
 36+ protected String text;
 37+
 38+ public PhraseOccuranceSet(String text, List<PhraseOccurance> phrases) {
 39+ this.text = text;
 40+
 41+ this.phrases = phrases;
 42+ Collections.sort(this.phrases); //essential!
 43+ }
 44+
 45+ @Override
 46+ public PhraseOccurance get(int index) {
 47+ return phrases.get(index);
 48+ }
 49+
 50+ @Override
 51+ public int size() {
 52+ return phrases.size();
 53+ }
 54+
 55+ public String getText() {
 56+ return text;
 57+ }
 58+
 59+ public PhraseNode<PhraseOccurance> getRootNode() {
 60+ return getRootNodeAt(0);
 61+ }
 62+
 63+ public PhraseNode<PhraseOccurance> getRootNodeAt(final int ofs) {
 64+ return new PhraseNode<PhraseOccurance>(){
 65+ public PhraseOccurance getTermReference() {
 66+ return new PhraseOccurance("", 0, ofs, 0);
 67+ }
 68+
 69+ public List<? extends PhraseNode<PhraseOccurance>> getSuccessors() {
 70+ return getPhraseNodesAt(ofs);
 71+ }
 72+
 73+ public String toString() {
 74+ return "(root#"+ofs+")";
 75+ }
 76+ };
 77+ }
 78+
 79+ public List<? extends PhraseNode<PhraseOccurance>> getPhraseNodesAt(int offs) {
 80+ List<PhraseOccurance> phrases = getPhrasesAt(offs);
 81+ List<Node> nodes = new ArrayList<Node>(phrases.size());
 82+
 83+ for (PhraseOccurance p: phrases) {
 84+ nodes.add(new Node(p));
 85+ }
 86+
 87+ return nodes;
 88+ }
 89+
 90+ public List<PhraseOccurance> getPhrasesAt(int offs) {
 91+ int i = 0;
 92+ while (i<size()) {
 93+ PhraseOccurance p = get(i);
 94+ if (p.getOffset() >= offs) {
 95+ offs = p.getOffset();
 96+ break;
 97+ }
 98+
 99+ i++;
 100+ }
 101+
 102+ if (i>=size()) return null;
 103+
 104+ int j = i;
 105+ while (j<size()) {
 106+ PhraseOccurance p = get(j);
 107+ if (p.getOffset() > offs) break;
 108+ j++;
 109+ }
 110+
 111+ return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
 112+ }
 113+
 114+ /*
 115+ public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
 116+ List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
 117+
 118+ int i = 0;
 119+
 120+ outer:
 121+ while (i<size()) {
 122+ List<PhraseOccurance> candidates = getPhrasesAt(i);
 123+ if (candidates == null) break;
 124+
 125+ for (PhraseOccurance p: candidates) {
 126+ i = p.getEndOffset();
 127+ if (filter==null || filter.matches(p.getPhrase())) {
 128+ phrases.add(p);
 129+ continue outer;
 130+ }
 131+ }
 132+ }
 133+
 134+ return phrases;
 135+ }
 136+ */
 137+}
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
___________________________________________________________________
Name: svn:mergeinfo
1138 +
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java
@@ -1,6 +1,7 @@
22 package de.brightbyte.wikiword.model;
33
44 import java.io.Serializable;
 5+import java.util.List;
56
67 public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance>, TermReference {
78
@@ -23,7 +24,7 @@
2425 this.offset = offset;
2526 this.length = length;
2627 }
27 -
 28+
2829 public int getLength() {
2930 return length;
3031 }
@@ -99,4 +100,5 @@
100101
101102 return 0;
102103 }
 104+
103105 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java
@@ -0,0 +1,11 @@
 2+package de.brightbyte.wikiword.model;
 3+
 4+import java.util.List;
 5+
 6+public interface PhraseNode<T extends TermReference> {
 7+
 8+ public T getTermReference();
 9+
 10+ public List<? extends PhraseNode<T>> getSuccessors();
 11+
 12+}
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermListNode.java
@@ -0,0 +1,35 @@
 2+package de.brightbyte.wikiword.model;
 3+
 4+import java.util.Collections;
 5+import java.util.List;
 6+import java.util.NoSuchElementException;
 7+
 8+public class TermListNode<T extends TermReference> implements PhraseNode<T> {
 9+
 10+ protected List<T> terms;
 11+ protected int index;
 12+
 13+ protected List<TermListNode<T>> successors;
 14+
 15+ public TermListNode(List<T> terms, int index) {
 16+ if (terms==null) throw new NullPointerException();
 17+ if (index<0 || index>=terms.size()) throw new NoSuchElementException("index out of range");
 18+
 19+ this.terms = terms;
 20+ this.index = index;
 21+ }
 22+
 23+ public T getTermReference() {
 24+ return terms.get(index);
 25+ }
 26+
 27+ public List<TermListNode<T>> getSuccessors() {
 28+ if (successors == null) {
 29+ if (index+1>=terms.size()) successors = Collections.emptyList();
 30+ else Collections.singletonList(new TermListNode<T>(terms, index+1));
 31+ }
 32+
 33+ return successors;
 34+ }
 35+
 36+}
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
@@ -21,7 +21,7 @@
2222 import de.brightbyte.wikiword.Corpus;
2323 import de.brightbyte.wikiword.TweakSet;
2424 import de.brightbyte.wikiword.model.PhraseOccurance;
25 -import de.brightbyte.wikiword.model.PhraseOccuranceSequence;
 25+import de.brightbyte.wikiword.model.PhraseOccuranceSet;
2626
2727 public class PlainTextAnalyzer extends AbstractAnalyzer {
2828 private LanguageConfiguration config;
@@ -190,7 +190,7 @@
191191 return corpus;
192192 }
193193
194 - public PhraseOccuranceSequence extractPhrases(CharSequence text, int maxWeight) {
 194+ public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) {
195195 ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
196196
197197 text = applyManglers(config.sentenceManglers, text);
@@ -204,7 +204,7 @@
205205 buildPhrases(s, ofs, phrases, maxWeight);
206206 }
207207
208 - return new PhraseOccuranceSequence(text.toString(), phrases);
 208+ return new PhraseOccuranceSet(text.toString(), phrases);
209209 }
210210
211211 private PhraseAggregator buildPhrasesAggregator = null;
@@ -261,7 +261,7 @@
262262 BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
263263 String s ;
264264 while ( (s = in.readLine()) != null ) {
265 - PhraseOccuranceSequence phrases = analyzer.extractPhrases(s, 6);
 265+ PhraseOccuranceSet phrases = analyzer.extractPhrases(s, 6);
266266 DebugUtil.dump("", phrases, ConsoleIO.output);
267267 }
268268 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java
@@ -110,9 +110,9 @@
111111
112112 @Override
113113 protected void process(String line) throws PersistenceException, ParseException {
114 - //TODO: logic for handling overlapping phrases in a PhraseOccuranceSequence
 114+ //TODO: logic for handling overlapping phrases in a PhraseOccuranceSet
115115 /*
116 - PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first.
 116+ PhraseOccuranceSet sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first.
117117 List<PhraseOccurance> phrases = sequence.getDisjointPhraseSequence(null);
118118 Disambiguator.Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(phrases);
119119 return result.toString(); //FIXME: annotate!

Status & tagging log