Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningFetcher.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.Collection; |
4 | 5 | import java.util.List; |
5 | 6 | import java.util.Map; |
6 | 7 | |
— | — | @@ -10,5 +11,5 @@ |
11 | 12 | public interface MeaningFetcher<C extends WikiWordConcept> { |
12 | 13 | public List<? extends C> getMeanings(String term) throws PersistenceException; |
13 | 14 | |
14 | | - public <X extends TermReference>Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException; |
| 15 | + public <X extends TermReference>Map<X, List<? extends C>> getMeanings(Collection<X> terms) throws PersistenceException; |
15 | 16 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -16,6 +16,7 @@ |
17 | 17 | import de.brightbyte.data.measure.Similarity; |
18 | 18 | import de.brightbyte.util.PersistenceException; |
19 | 19 | import de.brightbyte.wikiword.model.LocalConcept; |
| 20 | +import de.brightbyte.wikiword.model.PhraseNode; |
20 | 21 | import de.brightbyte.wikiword.model.TermReference; |
21 | 22 | import de.brightbyte.wikiword.model.WikiWordConcept; |
22 | 23 | |
— | — | @@ -39,14 +40,14 @@ |
40 | 41 | /* (non-Javadoc) |
41 | 42 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
42 | 43 | */ |
43 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 44 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
44 | 45 | if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
45 | 46 | |
46 | 47 | int sz = Math.min(terms.size(), meanings.size()); |
47 | 48 | if (context!=null) sz += context.size(); |
48 | 49 | |
49 | 50 | if (window < 2 || sz<2) { |
50 | | - return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 51 | + return popularityDisambiguator.disambiguate(root, terms, meanings, context); |
51 | 52 | } |
52 | 53 | |
53 | 54 | pruneMeanings(meanings); |
— | — | @@ -55,7 +56,7 @@ |
56 | 57 | if (context!=null) sz += context.size(); |
57 | 58 | |
58 | 59 | if (sz<2) { |
59 | | - return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 60 | + return popularityDisambiguator.disambiguate(root, terms, meanings, context); |
60 | 61 | } |
61 | 62 | |
62 | 63 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
— | — | @@ -77,7 +78,7 @@ |
78 | 79 | Result r ; |
79 | 80 | |
80 | 81 | if (to-from < 2) { |
81 | | - r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings, context); |
| 82 | + r = popularityDisambiguator.disambiguate(root..., terms.subList(from, to), meanings, context); |
82 | 83 | } else { |
83 | 84 | List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
84 | 85 | r = getBestInterpretation(terms, meanings, context, interpretations, similarities, features); |
— | — | @@ -124,7 +125,7 @@ |
125 | 126 | mset.put(t, m); |
126 | 127 | } |
127 | 128 | |
128 | | - return getInterpretations(terms.subList(from, to), mset); |
| 129 | + return getSequenceInterpretations(terms.subList(from, to), mset); |
129 | 130 | } |
130 | 131 | |
131 | 132 | public boolean getRunningStart() { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.Collection; |
4 | 5 | import java.util.HashMap; |
5 | 6 | import java.util.List; |
6 | 7 | import java.util.Map; |
— | — | @@ -34,7 +35,7 @@ |
35 | 36 | return m.load(); |
36 | 37 | } |
37 | 38 | |
38 | | - public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(List<X> terms) throws PersistenceException { |
| 39 | + public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(Collection<X> terms) throws PersistenceException { |
39 | 40 | Map<X, List<? extends LocalConcept>> meanings = new HashMap<X, List<? extends LocalConcept>>(); |
40 | 41 | |
41 | 42 | for (X t: terms) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningCache.java |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | 4 | import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
5 | 6 | import java.util.HashMap; |
6 | 7 | import java.util.List; |
7 | 8 | import java.util.Map; |
— | — | @@ -79,7 +80,7 @@ |
80 | 81 | } |
81 | 82 | |
82 | 83 | |
83 | | - public <X extends TermReference> Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException { |
| 84 | + public <X extends TermReference> Map<X, List<? extends C>> getMeanings(Collection<X> terms) throws PersistenceException { |
84 | 85 | Map<X, List<? extends C>> meanings= new HashMap<X, List<? extends C>>(); |
85 | 86 | List<X> todo = new ArrayList<X>(terms.size()); |
86 | 87 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -9,6 +9,7 @@ |
10 | 10 | import de.brightbyte.data.measure.Measure; |
11 | 11 | import de.brightbyte.data.measure.Measure.Comparator; |
12 | 12 | import de.brightbyte.wikiword.model.LocalConcept; |
| 13 | +import de.brightbyte.wikiword.model.PhraseNode; |
13 | 14 | import de.brightbyte.wikiword.model.TermReference; |
14 | 15 | import de.brightbyte.wikiword.model.WikiWordConcept; |
15 | 16 | |
— | — | @@ -36,7 +37,7 @@ |
37 | 38 | this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true); |
38 | 39 | } |
39 | 40 | |
40 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
| 41 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
41 | 42 | if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
42 | 43 | |
43 | 44 | Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -22,6 +22,7 @@ |
23 | 23 | import de.brightbyte.util.PersistenceException; |
24 | 24 | import de.brightbyte.wikiword.model.ConceptFeatures; |
25 | 25 | import de.brightbyte.wikiword.model.LocalConcept; |
| 26 | +import de.brightbyte.wikiword.model.PhraseNode; |
26 | 27 | import de.brightbyte.wikiword.model.TermReference; |
27 | 28 | import de.brightbyte.wikiword.model.WikiWordConcept; |
28 | 29 | |
— | — | @@ -180,14 +181,14 @@ |
181 | 182 | /* (non-Javadoc) |
182 | 183 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
183 | 184 | */ |
184 | | - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 185 | + public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
185 | 186 | if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
186 | 187 | |
187 | 188 | int sz = Math.min(terms.size(), meanings.size()); |
188 | 189 | if (context!=null) sz += context.size(); |
189 | 190 | |
190 | 191 | if (sz<2) { |
191 | | - return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 192 | + return popularityDisambiguator.disambiguate(root, terms, meanings, context); |
192 | 193 | } |
193 | 194 | |
194 | 195 | pruneMeanings(meanings); |
— | — | @@ -195,7 +196,7 @@ |
196 | 197 | sz = Math.min(terms.size(), meanings.size()); |
197 | 198 | if (context!=null) sz += context.size(); |
198 | 199 | if (sz <2) { |
199 | | - return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 200 | + return popularityDisambiguator.disambiguate(root, terms, meanings, context); |
200 | 201 | } |
201 | 202 | |
202 | 203 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
— | — | @@ -203,9 +204,10 @@ |
204 | 205 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
205 | 206 | FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context); |
206 | 207 | |
207 | | - List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings); |
| 208 | + Collection<List<X>> sequences = getSequences(root); |
| 209 | + List<Map<X, LocalConcept>> interpretations = getInterpretations(sequences, meanings); |
208 | 210 | |
209 | | - return getBestInterpretation(terms, meanings, context, interpretations, similarities, features); |
| 211 | + return getBestInterpretation(root, terms, meanings, context, interpretations, similarities, features); |
210 | 212 | } |
211 | 213 | |
212 | 214 | protected void pruneMeanings(Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
— | — | @@ -235,7 +237,7 @@ |
236 | 238 | } |
237 | 239 | } |
238 | 240 | |
239 | | - protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, |
| 241 | + protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, |
240 | 242 | Collection<LocalConcept> context, List<Map<X, LocalConcept>> interpretations, |
241 | 243 | LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
242 | 244 | |
— | — | @@ -254,7 +256,7 @@ |
255 | 257 | } |
256 | 258 | |
257 | 259 | if (rankings.size()==0) { |
258 | | - return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 260 | + return popularityDisambiguator.disambiguate(root, terms, meanings, context); |
259 | 261 | } |
260 | 262 | |
261 | 263 | Collections.sort(rankings); |
— | — | @@ -265,7 +267,17 @@ |
266 | 268 | return r; |
267 | 269 | } |
268 | 270 | |
269 | | - protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) { |
| 271 | + protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(Collection<List<X>> sequences, Map<X, List<? extends LocalConcept>> meanings) { |
| 272 | + List<Map<X, LocalConcept>> interpretations = new ArrayList<Map<X, LocalConcept>>(); |
| 273 | + for (List<X> sq: sequences) { |
| 274 | + List<Map<X, LocalConcept>> sqint = getSequenceInterpretations(sq, meanings); |
| 275 | + interpretations.addAll(sqint); |
| 276 | + } |
| 277 | + |
| 278 | + return interpretations; |
| 279 | + } |
| 280 | + |
| 281 | + protected <X extends TermReference>List<Map<X, LocalConcept>> getSequenceInterpretations(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) { |
270 | 282 | if (terms.size()==0) { |
271 | 283 | return Collections.singletonList(Collections.<X, LocalConcept>emptyMap()); |
272 | 284 | } |
— | — | @@ -273,7 +285,7 @@ |
274 | 286 | X t = terms.get(0); |
275 | 287 | List<? extends LocalConcept> m = meanings.get(t); |
276 | 288 | |
277 | | - List<Map<X, LocalConcept>> base = getInterpretations(terms.subList(1, terms.size()), meanings); |
| 289 | + List<Map<X, LocalConcept>> base = getSequenceInterpretations(terms.subList(1, terms.size()), meanings); |
278 | 290 | |
279 | 291 | if (m==null || m.size()==0) return base; |
280 | 292 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java |
— | — | @@ -6,6 +6,7 @@ |
7 | 7 | |
8 | 8 | import de.brightbyte.io.Output; |
9 | 9 | import de.brightbyte.util.PersistenceException; |
| 10 | +import de.brightbyte.wikiword.model.PhraseNode; |
10 | 11 | import de.brightbyte.wikiword.model.TermReference; |
11 | 12 | import de.brightbyte.wikiword.model.WikiWordConcept; |
12 | 13 | |
— | — | @@ -54,5 +55,6 @@ |
55 | 56 | public void setTrace(Output trace); |
56 | 57 | |
57 | 58 | public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException; |
| 59 | + public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException; |
58 | 60 | |
59 | 61 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -8,11 +8,53 @@ |
9 | 9 | |
10 | 10 | import de.brightbyte.io.Output; |
11 | 11 | import de.brightbyte.util.PersistenceException; |
| 12 | +import de.brightbyte.wikiword.model.PhraseNode; |
| 13 | +import de.brightbyte.wikiword.model.TermListNode; |
12 | 14 | import de.brightbyte.wikiword.model.TermReference; |
13 | 15 | import de.brightbyte.wikiword.model.WikiWordConcept; |
14 | 16 | |
15 | 17 | public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> { |
16 | 18 | |
| 19 | + public interface NodeListener<T extends TermReference> { |
| 20 | + public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence); |
| 21 | + } |
| 22 | + |
| 23 | + public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> { |
| 24 | + protected List<List<T>> seqencees; |
| 25 | + |
| 26 | + public SequenceSetBuilder() { |
| 27 | + seqencees = new ArrayList<List<T>>(); |
| 28 | + } |
| 29 | + |
| 30 | + public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) { |
| 31 | + if (node.getSuccessors().isEmpty()) { //is leaf |
| 32 | + List<T> p = new ArrayList<T>(seqence); //clone |
| 33 | + seqencees.add(p); |
| 34 | + } |
| 35 | + } |
| 36 | + |
| 37 | + public List<List<T>> getSequences() { |
| 38 | + return seqencees; |
| 39 | + } |
| 40 | + } |
| 41 | + |
| 42 | + public static class TermSetBuilder <T extends TermReference> implements NodeListener<T> { |
| 43 | + protected List<T> terms; |
| 44 | + |
| 45 | + public TermSetBuilder() { |
| 46 | + terms = new ArrayList<T>(); |
| 47 | + } |
| 48 | + |
| 49 | + public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) { |
| 50 | + T t = node.getTermReference(); |
| 51 | + if (t.getTerm().length()>0) terms.add(t); |
| 52 | + } |
| 53 | + |
| 54 | + public List<T> getTerms() { |
| 55 | + return terms; |
| 56 | + } |
| 57 | + } |
| 58 | + |
17 | 59 | private MeaningCache.Manager<C> meaningCacheManager; |
18 | 60 | |
19 | 61 | private Output trace; |
— | — | @@ -28,9 +70,44 @@ |
29 | 71 | this.meaningOverrides = overrideMap; |
30 | 72 | } |
31 | 73 | |
32 | | - protected <X extends T>Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException { |
33 | | - List<X> todo = terms; |
| 74 | + protected <X extends T>Collection<X> getTerms(PhraseNode<X> root) { |
| 75 | + TermSetBuilder<X> builder = new TermSetBuilder<X>(); |
| 76 | + walk(root, null, builder); |
| 77 | + return builder.getTerms(); |
| 78 | + } |
| 79 | + |
| 80 | + protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root) { |
| 81 | + SequenceSetBuilder<X> builder = new SequenceSetBuilder<X>(); |
| 82 | + walk(root, null, builder); |
| 83 | + return builder.getSequences(); |
| 84 | + } |
| 85 | + |
| 86 | + protected <X extends T>void walk(PhraseNode<X> root, List<X> seqence, NodeListener<? super X> nodeListener) { |
| 87 | + if (seqence == null) seqence = new ArrayList<X>(); |
34 | 88 | |
| 89 | + X t = root.getTermReference(); |
| 90 | + if (t.getTerm().length()>0) seqence.add(t); //push |
| 91 | + |
| 92 | + if (nodeListener!=null) |
| 93 | + nodeListener.onNode(root, seqence); |
| 94 | + |
| 95 | + List<? extends PhraseNode<X>> successors = root.getSuccessors(); |
| 96 | + |
| 97 | + for (PhraseNode<X> n: successors) { |
| 98 | + walk(n, seqence, nodeListener); |
| 99 | + } |
| 100 | + |
| 101 | + if (t.getTerm().length()>0) seqence.remove(t); //pop |
| 102 | + } |
| 103 | + |
| 104 | + protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException { |
| 105 | + Collection<X> terms = getTerms(root); |
| 106 | + return getMeanings(terms); |
| 107 | + } |
| 108 | + |
| 109 | + protected <X extends T>Map<X, List<? extends C>> getMeanings(Collection<X> terms) throws PersistenceException { |
| 110 | + Collection<X> todo = terms; |
| 111 | + |
35 | 112 | if (meaningOverrides!=null) { |
36 | 113 | todo = new ArrayList<X>(); |
37 | 114 | for (X t: terms) { |
— | — | @@ -52,11 +129,16 @@ |
53 | 130 | } |
54 | 131 | |
55 | 132 | public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException { |
| 133 | + return this.<X>disambiguate(new TermListNode<X>(terms, 0), context); |
| 134 | + } |
| 135 | + |
| 136 | + public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException { |
| 137 | + Collection<X> terms = getTerms(root); |
56 | 138 | Map<X, List<? extends C>> meanings = getMeanings(terms); |
57 | | - return disambiguate(terms, meanings, context); |
| 139 | + return disambiguate(root, terms, meanings, context); |
58 | 140 | } |
59 | 141 | |
60 | | - public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException; |
| 142 | + public abstract <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException; |
61 | 143 | |
62 | 144 | public Output getTrace() { |
63 | 145 | return trace; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java |
— | — | @@ -1,82 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.model; |
3 | | - |
4 | | -import java.util.AbstractList; |
5 | | -import java.util.ArrayList; |
6 | | -import java.util.Collections; |
7 | | -import java.util.List; |
8 | | -import java.util.RandomAccess; |
9 | | - |
10 | | -import de.brightbyte.data.filter.Filter; |
11 | | - |
12 | | -public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess { |
13 | | - |
14 | | - protected List<PhraseOccurance> phrases; |
15 | | - protected String text; |
16 | | - |
17 | | - public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) { |
18 | | - this.text = text; |
19 | | - |
20 | | - this.phrases = phrases; |
21 | | - Collections.sort(this.phrases); //essential! |
22 | | - } |
23 | | - |
24 | | - @Override |
25 | | - public PhraseOccurance get(int index) { |
26 | | - return phrases.get(index); |
27 | | - } |
28 | | - |
29 | | - @Override |
30 | | - public int size() { |
31 | | - return phrases.size(); |
32 | | - } |
33 | | - |
34 | | - public String getText() { |
35 | | - return text; |
36 | | - } |
37 | | - |
38 | | - public List<PhraseOccurance> getPhrasesAt(int offs) { |
39 | | - int i = 0; |
40 | | - while (i<size()) { |
41 | | - PhraseOccurance p = get(i); |
42 | | - if (p.getOffset() >= offs) { |
43 | | - offs = p.getOffset(); |
44 | | - break; |
45 | | - } |
46 | | - |
47 | | - i++; |
48 | | - } |
49 | | - |
50 | | - if (i>=size()) return null; |
51 | | - |
52 | | - int j = i; |
53 | | - while (j<size()) { |
54 | | - PhraseOccurance p = get(j); |
55 | | - if (p.getOffset() > offs) break; |
56 | | - j++; |
57 | | - } |
58 | | - |
59 | | - return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
60 | | - } |
61 | | - |
62 | | - public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) { |
63 | | - List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
64 | | - |
65 | | - int i = 0; |
66 | | - |
67 | | - outer: |
68 | | - while (i<size()) { |
69 | | - List<PhraseOccurance> candidates = getPhrasesAt(i); |
70 | | - if (candidates == null) break; |
71 | | - |
72 | | - for (PhraseOccurance p: candidates) { |
73 | | - i = p.getEndOffset(); |
74 | | - if (filter==null || filter.matches(p.getPhrase())) { |
75 | | - phrases.add(p); |
76 | | - continue outer; |
77 | | - } |
78 | | - } |
79 | | - } |
80 | | - |
81 | | - return phrases; |
82 | | - } |
83 | | -} |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java |
— | — | @@ -0,0 +1,136 @@ |
| 2 | +package de.brightbyte.wikiword.model; |
| 3 | + |
| 4 | +import java.util.AbstractList; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collections; |
| 7 | +import java.util.List; |
| 8 | +import java.util.RandomAccess; |
| 9 | + |
| 10 | +import de.brightbyte.wikiword.disambig.Term; |
| 11 | + |
| 12 | +public class PhraseOccuranceSet extends AbstractList<PhraseOccurance> implements RandomAccess { |
| 13 | + |
| 14 | + protected class Node implements PhraseNode<PhraseOccurance> { |
| 15 | + protected PhraseOccurance phrase; |
| 16 | + |
| 17 | + public Node(PhraseOccurance phrase) { |
| 18 | + super(); |
| 19 | + this.phrase = phrase; |
| 20 | + } |
| 21 | + |
| 22 | + public List<? extends PhraseNode<PhraseOccurance>> getSuccessors() { |
| 23 | + return PhraseOccuranceSet.this.getPhraseNodesAt(phrase.getEndOffset()); |
| 24 | + } |
| 25 | + |
| 26 | + public PhraseOccurance getTermReference() { |
| 27 | + return phrase; |
| 28 | + } |
| 29 | + |
| 30 | + public String toString() { |
| 31 | + return phrase.toString(); |
| 32 | + } |
| 33 | + } |
| 34 | + |
| 35 | + protected List<PhraseOccurance> phrases; |
| 36 | + protected String text; |
| 37 | + |
| 38 | + public PhraseOccuranceSet(String text, List<PhraseOccurance> phrases) { |
| 39 | + this.text = text; |
| 40 | + |
| 41 | + this.phrases = phrases; |
| 42 | + Collections.sort(this.phrases); //essential! |
| 43 | + } |
| 44 | + |
| 45 | + @Override |
| 46 | + public PhraseOccurance get(int index) { |
| 47 | + return phrases.get(index); |
| 48 | + } |
| 49 | + |
| 50 | + @Override |
| 51 | + public int size() { |
| 52 | + return phrases.size(); |
| 53 | + } |
| 54 | + |
| 55 | + public String getText() { |
| 56 | + return text; |
| 57 | + } |
| 58 | + |
| 59 | + public PhraseNode<PhraseOccurance> getRootNode() { |
| 60 | + return getRootNodeAt(0); |
| 61 | + } |
| 62 | + |
| 63 | + public PhraseNode<PhraseOccurance> getRootNodeAt(final int ofs) { |
| 64 | + return new PhraseNode<PhraseOccurance>(){ |
| 65 | + public PhraseOccurance getTermReference() { |
| 66 | + return new PhraseOccurance("", 0, ofs, 0); |
| 67 | + } |
| 68 | + |
| 69 | + public List<? extends PhraseNode<PhraseOccurance>> getSuccessors() { |
| 70 | + return getPhraseNodesAt(ofs); |
| 71 | + } |
| 72 | + |
| 73 | + public String toString() { |
| 74 | + return "(root#"+ofs+")"; |
| 75 | + } |
| 76 | + }; |
| 77 | + } |
| 78 | + |
| 79 | + public List<? extends PhraseNode<PhraseOccurance>> getPhraseNodesAt(int offs) { |
| 80 | + List<PhraseOccurance> phrases = getPhrasesAt(offs); |
| 81 | + List<Node> nodes = new ArrayList<Node>(phrases.size()); |
| 82 | + |
| 83 | + for (PhraseOccurance p: phrases) { |
| 84 | + nodes.add(new Node(p)); |
| 85 | + } |
| 86 | + |
| 87 | + return nodes; |
| 88 | + } |
| 89 | + |
| 90 | + public List<PhraseOccurance> getPhrasesAt(int offs) { |
| 91 | + int i = 0; |
| 92 | + while (i<size()) { |
| 93 | + PhraseOccurance p = get(i); |
| 94 | + if (p.getOffset() >= offs) { |
| 95 | + offs = p.getOffset(); |
| 96 | + break; |
| 97 | + } |
| 98 | + |
| 99 | + i++; |
| 100 | + } |
| 101 | + |
| 102 | + if (i>=size()) return null; |
| 103 | + |
| 104 | + int j = i; |
| 105 | + while (j<size()) { |
| 106 | + PhraseOccurance p = get(j); |
| 107 | + if (p.getOffset() > offs) break; |
| 108 | + j++; |
| 109 | + } |
| 110 | + |
| 111 | + return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
| 112 | + } |
| 113 | + |
| 114 | + /* |
| 115 | + public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) { |
| 116 | + List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 117 | + |
| 118 | + int i = 0; |
| 119 | + |
| 120 | + outer: |
| 121 | + while (i<size()) { |
| 122 | + List<PhraseOccurance> candidates = getPhrasesAt(i); |
| 123 | + if (candidates == null) break; |
| 124 | + |
| 125 | + for (PhraseOccurance p: candidates) { |
| 126 | + i = p.getEndOffset(); |
| 127 | + if (filter==null || filter.matches(p.getPhrase())) { |
| 128 | + phrases.add(p); |
| 129 | + continue outer; |
| 130 | + } |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + return phrases; |
| 135 | + } |
| 136 | + */ |
| 137 | +} |
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 138 | + |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | package de.brightbyte.wikiword.model; |
3 | 3 | |
4 | 4 | import java.io.Serializable; |
| 5 | +import java.util.List; |
5 | 6 | |
6 | 7 | public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance>, TermReference { |
7 | 8 | |
— | — | @@ -23,7 +24,7 @@ |
24 | 25 | this.offset = offset; |
25 | 26 | this.length = length; |
26 | 27 | } |
27 | | - |
| 28 | + |
28 | 29 | public int getLength() { |
29 | 30 | return length; |
30 | 31 | } |
— | — | @@ -99,4 +100,5 @@ |
100 | 101 | |
101 | 102 | return 0; |
102 | 103 | } |
| 104 | + |
103 | 105 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java |
— | — | @@ -0,0 +1,11 @@ |
| 2 | +package de.brightbyte.wikiword.model; |
| 3 | + |
| 4 | +import java.util.List; |
| 5 | + |
| 6 | +public interface PhraseNode<T extends TermReference> { |
| 7 | + |
| 8 | + public T getTermReference(); |
| 9 | + |
| 10 | + public List<? extends PhraseNode<T>> getSuccessors(); |
| 11 | + |
| 12 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermListNode.java |
— | — | @@ -0,0 +1,35 @@ |
| 2 | +package de.brightbyte.wikiword.model; |
| 3 | + |
| 4 | +import java.util.Collections; |
| 5 | +import java.util.List; |
| 6 | +import java.util.NoSuchElementException; |
| 7 | + |
| 8 | +public class TermListNode<T extends TermReference> implements PhraseNode<T> { |
| 9 | + |
| 10 | + protected List<T> terms; |
| 11 | + protected int index; |
| 12 | + |
| 13 | + protected List<TermListNode<T>> successors; |
| 14 | + |
| 15 | + public TermListNode(List<T> terms, int index) { |
| 16 | + if (terms==null) throw new NullPointerException(); |
| 17 | + if (index<0 || index>=terms.size()) throw new NoSuchElementException("index out of range"); |
| 18 | + |
| 19 | + this.terms = terms; |
| 20 | + this.index = index; |
| 21 | + } |
| 22 | + |
| 23 | + public T getTermReference() { |
| 24 | + return terms.get(index); |
| 25 | + } |
| 26 | + |
| 27 | + public List<TermListNode<T>> getSuccessors() { |
| 28 | + if (successors == null) { |
| 29 | + if (index+1>=terms.size()) successors = Collections.emptyList(); |
| 30 | + else Collections.singletonList(new TermListNode<T>(terms, index+1)); |
| 31 | + } |
| 32 | + |
| 33 | + return successors; |
| 34 | + } |
| 35 | + |
| 36 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -21,7 +21,7 @@ |
22 | 22 | import de.brightbyte.wikiword.Corpus; |
23 | 23 | import de.brightbyte.wikiword.TweakSet; |
24 | 24 | import de.brightbyte.wikiword.model.PhraseOccurance; |
25 | | -import de.brightbyte.wikiword.model.PhraseOccuranceSequence; |
| 25 | +import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
26 | 26 | |
27 | 27 | public class PlainTextAnalyzer extends AbstractAnalyzer { |
28 | 28 | private LanguageConfiguration config; |
— | — | @@ -190,7 +190,7 @@ |
191 | 191 | return corpus; |
192 | 192 | } |
193 | 193 | |
194 | | - public PhraseOccuranceSequence extractPhrases(CharSequence text, int maxWeight) { |
| 194 | + public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) { |
195 | 195 | ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
196 | 196 | |
197 | 197 | text = applyManglers(config.sentenceManglers, text); |
— | — | @@ -204,7 +204,7 @@ |
205 | 205 | buildPhrases(s, ofs, phrases, maxWeight); |
206 | 206 | } |
207 | 207 | |
208 | | - return new PhraseOccuranceSequence(text.toString(), phrases); |
| 208 | + return new PhraseOccuranceSet(text.toString(), phrases); |
209 | 209 | } |
210 | 210 | |
211 | 211 | private PhraseAggregator buildPhrasesAggregator = null; |
— | — | @@ -261,7 +261,7 @@ |
262 | 262 | BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); |
263 | 263 | String s ; |
264 | 264 | while ( (s = in.readLine()) != null ) { |
265 | | - PhraseOccuranceSequence phrases = analyzer.extractPhrases(s, 6); |
| 265 | + PhraseOccuranceSet phrases = analyzer.extractPhrases(s, 6); |
266 | 266 | DebugUtil.dump("", phrases, ConsoleIO.output); |
267 | 267 | } |
268 | 268 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java |
— | — | @@ -110,9 +110,9 @@ |
111 | 111 | |
112 | 112 | @Override |
113 | 113 | protected void process(String line) throws PersistenceException, ParseException { |
114 | | - //TODO: logic for handling overlapping phrases in a PhraseOccuranceSequence |
| 114 | + //TODO: logic for handling overlapping phrases in a PhraseOccuranceSet |
115 | 115 | /* |
116 | | - PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first. |
| 116 | + PhraseOccuranceSet sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first. |
117 | 117 | List<PhraseOccurance> phrases = sequence.getDisjointPhraseSequence(null); |
118 | 118 | Disambiguator.Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(phrases); |
119 | 119 | return result.toString(); //FIXME: annotate! |