Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -159,7 +159,7 @@ |
160 | 160 | this.maxMeanings = maxMeanings; |
161 | 161 | } |
162 | 162 | |
163 | | - protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 163 | + protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException { |
164 | 164 | FeatureFetcher<LocalConcept, Integer> features = featureCacheManager.newCache(); |
165 | 165 | |
166 | 166 | //NOTE: pre-fetch all features in one go |
— | — | @@ -177,7 +177,7 @@ |
178 | 178 | /* (non-Javadoc) |
179 | 179 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
180 | 180 | */ |
181 | | - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 181 | + public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException { |
182 | 182 | if (meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), Collections.<X>emptyList(), 0.0, "no terms or meanings"); |
183 | 183 | |
184 | 184 | int sz = meanings.size(); |
— | — | @@ -199,7 +199,7 @@ |
200 | 200 | return disambiguate(sequences, root, meanings, context); |
201 | 201 | } |
202 | 202 | |
203 | | - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 203 | + public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException { |
204 | 204 | |
205 | 205 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
206 | 206 | |
— | — | @@ -239,7 +239,7 @@ |
240 | 240 | } |
241 | 241 | |
242 | 242 | protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, |
243 | | - Collection<LocalConcept> context, List<Disambiguator.Interpretation<X, LocalConcept>> interpretations, |
| 243 | + Collection<? extends LocalConcept> context, List<Disambiguator.Interpretation<X, LocalConcept>> interpretations, |
244 | 244 | LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
245 | 245 | |
246 | 246 | List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>(); |
— | — | @@ -306,7 +306,7 @@ |
307 | 307 | return interpretations; |
308 | 308 | } |
309 | 309 | |
310 | | - protected <X extends TermReference>Result<X, LocalConcept> getScore(Disambiguator.Interpretation<X, LocalConcept> interp, Collection<LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
| 310 | + protected <X extends TermReference>Result<X, LocalConcept> getScore(Disambiguator.Interpretation<X, LocalConcept> interp, Collection<? extends LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
311 | 311 | Map<? extends TermReference, LocalConcept> concepts; |
312 | 312 | if (context!=null) { |
313 | 313 | concepts = new HashMap<TermReference, LocalConcept>(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -92,13 +92,19 @@ |
93 | 93 | } |
94 | 94 | |
95 | 95 | private <X extends T>PhraseNode<X> findLastNode(PhraseNode<X> root, List<X> sequence) { |
| 96 | + if (root.getTermReference().getTerm().length()>0) { |
| 97 | + X t = sequence.get(0); |
| 98 | + if (!t.getTerm().equals(root.getTermReference().getTerm())) return null; |
| 99 | + sequence = sequence.subList(1, sequence.size()); |
| 100 | + } |
| 101 | + |
96 | 102 | terms: for (X t: sequence) { |
97 | 103 | Collection<? extends PhraseNode<X>> successors = root.getSuccessors(); |
98 | 104 | if (successors==null || successors.isEmpty()) |
99 | 105 | return null; |
100 | 106 | |
101 | 107 | for (PhraseNode<X> n: successors) { |
102 | | - if (n.getTermReference().equals(t)) { |
| 108 | + if (n.getTermReference().getTerm().equals(t.getTerm())) { |
103 | 109 | root = n; |
104 | 110 | continue terms; |
105 | 111 | } |
— | — | @@ -164,17 +170,17 @@ |
165 | 171 | return meanings; |
166 | 172 | } |
167 | 173 | |
168 | | - public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException { |
| 174 | + public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<? extends C> context) throws PersistenceException { |
169 | 175 | return this.<X>disambiguate(new TermListNode<X>(terms, 0), context); |
170 | 176 | } |
171 | 177 | |
172 | | - public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException { |
| 178 | + public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<? extends C> context) throws PersistenceException { |
173 | 179 | Collection<X> terms = getTerms(root, Integer.MAX_VALUE); |
174 | 180 | Map<X, List<? extends C>> meanings = getMeanings(terms); |
175 | 181 | return disambiguate(root, meanings, context); |
176 | 182 | } |
177 | 183 | |
178 | | - public abstract <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException; |
| 184 | + public abstract <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) throws PersistenceException; |
179 | 185 | |
180 | 186 | public Output getTrace() { |
181 | 187 | return trace; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | } |
41 | 41 | |
42 | 42 | public <X extends TermReference>Result<X, LocalConcept> evalStep(List<X> baseSequence, Map<X, LocalConcept> interpretation, PhraseNode<X> node, |
43 | | - Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context, |
| 43 | + Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context, |
44 | 44 | LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
45 | 45 | X term = node.getTermReference(); |
46 | 46 | |
— | — | @@ -67,7 +67,7 @@ |
68 | 68 | /* (non-Javadoc) |
69 | 69 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
70 | 70 | */ |
71 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 71 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException { |
72 | 72 | if (meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), Collections.<X>emptyList(), 0.0, "no terms or meanings"); |
73 | 73 | |
74 | 74 | int sz = meanings.size(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -44,12 +44,12 @@ |
45 | 45 | this.weigthCombiner = weightCombiner; |
46 | 46 | } |
47 | 47 | |
48 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
| 48 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) { |
49 | 49 | Collection<List<X>> sequences = getSequences(root, Integer.MAX_VALUE); |
50 | 50 | return disambiguate(sequences, root, meanings, context); |
51 | 51 | } |
52 | 52 | |
53 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
| 53 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) { |
54 | 54 | Result<X, LocalConcept> best = null; |
55 | 55 | |
56 | 56 | for (List<X> sequence: sequences) { |
— | — | @@ -62,7 +62,7 @@ |
63 | 63 | return best; |
64 | 64 | } |
65 | 65 | |
66 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
| 66 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) { |
67 | 67 | if (sequence.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), Collections.<X>emptyList(), 0.0, "no terms or meanings"); |
68 | 68 | |
69 | 69 | Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java |
— | — | @@ -77,7 +77,7 @@ |
78 | 78 | |
79 | 79 | public void setTrace(Output trace); |
80 | 80 | |
81 | | - public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException; |
82 | | - public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException; |
| 81 | + public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<? extends C> context) throws PersistenceException; |
| 82 | + public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<? extends C> context) throws PersistenceException; |
83 | 83 | |
84 | 84 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermAlternativeListNode.java |
— | — | @@ -23,6 +23,10 @@ |
24 | 24 | this.terms = terms; |
25 | 25 | this.index = index; |
26 | 26 | } |
| 27 | + |
| 28 | + public String toString() { |
| 29 | + return getTermReference().toString(); |
| 30 | + } |
27 | 31 | |
28 | 32 | public T getTermReference() { |
29 | 33 | return term; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermListNode.java |
— | — | @@ -18,6 +18,10 @@ |
19 | 19 | this.terms = terms; |
20 | 20 | this.index = index; |
21 | 21 | } |
| 22 | + |
| 23 | + public String toString() { |
| 24 | + return getTermReference().toString(); |
| 25 | + } |
22 | 26 | |
23 | 27 | public T getTermReference() { |
24 | 28 | return terms.get(index); |
— | — | @@ -26,7 +30,7 @@ |
27 | 31 | public List<TermListNode<T>> getSuccessors() { |
28 | 32 | if (successors == null) { |
29 | 33 | if (index+1>=terms.size()) successors = Collections.emptyList(); |
30 | | - else Collections.singletonList(new TermListNode<T>(terms, index+1)); |
| 34 | + else successors = Collections.singletonList(new TermListNode<T>(terms, index+1)); |
31 | 35 | } |
32 | 36 | |
33 | 37 | return successors; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/mangler/SpellingAlternator.java |
— | — | @@ -7,37 +7,48 @@ |
8 | 8 | import java.util.List; |
9 | 9 | import java.util.Set; |
10 | 10 | |
| 11 | +import de.brightbyte.wikiword.disambig.Term; |
| 12 | + |
11 | 13 | public class SpellingAlternator { |
12 | 14 | protected List<Mangler> manglers = new ArrayList<Mangler>(); |
| 15 | + protected double weightFactor; |
13 | 16 | |
14 | | - public Collection<String> getAlternatives(String term) { |
15 | | - if (manglers.isEmpty()) return Collections.singleton(term); |
| 17 | + public SpellingAlternator(List<Mangler> manglers, double weightFactor) { |
| 18 | + if (manglers==null) throw new NullPointerException(); |
| 19 | + if (weightFactor<=0 || weightFactor>1) throw new IllegalArgumentException("weightFactor must be > 0 and <= 1"); |
16 | 20 | |
17 | | - Set<String> alternatives = new HashSet<String>(); |
18 | | - alternatives.add(term); |
19 | | - |
20 | | - collectAlternatives(term, 0, alternatives); |
| 21 | + this.manglers = manglers; |
| 22 | + this.weightFactor = weightFactor; |
| 23 | + } |
| 24 | + |
| 25 | + public Collection<Term> getAlternatives(String term) { |
| 26 | + if (manglers.isEmpty()) return Collections.singleton(new Term(term)); |
| 27 | + |
| 28 | + Set<Term> alternatives = new HashSet<Term>(); |
| 29 | + |
| 30 | + collectAlternatives(term, 0, 1, alternatives); |
21 | 31 | return alternatives; |
22 | 32 | } |
23 | 33 | |
24 | | - private void collectAlternatives(String term, int index, Set<String> alternatives) { |
| 34 | + private void collectAlternatives(String term, int index, double weight, Set<Term> alternatives) { |
25 | 35 | if (index>=manglers.size()) return; |
26 | | - |
| 36 | + if (!alternatives.add(new Term(term, weight))) return; |
| 37 | + |
27 | 38 | Mangler mangler= manglers.get(index); |
28 | 39 | CharSequence t = mangler.mangle(term); |
29 | 40 | |
30 | | - if (t!=null && alternatives.add(t.toString())) { |
31 | | - collectAlternatives(t.toString(), index+1, alternatives); //branch recursion. NOTE: use index=0 to cover all combinations |
| 41 | + if (t!=null) { |
| 42 | + collectAlternatives(t.toString(), index+1, weight*weightFactor, alternatives); //branch recursion. NOTE: use index=0 to cover all combinations |
32 | 43 | } |
33 | 44 | |
34 | | - collectAlternatives(term, index+1, alternatives); //primitive recursion |
| 45 | + collectAlternatives(term, index+1, weight, alternatives); //primitive recursion |
35 | 46 | } |
36 | 47 | |
37 | | - public List<Collection<String>> getAlternatives(List<String> terms) { |
38 | | - List<Collection<String>> alternatives = new ArrayList<Collection<String>>(terms.size()); |
| 48 | + public List<Collection<Term>> getAlternatives(List<String> terms) { |
| 49 | + List<Collection<Term>> alternatives = new ArrayList<Collection<Term>>(terms.size()); |
39 | 50 | |
40 | 51 | for (String t: terms) { |
41 | | - Collection<String> alt = getAlternatives(t); |
| 52 | + Collection<Term> alt = getAlternatives(t); |
42 | 53 | alternatives.add(alt); |
43 | 54 | } |
44 | 55 | |