Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest.java |
— | — | @@ -7,6 +7,8 @@ |
8 | 8 | import java.util.Map; |
9 | 9 | |
10 | 10 | import de.brightbyte.data.Pair; |
| 11 | +import de.brightbyte.io.ConsoleIO; |
| 12 | +import de.brightbyte.io.Output; |
11 | 13 | import de.brightbyte.util.PersistenceException; |
12 | 14 | import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation; |
13 | 15 | import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
— | — | @@ -16,6 +18,8 @@ |
17 | 19 | |
18 | 20 | public class SlidingCoherenceDisambiguatorTest extends DisambiguatorTestBase { |
19 | 21 | |
| 22 | + private Output traceOutput = ConsoleIO.output; |
| 23 | + |
20 | 24 | public SlidingCoherenceDisambiguatorTest() throws IOException, PersistenceException { |
21 | 25 | super(); |
22 | 26 | } |
— | — | @@ -42,7 +46,25 @@ |
43 | 47 | assertTrue("UK as United_Kingdom", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom )) ); |
44 | 48 | assertTrue("UK as Great_Britain", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain )) ); |
45 | 49 | assertTrue("UK as England", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England )) ); |
| 50 | + |
| 51 | + /////////////////////////////////////////////////////////////////////////////////// |
| 52 | + Term freak = new Term("Freak"); |
| 53 | + Pair<Term, LocalConcept> freak_as_nothing = new Pair<Term, LocalConcept>(freak, null); |
46 | 54 | |
| 55 | + sequence = new ArrayList<Term>(); |
| 56 | + sequence.add(freak); |
| 57 | + sequence.add(london); |
| 58 | + |
| 59 | + interpretations = disambiguator.getSequenceInterpretations(sequence, meaningFetcher.getMeanings(sequence)); |
| 60 | + |
| 61 | + assertEquals("number of interpretations", 3, interpretations.size()); |
| 62 | + |
| 63 | + Interpretation<Term, LocalConcept> first = interpretations.iterator().next(); |
| 64 | + |
| 65 | + assertEquals( first.getSequence(), sequence ); |
| 66 | + Interpretation<Term, LocalConcept> interp = new Disambiguator.Interpretation<Term, LocalConcept>( freak_as_nothing, london_as_City_of_London ); |
| 67 | + assertTrue("London as City_of_London", interpretations.contains( interp) ); |
| 68 | + |
47 | 69 | /////////////////////////////////////////////////////////////////////////////////// |
48 | 70 | |
49 | 71 | sequence = new ArrayList<Term>(); |
— | — | @@ -103,6 +125,7 @@ |
104 | 126 | PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
105 | 127 | |
106 | 128 | SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher); |
| 129 | + disambiguator.setTrace(traceOutput); |
107 | 130 | disambiguator.setInitialWindow(1); |
108 | 131 | disambiguator.setWindow(3); |
109 | 132 | |
— | — | @@ -115,13 +138,59 @@ |
116 | 139 | assertEquals("Underground", sequence.get(1).getTerm()); |
117 | 140 | assertEquals("station", sequence.get(2).getTerm()); |
118 | 141 | |
119 | | - assertNotNull( meanings.get( sequence.get(0).getTerm() ) ); |
120 | | - assertNotNull( meanings.get( sequence.get(1).getTerm() ) ); |
121 | | - assertNotNull( meanings.get( sequence.get(2).getTerm() ) ); |
| 142 | + assertNotNull( meanings.get( sequence.get(0) ) ); |
| 143 | + assertNotNull( meanings.get( sequence.get(1) ) ); |
| 144 | + assertNotNull( meanings.get( sequence.get(2) ) ); |
122 | 145 | |
123 | | - assertEquals("Bank_and_Monument_Underground_station", meanings.get( sequence.get(0).getTerm() ).getName() ); |
124 | | - assertEquals("Subway", meanings.get( sequence.get(1).getTerm() ).getName() ); |
125 | | - assertEquals("Metro_station", meanings.get( sequence.get(2).getTerm() ).getName() ); |
| 146 | + assertEquals("Bank_and_Monument_Underground_stations", meanings.get( sequence.get(0) ).getName() ); |
| 147 | + assertEquals("London_Underground", meanings.get( sequence.get(1) ).getName() ); |
| 148 | + assertEquals("Metro_station", meanings.get( sequence.get(2) ).getName() ); |
| 149 | + |
| 150 | + /////////////////////////////////////////////////////////////////////////// |
| 151 | + disambiguator.setTrace(traceOutput); |
| 152 | + disambiguator.setInitialWindow(2); |
| 153 | + disambiguator.setWindow(3); |
| 154 | + |
| 155 | + result = disambiguator.disambiguate(set.getRootNode(), null); |
| 156 | + |
| 157 | + sequence = result.getSequence(); |
| 158 | + meanings = result.getMeanings(); |
| 159 | + |
| 160 | + assertEquals("Bank and Monument", sequence.get(0).getTerm()); |
| 161 | + assertEquals("Underground", sequence.get(1).getTerm()); |
| 162 | + assertEquals("station", sequence.get(2).getTerm()); |
| 163 | + |
| 164 | + assertNotNull( meanings.get( sequence.get(0) ) ); |
| 165 | + assertNotNull( meanings.get( sequence.get(1) ) ); |
| 166 | + assertNotNull( meanings.get( sequence.get(2) ) ); |
| 167 | + |
| 168 | + assertEquals("Bank_and_Monument_Underground_stations", meanings.get( sequence.get(0) ).getName() ); |
| 169 | + assertEquals("London_Underground", meanings.get( sequence.get(1) ).getName() ); |
| 170 | + assertEquals("Metro_station", meanings.get( sequence.get(2) ).getName() ); |
| 171 | + |
| 172 | + /////////////////////////////////////////////////////////////////////////// |
| 173 | + disambiguator.setTrace(traceOutput); |
| 174 | + disambiguator.setInitialWindow(3); |
| 175 | + disambiguator.setWindow(3); |
| 176 | + |
| 177 | + result = disambiguator.disambiguate(set.getRootNode(), null); |
| 178 | + |
| 179 | + sequence = result.getSequence(); |
| 180 | + meanings = result.getMeanings(); |
| 181 | + |
| 182 | + assertEquals("Bank and Monument", sequence.get(0).getTerm()); |
| 183 | + assertEquals("Underground", sequence.get(1).getTerm()); |
| 184 | + assertEquals("station", sequence.get(2).getTerm()); |
| 185 | + |
| 186 | + assertNotNull( meanings.get( sequence.get(0) ) ); |
| 187 | + assertNotNull( meanings.get( sequence.get(1) ) ); |
| 188 | + assertNotNull( meanings.get( sequence.get(2) ) ); |
| 189 | + |
| 190 | + assertEquals("Bank_and_Monument_Underground_stations", meanings.get( sequence.get(0) ).getName() ); |
| 191 | + assertEquals("London_Underground", meanings.get( sequence.get(1) ).getName() ); |
| 192 | + assertEquals("Metro_station", meanings.get( sequence.get(2) ).getName() ); |
| 193 | + |
| 194 | + throw new UnsupportedOperationException("todo: window 1, 2, ..."); |
126 | 195 | } |
127 | 196 | |
128 | 197 | public void testDisambiguateTerms() throws PersistenceException { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -284,24 +284,25 @@ |
285 | 285 | |
286 | 286 | Collection<Disambiguator.Interpretation<X, LocalConcept>> base = getSequenceInterpretations(sequence.subList(1, sequence.size()), meanings); |
287 | 287 | |
288 | | - if (m==null || m.size()==0) return base; |
289 | | - |
290 | 288 | List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = new ArrayList<Disambiguator.Interpretation<X, LocalConcept>>(); |
291 | 289 | |
292 | 290 | for (Disambiguator.Interpretation<X, LocalConcept> be: base) { |
293 | | - for (LocalConcept c: m) { |
294 | | - Map<X, LocalConcept> e = new HashMap<X, LocalConcept>(); |
295 | | - e.putAll(be.getMeanings()); |
296 | | - e.put(t, c); |
297 | | - |
298 | | - if (!sequence.isEmpty()) { |
| 291 | + if (m==null || m.isEmpty()) { |
| 292 | + Disambiguator.Interpretation<X, LocalConcept>interp = new Disambiguator.Interpretation<X, LocalConcept>(be.getMeanings(), sequence); |
| 293 | + interpretations.add(interp); |
| 294 | + } else { |
| 295 | + for (LocalConcept c: m) { |
| 296 | + Map<X, LocalConcept> e = new HashMap<X, LocalConcept>(); |
| 297 | + e.putAll(be.getMeanings()); |
| 298 | + e.put(t, c); |
| 299 | + |
299 | 300 | Disambiguator.Interpretation<X, LocalConcept>interp = new Disambiguator.Interpretation<X, LocalConcept>(e, sequence); |
300 | 301 | interpretations.add(interp); |
301 | 302 | } |
302 | 303 | } |
303 | 304 | } |
304 | 305 | |
305 | | - trace(" ~ "+t+": "+m.size()+" meanings; collected "+interpretations.size()+" combinations"); |
| 306 | + trace(" ~ "+t+": "+(m==null ? "no": m.size())+" meanings; collected "+interpretations.size()+" combinations"); |
306 | 307 | return interpretations; |
307 | 308 | } |
308 | 309 | |
— | — | @@ -367,6 +368,10 @@ |
368 | 369 | } |
369 | 370 | |
370 | 371 | if (d<0) throw new IllegalArgumentException("encountered negative similarity score ("+d+") for "+a+" / "+b); |
| 372 | + |
| 373 | + assert d>=0; |
| 374 | + assert d<=1; |
| 375 | + |
371 | 376 | sim += d; |
372 | 377 | n ++; //should add up to interp.size*(combo.size()-1)/2, according to Gauss |
373 | 378 | } |
— | — | @@ -384,14 +389,25 @@ |
385 | 390 | } |
386 | 391 | |
387 | 392 | //normalize |
388 | | - sim = sim / n; //scale |
389 | | - pop = pop / c; //scale |
390 | | - weight = weight / c; //scale |
| 393 | + sim = n == 0 ? 0 : sim / n; //scale |
| 394 | + pop = c == 0 ? 0 : pop / c; //scale |
| 395 | + weight = c == 0 ? 0 : weight / c; //scale |
391 | 396 | |
| 397 | + assert pop >= 0; |
| 398 | + assert sim >= 0; |
| 399 | + assert sim <= 1; |
| 400 | + |
392 | 401 | double popf = popularityNormalizer.apply(pop); |
393 | 402 | double simf = similarityNormalizer.apply(sim); |
| 403 | + |
| 404 | + assert popf>=0; |
| 405 | + assert popf<=1; |
| 406 | + assert simf>=0; |
| 407 | + assert simf<=1; |
394 | 408 | |
395 | 409 | double score = scoreCombiner.apply(popf, simf); |
| 410 | + assert score>=0; |
| 411 | + assert score<=1; |
396 | 412 | |
397 | 413 | return new Result<X, LocalConcept>(interp.getMeanings(), interp.getSequence(), score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop+", weight="+weight); |
398 | 414 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -97,8 +97,11 @@ |
98 | 98 | |
99 | 99 | if (initialWindow > 0) { //apply full coherence disambig to initial window size. initialWindow == 1 will trigger a popularity disambig. |
100 | 100 | Collection<List<X>> sequences = getSequences(root, initialWindow); |
101 | | - Result<X, LocalConcept> r = super.disambiguate(sequences, root, meanings, context); |
| 101 | + Result<X, LocalConcept> r; |
102 | 102 | |
| 103 | + if (initialWindow == 1) r = popularityDisambiguator.disambiguate(sequences, root, meanings, context); |
| 104 | + else r = super.disambiguate(sequences, root, meanings, context); |
| 105 | + |
103 | 106 | sequence.addAll(r.getSequence()); |
104 | 107 | currentNode = getLastNode(root, sequence); |
105 | 108 | disambig.putAll(r.getMeanings()); |
— | — | @@ -112,7 +115,8 @@ |
113 | 116 | PhraseNode<X> bestNode = null; |
114 | 117 | |
115 | 118 | for (PhraseNode<X> n: successors) { |
116 | | - Result<X, LocalConcept> r = evalStep(sequence, disambig, currentNode, meanings, context, similarities, features); //empty sequence will trigger popularity disambig |
| 119 | + Result<X, LocalConcept> r = evalStep(sequence, disambig, n, meanings, context, similarities, features); //empty sequence will trigger popularity disambig |
| 120 | + trace("evalStep("+n+"): " + r.toString()); |
117 | 121 | if (best == null || best.getScore() < r.getScore()) { |
118 | 122 | best = r; |
119 | 123 | bestNode = n; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java |
— | — | @@ -23,7 +23,7 @@ |
24 | 24 | private static <T extends TermReference, C extends WikiWordConcept>Map<T, C> buildMeaningMap(List<Pair<T, C>> interpretation) { |
25 | 25 | Map<T, C> sequence = new HashMap<T, C>(interpretation.size()); |
26 | 26 | for (Pair<T, C> p: interpretation) { |
27 | | - sequence.put(p.getA(), p.getB()); |
| 27 | + if (p.getB()!=null) sequence.put(p.getA(), p.getB()); |
28 | 28 | } |
29 | 29 | return sequence; |
30 | 30 | } |