Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java |
— | — | @@ -108,6 +108,40 @@ |
109 | 109 | assertTrue("depth 1000", sameElements(getBankAndMonumentSequences(1000), res)); |
110 | 110 | } |
111 | 111 | |
| 112 | + public void testGetSequences2() throws PersistenceException { |
| 113 | + PopularityDisambiguator<TermReference, LocalConcept> disambiguator = new PopularityDisambiguator<TermReference, LocalConcept>(meaningFetcher, 10); |
| 114 | + PhraseOccuranceSet set = getMargaretOfYorkPhrases(); |
| 115 | + |
| 116 | + Collection<List<PhraseOccurance>> res = disambiguator.getSequences(set.getRootNode(), 3); |
| 117 | + printSequences(res, 1000); |
| 118 | + |
| 119 | + //res = disambiguator.getSequences(set.getRootNode(), 1000); |
| 120 | + //printSequences(res, 1000); |
| 121 | + //TODO: check / compare. right now, we just test performance |
| 122 | + } |
| 123 | + |
| 124 | + protected void printSequences(Collection<List<PhraseOccurance>> res, int max) { |
| 125 | + int i = 0; |
| 126 | + StringBuilder b = new StringBuilder(); |
| 127 | + for (List<PhraseOccurance> seq: res) { |
| 128 | + b.setLength(0); |
| 129 | + |
| 130 | + for (PhraseOccurance p: seq) { |
| 131 | + if (b.length()>0) b.append(" | "); |
| 132 | + b.append(p.getTerm()); |
| 133 | + } |
| 134 | + |
| 135 | + i++; |
| 136 | + |
| 137 | + System.out.println("#"+i+": "+b); |
| 138 | + |
| 139 | + if (i>1000) { |
| 140 | + System.out.println("way too many ("+res.size()+")!"); |
| 141 | + break; |
| 142 | + } |
| 143 | + } |
| 144 | + } |
| 145 | + |
112 | 146 | public void testDisambiguateTerms() throws PersistenceException { |
113 | 147 | PopularityDisambiguator<TermReference, LocalConcept> disambiguator = new PopularityDisambiguator<TermReference, LocalConcept>(meaningFetcher, 10); |
114 | 148 | |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java |
— | — | @@ -207,7 +207,144 @@ |
208 | 208 | } |
209 | 209 | |
210 | 210 | private String bankAndMonumentText = "The Bank and Monument Underground station"; |
| 211 | + private String margaretofYorkText = "Margaret of York is introduced to her future husband, Charles the Bold on 27th June, 1468 at Damme (vellum)"; |
| 212 | + //private String fromTheFrenchText = "from the French edition of the biography of Alexandre le Grand by Quintus Curtius Rufus (c.42 AD)"; |
| 213 | + //private String translationByVasqueText = "from the French edition of the biography of Alexandre le Grand by Quintus Curtius Rufus (c.42 AD)"; |
211 | 214 | |
| 215 | + protected List<PhraseOccurance> getMargaretOfYorkTerms(int depth) { |
| 216 | + List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 217 | + |
| 218 | + if (depth==0) return phrases; |
| 219 | + |
| 220 | + int ofs = 0; |
| 221 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 8 ), 1, ofs, 8 -ofs ) ); //Margaret |
| 222 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 11 ), 1, ofs, 11 -ofs ) ); //Margaret of |
| 223 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 16 ), 2, ofs, 16 -ofs ) ); //Margaret of York |
| 224 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 19 ), 2, ofs, 19 -ofs ) ); //Margaret of York is |
| 225 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 3, ofs, 30 -ofs ) ); //Margaret of York is introduced |
| 226 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 3, ofs, 33 -ofs ) ); //Margaret of York is introduced to |
| 227 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 3, ofs, 37 -ofs ) ); //Margaret of York is introduced to her |
| 228 | + |
| 229 | + if (depth==1) return phrases; |
| 230 | + |
| 231 | + ofs = 9; |
| 232 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 16 ), 1, ofs, 16 -ofs ) ); //of York |
| 233 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 19 ), 1, ofs, 19 -ofs ) ); //of York is |
| 234 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 2, ofs, 30 -ofs ) ); //of York is introduced |
| 235 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 2, ofs, 33 -ofs ) ); //of York is introduced to |
| 236 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 2, ofs, 37 -ofs ) ); //of York is introduced to her |
| 237 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 4, ofs, 44 -ofs ) ); //of York is introduced to her future |
| 238 | + |
| 239 | + ofs = 12; |
| 240 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 16 ), 1, ofs, 16 -ofs ) ); //York |
| 241 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 19 ), 1, ofs, 19 -ofs ) ); //York is |
| 242 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 2, ofs, 30 -ofs ) ); //York is introduced |
| 243 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 2, ofs, 33 -ofs ) ); //York is introduced to |
| 244 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 2, ofs, 37 -ofs ) ); //York is introduced to her |
| 245 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 4, ofs, 44 -ofs ) ); //York is introduced to her future |
| 246 | + |
| 247 | + if (depth==2) return phrases; |
| 248 | + |
| 249 | + ofs = 17; |
| 250 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 1, ofs, 30 -ofs ) ); //is introduced |
| 251 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 1, ofs, 33 -ofs ) ); //is introduced to |
| 252 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 1, ofs, 37 -ofs ) ); //is introduced to her |
| 253 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 2, ofs, 44 -ofs ) ); //is introduced to her future |
| 254 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 3, ofs, 52 -ofs ) ); //is introduced to her future husband |
| 255 | + |
| 256 | + ofs = 20; |
| 257 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 1, ofs, 30 -ofs ) ); //introduced |
| 258 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 1, ofs, 33 -ofs ) ); //introduced to |
| 259 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 1, ofs, 37 -ofs ) ); //introduced to her |
| 260 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 2, ofs, 44 -ofs ) ); //introduced to her future |
| 261 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 3, ofs, 52 -ofs ) ); //introduced to her future husband |
| 262 | + |
| 263 | + if (depth==3) return phrases; |
| 264 | + |
| 265 | + ofs = 31; |
| 266 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 1, ofs, 44 -ofs ) ); //to her future |
| 267 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 2, ofs, 52 -ofs ) ); //to her future husband |
| 268 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 3, ofs,61 -ofs ) ); //to her future husband, Charles |
| 269 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 3, ofs, 65 -ofs ) ); //to her future husband, Charles the |
| 270 | + |
| 271 | + ofs = 34; |
| 272 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 1, ofs, 44 -ofs ) ); //her future |
| 273 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 2, ofs, 52 -ofs ) ); //her future husband |
| 274 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 3, ofs, 61 -ofs ) ); //her future husband, Charles |
| 275 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65), 3, ofs, 65 -ofs ) ); //her future husband, Charles the |
| 276 | + |
| 277 | + ofs = 38; |
| 278 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 1, ofs, 44 -ofs ) ); //future |
| 279 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 2, ofs, 52 -ofs ) ); //future husband |
| 280 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 3, ofs, 61 -ofs ) ); //future husband, Charles |
| 281 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 3, ofs, 65 -ofs ) ); //future husband, Charles the |
| 282 | + |
| 283 | + if (depth==4) return phrases; |
| 284 | + |
| 285 | + ofs = 45; |
| 286 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 1, ofs, 52 -ofs ) ); //husband |
| 287 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 2, ofs, 61 -ofs ) ); //husband, Charles |
| 288 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 2, ofs, 65 -ofs ) ); //husband, Charles the |
| 289 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 3, ofs, 70 -ofs ) ); //husband, Charles the Bold |
| 290 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 3, ofs, 73 -ofs ) ); //husband, Charles the Bold on |
| 291 | + |
| 292 | + if (depth==5) return phrases; |
| 293 | + |
| 294 | + ofs = 54; |
| 295 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 1, ofs, 61 -ofs ) ); //Charles |
| 296 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 1, ofs, 65 -ofs ) ); //Charles the |
| 297 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 2, ofs, 70 -ofs ) ); //Charles the Bold |
| 298 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 2, ofs, 73 -ofs ) ); //Charles the Bold on |
| 299 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 3, ofs, 78 -ofs ) ); //Charles the Bold on 27th |
| 300 | + |
| 301 | + if (depth==6) return phrases; |
| 302 | + |
| 303 | + ofs = 62; |
| 304 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 1, ofs, 70 -ofs ) ); //the Bold |
| 305 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 1, ofs, 73 -ofs ) ); //the Bold on |
| 306 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 2, ofs, 78 -ofs ) ); //the Bold on 27th |
| 307 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 3, ofs, 83 -ofs ) ); //the Bold on 27th June |
| 308 | + |
| 309 | + ofs = 66; |
| 310 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 1, ofs, 70 -ofs ) ); //Bold |
| 311 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 1, ofs, 73 -ofs ) ); //Bold on |
| 312 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 2, ofs, 78 -ofs ) ); //Bold on 27th |
| 313 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 3, ofs, 83 -ofs ) ); //Bold on 27th June |
| 314 | + |
| 315 | + if (depth==7) return phrases; |
| 316 | + |
| 317 | + ofs = 71; |
| 318 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 1, ofs, 78 -ofs ) ); //on 27th |
| 319 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 2, ofs, 83 -ofs ) ); //on 27th June |
| 320 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 3, ofs, 89 -ofs ) ); //on 27th June, 1468 |
| 321 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 3, ofs, 92 -ofs ) ); //on 27th June, 1468 at |
| 322 | + |
| 323 | + ofs = 73; |
| 324 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 1, ofs, 78 -ofs ) ); //27th |
| 325 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 2, ofs, 83 -ofs ) ); //27th June |
| 326 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 3, ofs, 89 -ofs ) ); //27th June, 1468 |
| 327 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 3, ofs, 92 -ofs ) ); //27th June, 1468 at |
| 328 | + |
| 329 | + if (depth==8) return phrases; |
| 330 | + |
| 331 | + ofs = 78; |
| 332 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 1, ofs, 83 -ofs ) ); //June |
| 333 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 2, ofs, 89 -ofs ) ); //June, 1468 |
| 334 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 2, ofs, 92 -ofs ) ); //June, 1468 at |
| 335 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 98 ), 2, ofs, 98 -ofs ) ); //June, 1468 at Damme |
| 336 | + |
| 337 | + if (depth==9) return phrases; |
| 338 | + |
| 339 | + ofs = 84; |
| 340 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 1, ofs, 89 -ofs ) ); //1468 |
| 341 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 1, ofs, 92 -ofs ) ); //1468 at |
| 342 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 98 ), 2, ofs, 98 -ofs ) ); //1468 at Damme |
| 343 | + phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 107 ), 3, ofs, 107 -ofs ) ); //1468 at Damme (vellum) |
| 344 | + |
| 345 | + return phrases; |
| 346 | + } |
| 347 | + // "Margaret of York is introduced to her future husband, Charles the Bold on 27th June, 1468 at Damme (vellum)" |
| 348 | + |
212 | 349 | protected List<PhraseOccurance> getBankAndMonumentTerms(int depth) { |
213 | 350 | List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
214 | 351 | |
— | — | @@ -384,6 +521,12 @@ |
385 | 522 | return set; |
386 | 523 | } |
387 | 524 | |
| 525 | + protected PhraseOccuranceSet getMargaretOfYorkPhrases() { |
| 526 | + List<PhraseOccurance> phrases = getMargaretOfYorkTerms(1000); |
| 527 | + |
| 528 | + PhraseOccuranceSet set = new PhraseOccuranceSet(margaretofYorkText, phrases); |
| 529 | + return set; |
| 530 | + } |
388 | 531 | |
389 | 532 | public static boolean sameElements(Collection a, Collection b) { |
390 | 533 | if (a==b) return true; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -261,6 +261,8 @@ |
262 | 262 | |
263 | 263 | private <X extends T>CoherenceDisambiguation<X, C> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context, LabeledMatrix<C, C> similarities, FeatureFetcher<C, Integer> features) throws PersistenceException { |
264 | 264 | |
| 265 | + pruneMeaninglessSequences( sequences, meanings ); |
| 266 | + |
265 | 267 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
266 | 268 | |
267 | 269 | Collection<Disambiguator.Interpretation<X, C>> interpretations = getInterpretations(sequences, meanings); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -3,7 +3,9 @@ |
4 | 4 | import java.util.ArrayList; |
5 | 5 | import java.util.Collection; |
6 | 6 | import java.util.Collections; |
| 7 | +import java.util.Iterator; |
7 | 8 | import java.util.List; |
| 9 | +import java.util.ListIterator; |
8 | 10 | import java.util.Map; |
9 | 11 | |
10 | 12 | import de.brightbyte.io.Output; |
— | — | @@ -84,6 +86,20 @@ |
85 | 87 | return builder.getSequences(); |
86 | 88 | } |
87 | 89 | |
| 90 | + protected <X extends T>void pruneMeaninglessSequences(Collection<List<X>> sequences, Map<X, List<? extends C>> meanings) { |
| 91 | + Iterator<List<X>> it = sequences.iterator(); |
| 92 | + outer: while ( it.hasNext() ) { |
| 93 | + List<X> seq = it.next(); |
| 94 | + |
| 95 | + for (X t: seq) { |
| 96 | + if ( meanings.get(t) != null ) { |
| 97 | + continue outer; |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + it.remove(); |
| 102 | + } |
| 103 | + } |
88 | 104 | |
89 | 105 | protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException { |
90 | 106 | Collection<X> terms = getTerms(root, Integer.MAX_VALUE); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -93,6 +93,8 @@ |
94 | 94 | |
95 | 95 | if (initialWindow > 0) { //apply full coherence disambig to initial window size. initialWindow == 1 will trigger a popularity disambig. |
96 | 96 | Collection<List<X>> sequences = getSequences(root, initialWindow); |
| 97 | + pruneMeaninglessSequences( sequences, meanings ); |
| 98 | + |
97 | 99 | Disambiguation<X, C> r; |
98 | 100 | |
99 | 101 | if (initialWindow == 1) r = popularityDisambiguator.disambiguate(sequences, root, meanings, context); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -69,6 +69,8 @@ |
70 | 70 | public <X extends T>Disambiguation<X, C> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) { |
71 | 71 | Disambiguation<X, C> best = null; |
72 | 72 | |
| 73 | + pruneMeaninglessSequences( sequences, meanings ); |
| 74 | + |
73 | 75 | for (List<X> sequence: sequences) { |
74 | 76 | Disambiguation<X, C> r = disambiguate(sequence, meanings, context); |
75 | 77 | trace(r.toString()); |