Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java |
— | — | @@ -2,9 +2,8 @@ |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
| 6 | +import java.util.Arrays; |
6 | 7 | import java.util.Collection; |
7 | | -import java.util.Collections; |
8 | | -import java.util.HashSet; |
9 | 8 | import java.util.List; |
10 | 9 | import java.util.Map; |
11 | 10 | |
— | — | @@ -34,26 +33,30 @@ |
35 | 34 | terms.add(underground); |
36 | 35 | |
37 | 36 | Collection<Term> res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1); |
38 | | - assertEquals("depth 1", new HashSet<Term>( terms.subList(0, 1) ), res); |
| 37 | + assertTrue("depth 1", sameElements( terms.subList(0, 1), res) ); |
39 | 38 | |
40 | 39 | res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 2); |
41 | | - assertEquals("depth 2", new HashSet<Term>( terms.subList(0, 2) ), res); |
| 40 | + assertTrue("depth 2", sameElements( terms.subList(0, 2), res) ); |
42 | 41 | |
43 | 42 | res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1000); |
44 | | - assertEquals("depth 1000", new HashSet<Term>( terms ), res); |
| 43 | + assertTrue("depth 1000", sameElements( terms, res) ); |
45 | 44 | } |
46 | 45 | |
47 | 46 | public void testGetTermsForNode() throws PersistenceException { |
48 | 47 | PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
49 | 48 | |
| 49 | + //FIXME: Test case for getHorizon |
| 50 | + |
50 | 51 | PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 52 | + |
51 | 53 | Collection<PhraseOccurance> terms = disambiguator.getTerms(set.getRootNode(), 0); |
52 | | - assertEquals("empty term set", Collections.emptySet(), terms); |
| 54 | + assertTrue("empty term set", sameElements( getBankAndMonumentTerms(0), terms) ); |
53 | 55 | |
54 | | - //FIXME: Test case for getHorizon |
| 56 | + terms = disambiguator.getTerms(set.getRootNode(), 1); |
| 57 | + assertTrue("terms from depth 1", sameElements( getBankAndMonumentTerms(1), terms) ); |
55 | 58 | |
56 | | - terms = disambiguator.getTerms(set.getRootNode(), 1); |
57 | | - assertEquals("terms from depth 1", Collections.emptySet() /* fixme */, terms); |
| 59 | + terms = disambiguator.getTerms(set.getRootNode(), 1000); |
| 60 | + assertTrue("terms from depth 1000", sameElements( getBankAndMonumentTerms(1000), terms) ); |
58 | 61 | } |
59 | 62 | |
60 | 63 | public void testGetMeaningsForList() throws PersistenceException { |
— | — | @@ -68,45 +71,80 @@ |
69 | 72 | terms.add(london); |
70 | 73 | terms.add(underground); |
71 | 74 | |
72 | | - Map<Term, List<? extends LocalConcept>> meanings = disambiguator.getMeanings(terms); |
| 75 | + Map<Term, List<? extends LocalConcept>> res = disambiguator.getMeanings(terms); |
73 | 76 | |
74 | | - assertEquals(uk.getTerm(), meanings.get(uk.getTerm()), meanings.get(uk)); |
75 | | - assertEquals(london.getTerm(), meanings.get(london.getTerm()), meanings.get(london)); |
76 | | - assertEquals(underground.getTerm(), meanings.get(underground.getTerm()), meanings.get(underground)); |
| 77 | + assertEquals(uk.getTerm(), meanings.get(uk.getTerm()), res.get(uk)); |
| 78 | + assertEquals(london.getTerm(), meanings.get(london.getTerm()), res.get(london)); |
| 79 | + assertEquals(underground.getTerm(), meanings.get(underground.getTerm()), res.get(underground)); |
77 | 80 | } |
78 | 81 | |
79 | 82 | public void testGetMeaningsForNode() throws PersistenceException { |
80 | | - throw new UnsupportedOperationException("not yet implemented"); |
81 | | - //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
82 | | - //disambiguator.getMeanings(terms); |
| 83 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 84 | + |
| 85 | + PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
| 86 | + Map<PhraseOccurance, List<? extends LocalConcept>> res = disambiguator.getMeanings(set.getRootNode()); |
| 87 | + List<PhraseOccurance> terms = getBankAndMonumentTerms(1000); |
| 88 | + |
| 89 | + for (PhraseOccurance t: terms) { |
| 90 | + List<? extends LocalConcept> m = res.get(t); |
| 91 | + List<? extends LocalConcept> n = meanings.get(t.getTerm()); |
| 92 | + |
| 93 | + assertEquals("meanings for "+t, n, m); |
| 94 | + } |
83 | 95 | } |
84 | 96 | |
85 | 97 | public void testGetSequences() throws PersistenceException { |
86 | | - throw new UnsupportedOperationException("not yet implemented"); |
87 | | - //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
88 | | - //disambiguator.getSequences(root, depth); |
| 98 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 99 | + PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
| 100 | + |
| 101 | + Collection<List<PhraseOccurance>> res = disambiguator.getSequences(set.getRootNode(), 1); |
| 102 | + assertTrue("depth 1", sameElements(getBankAndMonumentSequences(1), res)); |
| 103 | + |
| 104 | + res = disambiguator.getSequences(set.getRootNode(), 2); |
| 105 | + assertTrue("depth 2", sameElements(getBankAndMonumentSequences(2), res)); |
| 106 | + |
| 107 | + res = disambiguator.getSequences(set.getRootNode(), 1000); |
| 108 | + assertTrue("depth 1000", sameElements(getBankAndMonumentSequences(1000), res)); |
89 | 109 | } |
90 | 110 | |
91 | 111 | public void testDisambiguateTerms() throws PersistenceException { |
92 | | - throw new UnsupportedOperationException("not yet implemented"); |
93 | | - /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 112 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
94 | 113 | |
95 | | - String[] sequence = {"UK", "London", "Underground", "Bank"}; |
| 114 | + Term uk = new Term("UK"); |
| 115 | + Term london = new Term("London"); |
| 116 | + Term underground = new Term("Underground"); |
| 117 | + |
| 118 | + List<Term> sequence = Arrays.asList(new Term[] {uk, london, underground}); |
| 119 | + Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(sequence, null); |
96 | 120 | |
97 | | - Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null); |
98 | | - */ |
99 | | - //// .............. /// |
| 121 | + assertEquals("sequence", sequence, result.getSequence()); |
| 122 | + |
| 123 | + assertEquals(uk.getTerm(), getConcept("United_Kingdom"), result.getMeanings().get(uk)); |
| 124 | + assertEquals(london.getTerm(), getConcept("City_of_London"), result.getMeanings().get(london)); |
| 125 | + assertEquals(underground.getTerm(), getConcept("London_Underground"), result.getMeanings().get(underground)); |
100 | 126 | } |
101 | 127 | |
102 | 128 | public void testDisambiguateNode() throws PersistenceException { |
103 | | - throw new UnsupportedOperationException("not yet implemented"); |
104 | | - /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 129 | + PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
105 | 130 | |
106 | | - String[] sequence = {"UK", "London", "Underground", "Bank"}; |
| 131 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
107 | 132 | |
108 | | - Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null); |
109 | | - */ |
110 | | - //// .............. /// |
| 133 | + Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(set.getRootNode(), null); |
| 134 | + |
| 135 | + List<? extends PhraseOccurance> sequence = result.getSequence(); |
| 136 | + Map<? extends PhraseOccurance, ? extends LocalConcept> meanings = result.getMeanings(); |
| 137 | + |
| 138 | + assertEquals("Bank and Monument", sequence.get(0).getTerm()); |
| 139 | + assertEquals("Underground", sequence.get(1).getTerm()); |
| 140 | + assertEquals("station", sequence.get(2).getTerm()); |
| 141 | + |
| 142 | + assertNotNull( meanings.get( sequence.get(0).getTerm() ) ); |
| 143 | + assertNotNull( meanings.get( sequence.get(1).getTerm() ) ); |
| 144 | + assertNotNull( meanings.get( sequence.get(2).getTerm() ) ); |
| 145 | + |
| 146 | + assertEquals("Bank_and_Monument_Underground_station", meanings.get( sequence.get(0).getTerm() ).getName() ); |
| 147 | + assertEquals("Subway", meanings.get( sequence.get(1).getTerm() ).getName() ); |
| 148 | + assertEquals("Metro_station", meanings.get( sequence.get(2).getTerm() ).getName() ); |
111 | 149 | } |
112 | 150 | |
113 | 151 | } |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java |
— | — | @@ -203,30 +203,195 @@ |
204 | 204 | return m; |
205 | 205 | } |
206 | 206 | |
207 | | - protected PhraseOccuranceSet getBankAndMonumentPhrases() { |
208 | | - String text = "The Bank and Monument Underground station"; |
| 207 | + private String bankAndMonumentText = "The Bank and Monument Underground station"; |
| 208 | + |
| 209 | + protected List<PhraseOccurance> getBankAndMonumentTerms(int depth) { |
209 | 210 | List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 211 | + |
| 212 | + if (depth==0) return phrases; |
210 | 213 | |
211 | | - phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank |
212 | | - phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument |
213 | | - phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground |
| 214 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank |
| 215 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument |
| 216 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground |
214 | 217 | |
215 | | - phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank |
216 | | - phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument |
217 | | - phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground |
| 218 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank |
| 219 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument |
| 220 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground |
218 | 221 | //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station |
| 222 | + |
| 223 | + if (depth==1) return phrases; |
219 | 224 | |
220 | | - phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
221 | | - phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
222 | | - phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
| 225 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
| 226 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
| 227 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
223 | 228 | |
224 | | - phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
225 | | - phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 229 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 230 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
226 | 231 | |
227 | | - phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 232 | + phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
228 | 233 | |
229 | | - PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases); |
| 234 | + return phrases; |
| 235 | + } |
| 236 | + |
| 237 | + protected Collection<List<PhraseOccurance>> getBankAndMonumentSequences(int depth) { |
| 238 | + ArrayList<List<PhraseOccurance>> sequences = new ArrayList<List<PhraseOccurance>>(); |
| 239 | + |
| 240 | + if (depth==0) return sequences; |
| 241 | + |
| 242 | + List<PhraseOccurance> seq1 = new ArrayList<PhraseOccurance>(); |
| 243 | + List<PhraseOccurance> seq11 = new ArrayList<PhraseOccurance>(); |
| 244 | + List<PhraseOccurance> seq111 = new ArrayList<PhraseOccurance>(); |
| 245 | + List<PhraseOccurance> seq1111 = new ArrayList<PhraseOccurance>(); |
| 246 | + List<PhraseOccurance> seq112 = new ArrayList<PhraseOccurance>(); |
| 247 | + List<PhraseOccurance> seq12 = new ArrayList<PhraseOccurance>(); |
| 248 | + List<PhraseOccurance> seq121 = new ArrayList<PhraseOccurance>(); |
| 249 | + List<PhraseOccurance> seq13 = new ArrayList<PhraseOccurance>(); |
| 250 | + |
| 251 | + List<PhraseOccurance> seq2 = new ArrayList<PhraseOccurance>(); |
| 252 | + List<PhraseOccurance> seq21 = new ArrayList<PhraseOccurance>(); |
| 253 | + List<PhraseOccurance> seq211 = new ArrayList<PhraseOccurance>(); |
| 254 | + List<PhraseOccurance> seq22 = new ArrayList<PhraseOccurance>(); |
| 255 | + |
| 256 | + List<PhraseOccurance> seq3 = new ArrayList<PhraseOccurance>(); |
| 257 | + List<PhraseOccurance> seq31 = new ArrayList<PhraseOccurance>(); |
| 258 | + |
| 259 | + List<PhraseOccurance> seq5 = new ArrayList<PhraseOccurance>(); |
| 260 | + List<PhraseOccurance> seq51 = new ArrayList<PhraseOccurance>(); |
| 261 | + List<PhraseOccurance> seq511 = new ArrayList<PhraseOccurance>(); |
| 262 | + List<PhraseOccurance> seq5111 = new ArrayList<PhraseOccurance>(); |
| 263 | + List<PhraseOccurance> seq512 = new ArrayList<PhraseOccurance>(); |
| 264 | + List<PhraseOccurance> seq52 = new ArrayList<PhraseOccurance>(); |
| 265 | + List<PhraseOccurance> seq521 = new ArrayList<PhraseOccurance>(); |
| 266 | + List<PhraseOccurance> seq53 = new ArrayList<PhraseOccurance>(); |
| 267 | + |
| 268 | + List<PhraseOccurance> seq6 = new ArrayList<PhraseOccurance>(); |
| 269 | + List<PhraseOccurance> seq61 = new ArrayList<PhraseOccurance>(); |
| 270 | + List<PhraseOccurance> seq611 = new ArrayList<PhraseOccurance>(); |
| 271 | + List<PhraseOccurance> seq62 = new ArrayList<PhraseOccurance>(); |
| 272 | + |
| 273 | + List<PhraseOccurance> seq7 = new ArrayList<PhraseOccurance>(); |
| 274 | + List<PhraseOccurance> seq71 = new ArrayList<PhraseOccurance>(); |
| 275 | + |
| 276 | + seq1.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank |
| 277 | + seq2.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 21 ), 1, 0, 21 ) ); //The Bank and Monument |
| 278 | + seq3.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground |
| 279 | + seq5.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank |
| 280 | + seq6.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument |
| 281 | + seq7.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground |
| 282 | + |
| 283 | + if (depth==1) { |
| 284 | + sequences.add(seq1); |
| 285 | + sequences.add(seq2); |
| 286 | + sequences.add(seq3); |
| 287 | + sequences.add(seq5); |
| 288 | + sequences.add(seq6); |
| 289 | + sequences.add(seq7); |
| 290 | + |
| 291 | + return sequences; |
| 292 | + } |
| 293 | + |
| 294 | + seq11.addAll(seq1); |
| 295 | + seq11.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
| 296 | + seq12.addAll(seq1); |
| 297 | + seq12.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
| 298 | + seq13.addAll(seq1); |
| 299 | + seq13.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
| 300 | + seq21.addAll(seq2); |
| 301 | + seq21.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 302 | + seq22.addAll(seq2); |
| 303 | + seq22.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 304 | + seq31.addAll(seq3); |
| 305 | + seq31.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 306 | + seq51.addAll(seq5); |
| 307 | + seq51.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
| 308 | + seq52.addAll(seq5); |
| 309 | + seq52.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
| 310 | + seq53.addAll(seq5); |
| 311 | + seq53.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
| 312 | + seq61.addAll(seq6); |
| 313 | + seq61.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 314 | + seq62.addAll(seq6); |
| 315 | + seq62.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 316 | + seq71.addAll(seq7); |
| 317 | + seq71.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 318 | + |
| 319 | + sequences.add(seq13); |
| 320 | + sequences.add(seq22); |
| 321 | + sequences.add(seq31); |
| 322 | + sequences.add(seq53); |
| 323 | + sequences.add(seq62); |
| 324 | + sequences.add(seq71); |
| 325 | + |
| 326 | + if (depth==2) { |
| 327 | + sequences.add(seq11); |
| 328 | + sequences.add(seq12); |
| 329 | + sequences.add(seq21); |
| 330 | + sequences.add(seq51); |
| 331 | + sequences.add(seq52); |
| 332 | + sequences.add(seq61); |
| 333 | + |
| 334 | + return sequences; |
| 335 | + } |
| 336 | + |
| 337 | + seq111.addAll(seq11); |
| 338 | + seq111.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 339 | + seq112.addAll(seq11); |
| 340 | + seq112.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 341 | + seq121.addAll(seq12); |
| 342 | + seq121.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 343 | + seq211.addAll(seq21); |
| 344 | + seq211.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 345 | + seq511.addAll(seq51); |
| 346 | + seq511.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 347 | + seq512.addAll(seq51); |
| 348 | + seq512.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 349 | + seq521.addAll(seq52); |
| 350 | + seq521.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 351 | + seq611.addAll(seq61); |
| 352 | + seq611.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 353 | + |
| 354 | + sequences.add(seq112); |
| 355 | + sequences.add(seq121); |
| 356 | + sequences.add(seq211); |
| 357 | + sequences.add(seq512); |
| 358 | + sequences.add(seq521); |
| 359 | + sequences.add(seq611); |
| 360 | + |
| 361 | + if (depth==3) { |
| 362 | + sequences.add(seq111); |
| 363 | + sequences.add(seq511); |
| 364 | + return sequences; |
| 365 | + } |
| 366 | + |
| 367 | + seq1111.addAll(seq111); |
| 368 | + seq1111.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 369 | + seq5111.addAll(seq511); |
| 370 | + seq5111.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 371 | + |
| 372 | + sequences.add(seq1111); |
| 373 | + sequences.add(seq5111); |
| 374 | + |
| 375 | + return sequences; |
| 376 | + } |
| 377 | + |
| 378 | + protected PhraseOccuranceSet getBankAndMonumentPhrases() { |
| 379 | + List<PhraseOccurance> phrases = getBankAndMonumentTerms(1000); |
| 380 | + |
| 381 | + PhraseOccuranceSet set = new PhraseOccuranceSet(bankAndMonumentText, phrases); |
230 | 382 | return set; |
231 | 383 | } |
232 | 384 | |
| 385 | + |
| 386 | + public static boolean sameElements(Collection a, Collection b) { |
| 387 | + if (a==b) return true; |
| 388 | + if (a==null || b==null) return false; |
| 389 | + if (a.size() != b.size()) return false; |
| 390 | + if (a.equals(b)) return true; |
| 391 | + |
| 392 | + for (Object x: a) { |
| 393 | + if (!b.contains(x)) return false; |
| 394 | + } |
| 395 | + |
| 396 | + return true; |
| 397 | + } |
233 | 398 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -71,7 +71,7 @@ |
72 | 72 | public void setMeaningOverrides(Map<? extends T, C> overrideMap) { |
73 | 73 | this.meaningOverrides = overrideMap; |
74 | 74 | } |
75 | | - |
| 75 | + |
76 | 76 | protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) { |
77 | 77 | TermSetBuilder<X> builder = new TermSetBuilder<X>(); |
78 | 78 | walk(root, null, builder, depth); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
4 | 5 | import java.util.Collection; |
5 | 6 | import java.util.Collections; |
6 | 7 | import java.util.HashMap; |
— | — | @@ -69,10 +70,14 @@ |
70 | 71 | double score = 0; |
71 | 72 | int totalPop = 0; |
72 | 73 | |
| 74 | + List<X> resultSequence = new ArrayList<X>(sequence.size()); |
| 75 | + |
73 | 76 | for (X t: sequence) { |
74 | 77 | List<? extends LocalConcept> m = meanings.get(t); |
75 | 78 | if (m==null || m.size()==0) continue; |
76 | 79 | |
| 80 | + resultSequence.add(t); |
| 81 | + |
77 | 82 | if (m.size()>1) Collections.sort(m, popularityComparator); |
78 | 83 | |
79 | 84 | LocalConcept c = m.get(0); |
— | — | @@ -81,13 +86,13 @@ |
82 | 87 | double pop = popularityMeasure.measure(c); |
83 | 88 | totalPop += pop; |
84 | 89 | |
85 | | - Number sc = weigthCombiner.apply(pop, t.getWeight()); |
86 | | - score += sc.doubleValue(); |
| 90 | + double sc = weigthCombiner.apply(pop, t.getWeight()); //FIXME: pop and weight are not in the same scale. |
| 91 | + score += sc; |
87 | 92 | } |
88 | 93 | |
89 | 94 | if (disambig.size()>0) score = score / disambig.size(); |
90 | 95 | |
91 | | - Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, sequence, score, "score="+score+"; pop="+totalPop); |
| 96 | + Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, resultSequence, score, "score="+score+"; pop="+totalPop); |
92 | 97 | return r; |
93 | 98 | } |
94 | 99 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/LinearCombiner.java |
— | — | @@ -22,7 +22,7 @@ |
23 | 23 | } |
24 | 24 | |
25 | 25 | public double apply(double a, double b) { |
26 | | - return b * bias + b * ( 1 - bias ); |
| 26 | + return a * bias + b * ( 1.0 - bias ); |
27 | 27 | //return = Math.sqrt( popf * simf ); //normalized produkt |
28 | 28 | } |
29 | 29 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java |
— | — | @@ -106,13 +106,14 @@ |
107 | 107 | public Collection<? extends PhraseNode<PhraseOccurance>> getSuccessorsAt(int pos) { |
108 | 108 | Set<PhraseNode<PhraseOccurance>> successors = new HashSet<PhraseNode<PhraseOccurance>>(); |
109 | 109 | |
| 110 | + int horizon = text.length(); |
110 | 111 | while (true) { |
111 | 112 | Collection<? extends PhraseNode<PhraseOccurance>> nodes = PhraseOccuranceSet.this.getPhraseNodesAt(pos); |
112 | | - if (nodes == null || nodes.isEmpty()) break; |
| 113 | + if (nodes != null && !nodes.isEmpty()) { |
| 114 | + successors.addAll(nodes); |
| 115 | + horizon = getHorizon(successors, horizon); |
| 116 | + } |
113 | 117 | |
114 | | - successors.addAll(nodes); |
115 | | - int horizon = getHorizon(successors); |
116 | | - |
117 | 118 | pos ++; |
118 | 119 | if (pos>=horizon) break; |
119 | 120 | } |
— | — | @@ -120,8 +121,7 @@ |
121 | 122 | return successors; |
122 | 123 | } |
123 | 124 | |
124 | | - private int getHorizon(Collection<? extends PhraseNode<PhraseOccurance>> successors) { |
125 | | - int horizon = Integer.MAX_VALUE; |
| 125 | + private int getHorizon(Collection<? extends PhraseNode<PhraseOccurance>> successors, int horizon) { |
126 | 126 | for (PhraseNode<PhraseOccurance> n: successors) { |
127 | 127 | int end = n.getTermReference().getEndOffset(); |
128 | 128 | if (end < horizon) horizon = end; |
— | — | @@ -133,8 +133,16 @@ |
134 | 134 | |
135 | 135 | public Collection<? extends PhraseNode<PhraseOccurance>> getPhraseNodesAt(int offs) { |
136 | 136 | List<PhraseOccurance> phrases = getPhrasesAt(offs); |
137 | | - if (phrases == null) return null; |
138 | | - |
| 137 | + return toNodeList(phrases); |
| 138 | + } |
| 139 | + |
| 140 | + public Collection<? extends PhraseNode<PhraseOccurance>> getPhraseNodesFrom(int offs) { |
| 141 | + List<PhraseOccurance> phrases = getPhrasesFrom(offs); |
| 142 | + return toNodeList(phrases); |
| 143 | + } |
| 144 | + |
| 145 | + protected List<Node> toNodeList(List<PhraseOccurance> phrases) { |
| 146 | + if (phrases==null) return null; |
139 | 147 | List<Node> nodes = new ArrayList<Node>(phrases.size()); |
140 | 148 | |
141 | 149 | for (PhraseOccurance p: phrases) { |
— | — | @@ -144,9 +152,34 @@ |
145 | 153 | return nodes; |
146 | 154 | } |
147 | 155 | |
148 | | - public List<PhraseOccurance> getPhrasesAt(int offs) { |
| 156 | + public List<PhraseOccurance> getPhrasesAt(int at) { |
149 | 157 | int i = 0; |
| 158 | + PhraseOccurance p = null; |
150 | 159 | while (i<size()) { |
| 160 | + p = get(i); |
| 161 | + if (p.getOffset() >= at) { |
| 162 | + break; |
| 163 | + } |
| 164 | + |
| 165 | + i++; |
| 166 | + } |
| 167 | + |
| 168 | + if (p!=null && p.getOffset() > at) return null; |
| 169 | + if (i>=size()) return null; |
| 170 | + |
| 171 | + int j = i; |
| 172 | + while (j<size()) { |
| 173 | + p = get(j); |
| 174 | + if (p.getOffset() > at) break; |
| 175 | + j++; |
| 176 | + } |
| 177 | + |
| 178 | + return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
| 179 | + } |
| 180 | + |
| 181 | + public List<PhraseOccurance> getPhrasesFrom(int offs) { |
| 182 | + int i = 0; |
| 183 | + while (i<size()) { |
151 | 184 | PhraseOccurance p = get(i); |
152 | 185 | if (p.getOffset() >= offs) { |
153 | 186 | offs = p.getOffset(); |