r65970 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r65969‎ | r65970 | r65971 >
Date:23:07, 5 May 2010
Author:daniel
Status:deferred
Tags:
Comment:
disambig testing and debugging (work in progress)
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/LinearCombiner.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java
@@ -2,9 +2,8 @@
33
44 import java.io.IOException;
55 import java.util.ArrayList;
 6+import java.util.Arrays;
67 import java.util.Collection;
7 -import java.util.Collections;
8 -import java.util.HashSet;
98 import java.util.List;
109 import java.util.Map;
1110
@@ -34,26 +33,30 @@
3534 terms.add(underground);
3635
3736 Collection<Term> res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1);
38 - assertEquals("depth 1", new HashSet<Term>( terms.subList(0, 1) ), res);
 37+ assertTrue("depth 1", sameElements( terms.subList(0, 1), res) );
3938
4039 res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 2);
41 - assertEquals("depth 2", new HashSet<Term>( terms.subList(0, 2) ), res);
 40+ assertTrue("depth 2", sameElements( terms.subList(0, 2), res) );
4241
4342 res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1000);
44 - assertEquals("depth 1000", new HashSet<Term>( terms ), res);
 43+ assertTrue("depth 1000", sameElements( terms, res) );
4544 }
4645
4746 public void testGetTermsForNode() throws PersistenceException {
4847 PhraseOccuranceSet set = getBankAndMonumentPhrases();
4948
 49+ //FIXME: Test case for getHorizon
 50+
5051 PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 52+
5153 Collection<PhraseOccurance> terms = disambiguator.getTerms(set.getRootNode(), 0);
52 - assertEquals("empty term set", Collections.emptySet(), terms);
 54+ assertTrue("empty term set", sameElements( getBankAndMonumentTerms(0), terms) );
5355
54 - //FIXME: Test case for getHorizon
 56+ terms = disambiguator.getTerms(set.getRootNode(), 1);
 57+ assertTrue("terms from depth 1", sameElements( getBankAndMonumentTerms(1), terms) );
5558
56 - terms = disambiguator.getTerms(set.getRootNode(), 1);
57 - assertEquals("terms from depth 1", Collections.emptySet() /* fixme */, terms);
 59+ terms = disambiguator.getTerms(set.getRootNode(), 1000);
 60+ assertTrue("terms from depth 1000", sameElements( getBankAndMonumentTerms(1000), terms) );
5861 }
5962
6063 public void testGetMeaningsForList() throws PersistenceException {
@@ -68,45 +71,80 @@
6972 terms.add(london);
7073 terms.add(underground);
7174
72 - Map<Term, List<? extends LocalConcept>> meanings = disambiguator.getMeanings(terms);
 75+ Map<Term, List<? extends LocalConcept>> res = disambiguator.getMeanings(terms);
7376
74 - assertEquals(uk.getTerm(), meanings.get(uk.getTerm()), meanings.get(uk));
75 - assertEquals(london.getTerm(), meanings.get(london.getTerm()), meanings.get(london));
76 - assertEquals(underground.getTerm(), meanings.get(underground.getTerm()), meanings.get(underground));
 77+ assertEquals(uk.getTerm(), meanings.get(uk.getTerm()), res.get(uk));
 78+ assertEquals(london.getTerm(), meanings.get(london.getTerm()), res.get(london));
 79+ assertEquals(underground.getTerm(), meanings.get(underground.getTerm()), res.get(underground));
7780 }
7881
7982 public void testGetMeaningsForNode() throws PersistenceException {
80 - throw new UnsupportedOperationException("not yet implemented");
81 - //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
82 - //disambiguator.getMeanings(terms);
 83+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 84+
 85+ PhraseOccuranceSet set = getBankAndMonumentPhrases();
 86+ Map<PhraseOccurance, List<? extends LocalConcept>> res = disambiguator.getMeanings(set.getRootNode());
 87+ List<PhraseOccurance> terms = getBankAndMonumentTerms(1000);
 88+
 89+ for (PhraseOccurance t: terms) {
 90+ List<? extends LocalConcept> m = res.get(t);
 91+ List<? extends LocalConcept> n = meanings.get(t.getTerm());
 92+
 93+ assertEquals("meanings for "+t, n, m);
 94+ }
8395 }
8496
8597 public void testGetSequences() throws PersistenceException {
86 - throw new UnsupportedOperationException("not yet implemented");
87 - //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
88 - //disambiguator.getSequences(root, depth);
 98+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 99+ PhraseOccuranceSet set = getBankAndMonumentPhrases();
 100+
 101+ Collection<List<PhraseOccurance>> res = disambiguator.getSequences(set.getRootNode(), 1);
 102+ assertTrue("depth 1", sameElements(getBankAndMonumentSequences(1), res));
 103+
 104+ res = disambiguator.getSequences(set.getRootNode(), 2);
 105+ assertTrue("depth 2", sameElements(getBankAndMonumentSequences(2), res));
 106+
 107+ res = disambiguator.getSequences(set.getRootNode(), 1000);
 108+ assertTrue("depth 1000", sameElements(getBankAndMonumentSequences(1000), res));
89109 }
90110
91111 public void testDisambiguateTerms() throws PersistenceException {
92 - throw new UnsupportedOperationException("not yet implemented");
93 - /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 112+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
94113
95 - String[] sequence = {"UK", "London", "Underground", "Bank"};
 114+ Term uk = new Term("UK");
 115+ Term london = new Term("London");
 116+ Term underground = new Term("Underground");
 117+
 118+ List<Term> sequence = Arrays.asList(new Term[] {uk, london, underground});
 119+ Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(sequence, null);
96120
97 - Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null);
98 - */
99 - //// .............. ///
 121+ assertEquals("sequence", sequence, result.getSequence());
 122+
 123+ assertEquals(uk.getTerm(), getConcept("United_Kingdom"), result.getMeanings().get(uk));
 124+ assertEquals(london.getTerm(), getConcept("City_of_London"), result.getMeanings().get(london));
 125+ assertEquals(underground.getTerm(), getConcept("London_Underground"), result.getMeanings().get(underground));
100126 }
101127
102128 public void testDisambiguateNode() throws PersistenceException {
103 - throw new UnsupportedOperationException("not yet implemented");
104 - /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 129+ PhraseOccuranceSet set = getBankAndMonumentPhrases();
105130
106 - String[] sequence = {"UK", "London", "Underground", "Bank"};
 131+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
107132
108 - Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null);
109 - */
110 - //// .............. ///
 133+ Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(set.getRootNode(), null);
 134+
 135+ List<? extends PhraseOccurance> sequence = result.getSequence();
 136+ Map<? extends PhraseOccurance, ? extends LocalConcept> meanings = result.getMeanings();
 137+
 138+ assertEquals("Bank and Monument", sequence.get(0).getTerm());
 139+ assertEquals("Underground", sequence.get(1).getTerm());
 140+ assertEquals("station", sequence.get(2).getTerm());
 141+
 142+ assertNotNull( meanings.get( sequence.get(0).getTerm() ) );
 143+ assertNotNull( meanings.get( sequence.get(1).getTerm() ) );
 144+ assertNotNull( meanings.get( sequence.get(2).getTerm() ) );
 145+
 146+ assertEquals("Bank_and_Monument_Underground_station", meanings.get( sequence.get(0).getTerm() ).getName() );
 147+ assertEquals("Subway", meanings.get( sequence.get(1).getTerm() ).getName() );
 148+ assertEquals("Metro_station", meanings.get( sequence.get(2).getTerm() ).getName() );
111149 }
112150
113151 }
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java
@@ -203,30 +203,195 @@
204204 return m;
205205 }
206206
207 - protected PhraseOccuranceSet getBankAndMonumentPhrases() {
208 - String text = "The Bank and Monument Underground station";
 207+ private String bankAndMonumentText = "The Bank and Monument Underground station";
 208+
 209+ protected List<PhraseOccurance> getBankAndMonumentTerms(int depth) {
209210 List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
 211+
 212+ if (depth==0) return phrases;
210213
211 - phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank
212 - phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument
213 - phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground
 214+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank
 215+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument
 216+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground
214217
215 - phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank
216 - phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument
217 - phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground
 218+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank
 219+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument
 220+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground
218221 //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station
 222+
 223+ if (depth==1) return phrases;
219224
220 - phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument
221 - phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground
222 - phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station
 225+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument
 226+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground
 227+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station
223228
224 - phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
225 - phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 229+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
 230+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
226231
227 - phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 232+ phrases.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
228233
229 - PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases);
 234+ return phrases;
 235+ }
 236+
 237+ protected Collection<List<PhraseOccurance>> getBankAndMonumentSequences(int depth) {
 238+ ArrayList<List<PhraseOccurance>> sequences = new ArrayList<List<PhraseOccurance>>();
 239+
 240+ if (depth==0) return sequences;
 241+
 242+ List<PhraseOccurance> seq1 = new ArrayList<PhraseOccurance>();
 243+ List<PhraseOccurance> seq11 = new ArrayList<PhraseOccurance>();
 244+ List<PhraseOccurance> seq111 = new ArrayList<PhraseOccurance>();
 245+ List<PhraseOccurance> seq1111 = new ArrayList<PhraseOccurance>();
 246+ List<PhraseOccurance> seq112 = new ArrayList<PhraseOccurance>();
 247+ List<PhraseOccurance> seq12 = new ArrayList<PhraseOccurance>();
 248+ List<PhraseOccurance> seq121 = new ArrayList<PhraseOccurance>();
 249+ List<PhraseOccurance> seq13 = new ArrayList<PhraseOccurance>();
 250+
 251+ List<PhraseOccurance> seq2 = new ArrayList<PhraseOccurance>();
 252+ List<PhraseOccurance> seq21 = new ArrayList<PhraseOccurance>();
 253+ List<PhraseOccurance> seq211 = new ArrayList<PhraseOccurance>();
 254+ List<PhraseOccurance> seq22 = new ArrayList<PhraseOccurance>();
 255+
 256+ List<PhraseOccurance> seq3 = new ArrayList<PhraseOccurance>();
 257+ List<PhraseOccurance> seq31 = new ArrayList<PhraseOccurance>();
 258+
 259+ List<PhraseOccurance> seq5 = new ArrayList<PhraseOccurance>();
 260+ List<PhraseOccurance> seq51 = new ArrayList<PhraseOccurance>();
 261+ List<PhraseOccurance> seq511 = new ArrayList<PhraseOccurance>();
 262+ List<PhraseOccurance> seq5111 = new ArrayList<PhraseOccurance>();
 263+ List<PhraseOccurance> seq512 = new ArrayList<PhraseOccurance>();
 264+ List<PhraseOccurance> seq52 = new ArrayList<PhraseOccurance>();
 265+ List<PhraseOccurance> seq521 = new ArrayList<PhraseOccurance>();
 266+ List<PhraseOccurance> seq53 = new ArrayList<PhraseOccurance>();
 267+
 268+ List<PhraseOccurance> seq6 = new ArrayList<PhraseOccurance>();
 269+ List<PhraseOccurance> seq61 = new ArrayList<PhraseOccurance>();
 270+ List<PhraseOccurance> seq611 = new ArrayList<PhraseOccurance>();
 271+ List<PhraseOccurance> seq62 = new ArrayList<PhraseOccurance>();
 272+
 273+ List<PhraseOccurance> seq7 = new ArrayList<PhraseOccurance>();
 274+ List<PhraseOccurance> seq71 = new ArrayList<PhraseOccurance>();
 275+
 276+ seq1.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank
 277+ seq2.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 21 ), 1, 0, 21 ) ); //The Bank and Monument
 278+ seq3.add( new PhraseOccurance( bankAndMonumentText.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground
 279+ seq5.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank
 280+ seq6.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument
 281+ seq7.add( new PhraseOccurance( bankAndMonumentText.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground
 282+
 283+ if (depth==1) {
 284+ sequences.add(seq1);
 285+ sequences.add(seq2);
 286+ sequences.add(seq3);
 287+ sequences.add(seq5);
 288+ sequences.add(seq6);
 289+ sequences.add(seq7);
 290+
 291+ return sequences;
 292+ }
 293+
 294+ seq11.addAll(seq1);
 295+ seq11.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument
 296+ seq12.addAll(seq1);
 297+ seq12.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground
 298+ seq13.addAll(seq1);
 299+ seq13.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station
 300+ seq21.addAll(seq2);
 301+ seq21.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
 302+ seq22.addAll(seq2);
 303+ seq22.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 304+ seq31.addAll(seq3);
 305+ seq31.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 306+ seq51.addAll(seq5);
 307+ seq51.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument
 308+ seq52.addAll(seq5);
 309+ seq52.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground
 310+ seq53.addAll(seq5);
 311+ seq53.add( new PhraseOccurance( bankAndMonumentText.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station
 312+ seq61.addAll(seq6);
 313+ seq61.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
 314+ seq62.addAll(seq6);
 315+ seq62.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 316+ seq71.addAll(seq7);
 317+ seq71.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 318+
 319+ sequences.add(seq13);
 320+ sequences.add(seq22);
 321+ sequences.add(seq31);
 322+ sequences.add(seq53);
 323+ sequences.add(seq62);
 324+ sequences.add(seq71);
 325+
 326+ if (depth==2) {
 327+ sequences.add(seq11);
 328+ sequences.add(seq12);
 329+ sequences.add(seq21);
 330+ sequences.add(seq51);
 331+ sequences.add(seq52);
 332+ sequences.add(seq61);
 333+
 334+ return sequences;
 335+ }
 336+
 337+ seq111.addAll(seq11);
 338+ seq111.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
 339+ seq112.addAll(seq11);
 340+ seq112.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 341+ seq121.addAll(seq12);
 342+ seq121.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 343+ seq211.addAll(seq21);
 344+ seq211.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 345+ seq511.addAll(seq51);
 346+ seq511.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
 347+ seq512.addAll(seq51);
 348+ seq512.add( new PhraseOccurance( bankAndMonumentText.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 349+ seq521.addAll(seq52);
 350+ seq521.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 351+ seq611.addAll(seq61);
 352+ seq611.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 353+
 354+ sequences.add(seq112);
 355+ sequences.add(seq121);
 356+ sequences.add(seq211);
 357+ sequences.add(seq512);
 358+ sequences.add(seq521);
 359+ sequences.add(seq611);
 360+
 361+ if (depth==3) {
 362+ sequences.add(seq111);
 363+ sequences.add(seq511);
 364+ return sequences;
 365+ }
 366+
 367+ seq1111.addAll(seq111);
 368+ seq1111.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 369+ seq5111.addAll(seq511);
 370+ seq5111.add( new PhraseOccurance( bankAndMonumentText.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 371+
 372+ sequences.add(seq1111);
 373+ sequences.add(seq5111);
 374+
 375+ return sequences;
 376+ }
 377+
 378+ protected PhraseOccuranceSet getBankAndMonumentPhrases() {
 379+ List<PhraseOccurance> phrases = getBankAndMonumentTerms(1000);
 380+
 381+ PhraseOccuranceSet set = new PhraseOccuranceSet(bankAndMonumentText, phrases);
230382 return set;
231383 }
232384
 385+
 386+ public static boolean sameElements(Collection a, Collection b) {
 387+ if (a==b) return true;
 388+ if (a==null || b==null) return false;
 389+ if (a.size() != b.size()) return false;
 390+ if (a.equals(b)) return true;
 391+
 392+ for (Object x: a) {
 393+ if (!b.contains(x)) return false;
 394+ }
 395+
 396+ return true;
 397+ }
233398 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -71,7 +71,7 @@
7272 public void setMeaningOverrides(Map<? extends T, C> overrideMap) {
7373 this.meaningOverrides = overrideMap;
7474 }
75 -
 75+
7676 protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) {
7777 TermSetBuilder<X> builder = new TermSetBuilder<X>();
7878 walk(root, null, builder, depth);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -1,5 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.ArrayList;
45 import java.util.Collection;
56 import java.util.Collections;
67 import java.util.HashMap;
@@ -69,10 +70,14 @@
7071 double score = 0;
7172 int totalPop = 0;
7273
 74+ List<X> resultSequence = new ArrayList<X>(sequence.size());
 75+
7376 for (X t: sequence) {
7477 List<? extends LocalConcept> m = meanings.get(t);
7578 if (m==null || m.size()==0) continue;
7679
 80+ resultSequence.add(t);
 81+
7782 if (m.size()>1) Collections.sort(m, popularityComparator);
7883
7984 LocalConcept c = m.get(0);
@@ -81,13 +86,13 @@
8287 double pop = popularityMeasure.measure(c);
8388 totalPop += pop;
8489
85 - Number sc = weigthCombiner.apply(pop, t.getWeight());
86 - score += sc.doubleValue();
 90+ double sc = weigthCombiner.apply(pop, t.getWeight()); //FIXME: pop and weight are not in the same scale.
 91+ score += sc;
8792 }
8893
8994 if (disambig.size()>0) score = score / disambig.size();
9095
91 - Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, sequence, score, "score="+score+"; pop="+totalPop);
 96+ Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, resultSequence, score, "score="+score+"; pop="+totalPop);
9297 return r;
9398 }
9499
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/LinearCombiner.java
@@ -22,7 +22,7 @@
2323 }
2424
2525 public double apply(double a, double b) {
26 - return b * bias + b * ( 1 - bias );
 26+ return a * bias + b * ( 1.0 - bias );
2727 //return = Math.sqrt( popf * simf ); //normalized produkt
2828 }
2929 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
@@ -106,13 +106,14 @@
107107 public Collection<? extends PhraseNode<PhraseOccurance>> getSuccessorsAt(int pos) {
108108 Set<PhraseNode<PhraseOccurance>> successors = new HashSet<PhraseNode<PhraseOccurance>>();
109109
 110+ int horizon = text.length();
110111 while (true) {
111112 Collection<? extends PhraseNode<PhraseOccurance>> nodes = PhraseOccuranceSet.this.getPhraseNodesAt(pos);
112 - if (nodes == null || nodes.isEmpty()) break;
 113+ if (nodes != null && !nodes.isEmpty()) {
 114+ successors.addAll(nodes);
 115+ horizon = getHorizon(successors, horizon);
 116+ }
113117
114 - successors.addAll(nodes);
115 - int horizon = getHorizon(successors);
116 -
117118 pos ++;
118119 if (pos>=horizon) break;
119120 }
@@ -120,8 +121,7 @@
121122 return successors;
122123 }
123124
124 - private int getHorizon(Collection<? extends PhraseNode<PhraseOccurance>> successors) {
125 - int horizon = Integer.MAX_VALUE;
 125+ private int getHorizon(Collection<? extends PhraseNode<PhraseOccurance>> successors, int horizon) {
126126 for (PhraseNode<PhraseOccurance> n: successors) {
127127 int end = n.getTermReference().getEndOffset();
128128 if (end < horizon) horizon = end;
@@ -133,8 +133,16 @@
134134
135135 public Collection<? extends PhraseNode<PhraseOccurance>> getPhraseNodesAt(int offs) {
136136 List<PhraseOccurance> phrases = getPhrasesAt(offs);
137 - if (phrases == null) return null;
138 -
 137+ return toNodeList(phrases);
 138+ }
 139+
 140+ public Collection<? extends PhraseNode<PhraseOccurance>> getPhraseNodesFrom(int offs) {
 141+ List<PhraseOccurance> phrases = getPhrasesFrom(offs);
 142+ return toNodeList(phrases);
 143+ }
 144+
 145+ protected List<Node> toNodeList(List<PhraseOccurance> phrases) {
 146+ if (phrases==null) return null;
139147 List<Node> nodes = new ArrayList<Node>(phrases.size());
140148
141149 for (PhraseOccurance p: phrases) {
@@ -144,9 +152,34 @@
145153 return nodes;
146154 }
147155
148 - public List<PhraseOccurance> getPhrasesAt(int offs) {
 156+ public List<PhraseOccurance> getPhrasesAt(int at) {
149157 int i = 0;
 158+ PhraseOccurance p = null;
150159 while (i<size()) {
 160+ p = get(i);
 161+ if (p.getOffset() >= at) {
 162+ break;
 163+ }
 164+
 165+ i++;
 166+ }
 167+
 168+ if (p!=null && p.getOffset() > at) return null;
 169+ if (i>=size()) return null;
 170+
 171+ int j = i;
 172+ while (j<size()) {
 173+ p = get(j);
 174+ if (p.getOffset() > at) break;
 175+ j++;
 176+ }
 177+
 178+ return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
 179+ }
 180+
 181+ public List<PhraseOccurance> getPhrasesFrom(int offs) {
 182+ int i = 0;
 183+ while (i<size()) {
151184 PhraseOccurance p = get(i);
152185 if (p.getOffset() >= offs) {
153186 offs = p.getOffset();

Status & tagging log