r73119 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r73118‎ | r73119 | r73120 >
Date:16:09, 16 September 2010
Author:daniel
Status:deferred
Tags:
Comment:
messing with phrase detection
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java
@@ -108,6 +108,40 @@
109109 assertTrue("depth 1000", sameElements(getBankAndMonumentSequences(1000), res));
110110 }
111111
 112+ public void testGetSequences2() throws PersistenceException {
 113+ PopularityDisambiguator<TermReference, LocalConcept> disambiguator = new PopularityDisambiguator<TermReference, LocalConcept>(meaningFetcher, 10);
 114+ PhraseOccuranceSet set = getMargaretOfYorkPhrases();
 115+
 116+ Collection<List<PhraseOccurance>> res = disambiguator.getSequences(set.getRootNode(), 3);
 117+ printSequences(res, 1000);
 118+
 119+ //res = disambiguator.getSequences(set.getRootNode(), 1000);
 120+ //printSequences(res, 1000);
 121+ //TODO: check / compare. right now, we just test performance
 122+ }
 123+
 124+ protected void printSequences(Collection<List<PhraseOccurance>> res, int max) {
 125+ int i = 0;
 126+ StringBuilder b = new StringBuilder();
 127+ for (List<PhraseOccurance> seq: res) {
 128+ b.setLength(0);
 129+
 130+ for (PhraseOccurance p: seq) {
 131+ if (b.length()>0) b.append(" | ");
 132+ b.append(p.getTerm());
 133+ }
 134+
 135+ i++;
 136+
 137+ System.out.println("#"+i+": "+b);
 138+
 139+ if (i>1000) {
 140+ System.out.println("way too many ("+res.size()+")!");
 141+ break;
 142+ }
 143+ }
 144+ }
 145+
112146 public void testDisambiguateTerms() throws PersistenceException {
113147 PopularityDisambiguator<TermReference, LocalConcept> disambiguator = new PopularityDisambiguator<TermReference, LocalConcept>(meaningFetcher, 10);
114148
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java
@@ -207,7 +207,144 @@
208208 }
209209
210210 private String bankAndMonumentText = "The Bank and Monument Underground station";
 211+ private String margaretofYorkText = "Margaret of York is introduced to her future husband, Charles the Bold on 27th June, 1468 at Damme (vellum)";
 212+ //private String fromTheFrenchText = "from the French edition of the biography of Alexandre le Grand by Quintus Curtius Rufus (c.42 AD)";
 213+ //private String translationByVasqueText = "from the French edition of the biography of Alexandre le Grand by Quintus Curtius Rufus (c.42 AD)";
211214
 215+ protected List<PhraseOccurance> getMargaretOfYorkTerms(int depth) {
 216+ List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
 217+
 218+ if (depth==0) return phrases;
 219+
 220+ int ofs = 0;
 221+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 8 ), 1, ofs, 8 -ofs ) ); //Margaret
 222+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 11 ), 1, ofs, 11 -ofs ) ); //Margaret of
 223+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 16 ), 2, ofs, 16 -ofs ) ); //Margaret of York
 224+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 19 ), 2, ofs, 19 -ofs ) ); //Margaret of York is
 225+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 3, ofs, 30 -ofs ) ); //Margaret of York is introduced
 226+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 3, ofs, 33 -ofs ) ); //Margaret of York is introduced to
 227+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 3, ofs, 37 -ofs ) ); //Margaret of York is introduced to her
 228+
 229+ if (depth==1) return phrases;
 230+
 231+ ofs = 9;
 232+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 16 ), 1, ofs, 16 -ofs ) ); //of York
 233+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 19 ), 1, ofs, 19 -ofs ) ); //of York is
 234+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 2, ofs, 30 -ofs ) ); //of York is introduced
 235+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 2, ofs, 33 -ofs ) ); //of York is introduced to
 236+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 2, ofs, 37 -ofs ) ); //of York is introduced to her
 237+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 4, ofs, 44 -ofs ) ); //of York is introduced to her future
 238+
 239+ ofs = 12;
 240+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 16 ), 1, ofs, 16 -ofs ) ); //York
 241+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 19 ), 1, ofs, 19 -ofs ) ); //York is
 242+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 2, ofs, 30 -ofs ) ); //York is introduced
 243+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 2, ofs, 33 -ofs ) ); //York is introduced to
 244+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 2, ofs, 37 -ofs ) ); //York is introduced to her
 245+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 4, ofs, 44 -ofs ) ); //York is introduced to her future
 246+
 247+ if (depth==2) return phrases;
 248+
 249+ ofs = 17;
 250+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 1, ofs, 30 -ofs ) ); //is introduced
 251+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 1, ofs, 33 -ofs ) ); //is introduced to
 252+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 1, ofs, 37 -ofs ) ); //is introduced to her
 253+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 2, ofs, 44 -ofs ) ); //is introduced to her future
 254+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 3, ofs, 52 -ofs ) ); //is introduced to her future husband
 255+
 256+ ofs = 20;
 257+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 30 ), 1, ofs, 30 -ofs ) ); //introduced
 258+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 33 ), 1, ofs, 33 -ofs ) ); //introduced to
 259+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 37 ), 1, ofs, 37 -ofs ) ); //introduced to her
 260+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 2, ofs, 44 -ofs ) ); //introduced to her future
 261+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 3, ofs, 52 -ofs ) ); //introduced to her future husband
 262+
 263+ if (depth==3) return phrases;
 264+
 265+ ofs = 31;
 266+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 1, ofs, 44 -ofs ) ); //to her future
 267+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 2, ofs, 52 -ofs ) ); //to her future husband
 268+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 3, ofs,61 -ofs ) ); //to her future husband, Charles
 269+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 3, ofs, 65 -ofs ) ); //to her future husband, Charles the
 270+
 271+ ofs = 34;
 272+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 1, ofs, 44 -ofs ) ); //her future
 273+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 2, ofs, 52 -ofs ) ); //her future husband
 274+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 3, ofs, 61 -ofs ) ); //her future husband, Charles
 275+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65), 3, ofs, 65 -ofs ) ); //her future husband, Charles the
 276+
 277+ ofs = 38;
 278+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 44 ), 1, ofs, 44 -ofs ) ); //future
 279+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 2, ofs, 52 -ofs ) ); //future husband
 280+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 3, ofs, 61 -ofs ) ); //future husband, Charles
 281+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 3, ofs, 65 -ofs ) ); //future husband, Charles the
 282+
 283+ if (depth==4) return phrases;
 284+
 285+ ofs = 45;
 286+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 52 ), 1, ofs, 52 -ofs ) ); //husband
 287+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 2, ofs, 61 -ofs ) ); //husband, Charles
 288+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 2, ofs, 65 -ofs ) ); //husband, Charles the
 289+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 3, ofs, 70 -ofs ) ); //husband, Charles the Bold
 290+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 3, ofs, 73 -ofs ) ); //husband, Charles the Bold on
 291+
 292+ if (depth==5) return phrases;
 293+
 294+ ofs = 54;
 295+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 61 ), 1, ofs, 61 -ofs ) ); //Charles
 296+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 65 ), 1, ofs, 65 -ofs ) ); //Charles the
 297+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 2, ofs, 70 -ofs ) ); //Charles the Bold
 298+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 2, ofs, 73 -ofs ) ); //Charles the Bold on
 299+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 3, ofs, 78 -ofs ) ); //Charles the Bold on 27th
 300+
 301+ if (depth==6) return phrases;
 302+
 303+ ofs = 62;
 304+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 1, ofs, 70 -ofs ) ); //the Bold
 305+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 1, ofs, 73 -ofs ) ); //the Bold on
 306+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 2, ofs, 78 -ofs ) ); //the Bold on 27th
 307+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 3, ofs, 83 -ofs ) ); //the Bold on 27th June
 308+
 309+ ofs = 66;
 310+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 70 ), 1, ofs, 70 -ofs ) ); //Bold
 311+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 73 ), 1, ofs, 73 -ofs ) ); //Bold on
 312+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 2, ofs, 78 -ofs ) ); //Bold on 27th
 313+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 3, ofs, 83 -ofs ) ); //Bold on 27th June
 314+
 315+ if (depth==7) return phrases;
 316+
 317+ ofs = 71;
 318+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 1, ofs, 78 -ofs ) ); //on 27th
 319+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 2, ofs, 83 -ofs ) ); //on 27th June
 320+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 3, ofs, 89 -ofs ) ); //on 27th June, 1468
 321+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 3, ofs, 92 -ofs ) ); //on 27th June, 1468 at
 322+
 323+ ofs = 73;
 324+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 78 ), 1, ofs, 78 -ofs ) ); //27th
 325+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 2, ofs, 83 -ofs ) ); //27th June
 326+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 3, ofs, 89 -ofs ) ); //27th June, 1468
 327+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 3, ofs, 92 -ofs ) ); //27th June, 1468 at
 328+
 329+ if (depth==8) return phrases;
 330+
 331+ ofs = 78;
 332+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 83 ), 1, ofs, 83 -ofs ) ); //June
 333+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 2, ofs, 89 -ofs ) ); //June, 1468
 334+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 2, ofs, 92 -ofs ) ); //June, 1468 at
 335+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 98 ), 2, ofs, 98 -ofs ) ); //June, 1468 at Damme
 336+
 337+ if (depth==9) return phrases;
 338+
 339+ ofs = 84;
 340+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 89 ), 1, ofs, 89 -ofs ) ); //1468
 341+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 92 ), 1, ofs, 92 -ofs ) ); //1468 at
 342+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 98 ), 2, ofs, 98 -ofs ) ); //1468 at Damme
 343+ phrases.add( new PhraseOccurance( margaretofYorkText.substring( ofs, 107 ), 3, ofs, 107 -ofs ) ); //1468 at Damme (vellum)
 344+
 345+ return phrases;
 346+ }
 347+ // "Margaret of York is introduced to her future husband, Charles the Bold on 27th June, 1468 at Damme (vellum)"
 348+
212349 protected List<PhraseOccurance> getBankAndMonumentTerms(int depth) {
213350 List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
214351
@@ -384,6 +521,12 @@
385522 return set;
386523 }
387524
 525+ protected PhraseOccuranceSet getMargaretOfYorkPhrases() {
 526+ List<PhraseOccurance> phrases = getMargaretOfYorkTerms(1000);
 527+
 528+ PhraseOccuranceSet set = new PhraseOccuranceSet(margaretofYorkText, phrases);
 529+ return set;
 530+ }
388531
389532 public static boolean sameElements(Collection a, Collection b) {
390533 if (a==b) return true;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -261,6 +261,8 @@
262262
263263 private <X extends T>CoherenceDisambiguation<X, C> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context, LabeledMatrix<C, C> similarities, FeatureFetcher<C, Integer> features) throws PersistenceException {
264264
 265+ pruneMeaninglessSequences( sequences, meanings );
 266+
265267 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
266268
267269 Collection<Disambiguator.Interpretation<X, C>> interpretations = getInterpretations(sequences, meanings);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -3,7 +3,9 @@
44 import java.util.ArrayList;
55 import java.util.Collection;
66 import java.util.Collections;
 7+import java.util.Iterator;
78 import java.util.List;
 9+import java.util.ListIterator;
810 import java.util.Map;
911
1012 import de.brightbyte.io.Output;
@@ -84,6 +86,20 @@
8587 return builder.getSequences();
8688 }
8789
 90+ protected <X extends T>void pruneMeaninglessSequences(Collection<List<X>> sequences, Map<X, List<? extends C>> meanings) {
 91+ Iterator<List<X>> it = sequences.iterator();
 92+ outer: while ( it.hasNext() ) {
 93+ List<X> seq = it.next();
 94+
 95+ for (X t: seq) {
 96+ if ( meanings.get(t) != null ) {
 97+ continue outer;
 98+ }
 99+ }
 100+
 101+ it.remove();
 102+ }
 103+ }
88104
89105 protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException {
90106 Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -93,6 +93,8 @@
9494
9595 if (initialWindow > 0) { //apply full coherence disambig to initial window size. initialWindow == 1 will trigger a popularity disambig.
9696 Collection<List<X>> sequences = getSequences(root, initialWindow);
 97+ pruneMeaninglessSequences( sequences, meanings );
 98+
9799 Disambiguation<X, C> r;
98100
99101 if (initialWindow == 1) r = popularityDisambiguator.disambiguate(sequences, root, meanings, context);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -69,6 +69,8 @@
7070 public <X extends T>Disambiguation<X, C> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) {
7171 Disambiguation<X, C> best = null;
7272
 73+ pruneMeaninglessSequences( sequences, meanings );
 74+
7375 for (List<X> sequence: sequences) {
7476 Disambiguation<X, C> r = disambiguate(sequence, meanings, context);
7577 trace(r.toString());

Status & tagging log