r68826 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r68825‎ | r68826 | r68827 >
Date:10:31, 1 July 2010
Author:daniel
Status:deferred
Tags:
Comment:
make coherence score more sensitive to frequency: normalize before averaging and use a softer normalization curve
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java
@@ -6,10 +6,13 @@
77 import java.util.List;
88 import java.util.Map;
99
 10+import de.brightbyte.data.LabeledMatrix;
 11+import de.brightbyte.data.MapLabeledMatrix;
1012 import de.brightbyte.data.Pair;
1113 import de.brightbyte.io.ConsoleIO;
1214 import de.brightbyte.io.Output;
1315 import de.brightbyte.util.PersistenceException;
 16+import de.brightbyte.wikiword.disambig.CoherenceDisambiguator.CoherenceDisambiguation;
1417 import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation;
1518 import de.brightbyte.wikiword.disambig.Disambiguator.Disambiguation;
1619 import de.brightbyte.wikiword.model.LocalConcept;
@@ -24,6 +27,33 @@
2528 super();
2629 }
2730
 31+ public void testGetScore() throws PersistenceException {
 32+ CoherenceDisambiguator disambiguator = new CoherenceDisambiguator(meaningFetcher, featureFetcher, 10);
 33+
 34+ LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
 35+
 36+ LocalConcept city_of_London = getConcept("City_of_London");
 37+ LocalConcept united_Kingdom = getConcept("United_Kingdom");
 38+
 39+ //united_Kingdom.setCardinality(100000);
 40+
 41+ Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(new Term("UK"), united_Kingdom);
 42+ Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(new Term("London"), city_of_London);
 43+
 44+ CoherenceDisambiguator.Interpretation interp = new CoherenceDisambiguator.Interpretation(uk_as_United_Kingdom, london_as_City_of_London);
 45+ CoherenceDisambiguation r1 = disambiguator.getScore(interp, null, similarities, featureFetcher);
 46+
 47+ int oldPop = city_of_London.getCardinality();
 48+ city_of_London.setCardinality(oldPop*2);
 49+
 50+ CoherenceDisambiguation r2 = disambiguator.getScore(interp, null, similarities, featureFetcher);
 51+ city_of_London.setCardinality(oldPop);
 52+
 53+ double score1 = r1.getScore();
 54+ double score2 = r2.getScore();
 55+ assertTrue("More popularity implies better score", score1 < score2 );
 56+ }
 57+
2858 public void testGetSequenceInterpretations() throws PersistenceException {
2959 CoherenceDisambiguator disambiguator = new CoherenceDisambiguator(meaningFetcher, featureFetcher, 10);
3060
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -77,7 +77,10 @@
7878 if (pop<0.5) return 0;
7979 if (pop<1) pop=1;
8080
81 - double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //XXX: black voodoo magic ad hoc formula with no deeper meaing.
 81+ //XXX: black voodoo magic ad hoc formula with no deeper meaing.
 82+ //double n = 1 - 1/(Math.log(pop)+1); //normalized log scale
 83+ //double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //dampened normalized log scale
 84+ double n =1 - (0.5/Math.sqrt((pop+200)/200)); //nice and smooth, but has magic params that may depend on the wiki
8285 return n;
8386 }
8487 };
@@ -436,7 +439,8 @@
437440
438441 LabeledVector<Integer> sum = ConceptFeatures.newIntFeaturVector( concepts.size() * 200 ); //XXX: magic number
439442 Map<Integer, ConceptFeatures<C, Integer>> disambigFeatures = new HashMap<Integer, ConceptFeatures<C, Integer>>();
440 - double sim = 0, pop = 0, weight = 0;
 443+ double sim = 0, pop = 0, weight = 0, popf = 0, simf = 0;
 444+
441445 int i=0, j=0;
442446 for (Map.Entry<TermReference, C> ea: concepts.entrySet()) {
443447 C a = ea.getValue();
@@ -496,6 +500,8 @@
497501 d = doubleSanity(d, "normal similarity score for "+a+" / "+b, "check similarityMeasure!", 0, 0.1, 1, 0.1);
498502
499503 sim += d;
 504+ simf += similarityNormalizer.apply(d);
 505+
500506 simCount ++;
501507 }
502508
@@ -508,7 +514,9 @@
509515
510516 p = weightCombiner.apply(p, w);
511517
512 - pop += p; //XXX: keep raw and processed pop
 518+ pop += p;
 519+ popf += popularityNormalizer.apply(p);
 520+
513521 weight += w;
514522 }
515523
@@ -523,14 +531,15 @@
524532
525533 sim = n == 0 ? 0 : sim / n; //scale
526534 pop = c == 0 ? 0 : pop / c; //scale
 535+
 536+ simf = n == 0 ? 0 : simf / n; //scale
 537+ popf = c == 0 ? 0 : popf / c; //scale
 538+
527539 weight = c == 0 ? 0 : weight / c; //scale
528540
529541 pop = doubleSanity(pop, "normal popularity", "check popularityMeasure!", 0, 0.1, Double.MAX_VALUE, 0);
530542 sim = doubleSanity(sim, "normal average simility", "ooops!", 0, 0.1, 1, 0.1);
531543
532 - double popf = popularityNormalizer.apply(pop);
533 - double simf = similarityNormalizer.apply(sim);
534 -
535544 popf = doubleSanity(popf, "normal popularity", "check popularityNormalizer!", 0, 0.1, 1, 0.1);
536545 simf = doubleSanity(simf, "normal similarity", "check similarityNormalizer!", 0, 0.1, 1, 0.1);
537546

Status & tagging log