Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java |
— | — | @@ -6,10 +6,13 @@ |
7 | 7 | import java.util.List; |
8 | 8 | import java.util.Map; |
9 | 9 | |
| 10 | +import de.brightbyte.data.LabeledMatrix; |
| 11 | +import de.brightbyte.data.MapLabeledMatrix; |
10 | 12 | import de.brightbyte.data.Pair; |
11 | 13 | import de.brightbyte.io.ConsoleIO; |
12 | 14 | import de.brightbyte.io.Output; |
13 | 15 | import de.brightbyte.util.PersistenceException; |
| 16 | +import de.brightbyte.wikiword.disambig.CoherenceDisambiguator.CoherenceDisambiguation; |
14 | 17 | import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation; |
15 | 18 | import de.brightbyte.wikiword.disambig.Disambiguator.Disambiguation; |
16 | 19 | import de.brightbyte.wikiword.model.LocalConcept; |
— | — | @@ -24,6 +27,33 @@ |
25 | 28 | super(); |
26 | 29 | } |
27 | 30 | |
| 31 | + public void testGetScore() throws PersistenceException { |
| 32 | + CoherenceDisambiguator disambiguator = new CoherenceDisambiguator(meaningFetcher, featureFetcher, 10); |
| 33 | + |
| 34 | + LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
| 35 | + |
| 36 | + LocalConcept city_of_London = getConcept("City_of_London"); |
| 37 | + LocalConcept united_Kingdom = getConcept("United_Kingdom"); |
| 38 | + |
| 39 | + //united_Kingdom.setCardinality(100000); |
| 40 | + |
| 41 | + Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(new Term("UK"), united_Kingdom); |
| 42 | + Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(new Term("London"), city_of_London); |
| 43 | + |
| 44 | + CoherenceDisambiguator.Interpretation interp = new CoherenceDisambiguator.Interpretation(uk_as_United_Kingdom, london_as_City_of_London); |
| 45 | + CoherenceDisambiguation r1 = disambiguator.getScore(interp, null, similarities, featureFetcher); |
| 46 | + |
| 47 | + int oldPop = city_of_London.getCardinality(); |
| 48 | + city_of_London.setCardinality(oldPop*2); |
| 49 | + |
| 50 | + CoherenceDisambiguation r2 = disambiguator.getScore(interp, null, similarities, featureFetcher); |
| 51 | + city_of_London.setCardinality(oldPop); |
| 52 | + |
| 53 | + double score1 = r1.getScore(); |
| 54 | + double score2 = r2.getScore(); |
| 55 | + assertTrue("More popularity implies better score", score1 < score2 ); |
| 56 | + } |
| 57 | + |
28 | 58 | public void testGetSequenceInterpretations() throws PersistenceException { |
29 | 59 | CoherenceDisambiguator disambiguator = new CoherenceDisambiguator(meaningFetcher, featureFetcher, 10); |
30 | 60 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -77,7 +77,10 @@ |
78 | 78 | if (pop<0.5) return 0; |
79 | 79 | if (pop<1) pop=1; |
80 | 80 | |
81 | | - double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //XXX: black voodoo magic ad hoc formula with no deeper meaing. |
| 81 | + //XXX: black voodoo magic ad hoc formula with no deeper meaing. |
| 82 | + //double n = 1 - 1/(Math.log(pop)+1); //normalized log scale |
| 83 | + //double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //dampened normalized log scale |
| 84 | + double n =1 - (0.5/Math.sqrt((pop+200)/200)); //nice and smooth, but has magic params that may depend on the wiki |
82 | 85 | return n; |
83 | 86 | } |
84 | 87 | }; |
— | — | @@ -436,7 +439,8 @@ |
437 | 440 | |
438 | 441 | LabeledVector<Integer> sum = ConceptFeatures.newIntFeaturVector( concepts.size() * 200 ); //XXX: magic number |
439 | 442 | Map<Integer, ConceptFeatures<C, Integer>> disambigFeatures = new HashMap<Integer, ConceptFeatures<C, Integer>>(); |
440 | | - double sim = 0, pop = 0, weight = 0; |
| 443 | + double sim = 0, pop = 0, weight = 0, popf = 0, simf = 0; |
| 444 | + |
441 | 445 | int i=0, j=0; |
442 | 446 | for (Map.Entry<TermReference, C> ea: concepts.entrySet()) { |
443 | 447 | C a = ea.getValue(); |
— | — | @@ -496,6 +500,8 @@ |
497 | 501 | d = doubleSanity(d, "normal similarity score for "+a+" / "+b, "check similarityMeasure!", 0, 0.1, 1, 0.1); |
498 | 502 | |
499 | 503 | sim += d; |
| 504 | + simf += similarityNormalizer.apply(d); |
| 505 | + |
500 | 506 | simCount ++; |
501 | 507 | } |
502 | 508 | |
— | — | @@ -508,7 +514,9 @@ |
509 | 515 | |
510 | 516 | p = weightCombiner.apply(p, w); |
511 | 517 | |
512 | | - pop += p; //XXX: keep raw and processed pop |
| 518 | + pop += p; |
| 519 | + popf += popularityNormalizer.apply(p); |
| 520 | + |
513 | 521 | weight += w; |
514 | 522 | } |
515 | 523 | |
— | — | @@ -523,14 +531,15 @@ |
524 | 532 | |
525 | 533 | sim = n == 0 ? 0 : sim / n; //scale |
526 | 534 | pop = c == 0 ? 0 : pop / c; //scale |
| 535 | + |
| 536 | + simf = n == 0 ? 0 : simf / n; //scale |
| 537 | + popf = c == 0 ? 0 : popf / c; //scale |
| 538 | + |
527 | 539 | weight = c == 0 ? 0 : weight / c; //scale |
528 | 540 | |
529 | 541 | pop = doubleSanity(pop, "normal popularity", "check popularityMeasure!", 0, 0.1, Double.MAX_VALUE, 0); |
530 | 542 | sim = doubleSanity(sim, "normal average simility", "ooops!", 0, 0.1, 1, 0.1); |
531 | 543 | |
532 | | - double popf = popularityNormalizer.apply(pop); |
533 | | - double simf = similarityNormalizer.apply(sim); |
534 | | - |
535 | 544 | popf = doubleSanity(popf, "normal popularity", "check popularityNormalizer!", 0, 0.1, 1, 0.1); |
536 | 545 | simf = doubleSanity(simf, "normal similarity", "check similarityNormalizer!", 0, 0.1, 1, 0.1); |
537 | 546 | |