Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java |
— | — | @@ -7,6 +7,8 @@ |
8 | 8 | import java.util.List; |
9 | 9 | import java.util.Map; |
10 | 10 | |
| 11 | +import de.brightbyte.io.ConsoleIO; |
| 12 | +import de.brightbyte.io.Output; |
11 | 13 | import de.brightbyte.util.PersistenceException; |
12 | 14 | import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
13 | 15 | import de.brightbyte.wikiword.model.LocalConcept; |
— | — | @@ -16,6 +18,8 @@ |
17 | 19 | |
18 | 20 | public class PopularityDisambiguatorTest extends DisambiguatorTestBase { |
19 | 21 | |
| 22 | + private Output traceOut = ConsoleIO.output; |
| 23 | + |
20 | 24 | public PopularityDisambiguatorTest() throws IOException, PersistenceException { |
21 | 25 | super(); |
22 | 26 | } |
— | — | @@ -121,14 +125,15 @@ |
122 | 126 | |
123 | 127 | assertEquals(uk.getTerm(), getConcept("United_Kingdom"), result.getMeanings().get(uk)); |
124 | 128 | assertEquals(london.getTerm(), getConcept("City_of_London"), result.getMeanings().get(london)); |
125 | | - assertEquals(underground.getTerm(), getConcept("London_Underground"), result.getMeanings().get(underground)); |
| 129 | + assertEquals(underground.getTerm(), getConcept("Subway"), result.getMeanings().get(underground)); |
126 | 130 | } |
127 | 131 | |
128 | 132 | public void testDisambiguateNode() throws PersistenceException { |
129 | 133 | PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
130 | 134 | |
131 | 135 | PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
132 | | - |
| 136 | + disambiguator.setTrace(traceOut); |
| 137 | + |
133 | 138 | Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(set.getRootNode(), null); |
134 | 139 | |
135 | 140 | List<? extends PhraseOccurance> sequence = result.getSequence(); |
— | — | @@ -138,13 +143,13 @@ |
139 | 144 | assertEquals("Underground", sequence.get(1).getTerm()); |
140 | 145 | assertEquals("station", sequence.get(2).getTerm()); |
141 | 146 | |
142 | | - assertNotNull( meanings.get( sequence.get(0).getTerm() ) ); |
143 | | - assertNotNull( meanings.get( sequence.get(1).getTerm() ) ); |
144 | | - assertNotNull( meanings.get( sequence.get(2).getTerm() ) ); |
| 147 | + assertNotNull( meanings.get( sequence.get(0) ) ); |
| 148 | + assertNotNull( meanings.get( sequence.get(1) ) ); |
| 149 | + assertNotNull( meanings.get( sequence.get(2) ) ); |
145 | 150 | |
146 | | - assertEquals("Bank_and_Monument_Underground_station", meanings.get( sequence.get(0).getTerm() ).getName() ); |
147 | | - assertEquals("Subway", meanings.get( sequence.get(1).getTerm() ).getName() ); |
148 | | - assertEquals("Metro_station", meanings.get( sequence.get(2).getTerm() ).getName() ); |
| 151 | + assertEquals("Bank_and_Monument_Underground_stations", meanings.get( sequence.get(0) ).getName() ); |
| 152 | + assertEquals("Subway", meanings.get( sequence.get(1) ).getName() ); |
| 153 | + assertEquals("Bus_station", meanings.get( sequence.get(2) ).getName() ); |
149 | 154 | } |
150 | 155 | |
151 | 156 | } |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest-meanings.csv |
— | — | @@ -5,9 +5,9 @@ |
6 | 6 | London 2001 City_of_London 1000 90 |
7 | 7 | London 2002 Greater_London 888 80 |
8 | 8 | London 2003 London_city_council 12 10 |
9 | | -Underground 3001 London_Underground 1000 90 |
10 | | -Underground 3002 Subway 888 60 |
11 | | -Bank 4001 Bank 1000 90 |
| 9 | +Underground 3001 London_Underground 888 90 |
| 10 | +Underground 3002 Subway 999 60 |
| 11 | +Bank 4001 Bank 777 90 |
12 | 12 | Bank 4002 Bank_(sitting) 666 80 |
13 | 13 | Bank 4003 Bank_(geology) 230 60 |
14 | 14 | Bank 4004 Bank_of_England 220 60 |
— | — | @@ -15,7 +15,7 @@ |
16 | 16 | Bank and Monument Underground station 5001 Bank_and_Monument_Underground_stations 50 90 |
17 | 17 | Bank and Monument stations 5001 Bank_and_Monument_Underground_stations 66 60 |
18 | 18 | Bank and Monument 5001 Bank_and_Monument_Underground_stations 200 90 |
19 | | -Monument 6001 Monument 1000 90 |
| 19 | +Monument 6001 Monument 888 90 |
20 | 20 | Monument 6002 Some_silly_monument 100 60 |
21 | 21 | Monument 5001 Bank_and_Monument_Underground_stations 100 10 |
22 | 22 | station 7001 Bus_station 1000 90 |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/ProductCombiner.java |
— | — | @@ -0,0 +1,22 @@ |
| 2 | +/** |
| 3 | + * |
| 4 | + */ |
| 5 | +package de.brightbyte.wikiword.disambig; |
| 6 | + |
| 7 | +import de.brightbyte.data.Functor2; |
| 8 | + |
| 9 | +final class ProductCombiner implements Functor2.Double { |
| 10 | + |
| 11 | + public static final ProductCombiner instance = new ProductCombiner(); |
| 12 | + |
| 13 | + /** |
| 14 | + * @param disambiguator |
| 15 | + */ |
| 16 | + public ProductCombiner() { |
| 17 | + } |
| 18 | + |
| 19 | + public double apply(double a, double b) { |
| 20 | + return a * b; |
| 21 | + //return = Math.sqrt( popf * simf ); //normalized produkt |
| 22 | + } |
| 23 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -48,12 +48,6 @@ |
49 | 49 | } |
50 | 50 | }; |
51 | 51 | |
52 | | - private Functor.Double weightNormalizer = new Functor.Double() { //NOTE: must map [0:inf] to [0:1] and grow monotonously |
53 | | - public double apply(double pop) { |
54 | | - return 1 - 1/(Math.sqrt(Math.log(pop))+1); //XXX: black voodoo magic ad hoc formula with no deeper meaing. |
55 | | - } |
56 | | - }; |
57 | | - |
58 | 52 | private Functor.Double similarityNormalizer = new Functor.Double() { //NOTE: must map [0:1] to [0:1] and grow monotonously |
59 | 53 | public double apply(double sim) { |
60 | 54 | return Math.sqrt(Math.sqrt(sim)); //XXX: black voodoo magic ad hoc formula with no deeper meaing. |
— | — | @@ -61,7 +55,7 @@ |
62 | 56 | }; |
63 | 57 | |
64 | 58 | protected Functor2.Double scoreCombiner = new LinearCombiner(0.8); |
65 | | - protected Functor2.Double weightCombiner = new LinearCombiner(0.5); |
| 59 | + protected Functor2.Double weightCombiner = ProductCombiner.instance; |
66 | 60 | |
67 | 61 | public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) { |
68 | 62 | this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, |
— | — | @@ -382,23 +376,23 @@ |
383 | 377 | if (p<1) p= 1; |
384 | 378 | if (w<1) w= 1; |
385 | 379 | |
386 | | - pop += p; |
| 380 | + p = weightCombiner.apply(p, w); |
| 381 | + |
| 382 | + pop += p; //XXX: keep raw and processed pop |
387 | 383 | weight += w; |
388 | 384 | c ++; |
389 | 385 | } |
390 | 386 | |
391 | 387 | //normalize |
392 | | - sim = sim / n; //normalize |
393 | | - pop = pop / c; //normalize |
394 | | - weight = weight / c; //normalize |
| 388 | + sim = sim / n; //scale |
| 389 | + pop = pop / c; //scale |
| 390 | + weight = weight / c; //scale |
395 | 391 | |
396 | 392 | double popf = popularityNormalizer.apply(pop); |
397 | 393 | double simf = similarityNormalizer.apply(sim); |
398 | | - double weightf = weightNormalizer.apply(weight); |
| 394 | + |
| 395 | + double score = scoreCombiner.apply(popf, simf); |
399 | 396 | |
400 | | - double score = weightCombiner.apply(weightf, popf); |
401 | | - score = scoreCombiner.apply(score, simf); |
402 | | - |
403 | 397 | return new Result<X, LocalConcept>(interp.getMeanings(), interp.getSequence(), score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop+", weight="+weight); |
404 | 398 | } |
405 | 399 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/LinearBooster.java |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +/** |
| 3 | + * |
| 4 | + */ |
| 5 | +package de.brightbyte.wikiword.disambig; |
| 6 | + |
| 7 | +import de.brightbyte.data.Functor; |
| 8 | + |
| 9 | +final class LinearBooster implements Functor.Double { |
| 10 | + |
| 11 | + private double scale; |
| 12 | + |
| 13 | + /** |
| 14 | + * @param disambiguator |
| 15 | + */ |
| 16 | + public LinearBooster(double scale) { |
| 17 | + this.scale = scale; |
| 18 | + } |
| 19 | + |
| 20 | + public LinearBooster() { |
| 21 | + this(1.0); |
| 22 | + } |
| 23 | + |
| 24 | + public double apply(double a) { |
| 25 | + return a * scale; |
| 26 | + } |
| 27 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/ProductSqrtCombiner.java |
— | — | @@ -0,0 +1,23 @@ |
| 2 | +/** |
| 3 | + * |
| 4 | + */ |
| 5 | +package de.brightbyte.wikiword.disambig; |
| 6 | + |
| 7 | +import de.brightbyte.data.Functor2; |
| 8 | + |
| 9 | +final class ProductSqrtCombiner implements Functor2.Double { |
| 10 | + |
| 11 | + public static final ProductSqrtCombiner instance = new ProductSqrtCombiner(); |
| 12 | + |
| 13 | + /** |
| 14 | + * @param disambiguator |
| 15 | + */ |
| 16 | + public ProductSqrtCombiner() { |
| 17 | + } |
| 18 | + |
| 19 | + public double apply(double a, double b) { |
| 20 | + if (a<0 || a>1) throw new IllegalArgumentException("ProductSqrt is only defined for values 0 <= x <= 1"); |
| 21 | + if (b<0 || b>1) throw new IllegalArgumentException("ProductSqrt is only defined for values 0 <= x <= 1"); |
| 22 | + return Math.sqrt( a * b ); //normalized produkt |
| 23 | + } |
| 24 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -1,12 +1,12 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | | -import java.util.ArrayList; |
5 | 4 | import java.util.Collection; |
6 | 5 | import java.util.Collections; |
7 | 6 | import java.util.HashMap; |
8 | 7 | import java.util.List; |
9 | 8 | import java.util.Map; |
10 | 9 | |
| 10 | +import de.brightbyte.data.Functor; |
11 | 11 | import de.brightbyte.data.Functor2; |
12 | 12 | import de.brightbyte.data.measure.Measure; |
13 | 13 | import de.brightbyte.data.measure.Measure.Comparator; |
— | — | @@ -20,7 +20,8 @@ |
21 | 21 | protected Measure<WikiWordConcept> popularityMeasure; |
22 | 22 | protected Comparator<LocalConcept> popularityComparator; |
23 | 23 | |
24 | | - protected Functor2.Double weigthCombiner = new LinearCombiner(0.5); |
| 24 | + protected Functor.Double weightBooster = SquareBooster.instance; |
| 25 | + protected Functor2.Double weigthCombiner = new ProductCombiner(); //NOTE: pop and weight are not in the same scale. |
25 | 26 | |
26 | 27 | public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) { |
27 | 28 | this(meaningFetcher, WikiWordConcept.theCardinality); |
— | — | @@ -55,11 +56,13 @@ |
56 | 57 | |
57 | 58 | for (List<X> sequence: sequences) { |
58 | 59 | Result<X, LocalConcept> r = disambiguate(sequence, meanings, context); |
| 60 | + trace(r.toString()); |
59 | 61 | if (best == null || best.getScore() < r.getScore()) { |
60 | 62 | best = r; |
61 | 63 | } |
62 | 64 | } |
63 | 65 | |
| 66 | + trace("best:" + best.toString()); |
64 | 67 | return best; |
65 | 68 | } |
66 | 69 | |
— | — | @@ -70,14 +73,10 @@ |
71 | 74 | double score = 0; |
72 | 75 | int totalPop = 0; |
73 | 76 | |
74 | | - List<X> resultSequence = new ArrayList<X>(sequence.size()); |
75 | | - |
76 | 77 | for (X t: sequence) { |
77 | 78 | List<? extends LocalConcept> m = meanings.get(t); |
78 | 79 | if (m==null || m.size()==0) continue; |
79 | 80 | |
80 | | - resultSequence.add(t); |
81 | | - |
82 | 81 | if (m.size()>1) Collections.sort(m, popularityComparator); |
83 | 82 | |
84 | 83 | LocalConcept c = m.get(0); |
— | — | @@ -86,13 +85,14 @@ |
87 | 86 | double pop = popularityMeasure.measure(c); |
88 | 87 | totalPop += pop; |
89 | 88 | |
90 | | - double sc = weigthCombiner.apply(pop, t.getWeight()); //FIXME: pop and weight are not in the same scale. |
| 89 | + double w = weightBooster.apply(t.getWeight()); |
| 90 | + double sc = weigthCombiner.apply(pop, w); |
91 | 91 | score += sc; |
92 | 92 | } |
93 | 93 | |
94 | | - if (disambig.size()>0) score = score / disambig.size(); |
| 94 | + if (disambig.size()>0) score = score / sequence.size(); //NOTE: treat unknown terms as having pop = 0 |
95 | 95 | |
96 | | - Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, resultSequence, score, "score="+score+"; pop="+totalPop); |
| 96 | + Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, sequence, score, "score="+score+"; pop="+totalPop); |
97 | 97 | return r; |
98 | 98 | } |
99 | 99 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SquareBooster.java |
— | — | @@ -0,0 +1,21 @@ |
| 2 | +/** |
| 3 | + * |
| 4 | + */ |
| 5 | +package de.brightbyte.wikiword.disambig; |
| 6 | + |
| 7 | +import de.brightbyte.data.Functor; |
| 8 | + |
| 9 | +final class SquareBooster implements Functor.Double { |
| 10 | + |
| 11 | + public static final SquareBooster instance = new SquareBooster(); |
| 12 | + |
| 13 | + /** |
| 14 | + * @param disambiguator |
| 15 | + */ |
| 16 | + public SquareBooster() { |
| 17 | + } |
| 18 | + |
| 19 | + public double apply(double a) { |
| 20 | + return a * a; |
| 21 | + } |
| 22 | +} |
\ No newline at end of file |