r64443 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r64442‎ | r64443 | r64444 >
Date:16:08, 31 March 2010
Author:daniel
Status:deferred
Tags:
Comment:
popularity measure and comparator
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/TermRelatedness.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -1,6 +1,7 @@
22 package de.brightbyte.wikiword.disambig;
33
44 import java.util.ArrayList;
 5+import java.util.Collection;
56 import java.util.Collections;
67 import java.util.HashMap;
78 import java.util.List;
@@ -37,21 +38,31 @@
3839 /* (non-Javadoc)
3940 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
4041 */
41 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException {
42 - if (window < 2 || terms.size()<2 || meanings.size()<2)
43 - return popularityDisambiguator.disambiguate(terms, meanings);
 42+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 43+ if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings");
 44+
 45+ int sz = Math.min(terms.size(), meanings.size());
 46+ if (context!=null) sz += context.size();
 47+
 48+ if (window < 2 || sz<2) {
 49+ return popularityDisambiguator.disambiguate(terms, meanings, context);
 50+ }
4451
4552 pruneMeanings(meanings);
4653
47 - if (meanings.size()<2)
48 - return popularityDisambiguator.disambiguate(terms, meanings);
 54+ sz = Math.min(terms.size(), meanings.size());
 55+ if (context!=null) sz += context.size();
 56+
 57+ if (sz<2) {
 58+ return popularityDisambiguator.disambiguate(terms, meanings, context);
 59+ }
4960
5061 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
5162
5263 Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size());
5364
5465 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
55 - FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings);
 66+ FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context);
5667
5768 for (int i= window; ; i++) {
5869 int from = i-window;
@@ -63,10 +74,10 @@
6475 Result r ;
6576
6677 if (to-from < 2) {
67 - r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings);
 78+ r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings, context);
6879 } else {
6980 List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings);
70 - r = getBestInterpretation(terms, meanings, interpretations, similarities, features);
 81+ r = getBestInterpretation(terms, meanings, context, interpretations, similarities, features);
7182 }
7283
7384 for (int j=from; j<to; j++) {
@@ -82,7 +93,7 @@
8394 if (to+1>terms.size()) break;
8495 }
8596
86 - return getScore(disambig, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates.
 97+ return getScore(disambig, context, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates.
8798 }
8899
89100 protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(int from, int to, List<X> terms, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/TermRelatedness.java
@@ -1,12 +1,14 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.Collection;
 5+
46 import de.brightbyte.data.measure.Similarity;
57 import de.brightbyte.util.PersistenceException;
68 import de.brightbyte.util.UncheckedPersistenceException;
79 import de.brightbyte.wikiword.model.TermReference;
810 import de.brightbyte.wikiword.model.WikiWordConcept;
911
10 -public class TermRelatedness implements Similarity<String> {
 12+public class TermRelatedness<C extends WikiWordConcept> implements Similarity<String> {
1113
1214 public static class Relatedness {
1315 public final double relatedness;
@@ -27,15 +29,17 @@
2830 }
2931
3032 protected Similarity<WikiWordConcept> relatedness;
31 - protected Disambiguator<TermReference, ? extends WikiWordConcept> disambig;
 33+ protected Disambiguator<TermReference, C> disambig;
 34+ private Collection<C> context;
3235
33 - public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig) {
34 - this(disambig, null);
 36+ public TermRelatedness(Disambiguator<TermReference,C> disambig) {
 37+ this(disambig, null, null);
3538 }
3639
37 - public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig, Similarity<WikiWordConcept> relatedness) {
 40+ public TermRelatedness(Disambiguator<TermReference, C> disambig, Similarity<WikiWordConcept> relatedness, Collection<C> context) {
3841 this.relatedness = relatedness;
3942 this.disambig = disambig;
 43+ this.context = context;
4044 }
4145
4246 public double similarity(String a, String b) {
@@ -47,7 +51,7 @@
4852
4953 public Relatedness relatedness(String a, String b) {
5054 try {
51 - Disambiguator.Result<Term, ? extends WikiWordConcept> r = disambig.<Term>disambiguate(Term.asTerms(a, b));
 55+ Disambiguator.Result<Term, ? extends WikiWordConcept> r = disambig.<Term>disambiguate(Term.asTerms(a, b), context);
5256 if (r==null || r.getMeanings().size()!=2) return null;
5357
5458 double d;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -1,5 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.Collection;
45 import java.util.Collections;
56 import java.util.HashMap;
67 import java.util.List;
@@ -14,7 +15,7 @@
1516 public class PopularityDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> {
1617
1718 protected Measure<WikiWordConcept> popularityMeasure;
18 - protected Comparator<WikiWordConcept> popularityComparator;
 19+ protected Comparator<LocalConcept> popularityComparator;
1920
2021 public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) {
2122 this(meaningFetcher, WikiWordConcept.theCardinality);
@@ -23,11 +24,21 @@
2425 public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, Measure<WikiWordConcept> popularityMeasure) {
2526 super(meaningFetcher);
2627
 28+ this.setPopularityMeasure(popularityMeasure);
 29+ }
 30+
 31+ public Measure<WikiWordConcept> getPopularityMeasure() {
 32+ return popularityMeasure;
 33+ }
 34+
 35+ public void setPopularityMeasure(Measure<WikiWordConcept> popularityMeasure) {
2736 this.popularityMeasure = popularityMeasure;
28 - this.popularityComparator = new Measure.Comparator<WikiWordConcept>(popularityMeasure, true);
 37+ this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true);
2938 }
3039
31 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) {
 40+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) {
 41+ if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings");
 42+
3243 Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>();
3344 int pop = 0;
3445 for (X t: terms) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -1,7 +1,9 @@
22 package de.brightbyte.wikiword.disambig;
33
44 import java.util.ArrayList;
 5+import java.util.Collection;
56 import java.util.Collections;
 7+import java.util.Comparator;
68 import java.util.HashMap;
79 import java.util.Iterator;
810 import java.util.List;
@@ -36,6 +38,7 @@
3739 protected Similarity<LabeledVector<Integer>> similarityMeasure;
3840 protected Measure<WikiWordConcept> popularityMeasure;
3941 protected PopularityDisambiguator popularityDisambiguator;
 42+ protected Comparator<LocalConcept> popularityComparator;
4043
4144 private Functor.Double popularityFactor = new Functor.Double() { //NOTE: must map [0:inf] to [0:1] and grow monotonously
4245
@@ -71,12 +74,48 @@
7275 if (popularityMeasure==null) throw new NullPointerException();
7376 if (sim==null) throw new NullPointerException();
7477 if (featureFetcher==null) throw new NullPointerException();
75 - this.popularityMeasure = popularityMeasure;
76 - this.similarityMeasure = sim;
 78+
7779 this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //TODO: depth
7880 this.popularityDisambiguator = new PopularityDisambiguator(meaningFetcher, popularityMeasure);
 81+
 82+ this.setPopularityMeasure(popularityMeasure);
 83+ this.setSimilarityMeasure(sim);
7984 }
8085
 86+ public Functor.Double getPopularityFactor() {
 87+ return popularityFactor;
 88+ }
 89+
 90+ public void setPopularityFactor(Functor.Double popularityFactor) {
 91+ this.popularityFactor = popularityFactor;
 92+ }
 93+
 94+ public Measure<WikiWordConcept> getPopularityMeasure() {
 95+ return popularityMeasure;
 96+ }
 97+
 98+ public void setPopularityMeasure(Measure<WikiWordConcept> popularityMeasure) {
 99+ this.popularityMeasure = popularityMeasure;
 100+ this.popularityDisambiguator.setPopularityMeasure(popularityMeasure);
 101+ this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true);
 102+ }
 103+
 104+ public Functor2.Double getScoreCombiner() {
 105+ return scoreCombiner;
 106+ }
 107+
 108+ public void setScoreCombiner(Functor2.Double scoreCombiner) {
 109+ this.scoreCombiner = scoreCombiner;
 110+ }
 111+
 112+ public Functor.Double getSimilarityFactor() {
 113+ return similarityFactor;
 114+ }
 115+
 116+ public void setSimilarityFactor(Functor.Double similarityFactor) {
 117+ this.similarityFactor = similarityFactor;
 118+ }
 119+
81120 public void setFeatureFetcher(FeatureFetcher<LocalConcept, Integer> featureFetcher) {
82121 this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //FIXME: depth
83122 }
@@ -123,7 +162,7 @@
124163 this.maxMeanings = maxMeanings;
125164 }
126165
127 - protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings) throws PersistenceException {
 166+ protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
128167 FeatureFetcher<LocalConcept, Integer> features = featureCacheManager.newCache();
129168
130169 //NOTE: pre-fetch all features in one go
@@ -132,6 +171,7 @@
133172 concepts.addAll(m);
134173 }
135174
 175+ if (context!=null) concepts.addAll(context);
136176 features.getFeatures(concepts);
137177
138178 return features;
@@ -140,23 +180,32 @@
141181 /* (non-Javadoc)
142182 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
143183 */
144 - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException {
145 - if (terms.size()<2 || meanings.size()<2)
146 - return popularityDisambiguator.disambiguate(terms, meanings);
 184+ public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 185+ if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings");
147186
 187+ int sz = Math.min(terms.size(), meanings.size());
 188+ if (context!=null) sz += context.size();
 189+
 190+ if (sz<2) {
 191+ return popularityDisambiguator.disambiguate(terms, meanings, context);
 192+ }
 193+
148194 pruneMeanings(meanings);
149195
150 - if (meanings.size()<2)
151 - return popularityDisambiguator.disambiguate(terms, meanings);
 196+ sz = Math.min(terms.size(), meanings.size());
 197+ if (context!=null) sz += context.size();
 198+ if (sz <2) {
 199+ return popularityDisambiguator.disambiguate(terms, meanings, context);
 200+ }
152201
153202 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
154203
155204 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
156 - FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings);
 205+ FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context);
157206
158207 List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings);
159208
160 - return getBestInterpretation(terms, meanings, interpretations, similarities, features);
 209+ return getBestInterpretation(terms, meanings, context, interpretations, similarities, features);
161210 }
162211
163212 protected void pruneMeanings(Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
@@ -179,7 +228,7 @@
180229
181230 if (m.size()==0) eit.remove();
182231 else if (m.size()>maxMeanings) {
183 - Collections.sort(m, WikiWordConcept.byCardinality);
 232+ Collections.sort(m, popularityComparator);
184233 m = m.subList(0, maxMeanings);
185234 e.setValue(m);
186235 }
@@ -187,14 +236,14 @@
188237 }
189238
190239 protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings,
191 - List<Map<X, LocalConcept>> interpretations,
 240+ Collection<LocalConcept> context, List<Map<X, LocalConcept>> interpretations,
192241 LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
193242
194243 List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>();
195244
196245 double traceLimit = -1;
197246 for (Map<X, LocalConcept> interp: interpretations) {
198 - Result<X, LocalConcept> r = getScore(interp, similarities, features);
 247+ Result<X, LocalConcept> r = getScore(interp, context, similarities, features);
199248
200249 if (r.getScore() >= minScore) {
201250 rankings.add(r);
@@ -205,7 +254,7 @@
206255 }
207256
208257 if (rankings.size()==0) {
209 - return popularityDisambiguator.disambiguate(terms, meanings);
 258+ return popularityDisambiguator.disambiguate(terms, meanings, context);
210259 }
211260
212261 Collections.sort(rankings);
@@ -244,15 +293,24 @@
245294 return interpretations;
246295 }
247296
248 - protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
 297+ protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, Collection<LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
249298 double sim = 0;
250299 double pop = 0;
251300
 301+ Collection<LocalConcept> concepts;
 302+ if (context!=null) {
 303+ concepts = new ArrayList<LocalConcept>();
 304+ concepts.addAll(interp.values());
 305+ concepts.addAll(context);
 306+ } else {
 307+ concepts = interp.values();
 308+ }
 309+
252310 int i=0, j=0, n=0, c=0;
253 - for (LocalConcept a: interp.values()) {
 311+ for (LocalConcept a: concepts) {
254312 i++;
255313 j=0;
256 - for (LocalConcept b: interp.values()) {
 314+ for (LocalConcept b: concepts) {
257315 j++;
258316 if (i==j) break;
259317
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java
@@ -1,5 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.Collection;
45 import java.util.List;
56 import java.util.Map;
67
@@ -52,6 +53,6 @@
5354
5455 public void setTrace(Output trace);
5556
56 - public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException;
 57+ public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException;
5758
5859 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -1,5 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.Collection;
45 import java.util.List;
56 import java.util.Map;
67
@@ -19,13 +20,13 @@
2021 this.meaningCacheManager = new MeaningCache.Manager<C>(meaningFetcher, 10);
2122 }
2223
23 - public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException {
 24+ public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException {
2425 MeaningCache<C> mcache = meaningCacheManager.newCache();
2526 Map<X, List<? extends C>> meanings = mcache.getMeanings(terms);
26 - return disambiguate(terms, meanings);
 27+ return disambiguate(terms, meanings, context);
2728 }
2829
29 - public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings) throws PersistenceException;
 30+ public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException;
3031
3132 public Output getTrace() {
3233 return trace;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java
@@ -500,7 +500,7 @@
501501 }
502502
503503 public void showDisambiguation(List<String> terms, ConsoleOutput out) throws PersistenceException {
504 - Disambiguator.Result r = getDisambiguator().disambiguate(terms);
 504+ Disambiguator.Result r = getDisambiguator().disambiguate(terms, null);
505505 out.writeInterpretation(r.getMeanings());
506506 }
507507
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java
@@ -144,12 +144,11 @@
145145 stripClutterManglers.add( new RegularExpressionMangler("rank\\s*=\\s*\\[\\[List[ _]of[ _][-\\w\\d\\s]+?\\|\\s*Ranked\\s+\\{\\{[-\\w\\d\\s]+?counties\\s*\\|\\s*\\w+=[-\\w\\d\\s]+\\}\\}\\]\\]", "", 0));
146146
147147 conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PLACE,
148 - "^(Geography_of|Places|Villages|Towns|Cities|Counties|Countries|Municipalities|States|Provinces|Territories|Federal_states|Islands|Regions|Domains|Communes|Districts)" +
149 - "(_|$)|_(places|villages|towns|cities|counties|countries|municipalities|states|provinces|territories|federal_states|islands|regions|domains|communes|districts)$", 0));
 148+ "^(Geography_of|Places|Villages|Towns|Cities|Captials?|Counties|Countries|Municipalities|Settlements|States|Provinces|Territories|Federal_states|Islands|Regions|Domains|Communes|Districts|Locations)" +
 149+ "(_|$)|_(places|villages|towns|cities|capitals|counties|countries|municipalities|settlements|states|provinces|territories|federal_states|islands|regions|domains|communes|districts|locations)$", 0));
150150
 151+ conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PLACE, "^(Geobox|Infobox_([Ss]ettlement|[Cc]ountry|[Ss]tate|[Ll]ocation))$", 0));
151152
152 - conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.PLACE, "Geobox"));
153 -
154153 conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PERSON, "^(Male|Female|People)_|_(people|men|women|births|deaths)$", 0));
155154 conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PERSON, "^(Persondata|Lifetime|BD|BIRTH-DEATH-SORT|Infobox.*_(person|[aA]rtist|creator|writer|musician|biography|clergy|scientist))$", 0));
156155
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java
@@ -8,12 +8,14 @@
99
1010 import de.brightbyte.data.cursor.DataCursor;
1111 import de.brightbyte.data.cursor.DataSink;
 12+import de.brightbyte.data.measure.Measure;
1213 import de.brightbyte.io.ConsoleIO;
1314 import de.brightbyte.io.LineCursor;
1415 import de.brightbyte.io.OutputSink;
1516 import de.brightbyte.text.Chunker;
1617 import de.brightbyte.text.RegularExpressionChunker;
1718 import de.brightbyte.util.PersistenceException;
 19+import de.brightbyte.wikiword.ConceptType;
1820 import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer;
1921 import de.brightbyte.wikiword.disambig.Disambiguator;
2022 import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator;
@@ -23,6 +25,7 @@
2426 import de.brightbyte.wikiword.disambig.Disambiguator.Result;
2527 import de.brightbyte.wikiword.model.LocalConcept;
2628 import de.brightbyte.wikiword.model.TermReference;
 29+import de.brightbyte.wikiword.model.WikiWordConcept;
2730 import de.brightbyte.wikiword.store.DatabaseConceptStores;
2831 import de.brightbyte.wikiword.store.FeatureStore;
2932 import de.brightbyte.wikiword.store.LocalConceptStore;
@@ -64,10 +67,26 @@
6568 }
6669
6770 protected void init() throws PersistenceException, InstantiationException {
68 - StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore());
 71+ WikiWordConceptStore.ConceptQuerySpec spec = new WikiWordConceptStore.ConceptQuerySpec();
 72+ //spec.setRequireType(ConceptType.PLACE); //FIXME: config! //NOTE: type tags are currently too bad, need to rebuild; use soft boost instead.
 73+
 74+ StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore(), spec);
6975 StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore());
7076 disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true );
7177
 78+ Measure<WikiWordConcept> popularityMeasure = new Measure<WikiWordConcept>(){ //boost locations //FIXME: configure!
 79+ public double measure(WikiWordConcept concept) {
 80+ double score = concept.getCardinality();
 81+
 82+ if (concept.getType().equals(ConceptType.PLACE))
 83+ score *= 10; //XXX: magic number...
 84+
 85+ return score;
 86+ }
 87+ };
 88+
 89+ ((SlidingCoherenceDisambiguator)disambiguator).setPopularityMeasure(popularityMeasure);
 90+
7291 analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks);
7392 analyzer.initialize();
7493
@@ -88,10 +107,10 @@
89108 return result.toString(); //FIXME: annotate!
90109 */
91110
92 - List<Term> terms = Term.asTerms(chunker.chunk(line));
 111+ List<Term> terms = Term.asTerms(chunker.chunk(line.trim()));
93112 if (flip) Collections.reverse(terms);
94113
95 - Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms);
 114+ Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms, null);
96115 if (flip) Collections.reverse(terms);
97116
98117 return assembleMeanings(terms, result);

Status & tagging log