Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | 4 | import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
5 | 6 | import java.util.Collections; |
6 | 7 | import java.util.HashMap; |
7 | 8 | import java.util.List; |
— | — | @@ -37,21 +38,31 @@ |
38 | 39 | /* (non-Javadoc) |
39 | 40 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
40 | 41 | */ |
41 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException { |
42 | | - if (window < 2 || terms.size()<2 || meanings.size()<2) |
43 | | - return popularityDisambiguator.disambiguate(terms, meanings); |
| 42 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 43 | + if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
| 44 | + |
| 45 | + int sz = Math.min(terms.size(), meanings.size()); |
| 46 | + if (context!=null) sz += context.size(); |
| 47 | + |
| 48 | + if (window < 2 || sz<2) { |
| 49 | + return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 50 | + } |
44 | 51 | |
45 | 52 | pruneMeanings(meanings); |
46 | 53 | |
47 | | - if (meanings.size()<2) |
48 | | - return popularityDisambiguator.disambiguate(terms, meanings); |
| 54 | + sz = Math.min(terms.size(), meanings.size()); |
| 55 | + if (context!=null) sz += context.size(); |
| 56 | + |
| 57 | + if (sz<2) { |
| 58 | + return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 59 | + } |
49 | 60 | |
50 | 61 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
51 | 62 | |
52 | 63 | Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size()); |
53 | 64 | |
54 | 65 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
55 | | - FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings); |
| 66 | + FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context); |
56 | 67 | |
57 | 68 | for (int i= window; ; i++) { |
58 | 69 | int from = i-window; |
— | — | @@ -63,10 +74,10 @@ |
64 | 75 | Result r ; |
65 | 76 | |
66 | 77 | if (to-from < 2) { |
67 | | - r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings); |
| 78 | + r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings, context); |
68 | 79 | } else { |
69 | 80 | List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
70 | | - r = getBestInterpretation(terms, meanings, interpretations, similarities, features); |
| 81 | + r = getBestInterpretation(terms, meanings, context, interpretations, similarities, features); |
71 | 82 | } |
72 | 83 | |
73 | 84 | for (int j=from; j<to; j++) { |
— | — | @@ -82,7 +93,7 @@ |
83 | 94 | if (to+1>terms.size()) break; |
84 | 95 | } |
85 | 96 | |
86 | | - return getScore(disambig, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
| 97 | + return getScore(disambig, context, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
87 | 98 | } |
88 | 99 | |
89 | 100 | protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(int from, int to, List<X> terms, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/TermRelatedness.java |
— | — | @@ -1,12 +1,14 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.Collection; |
| 5 | + |
4 | 6 | import de.brightbyte.data.measure.Similarity; |
5 | 7 | import de.brightbyte.util.PersistenceException; |
6 | 8 | import de.brightbyte.util.UncheckedPersistenceException; |
7 | 9 | import de.brightbyte.wikiword.model.TermReference; |
8 | 10 | import de.brightbyte.wikiword.model.WikiWordConcept; |
9 | 11 | |
10 | | -public class TermRelatedness implements Similarity<String> { |
| 12 | +public class TermRelatedness<C extends WikiWordConcept> implements Similarity<String> { |
11 | 13 | |
12 | 14 | public static class Relatedness { |
13 | 15 | public final double relatedness; |
— | — | @@ -27,15 +29,17 @@ |
28 | 30 | } |
29 | 31 | |
30 | 32 | protected Similarity<WikiWordConcept> relatedness; |
31 | | - protected Disambiguator<TermReference, ? extends WikiWordConcept> disambig; |
| 33 | + protected Disambiguator<TermReference, C> disambig; |
| 34 | + private Collection<C> context; |
32 | 35 | |
33 | | - public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig) { |
34 | | - this(disambig, null); |
| 36 | + public TermRelatedness(Disambiguator<TermReference,C> disambig) { |
| 37 | + this(disambig, null, null); |
35 | 38 | } |
36 | 39 | |
37 | | - public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig, Similarity<WikiWordConcept> relatedness) { |
| 40 | + public TermRelatedness(Disambiguator<TermReference, C> disambig, Similarity<WikiWordConcept> relatedness, Collection<C> context) { |
38 | 41 | this.relatedness = relatedness; |
39 | 42 | this.disambig = disambig; |
| 43 | + this.context = context; |
40 | 44 | } |
41 | 45 | |
42 | 46 | public double similarity(String a, String b) { |
— | — | @@ -47,7 +51,7 @@ |
48 | 52 | |
49 | 53 | public Relatedness relatedness(String a, String b) { |
50 | 54 | try { |
51 | | - Disambiguator.Result<Term, ? extends WikiWordConcept> r = disambig.<Term>disambiguate(Term.asTerms(a, b)); |
| 55 | + Disambiguator.Result<Term, ? extends WikiWordConcept> r = disambig.<Term>disambiguate(Term.asTerms(a, b), context); |
52 | 56 | if (r==null || r.getMeanings().size()!=2) return null; |
53 | 57 | |
54 | 58 | double d; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.Collection; |
4 | 5 | import java.util.Collections; |
5 | 6 | import java.util.HashMap; |
6 | 7 | import java.util.List; |
— | — | @@ -14,7 +15,7 @@ |
15 | 16 | public class PopularityDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> { |
16 | 17 | |
17 | 18 | protected Measure<WikiWordConcept> popularityMeasure; |
18 | | - protected Comparator<WikiWordConcept> popularityComparator; |
| 19 | + protected Comparator<LocalConcept> popularityComparator; |
19 | 20 | |
20 | 21 | public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) { |
21 | 22 | this(meaningFetcher, WikiWordConcept.theCardinality); |
— | — | @@ -23,11 +24,21 @@ |
24 | 25 | public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, Measure<WikiWordConcept> popularityMeasure) { |
25 | 26 | super(meaningFetcher); |
26 | 27 | |
| 28 | + this.setPopularityMeasure(popularityMeasure); |
| 29 | + } |
| 30 | + |
| 31 | + public Measure<WikiWordConcept> getPopularityMeasure() { |
| 32 | + return popularityMeasure; |
| 33 | + } |
| 34 | + |
| 35 | + public void setPopularityMeasure(Measure<WikiWordConcept> popularityMeasure) { |
27 | 36 | this.popularityMeasure = popularityMeasure; |
28 | | - this.popularityComparator = new Measure.Comparator<WikiWordConcept>(popularityMeasure, true); |
| 37 | + this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true); |
29 | 38 | } |
30 | 39 | |
31 | | - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) { |
| 40 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
| 41 | + if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
| 42 | + |
32 | 43 | Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(); |
33 | 44 | int pop = 0; |
34 | 45 | for (X t: terms) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -1,7 +1,9 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | 4 | import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
5 | 6 | import java.util.Collections; |
| 7 | +import java.util.Comparator; |
6 | 8 | import java.util.HashMap; |
7 | 9 | import java.util.Iterator; |
8 | 10 | import java.util.List; |
— | — | @@ -36,6 +38,7 @@ |
37 | 39 | protected Similarity<LabeledVector<Integer>> similarityMeasure; |
38 | 40 | protected Measure<WikiWordConcept> popularityMeasure; |
39 | 41 | protected PopularityDisambiguator popularityDisambiguator; |
| 42 | + protected Comparator<LocalConcept> popularityComparator; |
40 | 43 | |
41 | 44 | private Functor.Double popularityFactor = new Functor.Double() { //NOTE: must map [0:inf] to [0:1] and grow monotonously |
42 | 45 | |
— | — | @@ -71,12 +74,48 @@ |
72 | 75 | if (popularityMeasure==null) throw new NullPointerException(); |
73 | 76 | if (sim==null) throw new NullPointerException(); |
74 | 77 | if (featureFetcher==null) throw new NullPointerException(); |
75 | | - this.popularityMeasure = popularityMeasure; |
76 | | - this.similarityMeasure = sim; |
| 78 | + |
77 | 79 | this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //TODO: depth |
78 | 80 | this.popularityDisambiguator = new PopularityDisambiguator(meaningFetcher, popularityMeasure); |
| 81 | + |
| 82 | + this.setPopularityMeasure(popularityMeasure); |
| 83 | + this.setSimilarityMeasure(sim); |
79 | 84 | } |
80 | 85 | |
| 86 | + public Functor.Double getPopularityFactor() { |
| 87 | + return popularityFactor; |
| 88 | + } |
| 89 | + |
| 90 | + public void setPopularityFactor(Functor.Double popularityFactor) { |
| 91 | + this.popularityFactor = popularityFactor; |
| 92 | + } |
| 93 | + |
| 94 | + public Measure<WikiWordConcept> getPopularityMeasure() { |
| 95 | + return popularityMeasure; |
| 96 | + } |
| 97 | + |
| 98 | + public void setPopularityMeasure(Measure<WikiWordConcept> popularityMeasure) { |
| 99 | + this.popularityMeasure = popularityMeasure; |
| 100 | + this.popularityDisambiguator.setPopularityMeasure(popularityMeasure); |
| 101 | + this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true); |
| 102 | + } |
| 103 | + |
| 104 | + public Functor2.Double getScoreCombiner() { |
| 105 | + return scoreCombiner; |
| 106 | + } |
| 107 | + |
| 108 | + public void setScoreCombiner(Functor2.Double scoreCombiner) { |
| 109 | + this.scoreCombiner = scoreCombiner; |
| 110 | + } |
| 111 | + |
| 112 | + public Functor.Double getSimilarityFactor() { |
| 113 | + return similarityFactor; |
| 114 | + } |
| 115 | + |
| 116 | + public void setSimilarityFactor(Functor.Double similarityFactor) { |
| 117 | + this.similarityFactor = similarityFactor; |
| 118 | + } |
| 119 | + |
81 | 120 | public void setFeatureFetcher(FeatureFetcher<LocalConcept, Integer> featureFetcher) { |
82 | 121 | this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //FIXME: depth |
83 | 122 | } |
— | — | @@ -123,7 +162,7 @@ |
124 | 163 | this.maxMeanings = maxMeanings; |
125 | 164 | } |
126 | 165 | |
127 | | - protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings) throws PersistenceException { |
| 166 | + protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
128 | 167 | FeatureFetcher<LocalConcept, Integer> features = featureCacheManager.newCache(); |
129 | 168 | |
130 | 169 | //NOTE: pre-fetch all features in one go |
— | — | @@ -132,6 +171,7 @@ |
133 | 172 | concepts.addAll(m); |
134 | 173 | } |
135 | 174 | |
| 175 | + if (context!=null) concepts.addAll(context); |
136 | 176 | features.getFeatures(concepts); |
137 | 177 | |
138 | 178 | return features; |
— | — | @@ -140,23 +180,32 @@ |
141 | 181 | /* (non-Javadoc) |
142 | 182 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
143 | 183 | */ |
144 | | - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException { |
145 | | - if (terms.size()<2 || meanings.size()<2) |
146 | | - return popularityDisambiguator.disambiguate(terms, meanings); |
| 184 | + public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException { |
| 185 | + if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
147 | 186 | |
| 187 | + int sz = Math.min(terms.size(), meanings.size()); |
| 188 | + if (context!=null) sz += context.size(); |
| 189 | + |
| 190 | + if (sz<2) { |
| 191 | + return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 192 | + } |
| 193 | + |
148 | 194 | pruneMeanings(meanings); |
149 | 195 | |
150 | | - if (meanings.size()<2) |
151 | | - return popularityDisambiguator.disambiguate(terms, meanings); |
| 196 | + sz = Math.min(terms.size(), meanings.size()); |
| 197 | + if (context!=null) sz += context.size(); |
| 198 | + if (sz <2) { |
| 199 | + return popularityDisambiguator.disambiguate(terms, meanings, context); |
| 200 | + } |
152 | 201 | |
153 | 202 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
154 | 203 | |
155 | 204 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
156 | | - FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings); |
| 205 | + FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context); |
157 | 206 | |
158 | 207 | List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings); |
159 | 208 | |
160 | | - return getBestInterpretation(terms, meanings, interpretations, similarities, features); |
| 209 | + return getBestInterpretation(terms, meanings, context, interpretations, similarities, features); |
161 | 210 | } |
162 | 211 | |
163 | 212 | protected void pruneMeanings(Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
— | — | @@ -179,7 +228,7 @@ |
180 | 229 | |
181 | 230 | if (m.size()==0) eit.remove(); |
182 | 231 | else if (m.size()>maxMeanings) { |
183 | | - Collections.sort(m, WikiWordConcept.byCardinality); |
| 232 | + Collections.sort(m, popularityComparator); |
184 | 233 | m = m.subList(0, maxMeanings); |
185 | 234 | e.setValue(m); |
186 | 235 | } |
— | — | @@ -187,14 +236,14 @@ |
188 | 237 | } |
189 | 238 | |
190 | 239 | protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, |
191 | | - List<Map<X, LocalConcept>> interpretations, |
| 240 | + Collection<LocalConcept> context, List<Map<X, LocalConcept>> interpretations, |
192 | 241 | LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
193 | 242 | |
194 | 243 | List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>(); |
195 | 244 | |
196 | 245 | double traceLimit = -1; |
197 | 246 | for (Map<X, LocalConcept> interp: interpretations) { |
198 | | - Result<X, LocalConcept> r = getScore(interp, similarities, features); |
| 247 | + Result<X, LocalConcept> r = getScore(interp, context, similarities, features); |
199 | 248 | |
200 | 249 | if (r.getScore() >= minScore) { |
201 | 250 | rankings.add(r); |
— | — | @@ -205,7 +254,7 @@ |
206 | 255 | } |
207 | 256 | |
208 | 257 | if (rankings.size()==0) { |
209 | | - return popularityDisambiguator.disambiguate(terms, meanings); |
| 258 | + return popularityDisambiguator.disambiguate(terms, meanings, context); |
210 | 259 | } |
211 | 260 | |
212 | 261 | Collections.sort(rankings); |
— | — | @@ -244,15 +293,24 @@ |
245 | 294 | return interpretations; |
246 | 295 | } |
247 | 296 | |
248 | | - protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
| 297 | + protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, Collection<LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
249 | 298 | double sim = 0; |
250 | 299 | double pop = 0; |
251 | 300 | |
| 301 | + Collection<LocalConcept> concepts; |
| 302 | + if (context!=null) { |
| 303 | + concepts = new ArrayList<LocalConcept>(); |
| 304 | + concepts.addAll(interp.values()); |
| 305 | + concepts.addAll(context); |
| 306 | + } else { |
| 307 | + concepts = interp.values(); |
| 308 | + } |
| 309 | + |
252 | 310 | int i=0, j=0, n=0, c=0; |
253 | | - for (LocalConcept a: interp.values()) { |
| 311 | + for (LocalConcept a: concepts) { |
254 | 312 | i++; |
255 | 313 | j=0; |
256 | | - for (LocalConcept b: interp.values()) { |
| 314 | + for (LocalConcept b: concepts) { |
257 | 315 | j++; |
258 | 316 | if (i==j) break; |
259 | 317 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.Collection; |
4 | 5 | import java.util.List; |
5 | 6 | import java.util.Map; |
6 | 7 | |
— | — | @@ -52,6 +53,6 @@ |
53 | 54 | |
54 | 55 | public void setTrace(Output trace); |
55 | 56 | |
56 | | - public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException; |
| 57 | + public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException; |
57 | 58 | |
58 | 59 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.Collection; |
4 | 5 | import java.util.List; |
5 | 6 | import java.util.Map; |
6 | 7 | |
— | — | @@ -19,13 +20,13 @@ |
20 | 21 | this.meaningCacheManager = new MeaningCache.Manager<C>(meaningFetcher, 10); |
21 | 22 | } |
22 | 23 | |
23 | | - public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException { |
| 24 | + public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException { |
24 | 25 | MeaningCache<C> mcache = meaningCacheManager.newCache(); |
25 | 26 | Map<X, List<? extends C>> meanings = mcache.getMeanings(terms); |
26 | | - return disambiguate(terms, meanings); |
| 27 | + return disambiguate(terms, meanings, context); |
27 | 28 | } |
28 | 29 | |
29 | | - public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings) throws PersistenceException; |
| 30 | + public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException; |
30 | 31 | |
31 | 32 | public Output getTrace() { |
32 | 33 | return trace; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java |
— | — | @@ -500,7 +500,7 @@ |
501 | 501 | } |
502 | 502 | |
503 | 503 | public void showDisambiguation(List<String> terms, ConsoleOutput out) throws PersistenceException { |
504 | | - Disambiguator.Result r = getDisambiguator().disambiguate(terms); |
| 504 | + Disambiguator.Result r = getDisambiguator().disambiguate(terms, null); |
505 | 505 | out.writeInterpretation(r.getMeanings()); |
506 | 506 | } |
507 | 507 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/WikiConfiguration_enwiki.java |
— | — | @@ -144,12 +144,11 @@ |
145 | 145 | stripClutterManglers.add( new RegularExpressionMangler("rank\\s*=\\s*\\[\\[List[ _]of[ _][-\\w\\d\\s]+?\\|\\s*Ranked\\s+\\{\\{[-\\w\\d\\s]+?counties\\s*\\|\\s*\\w+=[-\\w\\d\\s]+\\}\\}\\]\\]", "", 0)); |
146 | 146 | |
147 | 147 | conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PLACE, |
148 | | - "^(Geography_of|Places|Villages|Towns|Cities|Counties|Countries|Municipalities|States|Provinces|Territories|Federal_states|Islands|Regions|Domains|Communes|Districts)" + |
149 | | - "(_|$)|_(places|villages|towns|cities|counties|countries|municipalities|states|provinces|territories|federal_states|islands|regions|domains|communes|districts)$", 0)); |
| 148 | + "^(Geography_of|Places|Villages|Towns|Cities|Captials?|Counties|Countries|Municipalities|Settlements|States|Provinces|Territories|Federal_states|Islands|Regions|Domains|Communes|Districts|Locations)" + |
| 149 | + "(_|$)|_(places|villages|towns|cities|capitals|counties|countries|municipalities|settlements|states|provinces|territories|federal_states|islands|regions|domains|communes|districts|locations)$", 0)); |
150 | 150 | |
| 151 | + conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PLACE, "^(Geobox|Infobox_([Ss]ettlement|[Cc]ountry|[Ss]tate|[Ll]ocation))$", 0)); |
151 | 152 | |
152 | | - conceptTypeSensors.add( new HasTemplateSensor<ConceptType>(ConceptType.PLACE, "Geobox")); |
153 | | - |
154 | 153 | conceptTypeSensors.add( new HasCategoryLikeSensor<ConceptType>(ConceptType.PERSON, "^(Male|Female|People)_|_(people|men|women|births|deaths)$", 0)); |
155 | 154 | conceptTypeSensors.add( new HasTemplateLikeSensor<ConceptType>(ConceptType.PERSON, "^(Persondata|Lifetime|BD|BIRTH-DEATH-SORT|Infobox.*_(person|[aA]rtist|creator|writer|musician|biography|clergy|scientist))$", 0)); |
156 | 155 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java |
— | — | @@ -8,12 +8,14 @@ |
9 | 9 | |
10 | 10 | import de.brightbyte.data.cursor.DataCursor; |
11 | 11 | import de.brightbyte.data.cursor.DataSink; |
| 12 | +import de.brightbyte.data.measure.Measure; |
12 | 13 | import de.brightbyte.io.ConsoleIO; |
13 | 14 | import de.brightbyte.io.LineCursor; |
14 | 15 | import de.brightbyte.io.OutputSink; |
15 | 16 | import de.brightbyte.text.Chunker; |
16 | 17 | import de.brightbyte.text.RegularExpressionChunker; |
17 | 18 | import de.brightbyte.util.PersistenceException; |
| 19 | +import de.brightbyte.wikiword.ConceptType; |
18 | 20 | import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer; |
19 | 21 | import de.brightbyte.wikiword.disambig.Disambiguator; |
20 | 22 | import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator; |
— | — | @@ -23,6 +25,7 @@ |
24 | 26 | import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
25 | 27 | import de.brightbyte.wikiword.model.LocalConcept; |
26 | 28 | import de.brightbyte.wikiword.model.TermReference; |
| 29 | +import de.brightbyte.wikiword.model.WikiWordConcept; |
27 | 30 | import de.brightbyte.wikiword.store.DatabaseConceptStores; |
28 | 31 | import de.brightbyte.wikiword.store.FeatureStore; |
29 | 32 | import de.brightbyte.wikiword.store.LocalConceptStore; |
— | — | @@ -64,10 +67,26 @@ |
65 | 68 | } |
66 | 69 | |
67 | 70 | protected void init() throws PersistenceException, InstantiationException { |
68 | | - StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore()); |
| 71 | + WikiWordConceptStore.ConceptQuerySpec spec = new WikiWordConceptStore.ConceptQuerySpec(); |
| 72 | + //spec.setRequireType(ConceptType.PLACE); //FIXME: config! //NOTE: type tags are currently too bad, need to rebuild; use soft boost instead. |
| 73 | + |
| 74 | + StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore(), spec); |
69 | 75 | StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore()); |
70 | 76 | disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true ); |
71 | 77 | |
| 78 | + Measure<WikiWordConcept> popularityMeasure = new Measure<WikiWordConcept>(){ //boost locations //FIXME: configure! |
| 79 | + public double measure(WikiWordConcept concept) { |
| 80 | + double score = concept.getCardinality(); |
| 81 | + |
| 82 | + if (concept.getType().equals(ConceptType.PLACE)) |
| 83 | + score *= 10; //XXX: magic number... |
| 84 | + |
| 85 | + return score; |
| 86 | + } |
| 87 | + }; |
| 88 | + |
| 89 | + ((SlidingCoherenceDisambiguator)disambiguator).setPopularityMeasure(popularityMeasure); |
| 90 | + |
72 | 91 | analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks); |
73 | 92 | analyzer.initialize(); |
74 | 93 | |
— | — | @@ -88,10 +107,10 @@ |
89 | 108 | return result.toString(); //FIXME: annotate! |
90 | 109 | */ |
91 | 110 | |
92 | | - List<Term> terms = Term.asTerms(chunker.chunk(line)); |
| 111 | + List<Term> terms = Term.asTerms(chunker.chunk(line.trim())); |
93 | 112 | if (flip) Collections.reverse(terms); |
94 | 113 | |
95 | | - Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms); |
| 114 | + Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms, null); |
96 | 115 | if (flip) Collections.reverse(terms); |
97 | 116 | |
98 | 117 | return assembleMeanings(terms, result); |