Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/WindowCoherenceDisambiguator.java |
— | — | @@ -1,79 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.disambig; |
3 | | - |
4 | | -import java.util.Collections; |
5 | | -import java.util.HashMap; |
6 | | -import java.util.List; |
7 | | -import java.util.Map; |
8 | | - |
9 | | -import de.brightbyte.data.LabeledMatrix; |
10 | | -import de.brightbyte.data.LabeledVector; |
11 | | -import de.brightbyte.data.MapLabeledMatrix; |
12 | | -import de.brightbyte.data.measure.Measure; |
13 | | -import de.brightbyte.data.measure.Similarity; |
14 | | -import de.brightbyte.util.PersistenceException; |
15 | | -import de.brightbyte.wikiword.model.LocalConcept; |
16 | | - |
17 | | -public class WindowCoherenceDisambiguator<K> extends CoherenceDisambiguator<K> { |
18 | | - |
19 | | - protected int window = 2; |
20 | | - |
21 | | - public WindowCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<LocalConcept> popularityMeasure, Similarity<LabeledVector<K>> sim, int window) { |
22 | | - super(meaningFetcher, featureFetcher, popularityMeasure, sim); |
23 | | - |
24 | | - this.window = window; |
25 | | - } |
26 | | - |
27 | | - /* (non-Javadoc) |
28 | | - * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
29 | | - */ |
30 | | - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException { |
31 | | - Map<String, LocalConcept> disambig = new HashMap<String, LocalConcept>(meanings.size()); |
32 | | - |
33 | | - LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
34 | | - FeatureCache<LocalConcept, K> features = new FeatureCache<LocalConcept, K>(featureFetcher); //TODO: keep a chain of n caches, resulting in LRU logic. |
35 | | - |
36 | | - for (int i=0; i<terms.size(); i++) { |
37 | | - int from = i-window+1; |
38 | | - int to = i+1; |
39 | | - |
40 | | - if (from<0) from = 0; |
41 | | - if (to>terms.size()) to = terms.size(); |
42 | | - |
43 | | - String t = terms.get(i); |
44 | | - LocalConcept m; |
45 | | - |
46 | | - if (to-from < 2) { |
47 | | - Result r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings); |
48 | | - m = (LocalConcept)r.getMeanings().get(t); //UGLY cast |
49 | | - } else { |
50 | | - List<Map<String, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
51 | | - Result r = getBestInterpretation(terms, meanings, interpretations, similarities, features); |
52 | | - m = (LocalConcept)r.getMeanings().get(t); //UGLY cast |
53 | | - } |
54 | | - |
55 | | - disambig.put(t, m); |
56 | | - } |
57 | | - |
58 | | - return getScore(disambig, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
59 | | - } |
60 | | - |
61 | | - protected List<Map<String, LocalConcept>> getInterpretations(int from, int to, List<String> terms, Map<String, LocalConcept> known, Map<String, List<LocalConcept>> meanings) { |
62 | | - Map<String, List<LocalConcept>> mset = new HashMap<String, List<LocalConcept>>(); |
63 | | - |
64 | | - if (to>terms.size()) to = terms.size(); |
65 | | - |
66 | | - for (int i=from; i<to; i++) { |
67 | | - List<LocalConcept> m; |
68 | | - |
69 | | - String t = terms.get(i); |
70 | | - LocalConcept c = known.get(t); |
71 | | - |
72 | | - if (c!=null) m = Collections.singletonList(c); |
73 | | - else m = meanings.get(t); |
74 | | - |
75 | | - mset.put(t, m); |
76 | | - } |
77 | | - |
78 | | - return getInterpretations(terms.subList(from, to), mset); |
79 | | - } |
80 | | -} |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -0,0 +1,109 @@ |
| 2 | +package de.brightbyte.wikiword.disambig; |
| 3 | + |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Collections; |
| 6 | +import java.util.HashMap; |
| 7 | +import java.util.List; |
| 8 | +import java.util.Map; |
| 9 | + |
| 10 | +import de.brightbyte.data.LabeledMatrix; |
| 11 | +import de.brightbyte.data.LabeledVector; |
| 12 | +import de.brightbyte.data.MapLabeledMatrix; |
| 13 | +import de.brightbyte.data.measure.CosineVectorSimilarity; |
| 14 | +import de.brightbyte.data.measure.Measure; |
| 15 | +import de.brightbyte.data.measure.ScalarVectorSimilarity; |
| 16 | +import de.brightbyte.data.measure.Similarity; |
| 17 | +import de.brightbyte.util.PersistenceException; |
| 18 | +import de.brightbyte.wikiword.model.LocalConcept; |
| 19 | +import de.brightbyte.wikiword.model.WikiWordRanking; |
| 20 | + |
| 21 | +public class SlidingCoherenceDisambiguator<K> extends CoherenceDisambiguator<K> { |
| 22 | + |
| 23 | + protected int window ; |
| 24 | + |
| 25 | + public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, boolean featuresAreNormalized) { |
| 26 | + this(meaningFetcher, featureFetcher, WikiWordRanking.theCardinality, |
| 27 | + featuresAreNormalized ? ScalarVectorSimilarity.<K>getInstance() : CosineVectorSimilarity.<K>getInstance(), //if pre-normalized, use scalar to calc cosin |
| 28 | + 5); |
| 29 | + } |
| 30 | + |
| 31 | + public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<WikiWordRanking> popularityMeasure, Similarity<LabeledVector<K>> sim, int window) { |
| 32 | + super(meaningFetcher, featureFetcher, popularityMeasure, sim); |
| 33 | + |
| 34 | + this.window = window; |
| 35 | + } |
| 36 | + |
| 37 | + /* (non-Javadoc) |
| 38 | + * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
| 39 | + */ |
| 40 | + public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException { |
| 41 | + if (window < 2 || terms.size()<2 || meanings.size()<2) |
| 42 | + return popularityDisambiguator.disambiguate(terms, meanings); |
| 43 | + |
| 44 | + //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
| 45 | + |
| 46 | + Map<String, LocalConcept> disambig = new HashMap<String, LocalConcept>(meanings.size()); |
| 47 | + |
| 48 | + LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
| 49 | + FeatureCache<LocalConcept, K> features = new FeatureCache<LocalConcept, K>(featureFetcher); //TODO: keep a chain of n caches, resulting in LRU logic. |
| 50 | + |
| 51 | + for (int i= window; ; i++) { |
| 52 | + int from = i-window; |
| 53 | + int to = i+1; |
| 54 | + |
| 55 | + if (from<0) from = 0; |
| 56 | + if (to>terms.size()) to = terms.size(); |
| 57 | + |
| 58 | + Result r ; |
| 59 | + |
| 60 | + if (to-from < 2) { |
| 61 | + r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings); |
| 62 | + } else { |
| 63 | + List<Map<String, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
| 64 | + r = getBestInterpretation(terms, meanings, interpretations, similarities, features); |
| 65 | + } |
| 66 | + |
| 67 | + for (int j=from; j<to; j++) { |
| 68 | + String t = terms.get(j); |
| 69 | + if (disambig.containsKey(t)) continue; |
| 70 | + |
| 71 | + LocalConcept m; |
| 72 | + |
| 73 | + m = (LocalConcept)r.getMeanings().get(t); //UGLY cast |
| 74 | + if (m!=null) disambig.put(t, m); |
| 75 | + } |
| 76 | + |
| 77 | + if (to+1>terms.size()) break; |
| 78 | + } |
| 79 | + |
| 80 | + return getScore(disambig, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
| 81 | + } |
| 82 | + |
| 83 | + protected List<Map<String, LocalConcept>> getInterpretations(int from, int to, List<String> terms, Map<String, LocalConcept> known, Map<String, List<LocalConcept>> meanings) { |
| 84 | + //strip out all terms with no known meaning |
| 85 | + if (meanings.keySet().size() != terms.size()) { |
| 86 | + List<String> t = new ArrayList<String>(terms.size()); |
| 87 | + t.addAll(terms); |
| 88 | + t.retainAll(meanings.keySet()); |
| 89 | + terms = t; |
| 90 | + } |
| 91 | + |
| 92 | + Map<String, List<LocalConcept>> mset = new HashMap<String, List<LocalConcept>>(); |
| 93 | + |
| 94 | + if (to>terms.size()) to = terms.size(); |
| 95 | + |
| 96 | + for (int i=from; i<to; i++) { |
| 97 | + List<LocalConcept> m; |
| 98 | + |
| 99 | + String t = terms.get(i); |
| 100 | + LocalConcept c = known.get(t); |
| 101 | + |
| 102 | + if (c!=null) m = Collections.singletonList(c); |
| 103 | + else m = meanings.get(t); |
| 104 | + |
| 105 | + mset.put(t, m); |
| 106 | + } |
| 107 | + |
| 108 | + return getInterpretations(terms.subList(from, to), mset); |
| 109 | + } |
| 110 | +} |
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 111 | + |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java |
— | — | @@ -4,20 +4,27 @@ |
5 | 5 | |
6 | 6 | import de.brightbyte.data.cursor.DataSet; |
7 | 7 | import de.brightbyte.util.PersistenceException; |
| 8 | +import de.brightbyte.wikiword.ConceptType; |
8 | 9 | import de.brightbyte.wikiword.model.LocalConcept; |
9 | 10 | import de.brightbyte.wikiword.store.LocalConceptStore; |
10 | 11 | |
11 | 12 | public class StoredMeaningFetcher implements MeaningFetcher<LocalConcept> { |
12 | 13 | protected LocalConceptStore store; |
| 14 | + protected ConceptType type; |
13 | 15 | |
14 | 16 | public StoredMeaningFetcher(LocalConceptStore store) { |
| 17 | + this(store, null); |
| 18 | + } |
| 19 | + |
| 20 | + public StoredMeaningFetcher(LocalConceptStore store, ConceptType type) { |
15 | 21 | if (store==null) throw new NullPointerException(); |
16 | 22 | |
17 | 23 | this.store = store; |
| 24 | + this.type = type; |
18 | 25 | } |
19 | 26 | |
20 | 27 | public List<LocalConcept> getMeanings(String term) throws PersistenceException { |
21 | | - DataSet<LocalConcept> m = store.getMeanings(term); //FIXME: filter/cut-off rules, sort order! //XXX: relevance value? |
| 28 | + DataSet<LocalConcept> m = store.getMeanings(term, type); //FIXME: filter/cut-off rules, sort order! //XXX: relevance value? |
22 | 29 | return m.load(); |
23 | 30 | } |
24 | 31 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -8,17 +8,22 @@ |
9 | 9 | import de.brightbyte.data.measure.Measure; |
10 | 10 | import de.brightbyte.data.measure.Measure.Comparator; |
11 | 11 | import de.brightbyte.wikiword.model.LocalConcept; |
| 12 | +import de.brightbyte.wikiword.model.WikiWordRanking; |
12 | 13 | |
13 | 14 | public class PopularityDisambiguator extends AbstractDisambiguator { |
14 | 15 | |
15 | | - protected Measure<LocalConcept> popularityMeasure; |
16 | | - protected Comparator<LocalConcept> popularityComparator; |
| 16 | + protected Measure<WikiWordRanking> popularityMeasure; |
| 17 | + protected Comparator<WikiWordRanking> popularityComparator; |
17 | 18 | |
18 | | - public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, Measure<LocalConcept> popularityMeasure) { |
| 19 | + public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) { |
| 20 | + this(meaningFetcher, WikiWordRanking.theCardinality); |
| 21 | + } |
| 22 | + |
| 23 | + public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, Measure<WikiWordRanking> popularityMeasure) { |
19 | 24 | super(meaningFetcher); |
20 | 25 | |
21 | 26 | this.popularityMeasure = popularityMeasure; |
22 | | - this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true); |
| 27 | + this.popularityComparator = new Measure.Comparator<WikiWordRanking>(popularityMeasure, true); |
23 | 28 | } |
24 | 29 | |
25 | 30 | public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -9,23 +9,32 @@ |
10 | 10 | import de.brightbyte.data.LabeledMatrix; |
11 | 11 | import de.brightbyte.data.LabeledVector; |
12 | 12 | import de.brightbyte.data.MapLabeledMatrix; |
| 13 | +import de.brightbyte.data.measure.CosineVectorSimilarity; |
13 | 14 | import de.brightbyte.data.measure.Measure; |
| 15 | +import de.brightbyte.data.measure.ScalarVectorSimilarity; |
14 | 16 | import de.brightbyte.data.measure.Similarity; |
15 | 17 | import de.brightbyte.util.PersistenceException; |
16 | 18 | import de.brightbyte.wikiword.model.ConceptFeatures; |
17 | 19 | import de.brightbyte.wikiword.model.LocalConcept; |
| 20 | +import de.brightbyte.wikiword.model.WikiWordRanking; |
18 | 21 | |
19 | 22 | public class CoherenceDisambiguator<K> extends AbstractDisambiguator { |
20 | 23 | |
21 | 24 | protected int minPopularity = 2; //FIXME: use complex cutoff specifier! |
22 | | - protected double scoreThreshold = 0.002; |
23 | | - protected double popularityBias = 0.01; |
| 25 | + protected double scoreThreshold = 0.002; //FIXME: magic number |
| 26 | + protected double popularityBias = 0.01; //FIXME: magic number |
| 27 | + |
24 | 28 | protected Similarity<LabeledVector<K>> similarityMeasure; |
25 | 29 | protected FeatureFetcher<LocalConcept, K> featureFetcher; |
26 | | - protected Measure<LocalConcept> popularityMeasure; |
| 30 | + protected Measure<WikiWordRanking> popularityMeasure; |
27 | 31 | protected PopularityDisambiguator popularityDisambiguator; |
28 | 32 | |
29 | | - public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<LocalConcept> popularityMeasure, Similarity<LabeledVector<K>> sim) { |
| 33 | + public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, boolean featuresAreNormalized) { |
| 34 | + this(meaningFetcher, featureFetcher, WikiWordRanking.theCardinality, |
| 35 | + featuresAreNormalized ? ScalarVectorSimilarity.<K>getInstance() : CosineVectorSimilarity.<K>getInstance()); //if pre-normalized, use scalar to calc cosin |
| 36 | + } |
| 37 | + |
| 38 | + public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<WikiWordRanking> popularityMeasure, Similarity<LabeledVector<K>> sim) { |
30 | 39 | super(meaningFetcher); |
31 | 40 | |
32 | 41 | if (popularityMeasure==null) throw new NullPointerException(); |
— | — | @@ -83,9 +92,8 @@ |
84 | 93 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
85 | 94 | */ |
86 | 95 | public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException { |
87 | | - if (meanings.size()==1) { |
88 | | - return popularityDisambiguator.disambiguate(terms, meanings); |
89 | | - } |
| 96 | + if (terms.size()<2 || meanings.size()<2) |
| 97 | + return popularityDisambiguator.disambiguate(terms, meanings); |
90 | 98 | |
91 | 99 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
92 | 100 | FeatureCache<LocalConcept, K> features = new FeatureCache<LocalConcept, K>(featureFetcher); //TODO: keep a chain of n caches, resulting in LRU logic. |
— | — | @@ -136,10 +144,7 @@ |
137 | 145 | |
138 | 146 | protected List<Map<String, LocalConcept>> getInterpretations(List<String> terms, Map<String, List<LocalConcept>> meanings) { |
139 | 147 | if (terms.size()==0) { |
140 | | - List<Map<String, LocalConcept>> combinations = new ArrayList<Map<String, LocalConcept>>(); |
141 | | - Map<String, LocalConcept> e = new HashMap<String, LocalConcept>(); |
142 | | - combinations.add(e); |
143 | | - return combinations; |
| 148 | + return Collections.singletonList(Collections.<String, LocalConcept>emptyMap()); |
144 | 149 | } |
145 | 150 | |
146 | 151 | String t = terms.get(0); |
— | — | @@ -153,6 +158,9 @@ |
154 | 159 | |
155 | 160 | for (Map<String, LocalConcept> be: base) { |
156 | 161 | for (LocalConcept c: m) { |
| 162 | + double p = popularityMeasure.measure(c); |
| 163 | + if (p<minPopularity) continue; |
| 164 | + |
157 | 165 | Map<String, LocalConcept> e = new HashMap<String, LocalConcept>(); |
158 | 166 | e.putAll(be); |
159 | 167 | e.put(t, c); |
— | — | @@ -165,7 +173,7 @@ |
166 | 174 | return interpretations; |
167 | 175 | } |
168 | 176 | |
169 | | - protected Result getScore(Map<String, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache features) throws PersistenceException { |
| 177 | + protected Result getScore(Map<String, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, K> features) throws PersistenceException { |
170 | 178 | double sim = 0; |
171 | 179 | double pop = 0; |
172 | 180 | |
— | — | @@ -190,6 +198,13 @@ |
191 | 199 | ConceptFeatures<LocalConcept, K> fa = features.getFeatures(a); |
192 | 200 | ConceptFeatures<LocalConcept, K> fb = features.getFeatures(b); |
193 | 201 | |
| 202 | + //force relevance/cardinality to the figures from the meaning lookup |
| 203 | + //not strictly necessary, but nice to keep it consistent. |
| 204 | + fa.getConceptReference().setCardinality(a.getCardinality()); |
| 205 | + fa.getConceptReference().setRelevance(a.getRelevance()); |
| 206 | + fb.getConceptReference().setCardinality(b.getCardinality()); |
| 207 | + fb.getConceptReference().setRelevance(b.getRelevance()); |
| 208 | + |
194 | 209 | d = similarityMeasure.similarity(fa.getFeatureVector(), fb.getFeatureVector()); |
195 | 210 | similarities.set(a, b, d); |
196 | 211 | } |
— | — | @@ -197,12 +212,12 @@ |
198 | 213 | |
199 | 214 | if (d<0) throw new IllegalArgumentException("encountered negative similarity score ("+d+") for "+a+" / "+b); |
200 | 215 | sim += d; |
201 | | - n ++; //should add up to combo.size*(combo.size()-1)/2, according to Gauss |
| 216 | + n ++; //should add up to interp.size*(combo.size()-1)/2, according to Gauss |
202 | 217 | } |
203 | 218 | |
204 | | - int card = a.getCardinality(); //XXX: this may be local cardinality (indegree), we want the frequency of the meaning-assignment! |
205 | | - if (card<=0) card= 1; |
206 | | - pop += card; |
| 219 | + double p = popularityMeasure.measure(a); |
| 220 | + if (p<1) p= 1; |
| 221 | + pop += p; |
207 | 222 | c ++; |
208 | 223 | } |
209 | 224 | |
— | — | @@ -210,7 +225,7 @@ |
211 | 226 | sim = sim / n; |
212 | 227 | pop = pop / c; |
213 | 228 | |
214 | | - double popf = 1 - 1/(Math.sqrt(pop)+1); //converge against 1 |
| 229 | + double popf = 1 - 1/(Math.sqrt(pop)+1); //converge against 1 //XXX: black voodoo magic ad hoc formula with no deeper meaing. |
215 | 230 | |
216 | 231 | double score = popf * popularityBias + sim * ( 1 - popularityBias ); |
217 | 232 | return new Result(interp, score, sim, pop); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
4 | 5 | import java.util.HashMap; |
5 | 6 | import java.util.List; |
6 | 7 | import java.util.Map; |
— | — | @@ -24,7 +25,7 @@ |
25 | 26 | |
26 | 27 | for (String t: terms) { |
27 | 28 | List<LocalConcept> m = meaningFetcher.getMeanings(t); |
28 | | - meanings.put(t, m); |
| 29 | + if (m!=null && m.size()>0) meanings.put(t, m); |
29 | 30 | } |
30 | 31 | |
31 | 32 | return meanings; |
— | — | @@ -32,7 +33,6 @@ |
33 | 34 | |
34 | 35 | public Result disambiguate(List<String> terms) throws PersistenceException { |
35 | 36 | Map<String, List<LocalConcept>> meanings = fetchMeanings(terms); |
36 | | - |
37 | 37 | return disambiguate(terms, meanings); |
38 | 38 | } |
39 | 39 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseLocalConceptStore.java |
— | — | @@ -119,10 +119,12 @@ |
120 | 120 | */ |
121 | 121 | } |
122 | 122 | |
123 | | - protected String meaningWhere(String term) { |
124 | | - return " JOIN "+meaningTable.getSQLName()+" as M ON C.id = M.concept " + |
125 | | - " WHERE M.term_text = "+database.quoteString(term)+" " + |
126 | | - " ORDER BY freq DESC"; |
| 123 | + protected String meaningWhere(String term, ConceptType t) { |
| 124 | + String sql = " JOIN "+meaningTable.getSQLName()+" as M ON C.id = M.concept "; |
| 125 | + sql += " WHERE M.term_text = "+database.quoteString(term)+" "; |
| 126 | + if (t!=null) sql += " AND C.type = "+t.getCode()+" "; |
| 127 | + sql += " ORDER BY freq DESC"; |
| 128 | + return sql; |
127 | 129 | } |
128 | 130 | |
129 | 131 | @Override |
— | — | @@ -143,10 +145,14 @@ |
144 | 146 | return corpus; |
145 | 147 | } |
146 | 148 | |
147 | | - public DataSet<LocalConceptReference> listMeanings(String term) |
| 149 | + public DataSet<LocalConceptReference> listMeanings(String term) throws PersistenceException { |
| 150 | + return this.listMeanings(term, null); |
| 151 | + } |
| 152 | + |
| 153 | + public DataSet<LocalConceptReference> listMeanings(String term, ConceptType t) |
148 | 154 | throws PersistenceException { |
149 | 155 | |
150 | | - String sql = referenceSelect("M.freq") + meaningWhere(term); |
| 156 | + String sql = referenceSelect("M.freq") + meaningWhere(term, t); |
151 | 157 | |
152 | 158 | return new QueryDataSet<LocalConceptReference>(database, getRowReferenceFactory(), "listMeanings", sql, false); |
153 | 159 | } |
— | — | @@ -159,6 +165,10 @@ |
160 | 166 | return ((DatabaseLocalConceptInfoStore)getConceptInfoStore()).getMeanings(term); |
161 | 167 | } |
162 | 168 | |
| 169 | + public DataSet<LocalConcept> getMeanings(String term, ConceptType t) throws PersistenceException { |
| 170 | + return ((DatabaseLocalConceptInfoStore)getConceptInfoStore()).getMeanings(term, t); |
| 171 | + } |
| 172 | + |
163 | 173 | public TermReference pickRandomTerm(int top) throws PersistenceException { |
164 | 174 | return ((LocalStatisticsStore<LocalConcept, LocalConceptReference>)getStatisticsStore()).pickRandomTerm(top); |
165 | 175 | } |
— | — | @@ -313,10 +323,16 @@ |
314 | 324 | } |
315 | 325 | |
316 | 326 | |
317 | | - public DataSet<LocalConcept> getMeanings(String term) |
| 327 | + public DataSet<LocalConcept> getMeanings(String term) |
318 | 328 | throws PersistenceException { |
| 329 | + |
| 330 | + return getMeanings(term, null); |
| 331 | + } |
319 | 332 | |
320 | | - String sql = conceptSelect("M.freq") + meaningWhere(term); |
| 333 | + public DataSet<LocalConcept> getMeanings(String term, ConceptType t) |
| 334 | + throws PersistenceException { |
| 335 | + |
| 336 | + String sql = conceptSelect("M.freq") + meaningWhere(term, t); |
321 | 337 | |
322 | 338 | return new QueryDataSet<LocalConcept>(database, new ConceptFactory(), "getMeanins", sql, false); |
323 | 339 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseWikiWordConceptStore.java |
— | — | @@ -279,6 +279,10 @@ |
280 | 280 | } |
281 | 281 | } |
282 | 282 | |
| 283 | + public FeatureStore<T, Integer> getFeatureStore() throws PersistenceException { |
| 284 | + return getProximityStore(); |
| 285 | + } |
| 286 | + |
283 | 287 | public ProximityStore<T, R, Integer> getProximityStore() throws PersistenceException { |
284 | 288 | try { |
285 | 289 | if (proximityStore==null) proximityStore = newProximityStore(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalConceptStore.java |
— | — | @@ -2,6 +2,7 @@ |
3 | 3 | |
4 | 4 | import de.brightbyte.data.cursor.DataSet; |
5 | 5 | import de.brightbyte.util.PersistenceException; |
| 6 | +import de.brightbyte.wikiword.ConceptType; |
6 | 7 | import de.brightbyte.wikiword.model.LocalConcept; |
7 | 8 | import de.brightbyte.wikiword.model.LocalConceptReference; |
8 | 9 | import de.brightbyte.wikiword.model.TermReference; |
— | — | @@ -17,9 +18,13 @@ |
18 | 19 | public abstract DataSet<ConceptReference> getNarrowerConcepts() throws PersistenceException; |
19 | 20 | */ |
20 | 21 | |
21 | | - //TODO: relevance limit? order? |
| 22 | + //TODO: relevance limit? order? filter? |
| 23 | + public abstract DataSet<LocalConceptReference> listMeanings(String term, ConceptType t) throws PersistenceException; |
| 24 | + |
22 | 25 | public abstract DataSet<LocalConceptReference> listMeanings(String term) throws PersistenceException; |
23 | 26 | |
| 27 | + public abstract DataSet<LocalConcept> getMeanings(String term, ConceptType t) throws PersistenceException; |
| 28 | + |
24 | 29 | public abstract DataSet<LocalConcept> getMeanings(String term) throws PersistenceException; |
25 | 30 | |
26 | 31 | public int getNumberOfTerms() throws PersistenceException; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/WikiWordConceptStore.java |
— | — | @@ -15,6 +15,7 @@ |
16 | 16 | |
17 | 17 | public StatisticsStore getStatisticsStore() throws PersistenceException; |
18 | 18 | public ConceptInfoStore<T> getConceptInfoStore() throws PersistenceException; |
| 19 | + public FeatureStore<T, Integer> getFeatureStore() throws PersistenceException; |
19 | 20 | public ProximityStore<T, R, Integer> getProximityStore() throws PersistenceException; |
20 | 21 | |
21 | 22 | public T getConcept(int id) throws PersistenceException; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseFeatureStore.java |
— | — | @@ -148,13 +148,15 @@ |
149 | 149 | String conceptField, String nameField, String cardinalityField, String relevanceField, |
150 | 150 | String keyField, String valueField) throws PersistenceException { |
151 | 151 | try { |
152 | | - LabeledVector<Integer> v = readVector(rs, conceptField, keyField, valueField, new MapLabeledVector<Integer>()); |
153 | | - |
| 152 | + rs.next(); //TODO: return what iof this fails?? |
154 | 153 | int id = DatabaseUtil.asInt(rs.getObject(conceptField)); |
155 | 154 | String n = nameField == null ? null : DatabaseUtil.asString(rs.getObject(nameField)); |
156 | 155 | int c = cardinalityField == null ? 1 : DatabaseUtil.asInt(rs.getObject(cardinalityField)); |
157 | 156 | double r = relevanceField == null ? 1 : DatabaseUtil.asDouble(rs.getObject(relevanceField)); |
| 157 | + rs.previous(); |
158 | 158 | |
| 159 | + LabeledVector<Integer> v = readVector(rs, conceptField, keyField, valueField, new MapLabeledVector<Integer>()); |
| 160 | + |
159 | 161 | R ref = referenceFactory.newInstance(id, n, c, r); |
160 | 162 | return new ConceptFeatures<T, Integer>(ref, v); |
161 | 163 | } catch (SQLException e) { |
— | — | @@ -174,7 +176,7 @@ |
175 | 177 | Object c = rs.getObject(conceptField); |
176 | 178 | if (concept<0) concept = DatabaseUtil.asInt(c); |
177 | 179 | else if (concept!=DatabaseUtil.asInt(c)) { |
178 | | - rs.previous(); //push back |
| 180 | + if (!rs.previous()) throw new RuntimeException ("push-back failed on result set! "+rs.getClass()); //push back |
179 | 181 | break; |
180 | 182 | } |
181 | 183 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java |
— | — | @@ -17,6 +17,7 @@ |
18 | 18 | |
19 | 19 | public WikiWordConcept(WikiWordConceptReference reference, DatasetIdentifier dataset, ConceptType type) { |
20 | 20 | if (type==null) throw new NullPointerException(); |
| 21 | + if (reference==null) throw new NullPointerException(); |
21 | 22 | |
22 | 23 | this.dataset = dataset; |
23 | 24 | this.type = type; |
— | — | @@ -44,17 +45,25 @@ |
45 | 46 | } |
46 | 47 | |
47 | 48 | public int getCardinality() { |
48 | | - return reference==null ? 1 : reference.getCardinality(); |
| 49 | + return reference.getCardinality(); |
49 | 50 | } |
50 | 51 | |
51 | 52 | public double getRelevance() { |
52 | | - return reference==null ? 1 : reference.getRelevance(); |
| 53 | + return reference.getRelevance(); |
53 | 54 | } |
54 | 55 | |
55 | 56 | public boolean hasRanking() { |
56 | | - return reference != null && ( reference.getCardinality()>0 || reference.getRelevance()>0 ); |
| 57 | + return ( reference.getCardinality()>0 || reference.getRelevance()>0 ); |
57 | 58 | } |
58 | 59 | |
| 60 | + public void setCardinality(int cardinality) { |
| 61 | + reference.setCardinality(cardinality); |
| 62 | + } |
| 63 | + |
| 64 | + public void setRelevance(double relevance) { |
| 65 | + reference.setRelevance(relevance); |
| 66 | + } |
| 67 | + |
59 | 68 | @Override |
60 | 69 | public int hashCode() { |
61 | 70 | return reference.hashCode(); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordReference.java |
— | — | @@ -39,8 +39,8 @@ |
40 | 40 | protected final int id; |
41 | 41 | protected final String name; |
42 | 42 | |
43 | | - protected final int cardinality; |
44 | | - protected final double relevance; |
| 43 | + protected int cardinality; |
| 44 | + protected double relevance; |
45 | 45 | |
46 | 46 | public WikiWordReference(final int id, final String name, final int cardinality, final double relevance) { |
47 | 47 | this.cardinality = cardinality; |
— | — | @@ -57,7 +57,14 @@ |
58 | 58 | return relevance; |
59 | 59 | } |
60 | 60 | |
| 61 | + public void setRelevance(double relevance) { |
| 62 | + this.relevance = relevance; |
| 63 | + } |
61 | 64 | |
| 65 | + public void setCardinality(int cardinality) { |
| 66 | + this.cardinality = cardinality; |
| 67 | + } |
| 68 | + |
62 | 69 | public int getId() { |
63 | 70 | return id; |
64 | 71 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java |
— | — | @@ -17,16 +17,22 @@ |
18 | 18 | import de.brightbyte.util.PersistenceException; |
19 | 19 | import de.brightbyte.wikiword.ConsoleApp; |
20 | 20 | import de.brightbyte.wikiword.Corpus; |
| 21 | +import de.brightbyte.wikiword.disambig.Disambiguator; |
| 22 | +import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator; |
| 23 | +import de.brightbyte.wikiword.disambig.StoredFeatureFetcher; |
| 24 | +import de.brightbyte.wikiword.disambig.StoredMeaningFetcher; |
21 | 25 | import de.brightbyte.wikiword.model.AbstractConceptOutput; |
22 | 26 | import de.brightbyte.wikiword.model.ConceptFeatures; |
23 | 27 | import de.brightbyte.wikiword.model.ConceptOutput; |
24 | 28 | import de.brightbyte.wikiword.model.GlobalConcept; |
25 | 29 | import de.brightbyte.wikiword.model.LocalConcept; |
| 30 | +import de.brightbyte.wikiword.model.LocalConceptReference; |
26 | 31 | import de.brightbyte.wikiword.model.WikiWordConcept; |
27 | 32 | import de.brightbyte.wikiword.model.WikiWordConceptReference; |
28 | 33 | import de.brightbyte.wikiword.model.WikiWordReference; |
29 | 34 | import de.brightbyte.wikiword.rdf.RdfOutput; |
30 | 35 | import de.brightbyte.wikiword.store.DatabaseConceptStores; |
| 36 | +import de.brightbyte.wikiword.store.FeatureStore; |
31 | 37 | import de.brightbyte.wikiword.store.GlobalConceptStore; |
32 | 38 | import de.brightbyte.wikiword.store.LocalConceptStore; |
33 | 39 | import de.brightbyte.wikiword.store.ProximityStore; |
— | — | @@ -34,6 +40,8 @@ |
35 | 41 | |
36 | 42 | public class QueryConsole extends ConsoleApp<WikiWordConceptStore> { |
37 | 43 | |
| 44 | + protected Disambiguator disambiguator; |
| 45 | + |
38 | 46 | public QueryConsole() { |
39 | 47 | super(true, true); |
40 | 48 | } |
— | — | @@ -104,6 +112,10 @@ |
105 | 113 | output.writeConcepts(meanings); |
106 | 114 | } |
107 | 115 | |
| 116 | + public void writeConceptReferences(DataSet<? extends WikiWordConceptReference<? extends WikiWordConcept>> meanings) throws PersistenceException { |
| 117 | + output.writeConceptReferences(meanings); |
| 118 | + } |
| 119 | + |
108 | 120 | public void writeGlobalConcept(GlobalConcept concept) throws PersistenceException { |
109 | 121 | output.writeGlobalConcept(concept); |
110 | 122 | } |
— | — | @@ -123,6 +135,17 @@ |
124 | 136 | } |
125 | 137 | } |
126 | 138 | |
| 139 | + public void writeInterpretation(Map<String, ? extends WikiWordConcept> interp) throws PersistenceException { |
| 140 | + //XXX: hack! |
| 141 | + try { |
| 142 | + writer.write(interp.toString()); |
| 143 | + writer.write("\n"); |
| 144 | + writer.flush(); |
| 145 | + } catch (IOException e) { |
| 146 | + throw new PersistenceException(e); |
| 147 | + } |
| 148 | + } |
| 149 | + |
127 | 150 | } |
128 | 151 | |
129 | 152 | protected class ConceptDumper extends AbstractConceptOutput { |
— | — | @@ -344,6 +367,10 @@ |
345 | 368 | String id = params.get(1); |
346 | 369 | showFeatureVector(Integer.parseInt(id), out); |
347 | 370 | } |
| 371 | + else if (cmd.equals("d") || cmd.equals("dis") || cmd.equals("disambig") || cmd.equals("disambiguate")) { |
| 372 | + List<String> terms = params.subList(1,params.size()); |
| 373 | + showDisambiguation(terms, out); |
| 374 | + } |
348 | 375 | else if (cmd.equals("ls") || cmd.equals("list")) { |
349 | 376 | listConcepts(out); |
350 | 377 | } |
— | — | @@ -369,6 +396,21 @@ |
370 | 397 | return conceptStore.getProximityStore(); |
371 | 398 | } |
372 | 399 | |
| 400 | + protected FeatureStore<LocalConcept, Integer> getFeatureStore() throws PersistenceException { |
| 401 | + return conceptStore.getFeatureStore(); |
| 402 | + } |
| 403 | + |
| 404 | + protected Disambiguator getDisambiguator() throws PersistenceException { |
| 405 | + if (disambiguator==null) disambiguator = |
| 406 | + new SlidingCoherenceDisambiguator<Integer>( |
| 407 | + new StoredMeaningFetcher(getLocalConceptStore()), |
| 408 | + new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore()), |
| 409 | + true |
| 410 | + ); |
| 411 | + |
| 412 | + return disambiguator; |
| 413 | + } |
| 414 | + |
373 | 415 | public void dumpStats() throws PersistenceException { |
374 | 416 | Map<String, ? extends Number> m = ((WikiWordConceptStore)conceptStore).getStatisticsStore().getStatistics(); |
375 | 417 | |
— | — | @@ -387,8 +429,8 @@ |
388 | 430 | } |
389 | 431 | |
390 | 432 | public void listMeaningsLocal(String term, ConsoleOutput out) throws PersistenceException { |
391 | | - DataSet<LocalConcept> meanings = getLocalConceptStore().getMeanings(term); |
392 | | - out.writeConcepts(meanings); |
| 433 | + DataSet<LocalConceptReference> meanings = getLocalConceptStore().listMeanings(term); |
| 434 | + out.writeConceptReferences(meanings); |
393 | 435 | } |
394 | 436 | |
395 | 437 | public void listMeaningsGlobal(String lang, String term, ConsoleOutput out) throws PersistenceException { |
— | — | @@ -442,6 +484,11 @@ |
443 | 485 | out.writeFeatureVector(conceptFeatures.getFeatureVector()); |
444 | 486 | } |
445 | 487 | |
| 488 | + public void showDisambiguation(List<String> terms, ConsoleOutput out) throws PersistenceException { |
| 489 | + Disambiguator.Result r = getDisambiguator().disambiguate(terms); |
| 490 | + out.writeInterpretation(r.getMeanings()); |
| 491 | + } |
| 492 | + |
446 | 493 | public static void main(String[] argv) throws Exception { |
447 | 494 | QueryConsole q = new QueryConsole(); |
448 | 495 | q.launch(argv); |