Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -7,6 +7,8 @@ |
8 | 8 | import java.util.List; |
9 | 9 | import java.util.Map; |
10 | 10 | |
| 11 | +import de.brightbyte.data.Functor2; |
| 12 | +import de.brightbyte.data.Functors; |
11 | 13 | import de.brightbyte.data.LabeledMatrix; |
12 | 14 | import de.brightbyte.data.LabeledVector; |
13 | 15 | import de.brightbyte.data.MapLabeledMatrix; |
— | — | @@ -23,20 +25,45 @@ |
24 | 26 | public class SlidingCoherenceDisambiguator extends CoherenceDisambiguator { |
25 | 27 | |
26 | 28 | protected int window ; |
27 | | - protected boolean runningStart = false; |
28 | 29 | |
29 | 30 | public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) { |
30 | | - this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, |
| 31 | + this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, Functors.Double.product2, |
31 | 32 | featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance(), //if pre-normalized, use scalar to calc cosin |
32 | 33 | 5); |
33 | 34 | } |
34 | 35 | |
35 | | - public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<Integer>> sim, int window) { |
36 | | - super(meaningFetcher, featureFetcher, popularityMeasure, sim); |
| 36 | + public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Functor2<? extends Number, Number, Number> weightCombiner, Similarity<LabeledVector<Integer>> sim, int window) { |
| 37 | + super(meaningFetcher, featureFetcher, popularityMeasure, weightCombiner, sim); |
37 | 38 | |
38 | 39 | this.window = window; |
39 | 40 | } |
40 | 41 | |
| 42 | + public <X extends TermReference>Result<X, LocalConcept> evalStep(List<X> baseSequence, Map<X, LocalConcept> interpretation, PhraseNode<X> node, |
| 43 | + Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context, |
| 44 | + LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
| 45 | + X term = node.getTermReference(); |
| 46 | + |
| 47 | + List<X> sequence = new ArrayList<X>(baseSequence); |
| 48 | + sequence.add(term); |
| 49 | + |
| 50 | + int to = sequence.size(); |
| 51 | + int from = to - window; |
| 52 | + if (from<0) from = 0; |
| 53 | + |
| 54 | + List<X> frame = sequence.subList(from, to); |
| 55 | + |
| 56 | + Result<X, LocalConcept> r ; |
| 57 | + |
| 58 | + if (to-from < 2) { |
| 59 | + r = popularityDisambiguator.disambiguate(node, frame, meanings, context); |
| 60 | + } else { |
| 61 | + List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, frame, interpretation, meanings); |
| 62 | + r = getBestInterpretation(node, frame, meanings, context, interpretations, similarities, features); |
| 63 | + } |
| 64 | + |
| 65 | + return r; |
| 66 | + } |
| 67 | + |
41 | 68 | /* (non-Javadoc) |
42 | 69 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
43 | 70 | */ |
— | — | @@ -60,41 +87,35 @@ |
61 | 88 | } |
62 | 89 | |
63 | 90 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
64 | | - |
65 | | - Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size()); |
66 | 91 | |
67 | 92 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
68 | 93 | FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context); |
| 94 | + |
| 95 | + Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size()); |
| 96 | + PhraseNode<X> currentNode = root; |
| 97 | + List<X> sequence = new ArrayList<X>(); |
69 | 98 | |
70 | | - int start = runningStart ? 0 : window; |
71 | | - |
72 | | - for (int i= start; ; i++) { |
73 | | - int from = i-window; |
74 | | - int to = i+1; |
| 99 | + while (true) { |
| 100 | + List<? extends PhraseNode<X>> successors = currentNode.getSuccessors(); |
| 101 | + if (successors==null || successors.isEmpty()) break; |
75 | 102 | |
76 | | - if (from<0) from = 0; |
77 | | - if (to>terms.size()) to = terms.size(); |
| 103 | + Result<X, LocalConcept> best = null; |
| 104 | + PhraseNode<X> bestNode = null; |
78 | 105 | |
79 | | - Result r ; |
80 | | - |
81 | | - if (to-from < 2) { |
82 | | - r = popularityDisambiguator.disambiguate(root..., terms.subList(from, to), meanings, context); |
83 | | - } else { |
84 | | - List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
85 | | - r = getBestInterpretation(terms, meanings, context, interpretations, similarities, features); |
| 106 | + for (PhraseNode<X> n: successors) { |
| 107 | + Result<X, LocalConcept> r = evalStep(sequence, disambig, currentNode, meanings, context, similarities, features); |
| 108 | + if (best == null || best.getScore() < r.getScore()) { |
| 109 | + best = r; |
| 110 | + bestNode = n; |
| 111 | + } |
86 | 112 | } |
87 | | - |
88 | | - for (int j=from; j<to; j++) { |
89 | | - X t = terms.get(j); |
90 | | - if (disambig.containsKey(t)) continue; |
91 | | - |
92 | | - LocalConcept m; |
93 | | - |
94 | | - m = (LocalConcept)r.getMeanings().get(t); //UGLY cast |
95 | | - if (m!=null) disambig.put(t, m); |
96 | | - } |
97 | 113 | |
98 | | - if (to+1>terms.size()) break; |
| 114 | + X term = bestNode.getTermReference(); |
| 115 | + currentNode = bestNode; |
| 116 | + sequence.add(term); |
| 117 | + |
| 118 | + LocalConcept meaning = best.getMeanings().get(term); |
| 119 | + disambig.put(term, meaning); |
99 | 120 | } |
100 | 121 | |
101 | 122 | return getScore(disambig, context, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
— | — | @@ -128,12 +149,4 @@ |
129 | 150 | return getSequenceInterpretations(terms.subList(from, to), mset); |
130 | 151 | } |
131 | 152 | |
132 | | - public boolean getRunningStart() { |
133 | | - return runningStart; |
134 | | - } |
135 | | - |
136 | | - public void setRunningStart(boolean runningStart) { |
137 | | - this.runningStart = runningStart; |
138 | | - } |
139 | | - |
140 | 153 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -6,6 +6,8 @@ |
7 | 7 | import java.util.List; |
8 | 8 | import java.util.Map; |
9 | 9 | |
| 10 | +import de.brightbyte.data.Functor2; |
| 11 | +import de.brightbyte.data.Functors; |
10 | 12 | import de.brightbyte.data.measure.Measure; |
11 | 13 | import de.brightbyte.data.measure.Measure.Comparator; |
12 | 14 | import de.brightbyte.wikiword.model.LocalConcept; |
— | — | @@ -16,16 +18,18 @@ |
17 | 19 | public class PopularityDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> { |
18 | 20 | |
19 | 21 | protected Measure<WikiWordConcept> popularityMeasure; |
| 22 | + protected Functor2<? extends Number, Number, Number> weigthCombiner; |
20 | 23 | protected Comparator<LocalConcept> popularityComparator; |
21 | 24 | |
22 | 25 | public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) { |
23 | | - this(meaningFetcher, WikiWordConcept.theCardinality); |
| 26 | + this(meaningFetcher, WikiWordConcept.theCardinality, Functors.Double.product2); |
24 | 27 | } |
25 | 28 | |
26 | | - public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, Measure<WikiWordConcept> popularityMeasure) { |
| 29 | + public PopularityDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, Measure<WikiWordConcept> popularityMeasure, Functor2<? extends Number, Number, Number> weightCombiner) { |
27 | 30 | super(meaningFetcher); |
28 | 31 | |
29 | 32 | this.setPopularityMeasure(popularityMeasure); |
| 33 | + this.setWeightCombiner(weightCombiner); |
30 | 34 | } |
31 | 35 | |
32 | 36 | public Measure<WikiWordConcept> getPopularityMeasure() { |
— | — | @@ -37,11 +41,17 @@ |
38 | 42 | this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true); |
39 | 43 | } |
40 | 44 | |
| 45 | + public void setWeightCombiner(Functor2<? extends Number, Number, Number> weightCombiner) { |
| 46 | + this.weigthCombiner = weightCombiner; |
| 47 | + } |
| 48 | + |
41 | 49 | public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Collection<X> terms, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) { |
42 | 50 | if (terms.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), 0.0, "no terms or meanings"); |
43 | 51 | |
44 | 52 | Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(); |
45 | | - int pop = 0; |
| 53 | + double score = 0; |
| 54 | + int totalPop = 0; |
| 55 | + |
46 | 56 | for (X t: terms) { |
47 | 57 | List<? extends LocalConcept> m = meanings.get(t); |
48 | 58 | if (m==null || m.size()==0) continue; |
— | — | @@ -51,12 +61,16 @@ |
52 | 62 | LocalConcept c = m.get(0); |
53 | 63 | disambig.put(t, c); |
54 | 64 | |
55 | | - pop += Math.log(c.getCardinality()); |
| 65 | + double pop = popularityMeasure.measure(c); |
| 66 | + totalPop += pop; |
| 67 | + |
| 68 | + Number sc = weigthCombiner.apply(pop, t.getWeight()); |
| 69 | + score += sc.doubleValue(); |
56 | 70 | } |
57 | 71 | |
58 | | - if (disambig.size()>0) pop = pop / disambig.size(); |
| 72 | + if (disambig.size()>0) score = score / disambig.size(); |
59 | 73 | |
60 | | - Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, pop, "pop="+pop); |
| 74 | + Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, score, "score="+score+"; pop="+totalPop); |
61 | 75 | return r; |
62 | 76 | } |
63 | 77 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | |
14 | 14 | import de.brightbyte.data.Functor; |
15 | 15 | import de.brightbyte.data.Functor2; |
| 16 | +import de.brightbyte.data.Functors; |
16 | 17 | import de.brightbyte.data.LabeledMatrix; |
17 | 18 | import de.brightbyte.data.LabeledVector; |
18 | 19 | import de.brightbyte.data.MapLabeledMatrix; |
— | — | @@ -38,6 +39,7 @@ |
39 | 40 | |
40 | 41 | protected Similarity<LabeledVector<Integer>> similarityMeasure; |
41 | 42 | protected Measure<WikiWordConcept> popularityMeasure; |
| 43 | + protected Functor2<? extends Number, Number, Number> weightCombiner; |
42 | 44 | protected PopularityDisambiguator popularityDisambiguator; |
43 | 45 | protected Comparator<LocalConcept> popularityComparator; |
44 | 46 | |
— | — | @@ -65,11 +67,11 @@ |
66 | 68 | }; |
67 | 69 | |
68 | 70 | public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) { |
69 | | - this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, |
| 71 | + this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, Functors.Double.product2, |
70 | 72 | featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance()); //if pre-normalized, use scalar to calc cosin |
71 | 73 | } |
72 | 74 | |
73 | | - public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<Integer>> sim) { |
| 75 | + public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Functor2<? extends Number, Number, Number> weightCombiner, Similarity<LabeledVector<Integer>> sim) { |
74 | 76 | super(meaningFetcher); |
75 | 77 | |
76 | 78 | if (popularityMeasure==null) throw new NullPointerException(); |
— | — | @@ -77,9 +79,10 @@ |
78 | 80 | if (featureFetcher==null) throw new NullPointerException(); |
79 | 81 | |
80 | 82 | this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //TODO: depth |
81 | | - this.popularityDisambiguator = new PopularityDisambiguator(meaningFetcher, popularityMeasure); |
| 83 | + this.popularityDisambiguator = new PopularityDisambiguator(meaningFetcher, popularityMeasure, weightCombiner); |
82 | 84 | |
83 | 85 | this.setPopularityMeasure(popularityMeasure); |
| 86 | + this.setWeightCombiner(weightCombiner); |
84 | 87 | this.setSimilarityMeasure(sim); |
85 | 88 | } |
86 | 89 | |
— | — | @@ -101,6 +104,11 @@ |
102 | 105 | this.popularityComparator = new Measure.Comparator<LocalConcept>(popularityMeasure, true); |
103 | 106 | } |
104 | 107 | |
| 108 | + public void setWeightCombiner(Functor2<? extends Number, Number, Number> weightCombiner) { |
| 109 | + this.weightCombiner = weightCombiner; |
| 110 | + this.popularityDisambiguator.setWeightCombiner(weightCombiner); |
| 111 | + } |
| 112 | + |
105 | 113 | public Functor2.Double getScoreCombiner() { |
106 | 114 | return scoreCombiner; |
107 | 115 | } |
— | — | @@ -306,23 +314,31 @@ |
307 | 315 | } |
308 | 316 | |
309 | 317 | protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, Collection<LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
310 | | - double sim = 0; |
311 | | - double pop = 0; |
312 | | - |
313 | | - Collection<LocalConcept> concepts; |
| 318 | + Map<? extends TermReference, LocalConcept> concepts; |
314 | 319 | if (context!=null) { |
315 | | - concepts = new ArrayList<LocalConcept>(); |
316 | | - concepts.addAll(interp.values()); |
317 | | - concepts.addAll(context); |
| 320 | + concepts = new HashMap<TermReference, LocalConcept>(); |
| 321 | + |
| 322 | + for (Map.Entry<X, LocalConcept> e: interp.entrySet()) { |
| 323 | + ((HashMap<TermReference, LocalConcept>)concepts).put(e.getKey(), e.getValue()); |
| 324 | + } |
| 325 | + |
| 326 | + for (LocalConcept c: context) { |
| 327 | + ((HashMap<TermReference, LocalConcept>)concepts).put(new Term("", 1), c); |
| 328 | + } |
318 | 329 | } else { |
319 | | - concepts = interp.values(); |
| 330 | + concepts = interp; |
320 | 331 | } |
321 | 332 | |
| 333 | + double sim = 0, pop = 0, weight = 0; |
322 | 334 | int i=0, j=0, n=0, c=0; |
323 | | - for (LocalConcept a: concepts) { |
| 335 | + for (Map.Entry<? extends TermReference, LocalConcept> ea: concepts.entrySet()) { |
| 336 | + LocalConcept a = ea.getValue(); |
| 337 | + TermReference term = ea.getKey(); |
| 338 | + |
324 | 339 | i++; |
325 | 340 | j=0; |
326 | | - for (LocalConcept b: concepts) { |
| 341 | + for (Map.Entry<? extends TermReference, LocalConcept> eb: concepts.entrySet()) { |
| 342 | + LocalConcept b = eb.getValue(); |
327 | 343 | j++; |
328 | 344 | if (i==j) break; |
329 | 345 | |
— | — | @@ -357,21 +373,27 @@ |
358 | 374 | } |
359 | 375 | |
360 | 376 | double p = popularityMeasure.measure(a); |
| 377 | + double w = term.getWeight(); |
361 | 378 | if (p<1) p= 1; |
362 | | - pop += p; |
| 379 | + if (w<1) w= 1; |
| 380 | + |
| 381 | + pop += weightCombiner.apply(p, w).doubleValue(); |
| 382 | + |
| 383 | + weight += w; |
363 | 384 | c ++; |
364 | 385 | } |
365 | 386 | |
366 | 387 | //normalize |
367 | 388 | sim = sim / n; //normalize |
368 | 389 | pop = pop / c; //normalize |
| 390 | + weight = weight / c; //normalize |
369 | 391 | |
370 | 392 | double popf = popularityFactor.apply(pop); |
371 | 393 | double simf = similarityFactor.apply(sim); |
372 | 394 | |
373 | 395 | double score = scoreCombiner.apply(popf, simf); |
374 | 396 | |
375 | | - return new Result<X, LocalConcept>(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop); |
| 397 | + return new Result<X, LocalConcept>(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop+", weight="+weight); |
376 | 398 | } |
377 | 399 | |
378 | 400 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java |
— | — | @@ -9,16 +9,25 @@ |
10 | 10 | public class Term implements TermReference { |
11 | 11 | |
12 | 12 | private final String term; |
13 | | - |
| 13 | + private final double weight; |
| 14 | + |
14 | 15 | public Term(final String term) { |
15 | | - super(); |
| 16 | + this(term, 1); |
| 17 | + } |
| 18 | + |
| 19 | + public Term(final String term, final double weight) { |
16 | 20 | this.term = term; |
| 21 | + this.weight = weight; |
17 | 22 | } |
18 | 23 | |
19 | 24 | public String getTerm() { |
20 | 25 | return term; |
21 | 26 | } |
22 | 27 | |
| 28 | + public double getWeight() { |
| 29 | + return weight; |
| 30 | + } |
| 31 | + |
23 | 32 | public String toString() { |
24 | 33 | return getTerm(); |
25 | 34 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java |
— | — | @@ -1,7 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.model; |
3 | 3 | |
4 | 4 | import java.io.Serializable; |
5 | | -import java.util.List; |
6 | 5 | |
7 | 6 | public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance>, TermReference { |
8 | 7 | |
— | — | @@ -45,7 +44,7 @@ |
46 | 45 | return getPhrase(); |
47 | 46 | } |
48 | 47 | |
49 | | - public int getWeight() { |
| 48 | + public double getWeight() { |
50 | 49 | return weight; |
51 | 50 | } |
52 | 51 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermMeaning.java |
— | — | @@ -27,6 +27,10 @@ |
28 | 28 | public double getScore() { |
29 | 29 | return score; |
30 | 30 | } |
| 31 | + |
| 32 | + public double getWeight() { |
| 33 | + return getScore(); |
| 34 | + } |
31 | 35 | |
32 | 36 | public String getTerm() { |
33 | 37 | return term; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermReference.java |
— | — | @@ -3,5 +3,5 @@ |
4 | 4 | public interface TermReference { |
5 | 5 | |
6 | 6 | public String getTerm(); |
7 | | - |
| 7 | + public double getWeight(); |
8 | 8 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java |
— | — | @@ -19,16 +19,23 @@ |
20 | 20 | import de.brightbyte.util.PersistenceException; |
21 | 21 | import de.brightbyte.wikiword.ConsoleApp; |
22 | 22 | import de.brightbyte.wikiword.Corpus; |
| 23 | +import de.brightbyte.wikiword.TweakSet; |
23 | 24 | import de.brightbyte.wikiword.disambig.Disambiguator; |
| 25 | +import de.brightbyte.wikiword.disambig.PhraseExtractor; |
24 | 26 | import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator; |
25 | 27 | import de.brightbyte.wikiword.disambig.StoredFeatureFetcher; |
26 | 28 | import de.brightbyte.wikiword.disambig.StoredMeaningFetcher; |
| 29 | +import de.brightbyte.wikiword.disambig.Term; |
27 | 30 | import de.brightbyte.wikiword.model.AbstractConceptOutput; |
28 | 31 | import de.brightbyte.wikiword.model.ConceptFeatures; |
29 | 32 | import de.brightbyte.wikiword.model.ConceptOutput; |
30 | 33 | import de.brightbyte.wikiword.model.ConceptRelations; |
31 | 34 | import de.brightbyte.wikiword.model.GlobalConcept; |
32 | 35 | import de.brightbyte.wikiword.model.LocalConcept; |
| 36 | +import de.brightbyte.wikiword.model.PhraseNode; |
| 37 | +import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
| 38 | +import de.brightbyte.wikiword.model.TermListNode; |
| 39 | +import de.brightbyte.wikiword.model.TermReference; |
33 | 40 | import de.brightbyte.wikiword.model.WikiWordConcept; |
34 | 41 | import de.brightbyte.wikiword.rdf.RdfOutput; |
35 | 42 | import de.brightbyte.wikiword.store.DatabaseConceptStores; |
— | — | @@ -45,6 +52,7 @@ |
46 | 53 | protected ConceptQuerySpec minimalConceptSpec; |
47 | 54 | protected ConceptQuerySpec resolvedConceptSpec; |
48 | 55 | protected ConceptQuerySpec detailedConceptSpec; |
| 56 | + private PhraseExtractor phraseExtractor; |
49 | 57 | |
50 | 58 | public QueryConsole() { |
51 | 59 | super(true, true); |
— | — | @@ -80,6 +88,9 @@ |
81 | 89 | throw new PersistenceException(e); |
82 | 90 | } |
83 | 91 | } |
| 92 | + |
| 93 | + //FIXME: make the line below possible... |
| 94 | + //phraseExtractor = PlaintextAnalyzer.getPlaintextAnalyzer(getCorpus(), getTweaks()); |
84 | 95 | } |
85 | 96 | |
86 | 97 | public ConceptOutput getOutput() { |
— | — | @@ -375,12 +386,8 @@ |
376 | 387 | showFeatureVector(id, out); |
377 | 388 | } |
378 | 389 | else if (cmd.equals("d") || cmd.equals("dis") || cmd.equals("disambig") || cmd.equals("disambiguate")) { |
379 | | - List<String> terms = new ArrayList<String>(params.size()-1); |
380 | | - for (Object t: params.subList(1,params.size())) { |
381 | | - terms.add(t.toString()); |
382 | | - } |
383 | | - |
384 | | - showDisambiguation(terms, out); |
| 390 | + PhraseNode<? extends TermReference> root = getPhrases(params.get(1).toString()); |
| 391 | + showDisambiguation(root, out); |
385 | 392 | } |
386 | 393 | else if (cmd.equals("ls") || cmd.equals("list")) { |
387 | 394 | listConcepts(out); |
— | — | @@ -391,6 +398,21 @@ |
392 | 399 | } |
393 | 400 | } |
394 | 401 | |
| 402 | + protected PhraseNode<? extends TermReference> getPhrases(String s) { |
| 403 | + if (s.indexOf('|')>0) { |
| 404 | + String[] ss = s.split("\\s\\|\\s"); |
| 405 | + List<Term> terms = new ArrayList<Term>(ss.length); |
| 406 | + for (String t: ss) { |
| 407 | + terms.add(new Term(t)); |
| 408 | + } |
| 409 | + |
| 410 | + return new TermListNode<Term>(terms, 0); |
| 411 | + } else { |
| 412 | + PhraseOccuranceSet occurances = phraseExtractor.extractPhrases(s, 6); |
| 413 | + return occurances.getRootNode(); |
| 414 | + } |
| 415 | + } |
| 416 | + |
395 | 417 | public boolean isGlobalThesaurus() { |
396 | 418 | return !isDatasetLocal(); |
397 | 419 | } |
— | — | @@ -499,8 +521,8 @@ |
500 | 522 | out.writeFeatureVector(conceptFeatures.getFeatureVector()); |
501 | 523 | } |
502 | 524 | |
503 | | - public void showDisambiguation(List<String> terms, ConsoleOutput out) throws PersistenceException { |
504 | | - Disambiguator.Result r = getDisambiguator().disambiguate(terms, null); |
| 525 | + public void showDisambiguation(PhraseNode<? extends TermReference> root, ConsoleOutput out) throws PersistenceException { |
| 526 | + Disambiguator.Result r = getDisambiguator().disambiguate(root, null); |
505 | 527 | out.writeInterpretation(r.getMeanings()); |
506 | 528 | } |
507 | 529 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -20,10 +20,11 @@ |
21 | 21 | import de.brightbyte.io.ConsoleIO; |
22 | 22 | import de.brightbyte.wikiword.Corpus; |
23 | 23 | import de.brightbyte.wikiword.TweakSet; |
| 24 | +import de.brightbyte.wikiword.disambig.PhraseExtractor; |
24 | 25 | import de.brightbyte.wikiword.model.PhraseOccurance; |
25 | 26 | import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
26 | 27 | |
27 | | -public class PlainTextAnalyzer extends AbstractAnalyzer { |
| 28 | +public class PlainTextAnalyzer extends AbstractAnalyzer implements PhraseExtractor { |
28 | 29 | private LanguageConfiguration config; |
29 | 30 | |
30 | 31 | private Matcher sentenceMatcher; |