Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java |
— | — | @@ -3,13 +3,15 @@ |
4 | 4 | import java.util.Collection; |
5 | 5 | import java.util.HashMap; |
6 | 6 | import java.util.List; |
| 7 | +import java.util.ListIterator; |
7 | 8 | import java.util.Map; |
8 | 9 | |
| 10 | +import de.brightbyte.data.Functor2; |
9 | 11 | import de.brightbyte.data.cursor.DataSet; |
10 | 12 | import de.brightbyte.io.Output; |
11 | 13 | import de.brightbyte.util.PersistenceException; |
| 14 | +import de.brightbyte.wikiword.model.TermReference; |
12 | 15 | import de.brightbyte.wikiword.model.WikiWordConcept; |
13 | | -import de.brightbyte.wikiword.model.TermReference; |
14 | 16 | import de.brightbyte.wikiword.store.WikiWordConceptStore; |
15 | 17 | import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec; |
16 | 18 | |
— | — | @@ -17,6 +19,7 @@ |
18 | 20 | protected WikiWordConceptStore store; |
19 | 21 | protected ConceptQuerySpec spec; |
20 | 22 | protected Output trace; |
| 23 | + protected Functor2<WikiWordConcept, WikiWordConcept, String> meaningMangler; |
21 | 24 | |
22 | 25 | public StoredMeaningFetcher(WikiWordConceptStore store) { |
23 | 26 | this(store, null); |
— | — | @@ -29,10 +32,29 @@ |
30 | 33 | this.spec = type; |
31 | 34 | } |
32 | 35 | |
| 36 | + public Functor2<WikiWordConcept, WikiWordConcept, String> getMeaningMangler() { |
| 37 | + return meaningMangler; |
| 38 | + } |
| 39 | + |
| 40 | + public void setMeaningMangler(Functor2<WikiWordConcept, WikiWordConcept, String> meaningMangler) { |
| 41 | + this.meaningMangler = meaningMangler; |
| 42 | + } |
| 43 | + |
33 | 44 | public List<WikiWordConcept> getMeanings(String term) throws PersistenceException { |
34 | | - DataSet<WikiWordConcept> m = store.getMeanings(term, spec); //FIXME: filter/cut-off rules, sort order! //XXX: relevance value? |
| 45 | + DataSet<WikiWordConcept> m = store.getMeanings(term, spec); |
35 | 46 | List<WikiWordConcept> meanigns = m.load(); |
36 | 47 | |
| 48 | + if ( meaningMangler != null ) { |
| 49 | + ListIterator<WikiWordConcept> it = meanigns.listIterator(); |
| 50 | + while (it.hasNext()) { |
| 51 | + WikiWordConcept c = it.next(); |
| 52 | + WikiWordConcept c2 = meaningMangler.apply(c, term); |
| 53 | + |
| 54 | + if ( c2 == null ) it.remove(); |
| 55 | + else if ( c != c2 ) it.set(c2); |
| 56 | + } |
| 57 | + } |
| 58 | + |
37 | 59 | trace("fetched "+meanigns.size()+" meanings for \""+term+"\""); |
38 | 60 | return meanigns; |
39 | 61 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -248,7 +248,7 @@ |
249 | 249 | return getScore(r.getInterpretation(), context, similarities, features); |
250 | 250 | } |
251 | 251 | |
252 | | - Collection<List<X>> sequences = getSequences(root, Integer.MAX_VALUE); |
| 252 | + Collection<List<X>> sequences = getSequences(root, getPhraseSearchDepth()); |
253 | 253 | return disambiguate(sequences, root, meanings, context); |
254 | 254 | } |
255 | 255 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -21,6 +21,8 @@ |
22 | 22 | |
23 | 23 | private Map<String, C> meaningOverrides; |
24 | 24 | |
| 25 | + private int phraseSearchDepth = 8; //FIXME: magic... |
| 26 | + |
25 | 27 | public AbstractDisambiguator(MeaningFetcher<? extends C> meaningFetcher, int cacheCapacity) { |
26 | 28 | if (meaningFetcher==null) throw new NullPointerException(); |
27 | 29 | |
— | — | @@ -28,6 +30,14 @@ |
29 | 31 | this.meaningFetcher = meaningFetcher; |
30 | 32 | } |
31 | 33 | |
| 34 | + public int getPhraseSearchDepth() { |
| 35 | + return phraseSearchDepth; |
| 36 | + } |
| 37 | + |
| 38 | + public void setPhraseSearchDepth(int phraseSearchDepth) { |
| 39 | + this.phraseSearchDepth = phraseSearchDepth; |
| 40 | + } |
| 41 | + |
32 | 42 | public MeaningFetcher<? extends C> getMeaningFetcher() { |
33 | 43 | return meaningFetcher; |
34 | 44 | } |
— | — | @@ -86,7 +96,7 @@ |
87 | 97 | |
88 | 98 | |
89 | 99 | protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException { |
90 | | - Collection<X> terms = getTerms(root, Integer.MAX_VALUE); |
| 100 | + Collection<X> terms = getTerms(root, phraseSearchDepth); |
91 | 101 | return getMeanings(terms); |
92 | 102 | } |
93 | 103 | |
— | — | @@ -117,7 +127,7 @@ |
118 | 128 | } |
119 | 129 | |
120 | 130 | public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X> root, Collection<? extends C> context) throws PersistenceException { |
121 | | - Collection<X> terms = getTerms(root, Integer.MAX_VALUE); |
| 131 | + Collection<X> terms = getTerms(root, phraseSearchDepth); |
122 | 132 | Map<X, List<? extends C>> meanings = getMeanings(terms); |
123 | 133 | return disambiguate(root, meanings, context); |
124 | 134 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java |
— | — | @@ -4,6 +4,6 @@ |
5 | 5 | |
6 | 6 | public interface PhraseExtractor { |
7 | 7 | |
8 | | - public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight); |
| 8 | + public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight, int maxDepth); |
9 | 9 | |
10 | 10 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -62,7 +62,7 @@ |
63 | 63 | } |
64 | 64 | |
65 | 65 | public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) { |
66 | | - Collection<List<X>> sequences = getSequences(root, Integer.MAX_VALUE); |
| 66 | + Collection<List<X>> sequences = getSequences(root, getPhraseSearchDepth()); |
67 | 67 | return disambiguate(sequences, root, meanings, context); |
68 | 68 | } |
69 | 69 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java |
— | — | @@ -66,7 +66,6 @@ |
67 | 67 | } |
68 | 68 | |
69 | 69 | public void setName(String name) { |
70 | | - if (this.name!=null) throw new IllegalStateException("property already initialized"); |
71 | 70 | this.name = name; |
72 | 71 | } |
73 | 72 | |
— | — | @@ -91,7 +90,6 @@ |
92 | 91 | } |
93 | 92 | |
94 | 93 | public void setFeatures(ConceptFeatures<? extends WikiWordConcept, Integer> features) { |
95 | | - if (this.features!=null) throw new IllegalStateException("property already initialized"); |
96 | 94 | if (features.getConcept()!=null && !this.equals(features.getConcept())) throw new IllegalArgumentException("ConceptFeatures bound to a different concept: "+features.getConcept()); |
97 | 95 | this.features = features; |
98 | 96 | } |
— | — | @@ -101,7 +99,6 @@ |
102 | 100 | } |
103 | 101 | |
104 | 102 | public void setProperties(ConceptProperties<? extends WikiWordConcept> properties) { |
105 | | - if (this.properties!=null) throw new IllegalStateException("property already initialized"); |
106 | 103 | if (properties.getConcept()!=null && !this.equals(properties.getConcept())) throw new IllegalArgumentException("ConceptFeatures bound to a different concept: "+features.getConcept()); |
107 | 104 | this.properties = properties; |
108 | 105 | } |
— | — | @@ -111,7 +108,6 @@ |
112 | 109 | } |
113 | 110 | |
114 | 111 | public void setResources(ConceptResources<? extends WikiWordConcept> resources) { |
115 | | - if (this.resources!=null) throw new IllegalStateException("property already initialized"); |
116 | 112 | this.resources = resources; |
117 | 113 | } |
118 | 114 | |
— | — | @@ -120,7 +116,6 @@ |
121 | 117 | } |
122 | 118 | |
123 | 119 | public void setRelations(ConceptRelations<? extends WikiWordConcept> relations) { |
124 | | - if (this.relations!=null) throw new IllegalStateException("property already initialized"); |
125 | 120 | this.relations = relations; |
126 | 121 | } |
127 | 122 | |
— | — | @@ -137,7 +132,6 @@ |
138 | 133 | } |
139 | 134 | |
140 | 135 | public void setTerms(TermReference[] terms) { |
141 | | - if (this.terms!=null) throw new IllegalStateException("property already initialized"); |
142 | 136 | this.terms = terms; |
143 | 137 | } |
144 | 138 | |
— | — | @@ -146,7 +140,6 @@ |
147 | 141 | } |
148 | 142 | |
149 | 143 | public void setType(ConceptType type) { |
150 | | - if (this.type!=null && !this.type.equals(ConceptType.UNKNOWN)) throw new IllegalStateException("property already initialized"); |
151 | 144 | this.type = type; |
152 | 145 | } |
153 | 146 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java |
— | — | @@ -354,7 +354,7 @@ |
355 | 355 | } |
356 | 356 | } |
357 | 357 | |
358 | | - public void buildAggregatePhrases( int start, double minWeight, double maxWeight, Matcher phraseBreak ) { |
| 358 | + public void buildAggregatePhrases( int start, double minWeight, double maxWeight, int maxDepth, Matcher phraseBreak ) { |
359 | 359 | AggregatePhraseBuilder builder = new AggregatePhraseBuilder( minWeight, maxWeight, phraseBreak ); |
360 | 360 | |
361 | 361 | if (isEmpty()) return; |
— | — | @@ -363,7 +363,7 @@ |
364 | 364 | |
365 | 365 | for (int i=start; i<end; i++) { |
366 | 366 | if (hasPhrasesAt(i)) { |
367 | | - builder.walk(getRootNodeAt(i), 0, null, Integer.MAX_VALUE, maxWeight); |
| 367 | + builder.walk(getRootNodeAt(i), 0, null, maxDepth, maxWeight); |
368 | 368 | } |
369 | 369 | } |
370 | 370 | |
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java |
— | — | @@ -73,29 +73,29 @@ |
74 | 74 | } |
75 | 75 | |
76 | 76 | public void testExtractPhrases() { |
77 | | - PhraseOccuranceSet phrases = extractPhrases("", 3); |
| 77 | + PhraseOccuranceSet phrases = extractPhrases("", 3, 3); |
78 | 78 | assertEquals(0, phrases.size()); |
79 | 79 | assertEquals(theList(), getWordList(phrases.getPhrasesAt(0))); |
80 | 80 | |
81 | | - phrases = extractPhrases("foo", 3); |
| 81 | + phrases = extractPhrases("foo", 3, 3); |
82 | 82 | assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(0))); |
83 | 83 | |
84 | | - phrases = extractPhrases(" foo ", 3); |
| 84 | + phrases = extractPhrases(" foo ", 3, 3); |
85 | 85 | assertEquals(theList(), getWordList(phrases.getPhrasesAt(0))); |
86 | 86 | assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(1))); |
87 | 87 | assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesFrom(0))); |
88 | 88 | } |
89 | 89 | |
90 | 90 | public void testExtractPhrases2() { |
91 | | - PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3); |
| 91 | + PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3, 6); |
92 | 92 | assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0))); |
93 | 93 | assertEquals(theList( "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4))); |
94 | 94 | |
95 | | - phrases = extractPhrases("red green blue yellow black", 5); |
| 95 | + phrases = extractPhrases("red green blue yellow black", 5, 10); |
96 | 96 | assertEquals(theList( "red green blue yellow black", "red green blue yellow", "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0))); |
97 | 97 | assertEquals(theList( "green blue yellow black", "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4))); |
98 | 98 | |
99 | | - phrases = extractPhrases("and red and green and blue and yellow", 3); |
| 99 | + phrases = extractPhrases("and red and green and blue and yellow", 3, 12); |
100 | 100 | assertEquals(theList( "and red and green and blue", |
101 | 101 | "and red and green and", |
102 | 102 | "and red and green", |
— | — | @@ -111,14 +111,14 @@ |
112 | 112 | ), |
113 | 113 | getWordList(phrases.getPhrasesAt(4))); |
114 | 114 | |
115 | | - phrases = extractPhrases("red green blue. yellow black", 5); |
| 115 | + phrases = extractPhrases("red green blue. yellow black", 5, 10); |
116 | 116 | assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0))); |
117 | 117 | assertEquals(theList( "blue" ), getWordList(phrases.getPhrasesAt(10))); |
118 | 118 | assertEquals(theList( "yellow black", "yellow" ), getWordList(phrases.getPhrasesAt(16))); |
119 | 119 | } |
120 | 120 | |
121 | 121 | public void testExtractPhrases3() { |
122 | | - PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3); |
| 122 | + PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3, 6); |
123 | 123 | assertEquals(theList( "Krababbel"), getWordList(phrases.getPhrasesAt(0))); |
124 | 124 | |
125 | 125 | assertEquals(theList( "l'Foo-Bar", |
— | — | @@ -134,7 +134,7 @@ |
135 | 135 | assertEquals(theList( "Bar"), |
136 | 136 | getWordList(phrases.getPhrasesAt(17))); |
137 | 137 | |
138 | | - phrases = extractPhrases("harald's 'schlaaand", 3); |
| 138 | + phrases = extractPhrases("harald's 'schlaaand", 3, 3); |
139 | 139 | assertEquals(theList( "harald's 'schlaaand", |
140 | 140 | "harald's", |
141 | 141 | "harald" |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java |
— | — | @@ -28,7 +28,7 @@ |
29 | 29 | public void runCommand(String cmd, List<Object> params, ConsoleOutput out) throws Exception { |
30 | 30 | if (cmd.equals("phrases") || cmd.equals("p")) { |
31 | 31 | Object s = params.get(1); |
32 | | - PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s.toString(), 5); |
| 32 | + PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s.toString(), 5, 5); |
33 | 33 | out.writeList(occurances); |
34 | 34 | out.dumpPhraseTree(occurances.getRootNode()); |
35 | 35 | } else { |
— | — | @@ -40,7 +40,7 @@ |
41 | 41 | if (s.indexOf('|')>0 || s.indexOf(';')>0 ) { |
42 | 42 | return super.getPhrases(s); |
43 | 43 | } else { |
44 | | - PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s, 5); |
| 44 | + PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s, 5, 5); |
45 | 45 | return occurances.getRootNode(); |
46 | 46 | } |
47 | 47 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -271,7 +271,7 @@ |
272 | 272 | } |
273 | 273 | } */ |
274 | 274 | |
275 | | - public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) { |
| 275 | + public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight, int maxDepth) { |
276 | 276 | PhraseOccuranceSet phrases = new PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>()); |
277 | 277 | |
278 | 278 | text = applyManglers(config.sentenceManglers, text); |
— | — | @@ -285,7 +285,7 @@ |
286 | 286 | buildPhrases(s, ofs, phrases, maxWeight); |
287 | 287 | if (phrases.isEmpty()) continue; |
288 | 288 | |
289 | | - phrases.buildAggregatePhrases(ofs, 0, maxWeight, phraseBreakeMatcher); |
| 289 | + phrases.buildAggregatePhrases(ofs, 0, maxWeight, maxDepth, phraseBreakeMatcher); |
290 | 290 | } |
291 | 291 | |
292 | 292 | if (phrases.isEmpty()) return phrases; |
— | — | @@ -373,7 +373,7 @@ |
374 | 374 | BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); |
375 | 375 | String s ; |
376 | 376 | while ( (s = in.readLine()) != null ) { |
377 | | - PhraseOccuranceSet phrases = analyzer.extractPhrases(s, 6); |
| 377 | + PhraseOccuranceSet phrases = analyzer.extractPhrases(s, 6, 6); |
378 | 378 | DebugUtil.dump("", phrases, ConsoleIO.output); |
379 | 379 | } |
380 | 380 | } |