r72990 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r72989‎ | r72990 | r72991 >
Date:16:41, 14 September 2010
Author:daniel
Status:deferred
Tags:
Comment:
allow meanings to be mangled on load; limit recursion depth on phrase detector
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
@@ -3,13 +3,15 @@
44 import java.util.Collection;
55 import java.util.HashMap;
66 import java.util.List;
 7+import java.util.ListIterator;
78 import java.util.Map;
89
 10+import de.brightbyte.data.Functor2;
911 import de.brightbyte.data.cursor.DataSet;
1012 import de.brightbyte.io.Output;
1113 import de.brightbyte.util.PersistenceException;
 14+import de.brightbyte.wikiword.model.TermReference;
1215 import de.brightbyte.wikiword.model.WikiWordConcept;
13 -import de.brightbyte.wikiword.model.TermReference;
1416 import de.brightbyte.wikiword.store.WikiWordConceptStore;
1517 import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec;
1618
@@ -17,6 +19,7 @@
1820 protected WikiWordConceptStore store;
1921 protected ConceptQuerySpec spec;
2022 protected Output trace;
 23+ protected Functor2<WikiWordConcept, WikiWordConcept, String> meaningMangler;
2124
2225 public StoredMeaningFetcher(WikiWordConceptStore store) {
2326 this(store, null);
@@ -29,10 +32,29 @@
3033 this.spec = type;
3134 }
3235
 36+ public Functor2<WikiWordConcept, WikiWordConcept, String> getMeaningMangler() {
 37+ return meaningMangler;
 38+ }
 39+
 40+ public void setMeaningMangler(Functor2<WikiWordConcept, WikiWordConcept, String> meaningMangler) {
 41+ this.meaningMangler = meaningMangler;
 42+ }
 43+
3344 public List<WikiWordConcept> getMeanings(String term) throws PersistenceException {
34 - DataSet<WikiWordConcept> m = store.getMeanings(term, spec); //FIXME: filter/cut-off rules, sort order! //XXX: relevance value?
 45+ DataSet<WikiWordConcept> m = store.getMeanings(term, spec);
3546 List<WikiWordConcept> meanigns = m.load();
3647
 48+ if ( meaningMangler != null ) {
 49+ ListIterator<WikiWordConcept> it = meanigns.listIterator();
 50+ while (it.hasNext()) {
 51+ WikiWordConcept c = it.next();
 52+ WikiWordConcept c2 = meaningMangler.apply(c, term);
 53+
 54+ if ( c2 == null ) it.remove();
 55+ else if ( c != c2 ) it.set(c2);
 56+ }
 57+ }
 58+
3759 trace("fetched "+meanigns.size()+" meanings for \""+term+"\"");
3860 return meanigns;
3961 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -248,7 +248,7 @@
249249 return getScore(r.getInterpretation(), context, similarities, features);
250250 }
251251
252 - Collection<List<X>> sequences = getSequences(root, Integer.MAX_VALUE);
 252+ Collection<List<X>> sequences = getSequences(root, getPhraseSearchDepth());
253253 return disambiguate(sequences, root, meanings, context);
254254 }
255255
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -21,6 +21,8 @@
2222
2323 private Map<String, C> meaningOverrides;
2424
 25+ private int phraseSearchDepth = 8; //FIXME: magic...
 26+
2527 public AbstractDisambiguator(MeaningFetcher<? extends C> meaningFetcher, int cacheCapacity) {
2628 if (meaningFetcher==null) throw new NullPointerException();
2729
@@ -28,6 +30,14 @@
2931 this.meaningFetcher = meaningFetcher;
3032 }
3133
 34+ public int getPhraseSearchDepth() {
 35+ return phraseSearchDepth;
 36+ }
 37+
 38+ public void setPhraseSearchDepth(int phraseSearchDepth) {
 39+ this.phraseSearchDepth = phraseSearchDepth;
 40+ }
 41+
3242 public MeaningFetcher<? extends C> getMeaningFetcher() {
3343 return meaningFetcher;
3444 }
@@ -86,7 +96,7 @@
8797
8898
8999 protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException {
90 - Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
 100+ Collection<X> terms = getTerms(root, phraseSearchDepth);
91101 return getMeanings(terms);
92102 }
93103
@@ -117,7 +127,7 @@
118128 }
119129
120130 public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X> root, Collection<? extends C> context) throws PersistenceException {
121 - Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
 131+ Collection<X> terms = getTerms(root, phraseSearchDepth);
122132 Map<X, List<? extends C>> meanings = getMeanings(terms);
123133 return disambiguate(root, meanings, context);
124134 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
@@ -4,6 +4,6 @@
55
66 public interface PhraseExtractor {
77
8 - public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight);
 8+ public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight, int maxDepth);
99
1010 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -62,7 +62,7 @@
6363 }
6464
6565 public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) {
66 - Collection<List<X>> sequences = getSequences(root, Integer.MAX_VALUE);
 66+ Collection<List<X>> sequences = getSequences(root, getPhraseSearchDepth());
6767 return disambiguate(sequences, root, meanings, context);
6868 }
6969
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
@@ -66,7 +66,6 @@
6767 }
6868
6969 public void setName(String name) {
70 - if (this.name!=null) throw new IllegalStateException("property already initialized");
7170 this.name = name;
7271 }
7372
@@ -91,7 +90,6 @@
9291 }
9392
9493 public void setFeatures(ConceptFeatures<? extends WikiWordConcept, Integer> features) {
95 - if (this.features!=null) throw new IllegalStateException("property already initialized");
9694 if (features.getConcept()!=null && !this.equals(features.getConcept())) throw new IllegalArgumentException("ConceptFeatures bound to a different concept: "+features.getConcept());
9795 this.features = features;
9896 }
@@ -101,7 +99,6 @@
102100 }
103101
104102 public void setProperties(ConceptProperties<? extends WikiWordConcept> properties) {
105 - if (this.properties!=null) throw new IllegalStateException("property already initialized");
106103 if (properties.getConcept()!=null && !this.equals(properties.getConcept())) throw new IllegalArgumentException("ConceptFeatures bound to a different concept: "+features.getConcept());
107104 this.properties = properties;
108105 }
@@ -111,7 +108,6 @@
112109 }
113110
114111 public void setResources(ConceptResources<? extends WikiWordConcept> resources) {
115 - if (this.resources!=null) throw new IllegalStateException("property already initialized");
116112 this.resources = resources;
117113 }
118114
@@ -120,7 +116,6 @@
121117 }
122118
123119 public void setRelations(ConceptRelations<? extends WikiWordConcept> relations) {
124 - if (this.relations!=null) throw new IllegalStateException("property already initialized");
125120 this.relations = relations;
126121 }
127122
@@ -137,7 +132,6 @@
138133 }
139134
140135 public void setTerms(TermReference[] terms) {
141 - if (this.terms!=null) throw new IllegalStateException("property already initialized");
142136 this.terms = terms;
143137 }
144138
@@ -146,7 +140,6 @@
147141 }
148142
149143 public void setType(ConceptType type) {
150 - if (this.type!=null && !this.type.equals(ConceptType.UNKNOWN)) throw new IllegalStateException("property already initialized");
151144 this.type = type;
152145 }
153146
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
@@ -354,7 +354,7 @@
355355 }
356356 }
357357
358 - public void buildAggregatePhrases( int start, double minWeight, double maxWeight, Matcher phraseBreak ) {
 358+ public void buildAggregatePhrases( int start, double minWeight, double maxWeight, int maxDepth, Matcher phraseBreak ) {
359359 AggregatePhraseBuilder builder = new AggregatePhraseBuilder( minWeight, maxWeight, phraseBreak );
360360
361361 if (isEmpty()) return;
@@ -363,7 +363,7 @@
364364
365365 for (int i=start; i<end; i++) {
366366 if (hasPhrasesAt(i)) {
367 - builder.walk(getRootNodeAt(i), 0, null, Integer.MAX_VALUE, maxWeight);
 367+ builder.walk(getRootNodeAt(i), 0, null, maxDepth, maxWeight);
368368 }
369369 }
370370
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
@@ -73,29 +73,29 @@
7474 }
7575
7676 public void testExtractPhrases() {
77 - PhraseOccuranceSet phrases = extractPhrases("", 3);
 77+ PhraseOccuranceSet phrases = extractPhrases("", 3, 3);
7878 assertEquals(0, phrases.size());
7979 assertEquals(theList(), getWordList(phrases.getPhrasesAt(0)));
8080
81 - phrases = extractPhrases("foo", 3);
 81+ phrases = extractPhrases("foo", 3, 3);
8282 assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(0)));
8383
84 - phrases = extractPhrases(" foo ", 3);
 84+ phrases = extractPhrases(" foo ", 3, 3);
8585 assertEquals(theList(), getWordList(phrases.getPhrasesAt(0)));
8686 assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(1)));
8787 assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesFrom(0)));
8888 }
8989
9090 public void testExtractPhrases2() {
91 - PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3);
 91+ PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3, 6);
9292 assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
9393 assertEquals(theList( "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
9494
95 - phrases = extractPhrases("red green blue yellow black", 5);
 95+ phrases = extractPhrases("red green blue yellow black", 5, 10);
9696 assertEquals(theList( "red green blue yellow black", "red green blue yellow", "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
9797 assertEquals(theList( "green blue yellow black", "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
9898
99 - phrases = extractPhrases("and red and green and blue and yellow", 3);
 99+ phrases = extractPhrases("and red and green and blue and yellow", 3, 12);
100100 assertEquals(theList( "and red and green and blue",
101101 "and red and green and",
102102 "and red and green",
@@ -111,14 +111,14 @@
112112 ),
113113 getWordList(phrases.getPhrasesAt(4)));
114114
115 - phrases = extractPhrases("red green blue. yellow black", 5);
 115+ phrases = extractPhrases("red green blue. yellow black", 5, 10);
116116 assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
117117 assertEquals(theList( "blue" ), getWordList(phrases.getPhrasesAt(10)));
118118 assertEquals(theList( "yellow black", "yellow" ), getWordList(phrases.getPhrasesAt(16)));
119119 }
120120
121121 public void testExtractPhrases3() {
122 - PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3);
 122+ PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3, 6);
123123 assertEquals(theList( "Krababbel"), getWordList(phrases.getPhrasesAt(0)));
124124
125125 assertEquals(theList( "l'Foo-Bar",
@@ -134,7 +134,7 @@
135135 assertEquals(theList( "Bar"),
136136 getWordList(phrases.getPhrasesAt(17)));
137137
138 - phrases = extractPhrases("harald's 'schlaaand", 3);
 138+ phrases = extractPhrases("harald's 'schlaaand", 3, 3);
139139 assertEquals(theList( "harald's 'schlaaand",
140140 "harald's",
141141 "harald"
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
@@ -28,7 +28,7 @@
2929 public void runCommand(String cmd, List<Object> params, ConsoleOutput out) throws Exception {
3030 if (cmd.equals("phrases") || cmd.equals("p")) {
3131 Object s = params.get(1);
32 - PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s.toString(), 5);
 32+ PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s.toString(), 5, 5);
3333 out.writeList(occurances);
3434 out.dumpPhraseTree(occurances.getRootNode());
3535 } else {
@@ -40,7 +40,7 @@
4141 if (s.indexOf('|')>0 || s.indexOf(';')>0 ) {
4242 return super.getPhrases(s);
4343 } else {
44 - PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s, 5);
 44+ PhraseOccuranceSet occurances = plainTextAnalyzer.extractPhrases(s, 5, 5);
4545 return occurances.getRootNode();
4646 }
4747 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
@@ -271,7 +271,7 @@
272272 }
273273 } */
274274
275 - public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) {
 275+ public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight, int maxDepth) {
276276 PhraseOccuranceSet phrases = new PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
277277
278278 text = applyManglers(config.sentenceManglers, text);
@@ -285,7 +285,7 @@
286286 buildPhrases(s, ofs, phrases, maxWeight);
287287 if (phrases.isEmpty()) continue;
288288
289 - phrases.buildAggregatePhrases(ofs, 0, maxWeight, phraseBreakeMatcher);
 289+ phrases.buildAggregatePhrases(ofs, 0, maxWeight, maxDepth, phraseBreakeMatcher);
290290 }
291291
292292 if (phrases.isEmpty()) return phrases;
@@ -373,7 +373,7 @@
374374 BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
375375 String s ;
376376 while ( (s = in.readLine()) != null ) {
377 - PhraseOccuranceSet phrases = analyzer.extractPhrases(s, 6);
 377+ PhraseOccuranceSet phrases = analyzer.extractPhrases(s, 6, 6);
378378 DebugUtil.dump("", phrases, ConsoleIO.output);
379379 }
380380 }

Status & tagging log