r64388 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r64387‎ | r64388 | r64389 >
Date:14:29, 30 March 2010
Author:daniel
Status:deferred
Tags:
Comment:
fetcher caches
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureCache.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningCache.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureCache.java
@@ -11,6 +11,34 @@
1212 import de.brightbyte.wikiword.model.WikiWordConcept;
1313
1414 public class FeatureCache<C extends WikiWordConcept, K> implements FeatureFetcher<C, K> {
 15+
 16+ protected static class Manager<C extends WikiWordConcept, K> {
 17+ protected int maxDepth;
 18+
 19+ protected FeatureFetcher<C, K> root;
 20+ protected List<FeatureCache<C, K>> stack;
 21+
 22+ public Manager(FeatureFetcher<C, K> root, int maxDepth) {
 23+ this.stack = new ArrayList<FeatureCache<C, K>>(maxDepth+1);
 24+ this.maxDepth = maxDepth;
 25+ this.root = root;
 26+ }
 27+
 28+ private FeatureFetcher<C, K> getTop() {
 29+ if (stack.isEmpty()) return root;
 30+ else return stack.get(stack.size()-1);
 31+ }
 32+
 33+ public synchronized FeatureCache<C, K> newCache() {
 34+ FeatureCache<C, K> cache = new FeatureCache<C, K>( getTop() );
 35+ stack.add(cache);
 36+
 37+ if (stack.size()>maxDepth) stack.remove(0);
 38+ if (!stack.isEmpty()) stack.get(0).setParent(root);
 39+
 40+ return cache;
 41+ }
 42+ }
1543
1644 protected FeatureFetcher<C, K> parent;
1745
@@ -54,11 +82,11 @@
5583 return features;
5684 }
5785
58 - public FeatureFetcher getParent() {
 86+ public FeatureFetcher<C, K> getParent() {
5987 return parent;
6088 }
6189
62 - public void setParent(FeatureCache<C, K> parent) {
 90+ public void setParent(FeatureFetcher<C, K> parent) {
6391 if (parent == null) throw new NullPointerException();
6492 if (parent == this) throw new IllegalArgumentException("can't be my own parent");
6593 //TODO: prevent cycles
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningFetcher.java
@@ -1,10 +1,14 @@
22 package de.brightbyte.wikiword.disambig;
33
44 import java.util.List;
 5+import java.util.Map;
56
67 import de.brightbyte.util.PersistenceException;
 8+import de.brightbyte.wikiword.model.TermReference;
79 import de.brightbyte.wikiword.model.WikiWordConcept;
810
911 public interface MeaningFetcher<C extends WikiWordConcept> {
10 - public List<C> getMeanings(String term) throws PersistenceException;
 12+ public List<? extends C> getMeanings(String term) throws PersistenceException;
 13+
 14+ public <X extends TermReference>Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException;
1115 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -51,7 +51,7 @@
5252 Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size());
5353
5454 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
55 - FeatureCache<LocalConcept, Integer> features = getFeatureCache(meanings);
 55+ FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings);
5656
5757 for (int i= window; ; i++) {
5858 int from = i-window;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
@@ -1,11 +1,14 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.HashMap;
45 import java.util.List;
 6+import java.util.Map;
57
68 import de.brightbyte.data.cursor.DataSet;
79 import de.brightbyte.io.Output;
810 import de.brightbyte.util.PersistenceException;
911 import de.brightbyte.wikiword.model.LocalConcept;
 12+import de.brightbyte.wikiword.model.TermReference;
1013 import de.brightbyte.wikiword.store.LocalConceptStore;
1114 import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec;
1215
@@ -31,6 +34,18 @@
3235 return m.load();
3336 }
3437
 38+ public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(List<X> terms) throws PersistenceException {
 39+ Map<X, List<? extends LocalConcept>> meanings = new HashMap<X, List<? extends LocalConcept>>();
 40+
 41+ for (X t: terms) {
 42+ List<LocalConcept> m = getMeanings(t.getTerm());
 43+ if (m!=null && m.size()>0) meanings.put(t, m);
 44+ }
 45+
 46+ return meanings;
 47+ }
 48+
 49+
3550 public Output getTrace() {
3651 return trace;
3752 }
@@ -42,5 +57,5 @@
4358 protected void trace(String msg) {
4459 if (trace!=null) trace.println(msg);
4560 }
46 -
 61+
4762 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/MeaningCache.java
@@ -0,0 +1,107 @@
 2+package de.brightbyte.wikiword.disambig;
 3+
 4+import java.util.ArrayList;
 5+import java.util.HashMap;
 6+import java.util.List;
 7+import java.util.Map;
 8+
 9+import de.brightbyte.util.PersistenceException;
 10+import de.brightbyte.wikiword.model.TermReference;
 11+import de.brightbyte.wikiword.model.WikiWordConcept;
 12+
 13+public class MeaningCache<C extends WikiWordConcept> implements MeaningFetcher<C> {
 14+
 15+ protected static class Manager<C extends WikiWordConcept> {
 16+ protected int maxDepth;
 17+
 18+ protected MeaningFetcher<? extends C> root;
 19+ protected List<MeaningCache<C>> stack;
 20+
 21+ public Manager(MeaningFetcher<? extends C> root, int maxDepth) {
 22+ this.stack = new ArrayList<MeaningCache<C>>(maxDepth+1);
 23+ this.maxDepth = maxDepth;
 24+ this.root = root;
 25+ }
 26+
 27+ private MeaningFetcher<? extends C> getTop() {
 28+ if (stack.isEmpty()) return root;
 29+ else return stack.get(stack.size()-1);
 30+ }
 31+
 32+ public synchronized MeaningCache<C> newCache() {
 33+ MeaningCache<C> cache = new MeaningCache<C>( getTop() );
 34+ stack.add(cache);
 35+
 36+ if (stack.size()>maxDepth) stack.remove(0);
 37+ if (!stack.isEmpty()) stack.get(0).setParent(root);
 38+
 39+ return cache;
 40+ }
 41+ }
 42+
 43+ protected MeaningFetcher<C> parent;
 44+
 45+ protected Map<String, List<? extends C>> cache;
 46+
 47+ public MeaningCache(MeaningFetcher<? extends C> parent) {
 48+ if (parent==null) throw new NullPointerException();
 49+ this.setParent(parent);
 50+ this.cache = new HashMap<String, List<? extends C>>();
 51+ }
 52+
 53+
 54+ public MeaningFetcher<? extends C> getParent() {
 55+ return parent;
 56+ }
 57+
 58+ public void setParent(MeaningFetcher<? extends C> parent) {
 59+ if (parent == null) throw new NullPointerException();
 60+ if (parent == this) throw new IllegalArgumentException("can't be my own parent");
 61+ //TODO: prevent cycles
 62+
 63+ this.parent = (MeaningFetcher<C>)(Object)parent; //XXX: ugly scast. generics are a pain.
 64+ }
 65+
 66+ public void clear() {
 67+ cache.clear();
 68+ }
 69+
 70+
 71+ public List<? extends C> getMeanings(String term) throws PersistenceException {
 72+ List<? extends C> meanings = cache.get(term);
 73+
 74+ if (meanings==null) {
 75+ meanings = parent.getMeanings(term);
 76+ cache.put(term, meanings);
 77+ }
 78+
 79+ return meanings;
 80+ }
 81+
 82+
 83+ public <X extends TermReference> Map<X, List<? extends C>> getMeanings(List<X> terms) throws PersistenceException {
 84+ Map<X, List<? extends C>> meanings= new HashMap<X, List<? extends C>>();
 85+ List<X> todo = new ArrayList<X>(terms.size());
 86+
 87+ for (X t: terms) {
 88+ List<? extends C> m = cache.get(t.getTerm());
 89+ if (m!=null) {
 90+ meanings.put(t, m);
 91+ continue;
 92+ } else {
 93+ todo.add(t);
 94+ }
 95+ }
 96+
 97+ Map<X, List<? extends C>> parentMeanings = parent.getMeanings(todo); //XXX: ugly cast, generics are a pain
 98+
 99+ meanings.putAll(parentMeanings);
 100+
 101+ for (Map.Entry<X, List<? extends C>> e: parentMeanings.entrySet()) {
 102+ cache.put(e.getKey().getTerm(), e.getValue());
 103+ }
 104+
 105+ return meanings;
 106+ }
 107+
 108+}
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -42,7 +42,7 @@
4343 pop += Math.log(c.getCardinality());
4444 }
4545
46 - pop = pop / disambig.size();
 46+ if (disambig.size()>0) pop = pop / disambig.size();
4747
4848 Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, pop, "pop="+pop);
4949 return r;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -9,6 +9,7 @@
1010 import java.util.Map.Entry;
1111
1212 import de.brightbyte.data.Functor;
 13+import de.brightbyte.data.Functor2;
1314 import de.brightbyte.data.LabeledMatrix;
1415 import de.brightbyte.data.LabeledVector;
1516 import de.brightbyte.data.MapLabeledMatrix;
@@ -30,8 +31,9 @@
3132 protected double minScore = 0.1; //FIXME: magic number. should "somehow" match popularityFactor and similarityFactor
3233 protected double popularityBias = 0.2; //FIXME: magic number. should "somehow" match popularityFactor and similarityFactor
3334
 35+ protected FeatureCache.Manager<LocalConcept, Integer> featureCacheManager;
 36+
3437 protected Similarity<LabeledVector<Integer>> similarityMeasure;
35 - protected FeatureFetcher<LocalConcept, Integer> featureFetcher;
3638 protected Measure<WikiWordConcept> popularityMeasure;
3739 protected PopularityDisambiguator popularityDisambiguator;
3840
@@ -48,7 +50,16 @@
4951 return Math.sqrt(Math.sqrt(sim)); //XXX: black voodoo magic ad hoc formula with no deeper meaing.
5052 }
5153 };
 54+
 55+ private Functor2.Double scoreCombiner = new Functor2.Double() { //NOTE: must map ([0:1][0:1]) to [0:1] and grow monotonously over both params.
 56+
 57+ public double apply(double popf, double simf) {
 58+ return popf * popularityBias + simf * ( 1 - popularityBias ); //linear combination
 59+ //return = Math.sqrt( popf * simf ); //normalized produkt
 60+ }
5261
 62+ };
 63+
5364 public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) {
5465 this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality,
5566 featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance()); //if pre-normalized, use scalar to calc cosin
@@ -62,16 +73,12 @@
6374 if (featureFetcher==null) throw new NullPointerException();
6475 this.popularityMeasure = popularityMeasure;
6576 this.similarityMeasure = sim;
66 - this.featureFetcher = featureFetcher;
 77+ this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //TODO: depth
6778 this.popularityDisambiguator = new PopularityDisambiguator(meaningFetcher, popularityMeasure);
6879 }
6980
70 - public FeatureFetcher getFeatureFetcher() {
71 - return featureFetcher;
72 - }
73 -
7481 public void setFeatureFetcher(FeatureFetcher<LocalConcept, Integer> featureFetcher) {
75 - this.featureFetcher = featureFetcher;
 82+ this.featureCacheManager = new FeatureCache.Manager<LocalConcept, Integer>(featureFetcher, 10); //FIXME: depth
7683 }
7784
7885 public Similarity<LabeledVector<Integer>> getSimilarityMeasure() {
@@ -116,9 +123,8 @@
117124 this.maxMeanings = maxMeanings;
118125 }
119126
120 - protected FeatureCache<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings) throws PersistenceException {
121 - //TODO: keep a chain of n caches, resulting in LRU logic.
122 - FeatureCache<LocalConcept, Integer> features = new FeatureCache<LocalConcept, Integer>(featureFetcher);
 127+ protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings) throws PersistenceException {
 128+ FeatureFetcher<LocalConcept, Integer> features = featureCacheManager.newCache();
123129
124130 //NOTE: pre-fetch all features in one go
125131 List<LocalConcept> concepts = new ArrayList<LocalConcept>(meanings.size()*10);
@@ -146,7 +152,7 @@
147153 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
148154
149155 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
150 - FeatureCache<LocalConcept, Integer> features = getFeatureCache(meanings);
 156+ FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings);
151157
152158 List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings);
153159
@@ -182,7 +188,7 @@
183189
184190 protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings,
185191 List<Map<X, LocalConcept>> interpretations,
186 - LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, Integer> features) throws PersistenceException {
 192+ LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
187193
188194 List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>();
189195
@@ -238,7 +244,7 @@
239245 return interpretations;
240246 }
241247
242 - protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, Integer> features) throws PersistenceException {
 248+ protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
243249 double sim = 0;
244250 double pop = 0;
245251
@@ -293,9 +299,7 @@
294300 double popf = popularityFactor.apply(pop);
295301 double simf = similarityFactor.apply(sim);
296302
297 - //FIXME: functor!
298 - double score = popf * popularityBias + simf * ( 1 - popularityBias );
299 - //double score = Math.sqrt( popf * simf ); //FIXME: functor!
 303+ double score = scoreCombiner.apply(popf, simf);
300304
301305 return new Result<X, LocalConcept>(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop);
302306 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -1,6 +1,5 @@
22 package de.brightbyte.wikiword.disambig;
33
4 -import java.util.HashMap;
54 import java.util.List;
65 import java.util.Map;
76
@@ -11,27 +10,18 @@
1211
1312 public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> {
1413
15 - protected MeaningFetcher<? extends C> meaningFetcher;
 14+ protected MeaningCache.Manager<C> meaningCacheManager;
 15+
1616 protected Output trace;
1717
1818 public AbstractDisambiguator(MeaningFetcher<? extends C> meaningFetcher) {
1919 if (meaningFetcher==null) throw new NullPointerException();
20 - this.meaningFetcher = meaningFetcher;
 20+ this.meaningCacheManager = new MeaningCache.Manager<C>(meaningFetcher, 10);
2121 }
2222
23 - protected <X extends T>Map<X, List<? extends C>> fetchMeanings(List<X> terms) throws PersistenceException {
24 - Map<X, List<? extends C>> meanings = new HashMap<X, List<? extends C>>();
25 -
26 - for (X t: terms) {
27 - List<? extends C> m = meaningFetcher.getMeanings(t.getTerm());
28 - if (m!=null && m.size()>0) meanings.put(t, m);
29 - }
30 -
31 - return meanings;
32 - }
33 -
3423 public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException {
35 - Map<X, List<? extends C>> meanings = fetchMeanings(terms);
 24+ MeaningCache<C> mcache = meaningCacheManager.newCache();
 25+ Map<X, List<? extends C>> meanings = mcache.getMeanings(terms);
3626 return disambiguate(terms, meanings);
3727 }
3828
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java
@@ -70,7 +70,7 @@
7171
7272 for (PhraseOccurance p: candidates) {
7373 i = p.getEndOffset();
74 - if (filter.matches(p.getPhrase())) {
 74+ if (filter==null || filter.matches(p.getPhrase())) {
7575 phrases.add(p);
7676 continue outer;
7777 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java
@@ -14,7 +14,6 @@
1515 import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator;
1616 import de.brightbyte.wikiword.disambig.StoredFeatureFetcher;
1717 import de.brightbyte.wikiword.disambig.StoredMeaningFetcher;
18 -import de.brightbyte.wikiword.disambig.Disambiguator.Result;
1918 import de.brightbyte.wikiword.model.LocalConcept;
2019 import de.brightbyte.wikiword.model.PhraseOccurance;
2120 import de.brightbyte.wikiword.model.PhraseOccuranceSequence;
@@ -29,8 +28,8 @@
3029 protected PlainTextAnalyzer analyzer;
3130 private int phraseLength;
3231
33 - public WordSenseIndexer(boolean allowGlobal, boolean allowLocal) {
34 - super(allowGlobal, allowLocal);
 32+ public WordSenseIndexer() {
 33+ super(false, true);
3534 }
3635
3736 @Override
@@ -63,6 +62,7 @@
6463 disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true );
6564
6665 analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks);
 66+ analyzer.initialize();
6767
6868 phraseLength = args.getIntOption("phrase-length", tweaks.getTweak("wikiSenseIndexer.phraseLength", 6));
6969 }
@@ -75,4 +75,9 @@
7676 return result.toString(); //FIXME: annotate!
7777 }
7878
 79+ public static void main(String[] argv) throws Exception {
 80+ WordSenseIndexer q = new WordSenseIndexer();
 81+ q.launch(argv);
 82+ }
 83+
7984 }

Status & tagging log