r65882 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r65881‎ | r65882 | r65883 >
Date:21:53, 3 May 2010
Author:daniel
Status:deferred
Tags:
Comment:
disambiguator fixes (incomplete, something is still wrong)
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermAlternativeListNode.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermListNode.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/mangler/SpellingAlternator.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -159,7 +159,7 @@
160160 this.maxMeanings = maxMeanings;
161161 }
162162
163 - protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 163+ protected FeatureFetcher<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException {
164164 FeatureFetcher<LocalConcept, Integer> features = featureCacheManager.newCache();
165165
166166 //NOTE: pre-fetch all features in one go
@@ -177,7 +177,7 @@
178178 /* (non-Javadoc)
179179 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
180180 */
181 - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 181+ public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException {
182182 if (meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), Collections.<X>emptyList(), 0.0, "no terms or meanings");
183183
184184 int sz = meanings.size();
@@ -199,7 +199,7 @@
200200 return disambiguate(sequences, root, meanings, context);
201201 }
202202
203 - public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 203+ public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException {
204204
205205 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
206206
@@ -239,7 +239,7 @@
240240 }
241241
242242 protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings,
243 - Collection<LocalConcept> context, List<Disambiguator.Interpretation<X, LocalConcept>> interpretations,
 243+ Collection<? extends LocalConcept> context, List<Disambiguator.Interpretation<X, LocalConcept>> interpretations,
244244 LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
245245
246246 List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>();
@@ -306,7 +306,7 @@
307307 return interpretations;
308308 }
309309
310 - protected <X extends TermReference>Result<X, LocalConcept> getScore(Disambiguator.Interpretation<X, LocalConcept> interp, Collection<LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
 310+ protected <X extends TermReference>Result<X, LocalConcept> getScore(Disambiguator.Interpretation<X, LocalConcept> interp, Collection<? extends LocalConcept> context, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
311311 Map<? extends TermReference, LocalConcept> concepts;
312312 if (context!=null) {
313313 concepts = new HashMap<TermReference, LocalConcept>();
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -92,13 +92,19 @@
9393 }
9494
9595 private <X extends T>PhraseNode<X> findLastNode(PhraseNode<X> root, List<X> sequence) {
 96+ if (root.getTermReference().getTerm().length()>0) {
 97+ X t = sequence.get(0);
 98+ if (!t.getTerm().equals(root.getTermReference().getTerm())) return null;
 99+ sequence = sequence.subList(1, sequence.size());
 100+ }
 101+
96102 terms: for (X t: sequence) {
97103 Collection<? extends PhraseNode<X>> successors = root.getSuccessors();
98104 if (successors==null || successors.isEmpty())
99105 return null;
100106
101107 for (PhraseNode<X> n: successors) {
102 - if (n.getTermReference().equals(t)) {
 108+ if (n.getTermReference().getTerm().equals(t.getTerm())) {
103109 root = n;
104110 continue terms;
105111 }
@@ -164,17 +170,17 @@
165171 return meanings;
166172 }
167173
168 - public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException {
 174+ public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<? extends C> context) throws PersistenceException {
169175 return this.<X>disambiguate(new TermListNode<X>(terms, 0), context);
170176 }
171177
172 - public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException {
 178+ public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<? extends C> context) throws PersistenceException {
173179 Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
174180 Map<X, List<? extends C>> meanings = getMeanings(terms);
175181 return disambiguate(root, meanings, context);
176182 }
177183
178 - public abstract <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<C> context) throws PersistenceException;
 184+ public abstract <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) throws PersistenceException;
179185
180186 public Output getTrace() {
181187 return trace;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -39,7 +39,7 @@
4040 }
4141
4242 public <X extends TermReference>Result<X, LocalConcept> evalStep(List<X> baseSequence, Map<X, LocalConcept> interpretation, PhraseNode<X> node,
43 - Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context,
 43+ Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context,
4444 LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
4545 X term = node.getTermReference();
4646
@@ -67,7 +67,7 @@
6868 /* (non-Javadoc)
6969 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
7070 */
71 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) throws PersistenceException {
 71+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) throws PersistenceException {
7272 if (meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), Collections.<X>emptyList(), 0.0, "no terms or meanings");
7373
7474 int sz = meanings.size();
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -44,12 +44,12 @@
4545 this.weigthCombiner = weightCombiner;
4646 }
4747
48 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) {
 48+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) {
4949 Collection<List<X>> sequences = getSequences(root, Integer.MAX_VALUE);
5050 return disambiguate(sequences, root, meanings, context);
5151 }
5252
53 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) {
 53+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(Collection<List<X>> sequences, PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) {
5454 Result<X, LocalConcept> best = null;
5555
5656 for (List<X> sequence: sequences) {
@@ -62,7 +62,7 @@
6363 return best;
6464 }
6565
66 - public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings, Collection<LocalConcept> context) {
 66+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings, Collection<? extends LocalConcept> context) {
6767 if (sequence.isEmpty() || meanings.isEmpty()) return new Disambiguator.Result<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), Collections.<X>emptyList(), 0.0, "no terms or meanings");
6868
6969 Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>();
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java
@@ -77,7 +77,7 @@
7878
7979 public void setTrace(Output trace);
8080
81 - public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<C> context) throws PersistenceException;
82 - public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<C> context) throws PersistenceException;
 81+ public <X extends T>Result<X, C> disambiguate(List<X> terms, Collection<? extends C> context) throws PersistenceException;
 82+ public <X extends T>Result<X, C> disambiguate(PhraseNode<X> root, Collection<? extends C> context) throws PersistenceException;
8383
8484 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermAlternativeListNode.java
@@ -23,6 +23,10 @@
2424 this.terms = terms;
2525 this.index = index;
2626 }
 27+
 28+ public String toString() {
 29+ return getTermReference().toString();
 30+ }
2731
2832 public T getTermReference() {
2933 return term;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermListNode.java
@@ -18,6 +18,10 @@
1919 this.terms = terms;
2020 this.index = index;
2121 }
 22+
 23+ public String toString() {
 24+ return getTermReference().toString();
 25+ }
2226
2327 public T getTermReference() {
2428 return terms.get(index);
@@ -26,7 +30,7 @@
2731 public List<TermListNode<T>> getSuccessors() {
2832 if (successors == null) {
2933 if (index+1>=terms.size()) successors = Collections.emptyList();
30 - else Collections.singletonList(new TermListNode<T>(terms, index+1));
 34+ else successors = Collections.singletonList(new TermListNode<T>(terms, index+1));
3135 }
3236
3337 return successors;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/mangler/SpellingAlternator.java
@@ -7,37 +7,48 @@
88 import java.util.List;
99 import java.util.Set;
1010
 11+import de.brightbyte.wikiword.disambig.Term;
 12+
1113 public class SpellingAlternator {
1214 protected List<Mangler> manglers = new ArrayList<Mangler>();
 15+ protected double weightFactor;
1316
14 - public Collection<String> getAlternatives(String term) {
15 - if (manglers.isEmpty()) return Collections.singleton(term);
 17+ public SpellingAlternator(List<Mangler> manglers, double weightFactor) {
 18+ if (manglers==null) throw new NullPointerException();
 19+ if (weightFactor<=0 || weightFactor>1) throw new IllegalArgumentException("weightFactor must be > 0 and <= 1");
1620
17 - Set<String> alternatives = new HashSet<String>();
18 - alternatives.add(term);
19 -
20 - collectAlternatives(term, 0, alternatives);
 21+ this.manglers = manglers;
 22+ this.weightFactor = weightFactor;
 23+ }
 24+
 25+ public Collection<Term> getAlternatives(String term) {
 26+ if (manglers.isEmpty()) return Collections.singleton(new Term(term));
 27+
 28+ Set<Term> alternatives = new HashSet<Term>();
 29+
 30+ collectAlternatives(term, 0, 1, alternatives);
2131 return alternatives;
2232 }
2333
24 - private void collectAlternatives(String term, int index, Set<String> alternatives) {
 34+ private void collectAlternatives(String term, int index, double weight, Set<Term> alternatives) {
2535 if (index>=manglers.size()) return;
26 -
 36+ if (!alternatives.add(new Term(term, weight))) return;
 37+
2738 Mangler mangler= manglers.get(index);
2839 CharSequence t = mangler.mangle(term);
2940
30 - if (t!=null && alternatives.add(t.toString())) {
31 - collectAlternatives(t.toString(), index+1, alternatives); //branch recursion. NOTE: use index=0 to cover all combinations
 41+ if (t!=null) {
 42+ collectAlternatives(t.toString(), index+1, weight*weightFactor, alternatives); //branch recursion. NOTE: use index=0 to cover all combinations
3243 }
3344
34 - collectAlternatives(term, index+1, alternatives); //primitive recursion
 45+ collectAlternatives(term, index+1, weight, alternatives); //primitive recursion
3546 }
3647
37 - public List<Collection<String>> getAlternatives(List<String> terms) {
38 - List<Collection<String>> alternatives = new ArrayList<Collection<String>>(terms.size());
 48+ public List<Collection<Term>> getAlternatives(List<String> terms) {
 49+ List<Collection<Term>> alternatives = new ArrayList<Collection<Term>>(terms.size());
3950
4051 for (String t: terms) {
41 - Collection<String> alt = getAlternatives(t);
 52+ Collection<Term> alt = getAlternatives(t);
4253 alternatives.add(alt);
4354 }
4455

Status & tagging log