r65955 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r65954‎ | r65955 | r65956 >
Date:16:19, 5 May 2010
Author:daniel
Status:deferred
Tags:
Comment:
disambig testing and debugging (work in progress)
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest.java
@@ -1,204 +1,107 @@
22 package de.brightbyte.wikiword.disambig;
33
44 import java.io.IOException;
5 -import java.io.InputStream;
6 -import java.net.URL;
75 import java.util.ArrayList;
86 import java.util.Collection;
9 -import java.util.HashMap;
107 import java.util.List;
118 import java.util.Map;
129
13 -import de.brightbyte.abstraction.ListAbstractor;
14 -import de.brightbyte.data.LabeledVector;
15 -import de.brightbyte.data.MapLabeledVector;
16 -import de.brightbyte.data.cursor.DataCursor;
17 -import de.brightbyte.io.ChunkingCursor;
18 -import de.brightbyte.io.GroupingCursor;
19 -import de.brightbyte.io.LineCursor;
20 -import de.brightbyte.text.CsvLineChunker;
 10+import de.brightbyte.data.Pair;
2111 import de.brightbyte.util.PersistenceException;
22 -import de.brightbyte.wikiword.ConceptType;
23 -import de.brightbyte.wikiword.Corpus;
24 -import de.brightbyte.wikiword.TweakSet;
 12+import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation;
2513 import de.brightbyte.wikiword.disambig.Disambiguator.Result;
26 -import de.brightbyte.wikiword.model.ConceptFeatures;
2714 import de.brightbyte.wikiword.model.LocalConcept;
2815 import de.brightbyte.wikiword.model.PhraseOccurance;
2916 import de.brightbyte.wikiword.model.PhraseOccuranceSet;
30 -import de.brightbyte.wikiword.model.TermReference;
31 -import junit.framework.TestCase;
3217
33 -public class SlidingCoherenceDisambiguatorTest extends TestCase {
 18+public class SlidingCoherenceDisambiguatorTest extends DisambiguatorTestBase {
3419
35 - protected Map<String, List<? extends LocalConcept>> meanings = new HashMap<String, List<? extends LocalConcept>>();
36 - protected Map<Integer, ConceptFeatures<LocalConcept, Integer>> features = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>();
37 -
38 - protected static DataCursor<List<String>> openTableCursor(InputStream in, String enc) throws IOException {
39 - ChunkingCursor cursor = new ChunkingCursor(new LineCursor(in, enc), CsvLineChunker.tsv);
40 - return cursor;
 20+ public SlidingCoherenceDisambiguatorTest() throws IOException, PersistenceException {
 21+ super();
4122 }
4223
43 - protected static DataCursor<List<List<String>>> openGroupedTableCursor(InputStream in, String enc, int groupBy, boolean skipHeader) throws IOException, PersistenceException {
44 - DataCursor<List<String>> c = openTableCursor(in, enc);
45 - if (skipHeader) c.next(); //skip first line
 24+ public void testGetSequenceInterpretations() throws PersistenceException {
 25+ SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher);
4626
47 - return new GroupingCursor<List<String>, String>(c, new ListAbstractor.Accessor<String>(groupBy));
48 - }
49 -
50 - protected static void readMeanings(Corpus corpus, InputStream in, Map<String, List<? extends LocalConcept>> meanings) throws IOException, PersistenceException {
51 - DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true);
 27+ Term uk = new Term("UK");
 28+ Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(uk, getConcept("United_Kingdom"));
 29+ Pair<Term, LocalConcept> uk_as_Great_Britain = new Pair<Term, LocalConcept>(uk, getConcept("Great_Britain"));
 30+ Pair<Term, LocalConcept> uk_as_England = new Pair<Term, LocalConcept>(uk, getConcept("England"));
5231
53 - List<List<String>> group;
54 - while ((group = cursor.next()) != null) {
55 - List<LocalConcept> concepts = new ArrayList<LocalConcept>(group.size());
56 - String term = null;
57 -
58 - for (List<String> row: group) {
59 - term = row.get(0);
60 - int id = Integer.parseInt(row.get(1));
61 - String name = row.get(2);
62 - int freq = Integer.parseInt(row.get(3));
63 - int rule = Integer.parseInt(row.get(4));
64 -
65 - int score = ((rule==10 || rule==30) && freq<2) ? 0 : freq*rule;
66 -
67 - LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name);
68 - c.setCardinality(freq);
69 - c.setRelevance(score);
70 -
71 - concepts.add(c);
72 - }
73 -
74 - if (term!=null) meanings.put(term, concepts);
75 - }
 32+ Term london = new Term("London");
 33+ Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(london, getConcept("City_of_London"));
 34+ Pair<Term, LocalConcept> london_as_Greater_London = new Pair<Term, LocalConcept>(london, getConcept("Greater_London"));
 35+ Pair<Term, LocalConcept> london_as_London_city_council = new Pair<Term, LocalConcept>(london, getConcept("London_city_council"));
 36+
 37+ List<Term> sequence = new ArrayList<Term>();
 38+ sequence.add(uk);
 39+
 40+ Collection<Interpretation<Term, LocalConcept>> interpretations = disambiguator.getSequenceInterpretations(sequence, meaningFetcher.getMeanings(sequence));
7641
77 - cursor.close();
78 - }
79 -
80 - protected static void readFeatures(Corpus corpus, InputStream in, Map<Integer, ConceptFeatures<LocalConcept, Integer>> features) throws IOException, PersistenceException {
81 - DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true);
 42+ assertEquals("number of interpretations", 3, interpretations.size());
 43+ assertTrue("UK as United_Kingdom", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom )) );
 44+ assertTrue("UK as Great_Britain", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain )) );
 45+ assertTrue("UK as England", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England )) );
8246
83 - List<List<String>> group;
84 - while ((group = cursor.next()) != null) {
85 - LabeledVector<Integer> v = new MapLabeledVector<Integer>();
86 - Integer id = null;
87 - String name = null;
88 -
89 - for (List<String> row: group) {
90 - id = new Integer(row.get(0));
91 - name = row.get(1);
92 -
93 - int feature = Integer.parseInt(row.get(2));
94 - double value = Double.parseDouble(row.get(3));
 47+ ///////////////////////////////////////////////////////////////////////////////////
 48+
 49+ sequence = new ArrayList<Term>();
 50+ sequence.add(uk);
 51+ sequence.add(london);
9552
96 - v.set(feature, value);
97 - }
98 -
99 - if (id!=null) {
100 - double len = v.getLength();
101 - v = v.scaled(len); //normalize
102 -
103 - LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name);
104 - ConceptFeatures<LocalConcept, Integer> f = new ConceptFeatures<LocalConcept, Integer>(c, v);
105 - features.put(id, f);
106 - }
107 - }
 53+ interpretations = disambiguator.getSequenceInterpretations(sequence, meaningFetcher.getMeanings(sequence));
10854
109 - cursor.close();
110 - }
111 -
112 - private MeaningFetcher<LocalConcept> meaningFetcher = new MeaningFetcher<LocalConcept>() {
113 -
114 - public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(
115 - Collection<X> terms) throws PersistenceException {
116 - Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>();
117 -
118 - for (X t: terms) {
119 - List<? extends LocalConcept> n = getMeanings(t.getTerm());
120 - m.put(t, n);
121 - }
122 -
123 - return m;
124 - }
125 -
126 - public List<? extends LocalConcept> getMeanings(String term)
127 - throws PersistenceException {
128 - return meanings.get(term);
129 - }
130 -
131 - };
132 -
133 - private FeatureFetcher<LocalConcept, Integer> featureFetcher = new FeatureFetcher<LocalConcept, Integer>() {
134 -
135 - public boolean getFeaturesAreNormalized() {
136 - return true;
137 - }
138 -
139 - public Map<Integer, ConceptFeatures<LocalConcept, Integer>> getFeatures(
140 - Collection<? extends LocalConcept> concepts) throws PersistenceException {
141 - Map<Integer, ConceptFeatures<LocalConcept, Integer>> m = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>();
142 -
143 - for (LocalConcept c: concepts) {
144 - ConceptFeatures<LocalConcept, Integer> f = getFeatures(c);
145 - m.put(c.getId(), f);
146 - }
147 -
148 - return m;
149 - }
150 -
151 - public ConceptFeatures<LocalConcept, Integer> getFeatures(LocalConcept c)
152 - throws PersistenceException {
153 - return features.get(c.getId());
154 - }
155 -
156 - };
157 -
158 - protected Corpus corpus;
159 - protected TweakSet tweaks;
160 -
161 - public SlidingCoherenceDisambiguatorTest() throws IOException, PersistenceException {
162 - tweaks = new TweakSet();
163 - corpus = Corpus.forName("TEST", "en", tweaks);
 55+ assertEquals("number of interpretations", 9, interpretations.size());
16456
165 - URL meaningFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-meanings.csv");
166 - URL featureFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-features.csv");
 57+ assertTrue("UK as United_Kingdom; London as City_of_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom, london_as_City_of_London )) );
 58+ assertTrue("UK as Great_Britain; London as City_of_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain, london_as_City_of_London )) );
 59+ assertTrue("UK as England; London as City_of_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England, london_as_City_of_London )) );
16760
168 - readMeanings(corpus, meaningFile.openStream(), meanings);
169 - readFeatures(corpus, featureFile.openStream(), features);
 61+ assertTrue("UK as United_Kingdom; London as Greater_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom, london_as_Greater_London )) );
 62+ assertTrue("UK as Great_Britain; London as Greater_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain, london_as_Greater_London )) );
 63+ assertTrue("UK as England; London as Greater_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England, london_as_Greater_London )) );
 64+
 65+ assertTrue("UK as United_Kingdom; London as London_city_council", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom, london_as_London_city_council )) );
 66+ assertTrue("UK as Great_Britain; London as London_city_council", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain, london_as_London_city_council )) );
 67+ assertTrue("UK as England; London as London_city_council", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England, london_as_London_city_council )) );
17068 }
17169
172 - protected List<Term> terms(String... terms) {
173 - List<Term> list = new ArrayList<Term>();
174 - for (String t: terms) list.add(new Term(t));
175 - return list;
176 - }
177 -
178 - public void testDisambiguatePhraseNode() throws PersistenceException {
179 - String text = "The Bank and Monument Underground station";
180 - // 012345678901234567890123456789012345678901234567890
181 - List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
 70+ public void testGetInterpretations() throws PersistenceException {
 71+ SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher);
18272
183 - phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank
184 - phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument
185 - phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground
 73+ Term uk = new Term("UK");
 74+ Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(uk, getConcept("United_Kingdom"));
 75+ Pair<Term, LocalConcept> uk_as_Great_Britain = new Pair<Term, LocalConcept>(uk, getConcept("Great_Britain"));
 76+ Pair<Term, LocalConcept> uk_as_England = new Pair<Term, LocalConcept>(uk, getConcept("England"));
18677
187 - phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank
188 - phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument
189 - phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground
190 - //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station
 78+ Term london = new Term("London");
 79+ Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(london, getConcept("City_of_London"));
 80+ Pair<Term, LocalConcept> london_as_Greater_London = new Pair<Term, LocalConcept>(london, getConcept("Greater_London"));
 81+ Pair<Term, LocalConcept> london_as_London_city_council = new Pair<Term, LocalConcept>(london, getConcept("London_city_council"));
19182
192 - phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument
193 - phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground
194 - phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station
 83+ Term underground = new Term("Underground");
 84+ Pair<Term, LocalConcept> underground_as_Subway = new Pair<Term, LocalConcept>(underground, getConcept("Subway"));
 85+ Pair<Term, LocalConcept> underground_as_London_Undrerground = new Pair<Term, LocalConcept>(underground, getConcept("London_Underground"));
19586
196 - phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
197 - phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 87+ List<Term> sequence = new ArrayList<Term>();
 88+ sequence.add(uk);
 89+
 90+ List<List<Term>> sequences = new ArrayList<List<Term>>();
 91+ sequences.add(sequence);
19892
199 - phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 93+ Collection<Interpretation<Term, LocalConcept>> interpretations = disambiguator.getInterpretations(sequences, meaningFetcher.getMeanings(sequence));
20094
201 - PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases);
 95+ assertEquals("number of interpretations", 3, interpretations.size());
 96+ assertTrue("UK as United_Kingdom", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom )) );
 97+ assertTrue("UK as Great_Britain", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain )) );
 98+ assertTrue("UK as England", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England )) );
20299
 100+ ///////////////////////////////////////////////////////////////////////////////////////////////
 101+ }
 102+
 103+ public void testDisambiguatePhraseNode() throws PersistenceException {
 104+ PhraseOccuranceSet set = getBankAndMonumentPhrases();
 105+
203106 SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher);
204107 disambiguator.setInitialWindow(1);
205108 disambiguator.setWindow(3);
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java
@@ -0,0 +1,112 @@
 2+package de.brightbyte.wikiword.disambig;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Collection;
 7+import java.util.Collections;
 8+import java.util.HashSet;
 9+import java.util.List;
 10+import java.util.Map;
 11+
 12+import de.brightbyte.util.PersistenceException;
 13+import de.brightbyte.wikiword.disambig.Disambiguator.Result;
 14+import de.brightbyte.wikiword.model.LocalConcept;
 15+import de.brightbyte.wikiword.model.PhraseOccurance;
 16+import de.brightbyte.wikiword.model.PhraseOccuranceSet;
 17+import de.brightbyte.wikiword.model.TermListNode;
 18+
 19+public class PopularityDisambiguatorTest extends DisambiguatorTestBase {
 20+
 21+ public PopularityDisambiguatorTest() throws IOException, PersistenceException {
 22+ super();
 23+ }
 24+
 25+ public void testGetTermsForList() throws PersistenceException {
 26+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 27+
 28+ Term uk = new Term("UK");
 29+ Term london = new Term("London");
 30+ Term underground = new Term("Underground");
 31+
 32+ ArrayList<Term> terms = new ArrayList<Term>();
 33+ terms.add(uk);
 34+ terms.add(london);
 35+ terms.add(underground);
 36+
 37+ Collection<Term> res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1);
 38+ assertEquals("depth 1", new HashSet<Term>( terms.subList(0, 1) ), res);
 39+
 40+ res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 2);
 41+ assertEquals("depth 2", new HashSet<Term>( terms.subList(0, 2) ), res);
 42+
 43+ res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1000);
 44+ assertEquals("depth 1000", new HashSet<Term>( terms ), res);
 45+ }
 46+
 47+ public void testGetTermsForNode() throws PersistenceException {
 48+ PhraseOccuranceSet set = getBankAndMonumentPhrases();
 49+
 50+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 51+ Collection<PhraseOccurance> terms = disambiguator.getTerms(set.getRootNode(), 0);
 52+ assertEquals("empty term set", Collections.emptySet(), terms);
 53+
 54+ //FIXME: Test case for getHorizon
 55+
 56+ terms = disambiguator.getTerms(set.getRootNode(), 1);
 57+ assertEquals("terms from depth 1", Collections.emptySet() /* fixme */, terms);
 58+ }
 59+
 60+ public void testGetMeaningsForList() throws PersistenceException {
 61+ PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 62+
 63+ Term uk = new Term("UK");
 64+ Term london = new Term("London");
 65+ Term underground = new Term("Underground");
 66+
 67+ ArrayList<Term> terms = new ArrayList<Term>();
 68+ terms.add(uk);
 69+ terms.add(london);
 70+ terms.add(underground);
 71+
 72+ Map<Term, List<? extends LocalConcept>> meanings = disambiguator.getMeanings(terms);
 73+
 74+ assertEquals(uk.getTerm(), meanings.get(uk.getTerm()), meanings.get(uk));
 75+ assertEquals(london.getTerm(), meanings.get(london.getTerm()), meanings.get(london));
 76+ assertEquals(underground.getTerm(), meanings.get(underground.getTerm()), meanings.get(underground));
 77+ }
 78+
 79+ public void testGetMeaningsForNode() throws PersistenceException {
 80+ throw new UnsupportedOperationException("not yet implemented");
 81+ //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 82+ //disambiguator.getMeanings(terms);
 83+ }
 84+
 85+ public void testGetSequences() throws PersistenceException {
 86+ throw new UnsupportedOperationException("not yet implemented");
 87+ //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 88+ //disambiguator.getSequences(root, depth);
 89+ }
 90+
 91+ public void testDisambiguateTerms() throws PersistenceException {
 92+ throw new UnsupportedOperationException("not yet implemented");
 93+ /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 94+
 95+ String[] sequence = {"UK", "London", "Underground", "Bank"};
 96+
 97+ Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null);
 98+ */
 99+ //// .............. ///
 100+ }
 101+
 102+ public void testDisambiguateNode() throws PersistenceException {
 103+ throw new UnsupportedOperationException("not yet implemented");
 104+ /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher);
 105+
 106+ String[] sequence = {"UK", "London", "Underground", "Bank"};
 107+
 108+ Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null);
 109+ */
 110+ //// .............. ///
 111+ }
 112+
 113+}
Property changes on: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java
___________________________________________________________________
Name: svn:mergeinfo
1114 +
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java
@@ -0,0 +1,232 @@
 2+package de.brightbyte.wikiword.disambig;
 3+
 4+import java.io.IOException;
 5+import java.io.InputStream;
 6+import java.net.URL;
 7+import java.util.ArrayList;
 8+import java.util.Collection;
 9+import java.util.HashMap;
 10+import java.util.List;
 11+import java.util.Map;
 12+
 13+import junit.framework.TestCase;
 14+import de.brightbyte.abstraction.ListAbstractor;
 15+import de.brightbyte.data.LabeledVector;
 16+import de.brightbyte.data.MapLabeledVector;
 17+import de.brightbyte.data.cursor.DataCursor;
 18+import de.brightbyte.io.ChunkingCursor;
 19+import de.brightbyte.io.GroupingCursor;
 20+import de.brightbyte.io.LineCursor;
 21+import de.brightbyte.text.CsvLineChunker;
 22+import de.brightbyte.util.PersistenceException;
 23+import de.brightbyte.wikiword.ConceptType;
 24+import de.brightbyte.wikiword.Corpus;
 25+import de.brightbyte.wikiword.TweakSet;
 26+import de.brightbyte.wikiword.model.ConceptFeatures;
 27+import de.brightbyte.wikiword.model.LocalConcept;
 28+import de.brightbyte.wikiword.model.PhraseOccurance;
 29+import de.brightbyte.wikiword.model.PhraseOccuranceSet;
 30+import de.brightbyte.wikiword.model.TermReference;
 31+
 32+public class DisambiguatorTestBase extends TestCase {
 33+
 34+ protected Map<String, List<? extends LocalConcept>> meanings = new HashMap<String, List<? extends LocalConcept>>();
 35+ protected Map<Integer, ConceptFeatures<LocalConcept, Integer>> features = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>();
 36+ protected Map<Integer, LocalConcept> conceptsById = new HashMap<Integer, LocalConcept>();
 37+ protected Map<String, LocalConcept> conceptsByName = new HashMap<String, LocalConcept>();
 38+
 39+ protected static DataCursor<List<String>> openTableCursor(InputStream in, String enc) throws IOException {
 40+ ChunkingCursor cursor = new ChunkingCursor(new LineCursor(in, enc), CsvLineChunker.tsv);
 41+ return cursor;
 42+ }
 43+
 44+ protected static DataCursor<List<List<String>>> openGroupedTableCursor(InputStream in, String enc, int groupBy, boolean skipHeader) throws IOException, PersistenceException {
 45+ DataCursor<List<String>> c = openTableCursor(in, enc);
 46+ if (skipHeader) c.next(); //skip first line
 47+
 48+ return new GroupingCursor<List<String>, String>(c, new ListAbstractor.Accessor<String>(groupBy));
 49+ }
 50+
 51+ protected static void readMeanings(Corpus corpus, InputStream in, Map<String, List<? extends LocalConcept>> meanings) throws IOException, PersistenceException {
 52+ DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true);
 53+
 54+ List<List<String>> group;
 55+ while ((group = cursor.next()) != null) {
 56+ List<LocalConcept> concepts = new ArrayList<LocalConcept>(group.size());
 57+ String term = null;
 58+
 59+ for (List<String> row: group) {
 60+ term = row.get(0);
 61+ int id = Integer.parseInt(row.get(1));
 62+ String name = row.get(2);
 63+ int freq = Integer.parseInt(row.get(3));
 64+ int rule = Integer.parseInt(row.get(4));
 65+
 66+ int score = ((rule==10 || rule==30) && freq<2) ? 0 : freq*rule;
 67+
 68+ LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name);
 69+ c.setCardinality(freq);
 70+ c.setRelevance(score);
 71+
 72+ concepts.add(c);
 73+ }
 74+
 75+ if (term!=null) meanings.put(term, concepts);
 76+ }
 77+
 78+ cursor.close();
 79+ }
 80+
 81+ protected static void readFeatures(Corpus corpus, InputStream in, Map<Integer, ConceptFeatures<LocalConcept, Integer>> features) throws IOException, PersistenceException {
 82+ DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true);
 83+
 84+ List<List<String>> group;
 85+ while ((group = cursor.next()) != null) {
 86+ LabeledVector<Integer> v = new MapLabeledVector<Integer>();
 87+ Integer id = null;
 88+ String name = null;
 89+
 90+ for (List<String> row: group) {
 91+ id = new Integer(row.get(0));
 92+ name = row.get(1);
 93+
 94+ int feature = Integer.parseInt(row.get(2));
 95+ double value = Double.parseDouble(row.get(3));
 96+
 97+ v.set(feature, value);
 98+ }
 99+
 100+ if (id!=null) {
 101+ double len = v.getLength();
 102+ v = v.scaled(len); //normalize
 103+
 104+ LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name);
 105+ ConceptFeatures<LocalConcept, Integer> f = new ConceptFeatures<LocalConcept, Integer>(c, v);
 106+ features.put(id, f);
 107+ }
 108+ }
 109+
 110+ cursor.close();
 111+ }
 112+
 113+ protected MeaningFetcher<LocalConcept> meaningFetcher = new MeaningFetcher<LocalConcept>() {
 114+
 115+ public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings(
 116+ Collection<X> terms) throws PersistenceException {
 117+ Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>();
 118+
 119+ for (X t: terms) {
 120+ List<? extends LocalConcept> n = getMeanings(t.getTerm());
 121+ if (n!=null) m.put(t, n);
 122+ }
 123+
 124+ return m;
 125+ }
 126+
 127+ public List<? extends LocalConcept> getMeanings(String term)
 128+ throws PersistenceException {
 129+ return meanings.get(term);
 130+ }
 131+
 132+ };
 133+
 134+ protected FeatureFetcher<LocalConcept, Integer> featureFetcher = new FeatureFetcher<LocalConcept, Integer>() {
 135+
 136+ public boolean getFeaturesAreNormalized() {
 137+ return true;
 138+ }
 139+
 140+ public Map<Integer, ConceptFeatures<LocalConcept, Integer>> getFeatures(
 141+ Collection<? extends LocalConcept> concepts) throws PersistenceException {
 142+ Map<Integer, ConceptFeatures<LocalConcept, Integer>> m = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>();
 143+
 144+ for (LocalConcept c: concepts) {
 145+ ConceptFeatures<LocalConcept, Integer> f = getFeatures(c);
 146+ m.put(c.getId(), f);
 147+ }
 148+
 149+ return m;
 150+ }
 151+
 152+ public ConceptFeatures<LocalConcept, Integer> getFeatures(LocalConcept c)
 153+ throws PersistenceException {
 154+ return features.get(c.getId());
 155+ }
 156+
 157+ };
 158+
 159+ protected Corpus corpus;
 160+ protected TweakSet tweaks;
 161+
 162+ public DisambiguatorTestBase() throws IOException, PersistenceException {
 163+ tweaks = new TweakSet();
 164+ corpus = Corpus.forName("TEST", "en", tweaks);
 165+
 166+ URL meaningFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-meanings.csv");
 167+ URL featureFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-features.csv");
 168+
 169+ readMeanings(corpus, meaningFile.openStream(), meanings);
 170+ readFeatures(corpus, featureFile.openStream(), features);
 171+
 172+ for (List<? extends LocalConcept> concepts: meanings.values()) {
 173+ for (LocalConcept c: concepts) {
 174+ conceptsById.put(c.getId(), c);
 175+ conceptsByName.put(c.getName(), c);
 176+ }
 177+ }
 178+ }
 179+
 180+ protected List<Term> terms(String... terms) {
 181+ List<Term> list = new ArrayList<Term>();
 182+ for (String t: terms) list.add(new Term(t));
 183+ return list;
 184+ }
 185+
 186+ protected LocalConcept getConcept(String name) {
 187+ LocalConcept c = conceptsByName.get(name);
 188+ return c;
 189+ }
 190+
 191+ protected LocalConcept getConcept(int id) {
 192+ LocalConcept c = conceptsById.get(id);
 193+ return c;
 194+ }
 195+
 196+ protected <X extends TermReference>Map<X, List<? extends LocalConcept>> getMeanings(Collection<List<X>> sequences) throws PersistenceException {
 197+ Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>();
 198+
 199+ for (List<X> seq: sequences) {
 200+ Map<X, List<? extends LocalConcept>> meanings = meaningFetcher.getMeanings(seq);
 201+ m.putAll(meanings);
 202+ }
 203+
 204+ return m;
 205+ }
 206+
 207+ protected PhraseOccuranceSet getBankAndMonumentPhrases() {
 208+ String text = "The Bank and Monument Underground station";
 209+ List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
 210+
 211+ phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank
 212+ phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument
 213+ phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground
 214+
 215+ phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank
 216+ phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument
 217+ phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground
 218+ //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station
 219+
 220+ phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument
 221+ phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground
 222+ phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station
 223+
 224+ phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground
 225+ phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations
 226+
 227+ phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station
 228+
 229+ PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases);
 230+ return set;
 231+ }
 232+
 233+}
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -206,7 +206,7 @@
207207 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
208208 FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context);
209209
210 - List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(sequences, meanings);
 210+ Collection<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(sequences, meanings);
211211
212212 return getBestInterpretation(root, meanings, context, interpretations, similarities, features);
213213 }
@@ -240,7 +240,7 @@
241241 }
242242
243243 protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings,
244 - Collection<? extends LocalConcept> context, List<Disambiguator.Interpretation<X, LocalConcept>> interpretations,
 244+ Collection<? extends LocalConcept> context, Collection<Disambiguator.Interpretation<X, LocalConcept>> interpretations,
245245 LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException {
246246
247247 List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>();
@@ -269,17 +269,18 @@
270270 return r;
271271 }
272272
273 - protected <X extends TermReference>List<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(Collection<List<X>> sequences, Map<X, List<? extends LocalConcept>> meanings) {
 273+ public <X extends TermReference>Collection<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(Collection<List<X>> sequences, Map<X, List<? extends LocalConcept>> meanings) {
274274 List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = new ArrayList<Disambiguator.Interpretation<X, LocalConcept>>();
275275 for (List<X> sq: sequences) {
276 - List<Disambiguator.Interpretation<X, LocalConcept>> sqint = getSequenceInterpretations(sq, meanings);
 276+ if (sq.isEmpty()) continue;
 277+ Collection<Disambiguator.Interpretation<X, LocalConcept>> sqint = getSequenceInterpretations(sq, meanings);
277278 interpretations.addAll(sqint);
278279 }
279280
280281 return interpretations;
281282 }
282283
283 - protected <X extends TermReference>List<Disambiguator.Interpretation<X, LocalConcept>> getSequenceInterpretations(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings) {
 284+ public <X extends TermReference>Collection<Disambiguator.Interpretation<X, LocalConcept>> getSequenceInterpretations(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings) {
284285 if (sequence.size()==0) {
285286 return Collections.singletonList(new Disambiguator.Interpretation<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), sequence));
286287 }
@@ -287,7 +288,7 @@
288289 X t = sequence.get(0);
289290 List<? extends LocalConcept> m = meanings.get(t);
290291
291 - List<Disambiguator.Interpretation<X, LocalConcept>> base = getSequenceInterpretations(sequence.subList(1, sequence.size()), meanings);
 292+ Collection<Disambiguator.Interpretation<X, LocalConcept>> base = getSequenceInterpretations(sequence.subList(1, sequence.size()), meanings);
292293
293294 if (m==null || m.size()==0) return base;
294295
@@ -299,7 +300,10 @@
300301 e.putAll(be.getMeanings());
301302 e.put(t, c);
302303
303 - interpretations.add(new Disambiguator.Interpretation<X, LocalConcept>(e, sequence));
 304+ if (!sequence.isEmpty()) {
 305+ Disambiguator.Interpretation<X, LocalConcept>interp = new Disambiguator.Interpretation<X, LocalConcept>(e, sequence);
 306+ interpretations.add(interp);
 307+ }
304308 }
305309 }
306310
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -18,7 +18,7 @@
1919 public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> {
2020
2121 public interface NodeListener<T extends TermReference> {
22 - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence);
 22+ public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal);
2323 }
2424
2525 public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> {
@@ -28,9 +28,8 @@
2929 seqencees = new ArrayList<List<T>>();
3030 }
3131
32 - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) {
33 - Collection<?> successors = node.getSuccessors();
34 - if (successors==null || successors.isEmpty()) { //is leaf
 32+ public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) {
 33+ if (terminal) {
3534 List<T> p = new ArrayList<T>(seqence); //clone
3635 seqencees.add(p);
3736 }
@@ -48,7 +47,7 @@
4948 terms = new HashSet<T>();
5049 }
5150
52 - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) {
 51+ public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) {
5352 T t = node.getTermReference();
5453 if (t.getTerm().length()>0) terms.add(t);
5554 }
@@ -127,13 +126,17 @@
128127
129128 X t = root.getTermReference();
130129 if (t.getTerm().length()>0) seqence.add(t); //push
 130+ else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes.
131131
 132+ boolean terminal = (depth<=1);
 133+
 134+ Collection<? extends PhraseNode<X>> successors = terminal ? null : root.getSuccessors();
 135+ if (successors==null || successors.isEmpty()) terminal = true;
 136+
132137 if (nodeListener!=null)
133 - nodeListener.onNode(root, seqence);
 138+ nodeListener.onNode(root, seqence, terminal);
134139
135 - Collection<? extends PhraseNode<X>> successors = root.getSuccessors();
136 -
137 - if (depth>1 && successors!=null) {
 140+ if (!terminal) {
138141 for (PhraseNode<X> n: successors) {
139142 walk(n, seqence, nodeListener, depth-1);
140143 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -57,7 +57,7 @@
5858 if (to-from < 2) {
5959 r = popularityDisambiguator.disambiguate(frame, meanings, context);
6060 } else {
61 - List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(frame, interpretation, meanings);
 61+ Collection<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(frame, interpretation, meanings);
6262 r = getBestInterpretation(node, meanings, context, interpretations, similarities, features);
6363 }
6464
@@ -97,7 +97,7 @@
9898
9999 if (initialWindow > 0) { //apply full coherence disambig to initial window size. initialWindow == 1 will trigger a popularity disambig.
100100 Collection<List<X>> sequences = getSequences(root, initialWindow);
101 - Result<X, LocalConcept> r = disambiguate(sequences, root, meanings, context);
 101+ Result<X, LocalConcept> r = super.disambiguate(sequences, root, meanings, context);
102102
103103 sequence.addAll(r.getSequence());
104104 currentNode = getLastNode(root, sequence);
@@ -130,7 +130,7 @@
131131 return getScore(new Disambiguator.Interpretation<X, LocalConcept>(disambig, sequence), context, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates.
132132 }
133133
134 - protected <X extends TermReference>List<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(List<X> frame, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
 134+ protected <X extends TermReference>Collection<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(List<X> frame, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
135135 //strip out all terms with no known meaning
136136 if (meanings.keySet().size() != frame.size()) {
137137 List<X> t = new ArrayList<X>(frame.size());
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java
@@ -1,9 +1,13 @@
22 package de.brightbyte.wikiword.disambig;
33
 4+import java.util.ArrayList;
 5+import java.util.Arrays;
46 import java.util.Collection;
 7+import java.util.HashMap;
58 import java.util.List;
69 import java.util.Map;
710
 11+import de.brightbyte.data.Pair;
812 import de.brightbyte.io.Output;
913 import de.brightbyte.util.PersistenceException;
1014 import de.brightbyte.wikiword.model.PhraseNode;
@@ -11,12 +15,39 @@
1216 import de.brightbyte.wikiword.model.WikiWordConcept;
1317
1418 public interface Disambiguator<T extends TermReference, C extends WikiWordConcept> {
15 -
 19+
1620 public static class Interpretation<T extends TermReference, C extends WikiWordConcept> {
1721 private final Map<T, C> meanings;
1822 private final List<T> sequence;
1923
 24+ private static <T extends TermReference, C extends WikiWordConcept>Map<T, C> buildMeaningMap(List<Pair<T, C>> interpretation) {
 25+ Map<T, C> sequence = new HashMap<T, C>(interpretation.size());
 26+ for (Pair<T, C> p: interpretation) {
 27+ sequence.put(p.getA(), p.getB());
 28+ }
 29+ return sequence;
 30+ }
 31+
 32+ private static <T extends TermReference, C extends WikiWordConcept>List<T> buildTermSequence(List<Pair<T, C>> interpretation) {
 33+ List<T> sequence = new ArrayList<T>(interpretation.size());
 34+ for (Pair<T, C> p: interpretation) {
 35+ sequence.add(p.getA());
 36+ }
 37+ return sequence;
 38+ }
 39+
 40+ public Interpretation(Pair<T, C>... interpretation) {
 41+ this(Arrays.asList(interpretation));
 42+ }
 43+
 44+ public Interpretation(List<Pair<T, C>> interpretation) {
 45+ this(buildMeaningMap(interpretation), buildTermSequence(interpretation));
 46+ }
 47+
2048 public Interpretation(final Map<T, C> meanings, final List<T> sequence) {
 49+ if (meanings==null) throw new NullPointerException();
 50+ if (sequence==null) throw new NullPointerException();
 51+
2152 this.meanings = meanings;
2253 this.sequence = sequence;
2354 }
@@ -24,9 +55,61 @@
2556 public Map<T, C> getMeanings() {
2657 return meanings;
2758 }
 59+
2860 public List<T> getSequence() {
2961 return sequence;
3062 }
 63+
 64+ public String toString() {
 65+ if (sequence.isEmpty()) return "()";
 66+
 67+ StringBuilder b = new StringBuilder();
 68+ b.append("(");
 69+
 70+ for (T t: sequence) {
 71+ C c = meanings.get(t);
 72+ b.append(t);
 73+ b.append("=>");
 74+ b.append(c);
 75+ b.append("; ");
 76+ }
 77+
 78+ b.append(")");
 79+ return b.toString();
 80+ }
 81+
 82+ @Override
 83+ public int hashCode() {
 84+ final int PRIME = 31;
 85+ int result = 1;
 86+ result = PRIME * result + ((meanings == null) ? 0 : meanings.hashCode());
 87+ result = PRIME * result + ((sequence == null) ? 0 : sequence.hashCode());
 88+ return result;
 89+ }
 90+
 91+ @Override
 92+ public boolean equals(Object obj) {
 93+ if (this == obj)
 94+ return true;
 95+ if (obj == null)
 96+ return false;
 97+ if (getClass() != obj.getClass())
 98+ return false;
 99+ final Interpretation other = (Interpretation) obj;
 100+ if (meanings == null) {
 101+ if (other.meanings != null)
 102+ return false;
 103+ } else if (!meanings.equals(other.meanings))
 104+ return false;
 105+ if (sequence == null) {
 106+ if (other.sequence != null)
 107+ return false;
 108+ } else if (!sequence.equals(other.sequence))
 109+ return false;
 110+ return true;
 111+ }
 112+
 113+
31114 }
32115
33116 public static class Result<T extends TermReference, C extends WikiWordConcept> implements Comparable {

Status & tagging log