Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest.java |
— | — | @@ -1,204 +1,107 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | | -import java.io.InputStream; |
6 | | -import java.net.URL; |
7 | 5 | import java.util.ArrayList; |
8 | 6 | import java.util.Collection; |
9 | | -import java.util.HashMap; |
10 | 7 | import java.util.List; |
11 | 8 | import java.util.Map; |
12 | 9 | |
13 | | -import de.brightbyte.abstraction.ListAbstractor; |
14 | | -import de.brightbyte.data.LabeledVector; |
15 | | -import de.brightbyte.data.MapLabeledVector; |
16 | | -import de.brightbyte.data.cursor.DataCursor; |
17 | | -import de.brightbyte.io.ChunkingCursor; |
18 | | -import de.brightbyte.io.GroupingCursor; |
19 | | -import de.brightbyte.io.LineCursor; |
20 | | -import de.brightbyte.text.CsvLineChunker; |
| 10 | +import de.brightbyte.data.Pair; |
21 | 11 | import de.brightbyte.util.PersistenceException; |
22 | | -import de.brightbyte.wikiword.ConceptType; |
23 | | -import de.brightbyte.wikiword.Corpus; |
24 | | -import de.brightbyte.wikiword.TweakSet; |
| 12 | +import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation; |
25 | 13 | import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
26 | | -import de.brightbyte.wikiword.model.ConceptFeatures; |
27 | 14 | import de.brightbyte.wikiword.model.LocalConcept; |
28 | 15 | import de.brightbyte.wikiword.model.PhraseOccurance; |
29 | 16 | import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
30 | | -import de.brightbyte.wikiword.model.TermReference; |
31 | | -import junit.framework.TestCase; |
32 | 17 | |
33 | | -public class SlidingCoherenceDisambiguatorTest extends TestCase { |
| 18 | +public class SlidingCoherenceDisambiguatorTest extends DisambiguatorTestBase { |
34 | 19 | |
35 | | - protected Map<String, List<? extends LocalConcept>> meanings = new HashMap<String, List<? extends LocalConcept>>(); |
36 | | - protected Map<Integer, ConceptFeatures<LocalConcept, Integer>> features = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>(); |
37 | | - |
38 | | - protected static DataCursor<List<String>> openTableCursor(InputStream in, String enc) throws IOException { |
39 | | - ChunkingCursor cursor = new ChunkingCursor(new LineCursor(in, enc), CsvLineChunker.tsv); |
40 | | - return cursor; |
| 20 | + public SlidingCoherenceDisambiguatorTest() throws IOException, PersistenceException { |
| 21 | + super(); |
41 | 22 | } |
42 | 23 | |
43 | | - protected static DataCursor<List<List<String>>> openGroupedTableCursor(InputStream in, String enc, int groupBy, boolean skipHeader) throws IOException, PersistenceException { |
44 | | - DataCursor<List<String>> c = openTableCursor(in, enc); |
45 | | - if (skipHeader) c.next(); //skip first line |
| 24 | + public void testGetSequenceInterpretations() throws PersistenceException { |
| 25 | + SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher); |
46 | 26 | |
47 | | - return new GroupingCursor<List<String>, String>(c, new ListAbstractor.Accessor<String>(groupBy)); |
48 | | - } |
49 | | - |
50 | | - protected static void readMeanings(Corpus corpus, InputStream in, Map<String, List<? extends LocalConcept>> meanings) throws IOException, PersistenceException { |
51 | | - DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true); |
| 27 | + Term uk = new Term("UK"); |
| 28 | + Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(uk, getConcept("United_Kingdom")); |
| 29 | + Pair<Term, LocalConcept> uk_as_Great_Britain = new Pair<Term, LocalConcept>(uk, getConcept("Great_Britain")); |
| 30 | + Pair<Term, LocalConcept> uk_as_England = new Pair<Term, LocalConcept>(uk, getConcept("England")); |
52 | 31 | |
53 | | - List<List<String>> group; |
54 | | - while ((group = cursor.next()) != null) { |
55 | | - List<LocalConcept> concepts = new ArrayList<LocalConcept>(group.size()); |
56 | | - String term = null; |
57 | | - |
58 | | - for (List<String> row: group) { |
59 | | - term = row.get(0); |
60 | | - int id = Integer.parseInt(row.get(1)); |
61 | | - String name = row.get(2); |
62 | | - int freq = Integer.parseInt(row.get(3)); |
63 | | - int rule = Integer.parseInt(row.get(4)); |
64 | | - |
65 | | - int score = ((rule==10 || rule==30) && freq<2) ? 0 : freq*rule; |
66 | | - |
67 | | - LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name); |
68 | | - c.setCardinality(freq); |
69 | | - c.setRelevance(score); |
70 | | - |
71 | | - concepts.add(c); |
72 | | - } |
73 | | - |
74 | | - if (term!=null) meanings.put(term, concepts); |
75 | | - } |
| 32 | + Term london = new Term("London"); |
| 33 | + Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(london, getConcept("City_of_London")); |
| 34 | + Pair<Term, LocalConcept> london_as_Greater_London = new Pair<Term, LocalConcept>(london, getConcept("Greater_London")); |
| 35 | + Pair<Term, LocalConcept> london_as_London_city_council = new Pair<Term, LocalConcept>(london, getConcept("London_city_council")); |
| 36 | + |
| 37 | + List<Term> sequence = new ArrayList<Term>(); |
| 38 | + sequence.add(uk); |
| 39 | + |
| 40 | + Collection<Interpretation<Term, LocalConcept>> interpretations = disambiguator.getSequenceInterpretations(sequence, meaningFetcher.getMeanings(sequence)); |
76 | 41 | |
77 | | - cursor.close(); |
78 | | - } |
79 | | - |
80 | | - protected static void readFeatures(Corpus corpus, InputStream in, Map<Integer, ConceptFeatures<LocalConcept, Integer>> features) throws IOException, PersistenceException { |
81 | | - DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true); |
| 42 | + assertEquals("number of interpretations", 3, interpretations.size()); |
| 43 | + assertTrue("UK as United_Kingdom", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom )) ); |
| 44 | + assertTrue("UK as Great_Britain", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain )) ); |
| 45 | + assertTrue("UK as England", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England )) ); |
82 | 46 | |
83 | | - List<List<String>> group; |
84 | | - while ((group = cursor.next()) != null) { |
85 | | - LabeledVector<Integer> v = new MapLabeledVector<Integer>(); |
86 | | - Integer id = null; |
87 | | - String name = null; |
88 | | - |
89 | | - for (List<String> row: group) { |
90 | | - id = new Integer(row.get(0)); |
91 | | - name = row.get(1); |
92 | | - |
93 | | - int feature = Integer.parseInt(row.get(2)); |
94 | | - double value = Double.parseDouble(row.get(3)); |
| 47 | + /////////////////////////////////////////////////////////////////////////////////// |
| 48 | + |
| 49 | + sequence = new ArrayList<Term>(); |
| 50 | + sequence.add(uk); |
| 51 | + sequence.add(london); |
95 | 52 | |
96 | | - v.set(feature, value); |
97 | | - } |
98 | | - |
99 | | - if (id!=null) { |
100 | | - double len = v.getLength(); |
101 | | - v = v.scaled(len); //normalize |
102 | | - |
103 | | - LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name); |
104 | | - ConceptFeatures<LocalConcept, Integer> f = new ConceptFeatures<LocalConcept, Integer>(c, v); |
105 | | - features.put(id, f); |
106 | | - } |
107 | | - } |
| 53 | + interpretations = disambiguator.getSequenceInterpretations(sequence, meaningFetcher.getMeanings(sequence)); |
108 | 54 | |
109 | | - cursor.close(); |
110 | | - } |
111 | | - |
112 | | - private MeaningFetcher<LocalConcept> meaningFetcher = new MeaningFetcher<LocalConcept>() { |
113 | | - |
114 | | - public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings( |
115 | | - Collection<X> terms) throws PersistenceException { |
116 | | - Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>(); |
117 | | - |
118 | | - for (X t: terms) { |
119 | | - List<? extends LocalConcept> n = getMeanings(t.getTerm()); |
120 | | - m.put(t, n); |
121 | | - } |
122 | | - |
123 | | - return m; |
124 | | - } |
125 | | - |
126 | | - public List<? extends LocalConcept> getMeanings(String term) |
127 | | - throws PersistenceException { |
128 | | - return meanings.get(term); |
129 | | - } |
130 | | - |
131 | | - }; |
132 | | - |
133 | | - private FeatureFetcher<LocalConcept, Integer> featureFetcher = new FeatureFetcher<LocalConcept, Integer>() { |
134 | | - |
135 | | - public boolean getFeaturesAreNormalized() { |
136 | | - return true; |
137 | | - } |
138 | | - |
139 | | - public Map<Integer, ConceptFeatures<LocalConcept, Integer>> getFeatures( |
140 | | - Collection<? extends LocalConcept> concepts) throws PersistenceException { |
141 | | - Map<Integer, ConceptFeatures<LocalConcept, Integer>> m = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>(); |
142 | | - |
143 | | - for (LocalConcept c: concepts) { |
144 | | - ConceptFeatures<LocalConcept, Integer> f = getFeatures(c); |
145 | | - m.put(c.getId(), f); |
146 | | - } |
147 | | - |
148 | | - return m; |
149 | | - } |
150 | | - |
151 | | - public ConceptFeatures<LocalConcept, Integer> getFeatures(LocalConcept c) |
152 | | - throws PersistenceException { |
153 | | - return features.get(c.getId()); |
154 | | - } |
155 | | - |
156 | | - }; |
157 | | - |
158 | | - protected Corpus corpus; |
159 | | - protected TweakSet tweaks; |
160 | | - |
161 | | - public SlidingCoherenceDisambiguatorTest() throws IOException, PersistenceException { |
162 | | - tweaks = new TweakSet(); |
163 | | - corpus = Corpus.forName("TEST", "en", tweaks); |
| 55 | + assertEquals("number of interpretations", 9, interpretations.size()); |
164 | 56 | |
165 | | - URL meaningFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-meanings.csv"); |
166 | | - URL featureFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-features.csv"); |
| 57 | + assertTrue("UK as United_Kingdom; London as City_of_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom, london_as_City_of_London )) ); |
| 58 | + assertTrue("UK as Great_Britain; London as City_of_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain, london_as_City_of_London )) ); |
| 59 | + assertTrue("UK as England; London as City_of_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England, london_as_City_of_London )) ); |
167 | 60 | |
168 | | - readMeanings(corpus, meaningFile.openStream(), meanings); |
169 | | - readFeatures(corpus, featureFile.openStream(), features); |
| 61 | + assertTrue("UK as United_Kingdom; London as Greater_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom, london_as_Greater_London )) ); |
| 62 | + assertTrue("UK as Great_Britain; London as Greater_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain, london_as_Greater_London )) ); |
| 63 | + assertTrue("UK as England; London as Greater_London", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England, london_as_Greater_London )) ); |
| 64 | + |
| 65 | + assertTrue("UK as United_Kingdom; London as London_city_council", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom, london_as_London_city_council )) ); |
| 66 | + assertTrue("UK as Great_Britain; London as London_city_council", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain, london_as_London_city_council )) ); |
| 67 | + assertTrue("UK as England; London as London_city_council", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England, london_as_London_city_council )) ); |
170 | 68 | } |
171 | 69 | |
172 | | - protected List<Term> terms(String... terms) { |
173 | | - List<Term> list = new ArrayList<Term>(); |
174 | | - for (String t: terms) list.add(new Term(t)); |
175 | | - return list; |
176 | | - } |
177 | | - |
178 | | - public void testDisambiguatePhraseNode() throws PersistenceException { |
179 | | - String text = "The Bank and Monument Underground station"; |
180 | | - // 012345678901234567890123456789012345678901234567890 |
181 | | - List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 70 | + public void testGetInterpretations() throws PersistenceException { |
| 71 | + SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher); |
182 | 72 | |
183 | | - phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank |
184 | | - phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument |
185 | | - phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground |
| 73 | + Term uk = new Term("UK"); |
| 74 | + Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(uk, getConcept("United_Kingdom")); |
| 75 | + Pair<Term, LocalConcept> uk_as_Great_Britain = new Pair<Term, LocalConcept>(uk, getConcept("Great_Britain")); |
| 76 | + Pair<Term, LocalConcept> uk_as_England = new Pair<Term, LocalConcept>(uk, getConcept("England")); |
186 | 77 | |
187 | | - phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank |
188 | | - phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument |
189 | | - phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground |
190 | | - //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station |
| 78 | + Term london = new Term("London"); |
| 79 | + Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(london, getConcept("City_of_London")); |
| 80 | + Pair<Term, LocalConcept> london_as_Greater_London = new Pair<Term, LocalConcept>(london, getConcept("Greater_London")); |
| 81 | + Pair<Term, LocalConcept> london_as_London_city_council = new Pair<Term, LocalConcept>(london, getConcept("London_city_council")); |
191 | 82 | |
192 | | - phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
193 | | - phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
194 | | - phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
| 83 | + Term underground = new Term("Underground"); |
| 84 | + Pair<Term, LocalConcept> underground_as_Subway = new Pair<Term, LocalConcept>(underground, getConcept("Subway")); |
| 85 | + Pair<Term, LocalConcept> underground_as_London_Undrerground = new Pair<Term, LocalConcept>(underground, getConcept("London_Underground")); |
195 | 86 | |
196 | | - phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
197 | | - phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 87 | + List<Term> sequence = new ArrayList<Term>(); |
| 88 | + sequence.add(uk); |
| 89 | + |
| 90 | + List<List<Term>> sequences = new ArrayList<List<Term>>(); |
| 91 | + sequences.add(sequence); |
198 | 92 | |
199 | | - phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 93 | + Collection<Interpretation<Term, LocalConcept>> interpretations = disambiguator.getInterpretations(sequences, meaningFetcher.getMeanings(sequence)); |
200 | 94 | |
201 | | - PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases); |
| 95 | + assertEquals("number of interpretations", 3, interpretations.size()); |
| 96 | + assertTrue("UK as United_Kingdom", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_United_Kingdom )) ); |
| 97 | + assertTrue("UK as Great_Britain", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_Great_Britain )) ); |
| 98 | + assertTrue("UK as England", interpretations.contains( new Disambiguator.Interpretation<Term, LocalConcept>( uk_as_England )) ); |
202 | 99 | |
| 100 | + /////////////////////////////////////////////////////////////////////////////////////////////// |
| 101 | + } |
| 102 | + |
| 103 | + public void testDisambiguatePhraseNode() throws PersistenceException { |
| 104 | + PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
| 105 | + |
203 | 106 | SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher); |
204 | 107 | disambiguator.setInitialWindow(1); |
205 | 108 | disambiguator.setWindow(3); |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java |
— | — | @@ -0,0 +1,112 @@ |
| 2 | +package de.brightbyte.wikiword.disambig; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
| 7 | +import java.util.Collections; |
| 8 | +import java.util.HashSet; |
| 9 | +import java.util.List; |
| 10 | +import java.util.Map; |
| 11 | + |
| 12 | +import de.brightbyte.util.PersistenceException; |
| 13 | +import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
| 14 | +import de.brightbyte.wikiword.model.LocalConcept; |
| 15 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 16 | +import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
| 17 | +import de.brightbyte.wikiword.model.TermListNode; |
| 18 | + |
| 19 | +public class PopularityDisambiguatorTest extends DisambiguatorTestBase { |
| 20 | + |
| 21 | + public PopularityDisambiguatorTest() throws IOException, PersistenceException { |
| 22 | + super(); |
| 23 | + } |
| 24 | + |
| 25 | + public void testGetTermsForList() throws PersistenceException { |
| 26 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 27 | + |
| 28 | + Term uk = new Term("UK"); |
| 29 | + Term london = new Term("London"); |
| 30 | + Term underground = new Term("Underground"); |
| 31 | + |
| 32 | + ArrayList<Term> terms = new ArrayList<Term>(); |
| 33 | + terms.add(uk); |
| 34 | + terms.add(london); |
| 35 | + terms.add(underground); |
| 36 | + |
| 37 | + Collection<Term> res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1); |
| 38 | + assertEquals("depth 1", new HashSet<Term>( terms.subList(0, 1) ), res); |
| 39 | + |
| 40 | + res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 2); |
| 41 | + assertEquals("depth 2", new HashSet<Term>( terms.subList(0, 2) ), res); |
| 42 | + |
| 43 | + res = disambiguator.getTerms(new TermListNode<Term>(terms, 0), 1000); |
| 44 | + assertEquals("depth 1000", new HashSet<Term>( terms ), res); |
| 45 | + } |
| 46 | + |
| 47 | + public void testGetTermsForNode() throws PersistenceException { |
| 48 | + PhraseOccuranceSet set = getBankAndMonumentPhrases(); |
| 49 | + |
| 50 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 51 | + Collection<PhraseOccurance> terms = disambiguator.getTerms(set.getRootNode(), 0); |
| 52 | + assertEquals("empty term set", Collections.emptySet(), terms); |
| 53 | + |
| 54 | + //FIXME: Test case for getHorizon |
| 55 | + |
| 56 | + terms = disambiguator.getTerms(set.getRootNode(), 1); |
| 57 | + assertEquals("terms from depth 1", Collections.emptySet() /* fixme */, terms); |
| 58 | + } |
| 59 | + |
| 60 | + public void testGetMeaningsForList() throws PersistenceException { |
| 61 | + PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 62 | + |
| 63 | + Term uk = new Term("UK"); |
| 64 | + Term london = new Term("London"); |
| 65 | + Term underground = new Term("Underground"); |
| 66 | + |
| 67 | + ArrayList<Term> terms = new ArrayList<Term>(); |
| 68 | + terms.add(uk); |
| 69 | + terms.add(london); |
| 70 | + terms.add(underground); |
| 71 | + |
| 72 | + Map<Term, List<? extends LocalConcept>> meanings = disambiguator.getMeanings(terms); |
| 73 | + |
| 74 | + assertEquals(uk.getTerm(), meanings.get(uk.getTerm()), meanings.get(uk)); |
| 75 | + assertEquals(london.getTerm(), meanings.get(london.getTerm()), meanings.get(london)); |
| 76 | + assertEquals(underground.getTerm(), meanings.get(underground.getTerm()), meanings.get(underground)); |
| 77 | + } |
| 78 | + |
| 79 | + public void testGetMeaningsForNode() throws PersistenceException { |
| 80 | + throw new UnsupportedOperationException("not yet implemented"); |
| 81 | + //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 82 | + //disambiguator.getMeanings(terms); |
| 83 | + } |
| 84 | + |
| 85 | + public void testGetSequences() throws PersistenceException { |
| 86 | + throw new UnsupportedOperationException("not yet implemented"); |
| 87 | + //PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 88 | + //disambiguator.getSequences(root, depth); |
| 89 | + } |
| 90 | + |
| 91 | + public void testDisambiguateTerms() throws PersistenceException { |
| 92 | + throw new UnsupportedOperationException("not yet implemented"); |
| 93 | + /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 94 | + |
| 95 | + String[] sequence = {"UK", "London", "Underground", "Bank"}; |
| 96 | + |
| 97 | + Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null); |
| 98 | + */ |
| 99 | + //// .............. /// |
| 100 | + } |
| 101 | + |
| 102 | + public void testDisambiguateNode() throws PersistenceException { |
| 103 | + throw new UnsupportedOperationException("not yet implemented"); |
| 104 | + /*PopularityDisambiguator disambiguator = new PopularityDisambiguator(meaningFetcher); |
| 105 | + |
| 106 | + String[] sequence = {"UK", "London", "Underground", "Bank"}; |
| 107 | + |
| 108 | + Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null); |
| 109 | + */ |
| 110 | + //// .............. /// |
| 111 | + } |
| 112 | + |
| 113 | +} |
Property changes on: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/PopularityDisambiguatorTest.java |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 114 | + |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/DisambiguatorTestBase.java |
— | — | @@ -0,0 +1,232 @@ |
| 2 | +package de.brightbyte.wikiword.disambig; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.InputStream; |
| 6 | +import java.net.URL; |
| 7 | +import java.util.ArrayList; |
| 8 | +import java.util.Collection; |
| 9 | +import java.util.HashMap; |
| 10 | +import java.util.List; |
| 11 | +import java.util.Map; |
| 12 | + |
| 13 | +import junit.framework.TestCase; |
| 14 | +import de.brightbyte.abstraction.ListAbstractor; |
| 15 | +import de.brightbyte.data.LabeledVector; |
| 16 | +import de.brightbyte.data.MapLabeledVector; |
| 17 | +import de.brightbyte.data.cursor.DataCursor; |
| 18 | +import de.brightbyte.io.ChunkingCursor; |
| 19 | +import de.brightbyte.io.GroupingCursor; |
| 20 | +import de.brightbyte.io.LineCursor; |
| 21 | +import de.brightbyte.text.CsvLineChunker; |
| 22 | +import de.brightbyte.util.PersistenceException; |
| 23 | +import de.brightbyte.wikiword.ConceptType; |
| 24 | +import de.brightbyte.wikiword.Corpus; |
| 25 | +import de.brightbyte.wikiword.TweakSet; |
| 26 | +import de.brightbyte.wikiword.model.ConceptFeatures; |
| 27 | +import de.brightbyte.wikiword.model.LocalConcept; |
| 28 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 29 | +import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
| 30 | +import de.brightbyte.wikiword.model.TermReference; |
| 31 | + |
| 32 | +public class DisambiguatorTestBase extends TestCase { |
| 33 | + |
| 34 | + protected Map<String, List<? extends LocalConcept>> meanings = new HashMap<String, List<? extends LocalConcept>>(); |
| 35 | + protected Map<Integer, ConceptFeatures<LocalConcept, Integer>> features = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>(); |
| 36 | + protected Map<Integer, LocalConcept> conceptsById = new HashMap<Integer, LocalConcept>(); |
| 37 | + protected Map<String, LocalConcept> conceptsByName = new HashMap<String, LocalConcept>(); |
| 38 | + |
| 39 | + protected static DataCursor<List<String>> openTableCursor(InputStream in, String enc) throws IOException { |
| 40 | + ChunkingCursor cursor = new ChunkingCursor(new LineCursor(in, enc), CsvLineChunker.tsv); |
| 41 | + return cursor; |
| 42 | + } |
| 43 | + |
| 44 | + protected static DataCursor<List<List<String>>> openGroupedTableCursor(InputStream in, String enc, int groupBy, boolean skipHeader) throws IOException, PersistenceException { |
| 45 | + DataCursor<List<String>> c = openTableCursor(in, enc); |
| 46 | + if (skipHeader) c.next(); //skip first line |
| 47 | + |
| 48 | + return new GroupingCursor<List<String>, String>(c, new ListAbstractor.Accessor<String>(groupBy)); |
| 49 | + } |
| 50 | + |
| 51 | + protected static void readMeanings(Corpus corpus, InputStream in, Map<String, List<? extends LocalConcept>> meanings) throws IOException, PersistenceException { |
| 52 | + DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true); |
| 53 | + |
| 54 | + List<List<String>> group; |
| 55 | + while ((group = cursor.next()) != null) { |
| 56 | + List<LocalConcept> concepts = new ArrayList<LocalConcept>(group.size()); |
| 57 | + String term = null; |
| 58 | + |
| 59 | + for (List<String> row: group) { |
| 60 | + term = row.get(0); |
| 61 | + int id = Integer.parseInt(row.get(1)); |
| 62 | + String name = row.get(2); |
| 63 | + int freq = Integer.parseInt(row.get(3)); |
| 64 | + int rule = Integer.parseInt(row.get(4)); |
| 65 | + |
| 66 | + int score = ((rule==10 || rule==30) && freq<2) ? 0 : freq*rule; |
| 67 | + |
| 68 | + LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name); |
| 69 | + c.setCardinality(freq); |
| 70 | + c.setRelevance(score); |
| 71 | + |
| 72 | + concepts.add(c); |
| 73 | + } |
| 74 | + |
| 75 | + if (term!=null) meanings.put(term, concepts); |
| 76 | + } |
| 77 | + |
| 78 | + cursor.close(); |
| 79 | + } |
| 80 | + |
| 81 | + protected static void readFeatures(Corpus corpus, InputStream in, Map<Integer, ConceptFeatures<LocalConcept, Integer>> features) throws IOException, PersistenceException { |
| 82 | + DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true); |
| 83 | + |
| 84 | + List<List<String>> group; |
| 85 | + while ((group = cursor.next()) != null) { |
| 86 | + LabeledVector<Integer> v = new MapLabeledVector<Integer>(); |
| 87 | + Integer id = null; |
| 88 | + String name = null; |
| 89 | + |
| 90 | + for (List<String> row: group) { |
| 91 | + id = new Integer(row.get(0)); |
| 92 | + name = row.get(1); |
| 93 | + |
| 94 | + int feature = Integer.parseInt(row.get(2)); |
| 95 | + double value = Double.parseDouble(row.get(3)); |
| 96 | + |
| 97 | + v.set(feature, value); |
| 98 | + } |
| 99 | + |
| 100 | + if (id!=null) { |
| 101 | + double len = v.getLength(); |
| 102 | + v = v.scaled(len); //normalize |
| 103 | + |
| 104 | + LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name); |
| 105 | + ConceptFeatures<LocalConcept, Integer> f = new ConceptFeatures<LocalConcept, Integer>(c, v); |
| 106 | + features.put(id, f); |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + cursor.close(); |
| 111 | + } |
| 112 | + |
| 113 | + protected MeaningFetcher<LocalConcept> meaningFetcher = new MeaningFetcher<LocalConcept>() { |
| 114 | + |
| 115 | + public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings( |
| 116 | + Collection<X> terms) throws PersistenceException { |
| 117 | + Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>(); |
| 118 | + |
| 119 | + for (X t: terms) { |
| 120 | + List<? extends LocalConcept> n = getMeanings(t.getTerm()); |
| 121 | + if (n!=null) m.put(t, n); |
| 122 | + } |
| 123 | + |
| 124 | + return m; |
| 125 | + } |
| 126 | + |
| 127 | + public List<? extends LocalConcept> getMeanings(String term) |
| 128 | + throws PersistenceException { |
| 129 | + return meanings.get(term); |
| 130 | + } |
| 131 | + |
| 132 | + }; |
| 133 | + |
| 134 | + protected FeatureFetcher<LocalConcept, Integer> featureFetcher = new FeatureFetcher<LocalConcept, Integer>() { |
| 135 | + |
| 136 | + public boolean getFeaturesAreNormalized() { |
| 137 | + return true; |
| 138 | + } |
| 139 | + |
| 140 | + public Map<Integer, ConceptFeatures<LocalConcept, Integer>> getFeatures( |
| 141 | + Collection<? extends LocalConcept> concepts) throws PersistenceException { |
| 142 | + Map<Integer, ConceptFeatures<LocalConcept, Integer>> m = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>(); |
| 143 | + |
| 144 | + for (LocalConcept c: concepts) { |
| 145 | + ConceptFeatures<LocalConcept, Integer> f = getFeatures(c); |
| 146 | + m.put(c.getId(), f); |
| 147 | + } |
| 148 | + |
| 149 | + return m; |
| 150 | + } |
| 151 | + |
| 152 | + public ConceptFeatures<LocalConcept, Integer> getFeatures(LocalConcept c) |
| 153 | + throws PersistenceException { |
| 154 | + return features.get(c.getId()); |
| 155 | + } |
| 156 | + |
| 157 | + }; |
| 158 | + |
| 159 | + protected Corpus corpus; |
| 160 | + protected TweakSet tweaks; |
| 161 | + |
| 162 | + public DisambiguatorTestBase() throws IOException, PersistenceException { |
| 163 | + tweaks = new TweakSet(); |
| 164 | + corpus = Corpus.forName("TEST", "en", tweaks); |
| 165 | + |
| 166 | + URL meaningFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-meanings.csv"); |
| 167 | + URL featureFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-features.csv"); |
| 168 | + |
| 169 | + readMeanings(corpus, meaningFile.openStream(), meanings); |
| 170 | + readFeatures(corpus, featureFile.openStream(), features); |
| 171 | + |
| 172 | + for (List<? extends LocalConcept> concepts: meanings.values()) { |
| 173 | + for (LocalConcept c: concepts) { |
| 174 | + conceptsById.put(c.getId(), c); |
| 175 | + conceptsByName.put(c.getName(), c); |
| 176 | + } |
| 177 | + } |
| 178 | + } |
| 179 | + |
| 180 | + protected List<Term> terms(String... terms) { |
| 181 | + List<Term> list = new ArrayList<Term>(); |
| 182 | + for (String t: terms) list.add(new Term(t)); |
| 183 | + return list; |
| 184 | + } |
| 185 | + |
| 186 | + protected LocalConcept getConcept(String name) { |
| 187 | + LocalConcept c = conceptsByName.get(name); |
| 188 | + return c; |
| 189 | + } |
| 190 | + |
| 191 | + protected LocalConcept getConcept(int id) { |
| 192 | + LocalConcept c = conceptsById.get(id); |
| 193 | + return c; |
| 194 | + } |
| 195 | + |
| 196 | + protected <X extends TermReference>Map<X, List<? extends LocalConcept>> getMeanings(Collection<List<X>> sequences) throws PersistenceException { |
| 197 | + Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>(); |
| 198 | + |
| 199 | + for (List<X> seq: sequences) { |
| 200 | + Map<X, List<? extends LocalConcept>> meanings = meaningFetcher.getMeanings(seq); |
| 201 | + m.putAll(meanings); |
| 202 | + } |
| 203 | + |
| 204 | + return m; |
| 205 | + } |
| 206 | + |
| 207 | + protected PhraseOccuranceSet getBankAndMonumentPhrases() { |
| 208 | + String text = "The Bank and Monument Underground station"; |
| 209 | + List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 210 | + |
| 211 | + phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank |
| 212 | + phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument |
| 213 | + phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground |
| 214 | + |
| 215 | + phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank |
| 216 | + phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument |
| 217 | + phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground |
| 218 | + //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station |
| 219 | + |
| 220 | + phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
| 221 | + phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
| 222 | + phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
| 223 | + |
| 224 | + phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 225 | + phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 226 | + |
| 227 | + phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 228 | + |
| 229 | + PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases); |
| 230 | + return set; |
| 231 | + } |
| 232 | + |
| 233 | +} |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -206,7 +206,7 @@ |
207 | 207 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
208 | 208 | FeatureFetcher<LocalConcept, Integer> features = getFeatureCache(meanings, context); |
209 | 209 | |
210 | | - List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(sequences, meanings); |
| 210 | + Collection<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(sequences, meanings); |
211 | 211 | |
212 | 212 | return getBestInterpretation(root, meanings, context, interpretations, similarities, features); |
213 | 213 | } |
— | — | @@ -240,7 +240,7 @@ |
241 | 241 | } |
242 | 242 | |
243 | 243 | protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(PhraseNode<X> root, Map<X, List<? extends LocalConcept>> meanings, |
244 | | - Collection<? extends LocalConcept> context, List<Disambiguator.Interpretation<X, LocalConcept>> interpretations, |
| 244 | + Collection<? extends LocalConcept> context, Collection<Disambiguator.Interpretation<X, LocalConcept>> interpretations, |
245 | 245 | LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureFetcher<LocalConcept, Integer> features) throws PersistenceException { |
246 | 246 | |
247 | 247 | List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>(); |
— | — | @@ -269,17 +269,18 @@ |
270 | 270 | return r; |
271 | 271 | } |
272 | 272 | |
273 | | - protected <X extends TermReference>List<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(Collection<List<X>> sequences, Map<X, List<? extends LocalConcept>> meanings) { |
| 273 | + public <X extends TermReference>Collection<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(Collection<List<X>> sequences, Map<X, List<? extends LocalConcept>> meanings) { |
274 | 274 | List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = new ArrayList<Disambiguator.Interpretation<X, LocalConcept>>(); |
275 | 275 | for (List<X> sq: sequences) { |
276 | | - List<Disambiguator.Interpretation<X, LocalConcept>> sqint = getSequenceInterpretations(sq, meanings); |
| 276 | + if (sq.isEmpty()) continue; |
| 277 | + Collection<Disambiguator.Interpretation<X, LocalConcept>> sqint = getSequenceInterpretations(sq, meanings); |
277 | 278 | interpretations.addAll(sqint); |
278 | 279 | } |
279 | 280 | |
280 | 281 | return interpretations; |
281 | 282 | } |
282 | 283 | |
283 | | - protected <X extends TermReference>List<Disambiguator.Interpretation<X, LocalConcept>> getSequenceInterpretations(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings) { |
| 284 | + public <X extends TermReference>Collection<Disambiguator.Interpretation<X, LocalConcept>> getSequenceInterpretations(List<X> sequence, Map<X, List<? extends LocalConcept>> meanings) { |
284 | 285 | if (sequence.size()==0) { |
285 | 286 | return Collections.singletonList(new Disambiguator.Interpretation<X, LocalConcept>(Collections.<X, LocalConcept>emptyMap(), sequence)); |
286 | 287 | } |
— | — | @@ -287,7 +288,7 @@ |
288 | 289 | X t = sequence.get(0); |
289 | 290 | List<? extends LocalConcept> m = meanings.get(t); |
290 | 291 | |
291 | | - List<Disambiguator.Interpretation<X, LocalConcept>> base = getSequenceInterpretations(sequence.subList(1, sequence.size()), meanings); |
| 292 | + Collection<Disambiguator.Interpretation<X, LocalConcept>> base = getSequenceInterpretations(sequence.subList(1, sequence.size()), meanings); |
292 | 293 | |
293 | 294 | if (m==null || m.size()==0) return base; |
294 | 295 | |
— | — | @@ -299,7 +300,10 @@ |
300 | 301 | e.putAll(be.getMeanings()); |
301 | 302 | e.put(t, c); |
302 | 303 | |
303 | | - interpretations.add(new Disambiguator.Interpretation<X, LocalConcept>(e, sequence)); |
| 304 | + if (!sequence.isEmpty()) { |
| 305 | + Disambiguator.Interpretation<X, LocalConcept>interp = new Disambiguator.Interpretation<X, LocalConcept>(e, sequence); |
| 306 | + interpretations.add(interp); |
| 307 | + } |
304 | 308 | } |
305 | 309 | } |
306 | 310 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -18,7 +18,7 @@ |
19 | 19 | public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> { |
20 | 20 | |
21 | 21 | public interface NodeListener<T extends TermReference> { |
22 | | - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence); |
| 22 | + public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal); |
23 | 23 | } |
24 | 24 | |
25 | 25 | public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> { |
— | — | @@ -28,9 +28,8 @@ |
29 | 29 | seqencees = new ArrayList<List<T>>(); |
30 | 30 | } |
31 | 31 | |
32 | | - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) { |
33 | | - Collection<?> successors = node.getSuccessors(); |
34 | | - if (successors==null || successors.isEmpty()) { //is leaf |
| 32 | + public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) { |
| 33 | + if (terminal) { |
35 | 34 | List<T> p = new ArrayList<T>(seqence); //clone |
36 | 35 | seqencees.add(p); |
37 | 36 | } |
— | — | @@ -48,7 +47,7 @@ |
49 | 48 | terms = new HashSet<T>(); |
50 | 49 | } |
51 | 50 | |
52 | | - public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence) { |
| 51 | + public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) { |
53 | 52 | T t = node.getTermReference(); |
54 | 53 | if (t.getTerm().length()>0) terms.add(t); |
55 | 54 | } |
— | — | @@ -127,13 +126,17 @@ |
128 | 127 | |
129 | 128 | X t = root.getTermReference(); |
130 | 129 | if (t.getTerm().length()>0) seqence.add(t); //push |
| 130 | + else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes. |
131 | 131 | |
| 132 | + boolean terminal = (depth<=1); |
| 133 | + |
| 134 | + Collection<? extends PhraseNode<X>> successors = terminal ? null : root.getSuccessors(); |
| 135 | + if (successors==null || successors.isEmpty()) terminal = true; |
| 136 | + |
132 | 137 | if (nodeListener!=null) |
133 | | - nodeListener.onNode(root, seqence); |
| 138 | + nodeListener.onNode(root, seqence, terminal); |
134 | 139 | |
135 | | - Collection<? extends PhraseNode<X>> successors = root.getSuccessors(); |
136 | | - |
137 | | - if (depth>1 && successors!=null) { |
| 140 | + if (!terminal) { |
138 | 141 | for (PhraseNode<X> n: successors) { |
139 | 142 | walk(n, seqence, nodeListener, depth-1); |
140 | 143 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | if (to-from < 2) { |
59 | 59 | r = popularityDisambiguator.disambiguate(frame, meanings, context); |
60 | 60 | } else { |
61 | | - List<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(frame, interpretation, meanings); |
| 61 | + Collection<Disambiguator.Interpretation<X, LocalConcept>> interpretations = getInterpretations(frame, interpretation, meanings); |
62 | 62 | r = getBestInterpretation(node, meanings, context, interpretations, similarities, features); |
63 | 63 | } |
64 | 64 | |
— | — | @@ -97,7 +97,7 @@ |
98 | 98 | |
99 | 99 | if (initialWindow > 0) { //apply full coherence disambig to initial window size. initialWindow == 1 will trigger a popularity disambig. |
100 | 100 | Collection<List<X>> sequences = getSequences(root, initialWindow); |
101 | | - Result<X, LocalConcept> r = disambiguate(sequences, root, meanings, context); |
| 101 | + Result<X, LocalConcept> r = super.disambiguate(sequences, root, meanings, context); |
102 | 102 | |
103 | 103 | sequence.addAll(r.getSequence()); |
104 | 104 | currentNode = getLastNode(root, sequence); |
— | — | @@ -130,7 +130,7 @@ |
131 | 131 | return getScore(new Disambiguator.Interpretation<X, LocalConcept>(disambig, sequence), context, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
132 | 132 | } |
133 | 133 | |
134 | | - protected <X extends TermReference>List<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(List<X> frame, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
| 134 | + protected <X extends TermReference>Collection<Disambiguator.Interpretation<X, LocalConcept>> getInterpretations(List<X> frame, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
135 | 135 | //strip out all terms with no known meaning |
136 | 136 | if (meanings.keySet().size() != frame.size()) { |
137 | 137 | List<X> t = new ArrayList<X>(frame.size()); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java |
— | — | @@ -1,9 +1,13 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Arrays; |
4 | 6 | import java.util.Collection; |
| 7 | +import java.util.HashMap; |
5 | 8 | import java.util.List; |
6 | 9 | import java.util.Map; |
7 | 10 | |
| 11 | +import de.brightbyte.data.Pair; |
8 | 12 | import de.brightbyte.io.Output; |
9 | 13 | import de.brightbyte.util.PersistenceException; |
10 | 14 | import de.brightbyte.wikiword.model.PhraseNode; |
— | — | @@ -11,12 +15,39 @@ |
12 | 16 | import de.brightbyte.wikiword.model.WikiWordConcept; |
13 | 17 | |
14 | 18 | public interface Disambiguator<T extends TermReference, C extends WikiWordConcept> { |
15 | | - |
| 19 | + |
16 | 20 | public static class Interpretation<T extends TermReference, C extends WikiWordConcept> { |
17 | 21 | private final Map<T, C> meanings; |
18 | 22 | private final List<T> sequence; |
19 | 23 | |
| 24 | + private static <T extends TermReference, C extends WikiWordConcept>Map<T, C> buildMeaningMap(List<Pair<T, C>> interpretation) { |
| 25 | + Map<T, C> sequence = new HashMap<T, C>(interpretation.size()); |
| 26 | + for (Pair<T, C> p: interpretation) { |
| 27 | + sequence.put(p.getA(), p.getB()); |
| 28 | + } |
| 29 | + return sequence; |
| 30 | + } |
| 31 | + |
| 32 | + private static <T extends TermReference, C extends WikiWordConcept>List<T> buildTermSequence(List<Pair<T, C>> interpretation) { |
| 33 | + List<T> sequence = new ArrayList<T>(interpretation.size()); |
| 34 | + for (Pair<T, C> p: interpretation) { |
| 35 | + sequence.add(p.getA()); |
| 36 | + } |
| 37 | + return sequence; |
| 38 | + } |
| 39 | + |
| 40 | + public Interpretation(Pair<T, C>... interpretation) { |
| 41 | + this(Arrays.asList(interpretation)); |
| 42 | + } |
| 43 | + |
| 44 | + public Interpretation(List<Pair<T, C>> interpretation) { |
| 45 | + this(buildMeaningMap(interpretation), buildTermSequence(interpretation)); |
| 46 | + } |
| 47 | + |
20 | 48 | public Interpretation(final Map<T, C> meanings, final List<T> sequence) { |
| 49 | + if (meanings==null) throw new NullPointerException(); |
| 50 | + if (sequence==null) throw new NullPointerException(); |
| 51 | + |
21 | 52 | this.meanings = meanings; |
22 | 53 | this.sequence = sequence; |
23 | 54 | } |
— | — | @@ -24,9 +55,61 @@ |
25 | 56 | public Map<T, C> getMeanings() { |
26 | 57 | return meanings; |
27 | 58 | } |
| 59 | + |
28 | 60 | public List<T> getSequence() { |
29 | 61 | return sequence; |
30 | 62 | } |
| 63 | + |
| 64 | + public String toString() { |
| 65 | + if (sequence.isEmpty()) return "()"; |
| 66 | + |
| 67 | + StringBuilder b = new StringBuilder(); |
| 68 | + b.append("("); |
| 69 | + |
| 70 | + for (T t: sequence) { |
| 71 | + C c = meanings.get(t); |
| 72 | + b.append(t); |
| 73 | + b.append("=>"); |
| 74 | + b.append(c); |
| 75 | + b.append("; "); |
| 76 | + } |
| 77 | + |
| 78 | + b.append(")"); |
| 79 | + return b.toString(); |
| 80 | + } |
| 81 | + |
| 82 | + @Override |
| 83 | + public int hashCode() { |
| 84 | + final int PRIME = 31; |
| 85 | + int result = 1; |
| 86 | + result = PRIME * result + ((meanings == null) ? 0 : meanings.hashCode()); |
| 87 | + result = PRIME * result + ((sequence == null) ? 0 : sequence.hashCode()); |
| 88 | + return result; |
| 89 | + } |
| 90 | + |
| 91 | + @Override |
| 92 | + public boolean equals(Object obj) { |
| 93 | + if (this == obj) |
| 94 | + return true; |
| 95 | + if (obj == null) |
| 96 | + return false; |
| 97 | + if (getClass() != obj.getClass()) |
| 98 | + return false; |
| 99 | + final Interpretation other = (Interpretation) obj; |
| 100 | + if (meanings == null) { |
| 101 | + if (other.meanings != null) |
| 102 | + return false; |
| 103 | + } else if (!meanings.equals(other.meanings)) |
| 104 | + return false; |
| 105 | + if (sequence == null) { |
| 106 | + if (other.sequence != null) |
| 107 | + return false; |
| 108 | + } else if (!sequence.equals(other.sequence)) |
| 109 | + return false; |
| 110 | + return true; |
| 111 | + } |
| 112 | + |
| 113 | + |
31 | 114 | } |
32 | 115 | |
33 | 116 | public static class Result<T extends TermReference, C extends WikiWordConcept> implements Comparable { |