Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest.java |
— | — | @@ -0,0 +1,236 @@ |
| 2 | +package de.brightbyte.wikiword.disambig; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.InputStream; |
| 6 | +import java.net.URL; |
| 7 | +import java.util.ArrayList; |
| 8 | +import java.util.Collection; |
| 9 | +import java.util.HashMap; |
| 10 | +import java.util.List; |
| 11 | +import java.util.Map; |
| 12 | + |
| 13 | +import de.brightbyte.abstraction.ListAbstractor; |
| 14 | +import de.brightbyte.data.LabeledVector; |
| 15 | +import de.brightbyte.data.MapLabeledVector; |
| 16 | +import de.brightbyte.data.cursor.DataCursor; |
| 17 | +import de.brightbyte.io.ChunkingCursor; |
| 18 | +import de.brightbyte.io.GroupingCursor; |
| 19 | +import de.brightbyte.io.LineCursor; |
| 20 | +import de.brightbyte.text.CsvLineChunker; |
| 21 | +import de.brightbyte.util.PersistenceException; |
| 22 | +import de.brightbyte.wikiword.ConceptType; |
| 23 | +import de.brightbyte.wikiword.Corpus; |
| 24 | +import de.brightbyte.wikiword.TweakSet; |
| 25 | +import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
| 26 | +import de.brightbyte.wikiword.model.ConceptFeatures; |
| 27 | +import de.brightbyte.wikiword.model.LocalConcept; |
| 28 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 29 | +import de.brightbyte.wikiword.model.PhraseOccuranceSet; |
| 30 | +import de.brightbyte.wikiword.model.TermReference; |
| 31 | +import junit.framework.TestCase; |
| 32 | + |
| 33 | +public class SlidingCoherenceDisambiguatorTest extends TestCase { |
| 34 | + |
| 35 | + protected Map<String, List<? extends LocalConcept>> meanings = new HashMap<String, List<? extends LocalConcept>>(); |
| 36 | + protected Map<Integer, ConceptFeatures<LocalConcept, Integer>> features = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>(); |
| 37 | + |
| 38 | + protected static DataCursor<List<String>> openTableCursor(InputStream in, String enc) throws IOException { |
| 39 | + ChunkingCursor cursor = new ChunkingCursor(new LineCursor(in, enc), CsvLineChunker.tsv); |
| 40 | + return cursor; |
| 41 | + } |
| 42 | + |
| 43 | + protected static DataCursor<List<List<String>>> openGroupedTableCursor(InputStream in, String enc, int groupBy, boolean skipHeader) throws IOException, PersistenceException { |
| 44 | + DataCursor<List<String>> c = openTableCursor(in, enc); |
| 45 | + if (skipHeader) c.next(); //skip first line |
| 46 | + |
| 47 | + return new GroupingCursor<List<String>, String>(c, new ListAbstractor.Accessor<String>(groupBy)); |
| 48 | + } |
| 49 | + |
| 50 | + protected static void readMeanings(Corpus corpus, InputStream in, Map<String, List<? extends LocalConcept>> meanings) throws IOException, PersistenceException { |
| 51 | + DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true); |
| 52 | + |
| 53 | + List<List<String>> group; |
| 54 | + while ((group = cursor.next()) != null) { |
| 55 | + List<LocalConcept> concepts = new ArrayList<LocalConcept>(group.size()); |
| 56 | + String term = null; |
| 57 | + |
| 58 | + for (List<String> row: group) { |
| 59 | + term = row.get(0); |
| 60 | + int id = Integer.parseInt(row.get(1)); |
| 61 | + String name = row.get(2); |
| 62 | + int freq = Integer.parseInt(row.get(3)); |
| 63 | + int rule = Integer.parseInt(row.get(4)); |
| 64 | + |
| 65 | + int score = ((rule==10 || rule==30) && freq<2) ? 0 : freq*rule; |
| 66 | + |
| 67 | + LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name); |
| 68 | + c.setCardinality(freq); |
| 69 | + c.setRelevance(score); |
| 70 | + |
| 71 | + concepts.add(c); |
| 72 | + } |
| 73 | + |
| 74 | + if (term!=null) meanings.put(term, concepts); |
| 75 | + } |
| 76 | + |
| 77 | + cursor.close(); |
| 78 | + } |
| 79 | + |
| 80 | + protected static void readFeatures(Corpus corpus, InputStream in, Map<Integer, ConceptFeatures<LocalConcept, Integer>> features) throws IOException, PersistenceException { |
| 81 | + DataCursor<List<List<String>>> cursor = openGroupedTableCursor(in, "UTF-8", 0, true); |
| 82 | + |
| 83 | + List<List<String>> group; |
| 84 | + while ((group = cursor.next()) != null) { |
| 85 | + LabeledVector<Integer> v = new MapLabeledVector<Integer>(); |
| 86 | + Integer id = null; |
| 87 | + String name = null; |
| 88 | + |
| 89 | + for (List<String> row: group) { |
| 90 | + id = new Integer(row.get(0)); |
| 91 | + name = row.get(1); |
| 92 | + |
| 93 | + int feature = Integer.parseInt(row.get(2)); |
| 94 | + double value = Double.parseDouble(row.get(3)); |
| 95 | + |
| 96 | + v.set(feature, value); |
| 97 | + } |
| 98 | + |
| 99 | + if (id!=null) { |
| 100 | + double len = v.getLength(); |
| 101 | + v = v.scaled(len); //normalize |
| 102 | + |
| 103 | + LocalConcept c = new LocalConcept(corpus, id, ConceptType.UNKNOWN, name); |
| 104 | + ConceptFeatures<LocalConcept, Integer> f = new ConceptFeatures<LocalConcept, Integer>(c, v); |
| 105 | + features.put(id, f); |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + cursor.close(); |
| 110 | + } |
| 111 | + |
| 112 | + private MeaningFetcher<LocalConcept> meaningFetcher = new MeaningFetcher<LocalConcept>() { |
| 113 | + |
| 114 | + public <X extends TermReference> Map<X, List<? extends LocalConcept>> getMeanings( |
| 115 | + Collection<X> terms) throws PersistenceException { |
| 116 | + Map<X, List<? extends LocalConcept>> m = new HashMap<X, List<? extends LocalConcept>>(); |
| 117 | + |
| 118 | + for (X t: terms) { |
| 119 | + List<? extends LocalConcept> n = getMeanings(t.getTerm()); |
| 120 | + m.put(t, n); |
| 121 | + } |
| 122 | + |
| 123 | + return m; |
| 124 | + } |
| 125 | + |
| 126 | + public List<? extends LocalConcept> getMeanings(String term) |
| 127 | + throws PersistenceException { |
| 128 | + return meanings.get(term); |
| 129 | + } |
| 130 | + |
| 131 | + }; |
| 132 | + |
| 133 | + private FeatureFetcher<LocalConcept, Integer> featureFetcher = new FeatureFetcher<LocalConcept, Integer>() { |
| 134 | + |
| 135 | + public boolean getFeaturesAreNormalized() { |
| 136 | + return true; |
| 137 | + } |
| 138 | + |
| 139 | + public Map<Integer, ConceptFeatures<LocalConcept, Integer>> getFeatures( |
| 140 | + Collection<? extends LocalConcept> concepts) throws PersistenceException { |
| 141 | + Map<Integer, ConceptFeatures<LocalConcept, Integer>> m = new HashMap<Integer, ConceptFeatures<LocalConcept, Integer>>(); |
| 142 | + |
| 143 | + for (LocalConcept c: concepts) { |
| 144 | + ConceptFeatures<LocalConcept, Integer> f = getFeatures(c); |
| 145 | + m.put(c.getId(), f); |
| 146 | + } |
| 147 | + |
| 148 | + return m; |
| 149 | + } |
| 150 | + |
| 151 | + public ConceptFeatures<LocalConcept, Integer> getFeatures(LocalConcept c) |
| 152 | + throws PersistenceException { |
| 153 | + return features.get(c.getId()); |
| 154 | + } |
| 155 | + |
| 156 | + }; |
| 157 | + |
| 158 | + protected Corpus corpus; |
| 159 | + protected TweakSet tweaks; |
| 160 | + |
| 161 | + public SlidingCoherenceDisambiguatorTest() throws IOException, PersistenceException { |
| 162 | + tweaks = new TweakSet(); |
| 163 | + corpus = Corpus.forName("TEST", "en", tweaks); |
| 164 | + |
| 165 | + URL meaningFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-meanings.csv"); |
| 166 | + URL featureFile = getClass().getResource("SlidingCoherenceDisambiguatorTest-features.csv"); |
| 167 | + |
| 168 | + readMeanings(corpus, meaningFile.openStream(), meanings); |
| 169 | + readFeatures(corpus, featureFile.openStream(), features); |
| 170 | + } |
| 171 | + |
| 172 | + protected List<Term> terms(String... terms) { |
| 173 | + List<Term> list = new ArrayList<Term>(); |
| 174 | + for (String t: terms) list.add(new Term(t)); |
| 175 | + return list; |
| 176 | + } |
| 177 | + |
| 178 | + public void testDisambiguatePhraseNode() throws PersistenceException { |
| 179 | + String text = "The Bank and Monument Underground station"; |
| 180 | + // 012345678901234567890123456789012345678901234567890 |
| 181 | + List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 182 | + |
| 183 | + phrases.add( new PhraseOccurance( text.substring( 0, 8 ), 1, 0, 8 ) ); //The Bank |
| 184 | + phrases.add( new PhraseOccurance( text.substring( 0, 21 ), 2, 0, 21 ) ); //The Bank and Monument |
| 185 | + phrases.add( new PhraseOccurance( text.substring( 0, 33 ), 3, 0, 33 ) ); //The Bank and Monument Underground |
| 186 | + |
| 187 | + phrases.add( new PhraseOccurance( text.substring( 4, 8 ), 1, 4, 8-4 ) ); //Bank |
| 188 | + phrases.add( new PhraseOccurance( text.substring( 4, 21 ), 2, 4, 21-4 ) ); //Bank and Monument |
| 189 | + phrases.add( new PhraseOccurance( text.substring( 4, 33 ), 3, 4, 33-4 ) ); //Bank and Monument Underground |
| 190 | + //phrases.add( new PhraseOccurance( text.substring( 4, 41 ), 4, 4, 41-4 ) ); //Bank and Monument Underground station |
| 191 | + |
| 192 | + phrases.add( new PhraseOccurance( text.substring( 13, 21 ), 1, 13, 21-13 ) ); //Monument |
| 193 | + phrases.add( new PhraseOccurance( text.substring( 13, 33 ), 2, 13, 33-13 ) ); //Monument Underground |
| 194 | + phrases.add( new PhraseOccurance( text.substring( 13, 41 ), 3, 13, 41-13 ) ); //Monument Underground station |
| 195 | + |
| 196 | + phrases.add( new PhraseOccurance( text.substring( 22, 33 ), 1, 22, 33-22 ) ); //Underground |
| 197 | + phrases.add( new PhraseOccurance( text.substring( 22, 41 ), 2, 22, 41-22 ) ); //Underground stations |
| 198 | + |
| 199 | + phrases.add( new PhraseOccurance( text.substring( 34, 41 ), 1, 34, 41-34 ) ); //station |
| 200 | + |
| 201 | + PhraseOccuranceSet set = new PhraseOccuranceSet(text, phrases); |
| 202 | + |
| 203 | + SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher); |
| 204 | + disambiguator.setInitialWindow(1); |
| 205 | + disambiguator.setWindow(3); |
| 206 | + |
| 207 | + Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(set.getRootNode(), null); |
| 208 | + |
| 209 | + List<? extends PhraseOccurance> sequence = result.getSequence(); |
| 210 | + Map<? extends PhraseOccurance, ? extends LocalConcept> meanings = result.getMeanings(); |
| 211 | + |
| 212 | + assertEquals("Bank and Monument", sequence.get(0).getTerm()); |
| 213 | + assertEquals("Underground", sequence.get(1).getTerm()); |
| 214 | + assertEquals("station", sequence.get(2).getTerm()); |
| 215 | + |
| 216 | + assertNotNull( meanings.get( sequence.get(0).getTerm() ) ); |
| 217 | + assertNotNull( meanings.get( sequence.get(1).getTerm() ) ); |
| 218 | + assertNotNull( meanings.get( sequence.get(2).getTerm() ) ); |
| 219 | + |
| 220 | + assertEquals("Bank_and_Monument_Underground_station", meanings.get( sequence.get(0).getTerm() ).getName() ); |
| 221 | + assertEquals("Subway", meanings.get( sequence.get(1).getTerm() ).getName() ); |
| 222 | + assertEquals("Metro_station", meanings.get( sequence.get(2).getTerm() ).getName() ); |
| 223 | + } |
| 224 | + |
| 225 | + public void testDisambiguateTerms() throws PersistenceException { |
| 226 | + SlidingCoherenceDisambiguator disambiguator = new SlidingCoherenceDisambiguator(meaningFetcher, featureFetcher); |
| 227 | + disambiguator.setInitialWindow(1); |
| 228 | + disambiguator.setWindow(3); |
| 229 | + |
| 230 | + String[] sequence = {"UK", "London", "Underground", "Bank"}; |
| 231 | + |
| 232 | + Result<Term, LocalConcept> result = disambiguator.disambiguate(terms(sequence), null); |
| 233 | + |
| 234 | + //// .............. /// |
| 235 | + } |
| 236 | + |
| 237 | +} |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest-features.csv |
— | — | @@ -0,0 +1,121 @@ |
| 2 | +concept concept_name feature weight |
| 3 | +1001 United_Kingdom 1001 2 |
| 4 | +1001 United_Kingdom 1004 0.66 |
| 5 | +1001 United_Kingdom 1001001 0.222 |
| 6 | +1001 United_Kingdom 2001001 0.222 |
| 7 | + |
| 8 | +1002 Great_Britain 1002 2 |
| 9 | +1002 Great_Britain 1004 0.65 |
| 10 | +1002 Great_Britain 1001002 0.222 |
| 11 | + |
| 12 | +1004 England 1004 2 |
| 13 | +1004 England 2001 0.43 |
| 14 | +1004 England 2002 0.45 |
| 15 | +1004 England 4003 0.2 |
| 16 | +1004 England 4004 0.45 |
| 17 | +1004 England 1002 0.65 |
| 18 | +1004 England 1001 0.66 |
| 19 | + |
| 20 | +2001 City_of_London 2001 2 |
| 21 | +2001 City_of_London 2002 0.77 |
| 22 | +2001 City_of_London 1004 0.43 |
| 23 | +2001 City_of_London 2003 0.55 |
| 24 | +2001 City_of_London 3001 0.25 |
| 25 | +2001 City_of_London 4003 0.15 |
| 26 | +2001 City_of_London 4004 0.22 |
| 27 | +2001 City_of_London 5001 0.21 |
| 28 | +2001 City_of_London 6001 0.023 |
| 29 | + |
| 30 | +2002 Greater_London 2002 2 |
| 31 | +2002 Greater_London 2001 0.77 |
| 32 | +2002 Greater_London 1004 0.45 |
| 33 | +2002 Greater_London 2003 0.56 |
| 34 | +2002 Greater_London 3001 0.22 |
| 35 | +2002 Greater_London 5001 0.12 |
| 36 | +2002 Greater_London 6001 0.022 |
| 37 | + |
| 38 | +2003 London_city_council 2003 2 |
| 39 | +2003 London_city_council 2001 0.55 |
| 40 | +2003 London_city_council 2002 0.56 |
| 41 | +2003 London_city_council 1002003 0.22 |
| 42 | + |
| 43 | +3001 London_Underground 3001 2 |
| 44 | +3001 London_Underground 3002 0.47 |
| 45 | +3001 London_Underground 5001 0.66 |
| 46 | +3001 London_Underground 2001 0.25 |
| 47 | +3001 London_Underground 2002 0.22 |
| 48 | + |
| 49 | +3002 Subway 3002 2 |
| 50 | +3002 Subway 5001 0.47 |
| 51 | +3002 Subway 3001 0.55 |
| 52 | +3002 Subway 7001 0.178 |
| 53 | +3002 Subway 7002 0.33 |
| 54 | +3002 Subway 7003 0.25 |
| 55 | + |
| 56 | +4001 Bank 4001 2 |
| 57 | +4001 Bank 4004 0.55 |
| 58 | +4001 Bank 1004001 0.33 |
| 59 | + |
| 60 | +4002 Bank_(sitting) 4002 2 |
| 61 | +4002 Bank_(sitting) 1004002 0.22 |
| 62 | +4002 Bank_(sitting) 2004002 0.33 |
| 63 | +4002 Bank_(sitting) 7002 0.02 |
| 64 | + |
| 65 | +4003 Bank_(geology) 4003 2 |
| 66 | +4003 Bank_(geology) 1004 0.2 |
| 67 | +4003 Bank_(geology) 2001 0.15 |
| 68 | + |
| 69 | +4004 Bank_of_England 4004 2 |
| 70 | +4004 Bank_of_England 4001 0.55 |
| 71 | +4004 Bank_of_England 1004 0.45 |
| 72 | +4004 Bank_of_England 2001 0.22 |
| 73 | +4004 Bank_of_England 5001 0.34 |
| 74 | + |
| 75 | +5001 Bank_and_Monument_Underground_stations 5001 2 |
| 76 | +5001 Bank_and_Monument_Underground_stations 3001 0.66 |
| 77 | +5001 Bank_and_Monument_Underground_stations 3002 0.47 |
| 78 | +5001 Bank_and_Monument_Underground_stations 4004 0.34 |
| 79 | +5001 Bank_and_Monument_Underground_stations 2001 0.21 |
| 80 | +5001 Bank_and_Monument_Underground_stations 2002 0.12 |
| 81 | + |
| 82 | +6001 Monument 6001 2 |
| 83 | +6001 Monument 5001 0.017 |
| 84 | +6001 Monument 2001 0.023 |
| 85 | +6001 Monument 2002 0.022 |
| 86 | +6001 Monument 1006001 0.18 |
| 87 | +6001 Monument 2006001 0.33 |
| 88 | + |
| 89 | +6002 Some_silly_monument 6002 2 |
| 90 | +6002 Some_silly_monument 6001 0.32 |
| 91 | +6002 Some_silly_monument 7003 0.008 |
| 92 | +6002 Some_silly_monument 1006002 0.08 |
| 93 | +6002 Some_silly_monument 2006002 0.01 |
| 94 | + |
| 95 | +7001 Bus_station 7001 2 |
| 96 | +7001 Bus_station 1007001 0.1 |
| 97 | +7001 Bus_station 2007001 0.2 |
| 98 | +7001 Bus_station 7003 0.21 |
| 99 | +7001 Bus_station 7002 0.32 |
| 100 | +7001 Bus_station 5001 0.10 |
| 101 | +7001 Bus_station 3002 0.178 |
| 102 | + |
| 103 | +7002 Metro_station 7002 2 |
| 104 | +7002 Metro_station 1007002 0.1 |
| 105 | +7002 Metro_station 2007002 0.2 |
| 106 | +7002 Metro_station 7003 0.22 |
| 107 | +7002 Metro_station 7001 0.32 |
| 108 | +7002 Metro_station 5001 0.17 |
| 109 | +7002 Metro_station 3002 0.33 |
| 110 | +7002 Metro_station 4002 0.02 |
| 111 | + |
| 112 | +7003 Train_station 7003 2 |
| 113 | +7003 Train_station 1007003 0.1 |
| 114 | +7003 Train_station 2007003 0.2 |
| 115 | +7003 Train_station 7002 0.22 |
| 116 | +7003 Train_station 7001 0.21 |
| 117 | +7003 Train_station 5001 0.11 |
| 118 | +7003 Train_station 3002 0.25 |
| 119 | + |
| 120 | +7004 Social_status 7004 2 |
| 121 | +7004 Social_status 1007004 0.1 |
| 122 | +7004 Social_status 2007004 0.2 |
Index: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguatorTest-meanings.csv |
— | — | @@ -0,0 +1,24 @@ |
| 2 | +term_text concept concept_name freq rule |
| 3 | +UK 1001 United_Kingdom 1000 90 |
| 4 | +UK 1002 Great_Britain 670 60 |
| 5 | +UK 1004 England 16 10 |
| 6 | +London 2001 City_of_London 1000 90 |
| 7 | +London 2002 Greater_London 888 80 |
| 8 | +London 2003 London_city_council 12 10 |
| 9 | +Underground 3001 London_Underground 1000 90 |
| 10 | +Underground 3002 Subway 888 60 |
| 11 | +Bank 4001 Bank 1000 90 |
| 12 | +Bank 4002 Bank_(sitting) 666 80 |
| 13 | +Bank 4003 Bank_(geology) 230 60 |
| 14 | +Bank 4004 Bank_of_England 220 60 |
| 15 | +Bank and Monument 5001 Bank_and_Monument_Underground_stations 200 60 |
| 16 | +Bank and Monument Underground station 5001 Bank_and_Monument_Underground_stations 50 90 |
| 17 | +Bank and Monument stations 5001 Bank_and_Monument_Underground_stations 66 60 |
| 18 | +Bank and Monument 5001 Bank_and_Monument_Underground_stations 200 90 |
| 19 | +Monument 6001 Monument 1000 90 |
| 20 | +Monument 6002 Some_silly_monument 100 60 |
| 21 | +Monument 5001 Bank_and_Monument_Underground_stations 100 10 |
| 22 | +station 7001 Bus_station 1000 90 |
| 23 | +station 7002 Metro_station 888 80 |
| 24 | +station 7003 Train_station 666 60 |
| 25 | +station 7004 Social_status 300 10 |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -165,7 +165,7 @@ |
166 | 166 | //NOTE: pre-fetch all features in one go |
167 | 167 | List<LocalConcept> concepts = new ArrayList<LocalConcept>(meanings.size()*10); |
168 | 168 | for (List<? extends LocalConcept> m: meanings.values()) { |
169 | | - concepts.addAll(m); |
| 169 | + if (m!=null) concepts.addAll(m); |
170 | 170 | } |
171 | 171 | |
172 | 172 | if (context!=null) concepts.addAll(context); |
— | — | @@ -218,6 +218,7 @@ |
219 | 219 | while (eit.hasNext()) { |
220 | 220 | Entry<TermReference, List<? extends LocalConcept>> e = (Entry<TermReference, List<? extends LocalConcept>>) eit.next(); //XXX: ugly cast. got confused about generics. ugh. |
221 | 221 | List<? extends LocalConcept> m = e.getValue(); |
| 222 | + if (m==null) continue; |
222 | 223 | |
223 | 224 | Iterator<? extends LocalConcept> cit = m.iterator(); |
224 | 225 | while (cit.hasNext()) { |
— | — | @@ -327,13 +328,16 @@ |
328 | 329 | for (Map.Entry<? extends TermReference, LocalConcept> ea: concepts.entrySet()) { |
329 | 330 | LocalConcept a = ea.getValue(); |
330 | 331 | TermReference term = ea.getKey(); |
| 332 | + |
| 333 | + i++; |
| 334 | + if (a==null) continue; |
331 | 335 | |
332 | | - i++; |
333 | 336 | j=0; |
334 | 337 | for (Map.Entry<? extends TermReference, LocalConcept> eb: concepts.entrySet()) { |
335 | 338 | LocalConcept b = eb.getValue(); |
336 | 339 | j++; |
337 | 340 | if (i==j) break; |
| 341 | + if (b==null) continue; |
338 | 342 | |
339 | 343 | double d; |
340 | 344 | |
— | — | @@ -348,14 +352,18 @@ |
349 | 353 | ConceptFeatures<LocalConcept, Integer> fa = features.getFeatures(a); |
350 | 354 | ConceptFeatures<LocalConcept, Integer> fb = features.getFeatures(b); |
351 | 355 | |
352 | | - //force relevance/cardinality to the figures from the meaning lookup |
353 | | - //not strictly necessary, but nice to keep it consistent. |
354 | | - fa.getConcept().setCardinality(a.getCardinality()); |
355 | | - fa.getConcept().setRelevance(a.getRelevance()); |
356 | | - fb.getConcept().setCardinality(b.getCardinality()); |
357 | | - fb.getConcept().setRelevance(b.getRelevance()); |
| 356 | + if (fa==null || fb==null) d = 0; |
| 357 | + else { |
| 358 | + //force relevance/cardinality to the figures from the meaning lookup |
| 359 | + //not strictly necessary, but nice to keep it consistent. |
| 360 | + fa.getConcept().setCardinality(a.getCardinality()); |
| 361 | + fa.getConcept().setRelevance(a.getRelevance()); |
| 362 | + fb.getConcept().setCardinality(b.getCardinality()); |
| 363 | + fb.getConcept().setRelevance(b.getRelevance()); |
| 364 | + |
| 365 | + d = similarityMeasure.similarity(fa.getFeatureVector(), fb.getFeatureVector()); |
| 366 | + } |
358 | 367 | |
359 | | - d = similarityMeasure.similarity(fa.getFeatureVector(), fb.getFeatureVector()); |
360 | 368 | similarities.set(a, b, d); |
361 | 369 | } |
362 | 370 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredFeatureFetcher.java |
— | — | @@ -44,5 +44,9 @@ |
45 | 45 | protected void trace(String msg) { |
46 | 46 | if (trace!=null) trace.println(msg); |
47 | 47 | } |
| 48 | + |
| 49 | + public boolean getFeaturesAreNormalized() { |
| 50 | + return true; |
| 51 | + } |
48 | 52 | |
49 | 53 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureCache.java |
— | — | @@ -98,4 +98,8 @@ |
99 | 99 | cache.clear(); |
100 | 100 | } |
101 | 101 | |
| 102 | + public boolean getFeaturesAreNormalized() { |
| 103 | + return parent.getFeaturesAreNormalized(); |
| 104 | + } |
| 105 | + |
102 | 106 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -25,9 +25,9 @@ |
26 | 26 | protected int window; |
27 | 27 | protected int initialWindow; |
28 | 28 | |
29 | | - public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) { |
| 29 | + public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher) { |
30 | 30 | this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, |
31 | | - featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance(), //if pre-normalized, use scalar to calc cosin |
| 31 | + featureFetcher.getFeaturesAreNormalized() ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance(), //if pre-normalized, use scalar to calc cosin |
32 | 32 | 5, 5); |
33 | 33 | } |
34 | 34 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureFetcher.java |
— | — | @@ -10,4 +10,5 @@ |
11 | 11 | public interface FeatureFetcher<C extends WikiWordConcept, K> { |
12 | 12 | public ConceptFeatures<C, K> getFeatures(C c) throws PersistenceException; |
13 | 13 | public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> c) throws PersistenceException; |
| 14 | + public boolean getFeaturesAreNormalized(); |
14 | 15 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/ConceptFeatures.java |
— | — | @@ -7,6 +7,8 @@ |
8 | 8 | protected WikiWordConcept concept; |
9 | 9 | |
10 | 10 | public ConceptFeatures(WikiWordConcept concept, LabeledVector<K> features) { |
| 11 | + if (features==null) throw new NullPointerException(); |
| 12 | + if (concept==null) throw new NullPointerException(); |
11 | 13 | this.features = features; |
12 | 14 | this.concept = concept; |
13 | 15 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java |
— | — | @@ -451,7 +451,7 @@ |
452 | 452 | if (disambiguator==null) { |
453 | 453 | StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore()); |
454 | 454 | StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore()); |
455 | | - disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true ); |
| 455 | + disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher ); |
456 | 456 | |
457 | 457 | LeveledOutput.Trace trace = new LeveledOutput.Trace(out); |
458 | 458 | meaningFetcher.setTrace(trace); |