Index: trunk/WikiWord/WikiWordBuilder/attic/ZLibBenchmark.java |
— | — | @@ -0,0 +1,167 @@ |
| 2 | +package de.brightbyte.wikiword.builder; |
| 3 | + |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.File; |
| 6 | +import java.io.IOException; |
| 7 | +import java.io.InputStreamReader; |
| 8 | +import java.io.UnsupportedEncodingException; |
| 9 | + |
| 10 | +import com.jcraft.jzlib.JZlib; |
| 11 | +import com.jcraft.jzlib.ZStream; |
| 12 | + |
| 13 | +import de.brightbyte.data.ByteString; |
| 14 | +import de.brightbyte.io.ConsoleIO; |
| 15 | +import de.brightbyte.io.IOUtil; |
| 16 | +import de.brightbyte.util.StringUtils; |
| 17 | + |
| 18 | +public class ZLibBenchmark { |
| 19 | + protected byte[] dictionary; |
| 20 | + |
| 21 | + public ZLibBenchmark() { |
| 22 | + } |
| 23 | + |
| 24 | + public void setDictionary(byte[] data) { |
| 25 | + this.dictionary = data; |
| 26 | + } |
| 27 | + |
| 28 | + protected int level = JZlib.Z_BEST_COMPRESSION; |
| 29 | + protected int windowbits = 15; |
| 30 | + protected int strategy = JZlib.Z_DEFAULT_STRATEGY; |
| 31 | + protected int bufferSize = 32 * 1024; |
| 32 | + |
| 33 | + public ByteString compress(byte[] data) { |
| 34 | + int err; |
| 35 | + |
| 36 | + int comprLen = bufferSize; //FIXME: data.length; |
| 37 | + |
| 38 | + byte[] compr = new byte[comprLen]; |
| 39 | + |
| 40 | + ZStream c_stream = new ZStream(); |
| 41 | + |
| 42 | + err = c_stream.deflateInit(level, 15); |
| 43 | + CHECK_ERR(c_stream, err, "deflateInit"); |
| 44 | + |
| 45 | + err = c_stream.deflateParams(level, strategy); |
| 46 | + CHECK_ERR(c_stream, err, "deflateInit"); |
| 47 | + |
| 48 | + err = c_stream.deflateSetDictionary(dictionary, dictionary.length); |
| 49 | + CHECK_ERR(c_stream, err, "deflateSetDictionary"); |
| 50 | + |
| 51 | + long dictId = c_stream.adler; |
| 52 | + |
| 53 | + c_stream.next_out = compr; |
| 54 | + c_stream.next_out_index = 0; |
| 55 | + c_stream.avail_out = comprLen; |
| 56 | + |
| 57 | + c_stream.next_in = data; |
| 58 | + c_stream.next_in_index = 0; |
| 59 | + c_stream.avail_in = data.length; |
| 60 | + |
| 61 | + err = c_stream.deflate(JZlib.Z_FINISH); |
| 62 | + //FIXME: JZlib.Z_STREAM_END expected, getting JZlib.Z_OK |
| 63 | + |
| 64 | + if (err != JZlib.Z_STREAM_END && err != JZlib.Z_OK) { |
| 65 | + throw new RuntimeException("deflate should report Z_STREAM_END, found "+err); |
| 66 | + } |
| 67 | + err = c_stream.deflateEnd(); |
| 68 | + CHECK_ERR(c_stream, err, "deflateEnd"); |
| 69 | + |
| 70 | + return new ByteString(compr, 0, c_stream.next_out_index); |
| 71 | + } |
| 72 | + |
| 73 | + public ByteString uncompress(byte[] data, boolean ignoreChecksumm) { |
| 74 | + int uncomprLen = bufferSize; //FIXME: data.length * 10; |
| 75 | + byte[] uncompr = new byte[uncomprLen]; |
| 76 | + ZStream d_stream = new ZStream(); |
| 77 | + |
| 78 | + d_stream.next_in = data; |
| 79 | + d_stream.next_in_index = 0; |
| 80 | + d_stream.avail_in = data.length; |
| 81 | + |
| 82 | + int err = d_stream.inflateInit(windowbits); |
| 83 | + CHECK_ERR(d_stream, err, "inflateInit"); |
| 84 | + d_stream.next_out = uncompr; |
| 85 | + d_stream.next_out_index = 0; |
| 86 | + d_stream.avail_out = uncomprLen; |
| 87 | + |
| 88 | + while (true) { |
| 89 | + err = d_stream.inflate(JZlib.Z_NO_FLUSH); |
| 90 | + if (err == JZlib.Z_STREAM_END) { |
| 91 | + break; |
| 92 | + } |
| 93 | + if (err == JZlib.Z_NEED_DICT) { |
| 94 | + /*if ((int) d_stream.adler != (int) dictId) { |
| 95 | + System.out.println("unexpected dictionary"); |
| 96 | + System.exit(1); |
| 97 | + } */ |
| 98 | + err = d_stream.inflateSetDictionary(dictionary, |
| 99 | + dictionary.length); |
| 100 | + } |
| 101 | + |
| 102 | + if (ignoreChecksumm && err==JZlib.Z_DATA_ERROR) break; |
| 103 | + else CHECK_ERR(d_stream, err, "inflate with dict"); |
| 104 | + } |
| 105 | + |
| 106 | + err = d_stream.inflateEnd(); |
| 107 | + CHECK_ERR(d_stream, err, "inflateEnd"); |
| 108 | + |
| 109 | + int j = 0; |
| 110 | + for (; j < uncompr.length; j++) |
| 111 | + if (uncompr[j] == 0) |
| 112 | + break; |
| 113 | + |
| 114 | + return new ByteString(uncompr, 0, d_stream.next_out_index); |
| 115 | + } |
| 116 | + |
| 117 | + public ByteString getPrefix(ByteString b) { |
| 118 | + return b.subString(0, 6); |
| 119 | + } |
| 120 | + |
| 121 | + public ByteString strip(ByteString b) { |
| 122 | + return b.subString(6, b.length()-5); |
| 123 | + } |
| 124 | + |
| 125 | + public ByteString pad(ByteString prefix, ByteString b, ByteString suffix) { |
| 126 | + return ByteString.concat(prefix, b, suffix); |
| 127 | + } |
| 128 | + |
| 129 | + public static void main(String[] args) throws IOException { |
| 130 | + |
| 131 | + String d = args[0]; |
| 132 | + String denc = "UTF-8"; |
| 133 | + String enc = "UTF-8"; |
| 134 | + |
| 135 | + String dict = IOUtil.slurp(new File(d), denc); |
| 136 | + |
| 137 | + ZLibBenchmark app = new ZLibBenchmark(); |
| 138 | + app.setDictionary(dict.getBytes(enc)); |
| 139 | + |
| 140 | + ByteString b = app.compress("dummy".getBytes()); |
| 141 | + ByteString prefix = app.getPrefix(b); |
| 142 | + ByteString suffix = new ByteString( new byte[] {0, 0, 0, 0, 0} ); |
| 143 | + |
| 144 | + BufferedReader r = new BufferedReader( new InputStreamReader( System.in )); |
| 145 | + String s; |
| 146 | + while ((s = r.readLine()) != null) { |
| 147 | + s = s.trim(); |
| 148 | + if (s.length()==0) continue; |
| 149 | + |
| 150 | + byte[] data = s.getBytes(enc); |
| 151 | + System.out.println("UTF-16: "+s.length()*2+" bytes"); |
| 152 | + System.out.println(enc+": "+data.length+" bytes: "+StringUtils.hex(data)); |
| 153 | + b = app.compress(data); |
| 154 | + System.out.println("compressed: "+b.length()+" bytes: "+b.toString()); |
| 155 | + b = app.strip(b); |
| 156 | + System.out.println("stripped: "+b.length()+" bytes: "+b.toString()); |
| 157 | + b = app.pad(prefix, b, suffix); |
| 158 | + b = app.uncompress(b.getBytes(), true); |
| 159 | + System.out.println("uncompressed: "+b.length()+" bytes, "+new String(b.getBytes(), enc)); |
| 160 | + } |
| 161 | + } |
| 162 | + |
| 163 | + static void CHECK_ERR(ZStream z, int err, String msg) { |
| 164 | + if (err != JZlib.Z_OK) |
| 165 | + throw new RuntimeException(z.msg + "; code: " + err); |
| 166 | + } |
| 167 | + |
| 168 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccuranceSequence.java |
— | — | @@ -1,82 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.analyzer; |
3 | | - |
4 | | -import java.util.AbstractList; |
5 | | -import java.util.ArrayList; |
6 | | -import java.util.Collections; |
7 | | -import java.util.List; |
8 | | -import java.util.RandomAccess; |
9 | | - |
10 | | -import de.brightbyte.data.filter.Filter; |
11 | | - |
12 | | -public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess { |
13 | | - |
14 | | - protected List<PhraseOccurance> phrases; |
15 | | - protected String text; |
16 | | - |
17 | | - public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) { |
18 | | - this.text = text; |
19 | | - |
20 | | - this.phrases = phrases; |
21 | | - Collections.sort(this.phrases); //essential! |
22 | | - } |
23 | | - |
24 | | - @Override |
25 | | - public PhraseOccurance get(int index) { |
26 | | - return phrases.get(index); |
27 | | - } |
28 | | - |
29 | | - @Override |
30 | | - public int size() { |
31 | | - return phrases.size(); |
32 | | - } |
33 | | - |
34 | | - public String getText() { |
35 | | - return text; |
36 | | - } |
37 | | - |
38 | | - public List<PhraseOccurance> getPhrasesAt(int offs) { |
39 | | - int i = 0; |
40 | | - while (i<size()) { |
41 | | - PhraseOccurance p = get(i); |
42 | | - if (p.getOffset() >= offs) { |
43 | | - offs = p.getOffset(); |
44 | | - break; |
45 | | - } |
46 | | - |
47 | | - i++; |
48 | | - } |
49 | | - |
50 | | - if (i>=size()) return null; |
51 | | - |
52 | | - int j = i; |
53 | | - while (j<size()) { |
54 | | - PhraseOccurance p = get(j); |
55 | | - if (p.getOffset() > offs) break; |
56 | | - j++; |
57 | | - } |
58 | | - |
59 | | - return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
60 | | - } |
61 | | - |
62 | | - public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) { |
63 | | - List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
64 | | - |
65 | | - int i = 0; |
66 | | - |
67 | | - outer: |
68 | | - while (i<size()) { |
69 | | - List<PhraseOccurance> candidates = getPhrasesAt(i); |
70 | | - if (candidates == null) break; |
71 | | - |
72 | | - for (PhraseOccurance p: candidates) { |
73 | | - i = p.getEndOffset(); |
74 | | - if (filter.matches(p.getPhrase())) { |
75 | | - phrases.add(p); |
76 | | - continue outer; |
77 | | - } |
78 | | - } |
79 | | - } |
80 | | - |
81 | | - return phrases; |
82 | | - } |
83 | | -} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccurance.java |
— | — | @@ -1,98 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.analyzer; |
3 | | - |
4 | | -import java.io.Serializable; |
5 | | - |
6 | | -public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance> { |
7 | | - |
8 | | - private static final long serialVersionUID = 241753475865301115L; |
9 | | - |
10 | | - protected String phrase; |
11 | | - protected int weight; |
12 | | - protected int offset; |
13 | | - protected int length; |
14 | | - |
15 | | - public PhraseOccurance(String phrase, int weight, int offset, int length) { |
16 | | - if (length <= 0) throw new IllegalArgumentException("bad length: "+length); |
17 | | - if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string"); |
18 | | - //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string"); |
19 | | - if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string"); |
20 | | - if (length < phrase.length()) phrase = phrase.substring(offset, offset+length); |
21 | | - |
22 | | - this.phrase = phrase; |
23 | | - this.weight = weight; |
24 | | - this.offset = offset; |
25 | | - this.length = length; |
26 | | - } |
27 | | - |
28 | | - public int getLength() { |
29 | | - return length; |
30 | | - } |
31 | | - |
32 | | - public int getOffset() { |
33 | | - return offset; |
34 | | - } |
35 | | - |
36 | | - public int getEndOffset() { |
37 | | - return getOffset() + getLength(); |
38 | | - } |
39 | | - |
40 | | - public String getPhrase() { |
41 | | - return phrase; |
42 | | - } |
43 | | - |
44 | | - public int getWeight() { |
45 | | - return weight; |
46 | | - } |
47 | | - |
48 | | - public String toString() { |
49 | | - return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]"; |
50 | | - } |
51 | | - |
52 | | - @Override |
53 | | - public int hashCode() { |
54 | | - final int PRIME = 31; |
55 | | - int result = 1; |
56 | | - result = PRIME * result + length; |
57 | | - result = PRIME * result + offset; |
58 | | - result = PRIME * result + ((phrase == null) ? 0 : phrase.hashCode()); |
59 | | - return result; |
60 | | - } |
61 | | - |
62 | | - @Override |
63 | | - public boolean equals(Object obj) { |
64 | | - if (this == obj) |
65 | | - return true; |
66 | | - if (obj == null) |
67 | | - return false; |
68 | | - if (getClass() != obj.getClass()) |
69 | | - return false; |
70 | | - final PhraseOccurance other = (PhraseOccurance) obj; |
71 | | - if (length != other.length) |
72 | | - return false; |
73 | | - if (offset != other.offset) |
74 | | - return false; |
75 | | - if (phrase == null) { |
76 | | - if (other.phrase != null) |
77 | | - return false; |
78 | | - } else if (!phrase.equals(other.phrase)) |
79 | | - return false; |
80 | | - return true; |
81 | | - } |
82 | | - |
83 | | - public boolean overlaps(PhraseOccurance other) { |
84 | | - if (getEndOffset() <= other.getOffset()) return false; |
85 | | - if (getOffset() >= other.getEndOffset()) return false; |
86 | | - |
87 | | - return true; |
88 | | - } |
89 | | - |
90 | | - public int compareTo(PhraseOccurance other) { |
91 | | - int o = getOffset() - other.getOffset(); |
92 | | - if (o!=0) return o; //by offset... |
93 | | - |
94 | | - int e = getEndOffset() - other.getEndOffset(); |
95 | | - if (e!=0) return -e; //but longest first! |
96 | | - |
97 | | - return 0; |
98 | | - } |
99 | | -} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java |
— | — | @@ -5,6 +5,8 @@ |
6 | 6 | import java.util.Iterator; |
7 | 7 | import java.util.regex.Matcher; |
8 | 8 | |
| 9 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 10 | + |
9 | 11 | public class PhraseAggregator { |
10 | 12 | public class PhraseBuilder { |
11 | 13 | protected StringBuilder phrase; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -20,6 +20,8 @@ |
21 | 21 | import de.brightbyte.io.ConsoleIO; |
22 | 22 | import de.brightbyte.wikiword.Corpus; |
23 | 23 | import de.brightbyte.wikiword.TweakSet; |
| 24 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 25 | +import de.brightbyte.wikiword.model.PhraseOccuranceSequence; |
24 | 26 | |
25 | 27 | public class PlainTextAnalyzer extends AbstractAnalyzer { |
26 | 28 | private LanguageConfiguration config; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java |
— | — | @@ -1,29 +1,31 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.util.List; |
5 | 6 | |
6 | | -import sun.net.dns.ResolverConfiguration.Options; |
7 | | - |
8 | 7 | import de.brightbyte.data.cursor.DataCursor; |
9 | 8 | import de.brightbyte.data.cursor.DataSink; |
10 | 9 | import de.brightbyte.io.ConsoleIO; |
11 | 10 | import de.brightbyte.io.LineCursor; |
12 | 11 | import de.brightbyte.io.OutputSink; |
13 | 12 | import de.brightbyte.util.PersistenceException; |
14 | | -import de.brightbyte.wikiword.analyzer.PhraseOccuranceSequence; |
15 | 13 | import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer; |
16 | 14 | import de.brightbyte.wikiword.disambig.Disambiguator; |
17 | 15 | import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator; |
18 | 16 | import de.brightbyte.wikiword.disambig.StoredFeatureFetcher; |
19 | 17 | import de.brightbyte.wikiword.disambig.StoredMeaningFetcher; |
| 18 | +import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
20 | 19 | import de.brightbyte.wikiword.model.LocalConcept; |
| 20 | +import de.brightbyte.wikiword.model.PhraseOccurance; |
| 21 | +import de.brightbyte.wikiword.model.PhraseOccuranceSequence; |
| 22 | +import de.brightbyte.wikiword.model.TermReference; |
21 | 23 | import de.brightbyte.wikiword.store.DatabaseConceptStores; |
22 | 24 | import de.brightbyte.wikiword.store.FeatureStore; |
23 | 25 | import de.brightbyte.wikiword.store.LocalConceptStore; |
24 | 26 | import de.brightbyte.wikiword.store.WikiWordConceptStore; |
25 | 27 | |
26 | 28 | public class WordSenseIndexer extends StreamProcessorApp<String, String, WikiWordConceptStore> { |
27 | | - protected Disambiguator disambiguator; |
| 29 | + protected Disambiguator<TermReference, LocalConcept> disambiguator; |
28 | 30 | protected PlainTextAnalyzer analyzer; |
29 | 31 | private int phraseLength; |
30 | 32 | |
— | — | @@ -58,7 +60,7 @@ |
59 | 61 | protected void init() throws PersistenceException, InstantiationException { |
60 | 62 | StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore()); |
61 | 63 | StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore()); |
62 | | - disambiguator = new SlidingCoherenceDisambiguator<Integer>( meaningFetcher, featureFetcher, true ); |
| 64 | + disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true ); |
63 | 65 | |
64 | 66 | analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks); |
65 | 67 | |
— | — | @@ -66,9 +68,11 @@ |
67 | 69 | } |
68 | 70 | |
69 | 71 | @Override |
70 | | - protected String process(String line) { |
71 | | - PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); |
72 | | - return null; |
| 72 | + protected String process(String line) throws PersistenceException { |
| 73 | + PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first. |
| 74 | + List<PhraseOccurance> phrases = sequence.getDisjointPhraseSequence(null); |
| 75 | + Disambiguator.Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(phrases); |
| 76 | + return result.toString(); //FIXME: annotate! |
73 | 77 | } |
74 | 78 | |
75 | 79 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/StreamProcessorApp.java |
— | — | @@ -49,6 +49,6 @@ |
50 | 50 | } |
51 | 51 | } |
52 | 52 | |
53 | | - protected abstract O process(I rec); |
| 53 | + protected abstract O process(I rec) throws Exception; |
54 | 54 | |
55 | 55 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalStatisticsStore.java |
— | — | @@ -2,6 +2,7 @@ |
3 | 3 | |
4 | 4 | import de.brightbyte.data.cursor.DataSet; |
5 | 5 | import de.brightbyte.util.PersistenceException; |
| 6 | +import de.brightbyte.wikiword.model.TermMeaning; |
6 | 7 | import de.brightbyte.wikiword.model.TermReference; |
7 | 8 | import de.brightbyte.wikiword.model.WikiWordConcept; |
8 | 9 | |
— | — | @@ -9,11 +10,11 @@ |
10 | 11 | |
11 | 12 | public int getNumberOfTerms() throws PersistenceException; |
12 | 13 | |
13 | | - public DataSet<TermReference> getAllTerms() |
| 14 | + public DataSet<TermMeaning> getAllTerms() |
14 | 15 | throws PersistenceException; |
15 | 16 | |
16 | 17 | /** |
17 | | - * Returns a TermReference for a random term from the top-n |
| 18 | + * Returns a TermMeaning for a random term from the top-n |
18 | 19 | * terms with repect to the frequency of occurance. |
19 | 20 | * @param top the maximum rank of the terms to be returned. If top is 0, |
20 | 21 | * any terms from the full range may be returned. If it is negative, |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalConceptStore.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import de.brightbyte.data.cursor.DataSet; |
5 | 5 | import de.brightbyte.util.PersistenceException; |
6 | 6 | import de.brightbyte.wikiword.model.LocalConcept; |
| 7 | +import de.brightbyte.wikiword.model.TermMeaning; |
7 | 8 | import de.brightbyte.wikiword.model.TermReference; |
8 | 9 | import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec; |
9 | 10 | |
— | — | @@ -16,7 +17,7 @@ |
17 | 18 | |
18 | 19 | public int getNumberOfTerms() throws PersistenceException; |
19 | 20 | |
20 | | - public abstract DataSet<TermReference> getAllTerms() throws PersistenceException; |
| 21 | + public abstract DataSet<TermMeaning> getAllTerms() throws PersistenceException; |
21 | 22 | //public abstract DataSet<ResourceReference> getAllResources() throws PersistenceException; |
22 | 23 | |
23 | 24 | //public abstract DataSet<LocalConcept> getLocalConcepts(DataSet<LocalConceptReference> refs) throws PersistenceException ; |
— | — | @@ -24,7 +25,7 @@ |
25 | 26 | //public abstract LocalConcept getLocalConcept(int id) throws PersistenceException ; |
26 | 27 | |
27 | 28 | /** |
28 | | - * Returns a TermReference for a random term from the top-n |
| 29 | + * Returns a TermMeaning for a random term from the top-n |
29 | 30 | * terms with repect to the frequency of occurance. |
30 | 31 | * @param top the maximum rank of the terms to be returned. If top is 0, |
31 | 32 | * any terms from the full range may be returned. If it is negative, |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseLocalConceptStore.java |
— | — | @@ -25,6 +25,7 @@ |
26 | 26 | import de.brightbyte.wikiword.TweakSet; |
27 | 27 | import de.brightbyte.wikiword.model.ConceptRelations; |
28 | 28 | import de.brightbyte.wikiword.model.LocalConcept; |
| 29 | +import de.brightbyte.wikiword.model.TermMeaning; |
29 | 30 | import de.brightbyte.wikiword.model.TermReference; |
30 | 31 | import de.brightbyte.wikiword.model.WikiWordResource; |
31 | 32 | import de.brightbyte.wikiword.schema.ConceptInfoStoreSchema; |
— | — | @@ -167,7 +168,7 @@ |
168 | 169 | return ((LocalStatisticsStore<LocalConcept>)getStatisticsStore()).pickRandomTerm(top); |
169 | 170 | } |
170 | 171 | |
171 | | - public DataSet<TermReference> getAllTerms() throws PersistenceException { |
| 172 | + public DataSet<TermMeaning> getAllTerms() throws PersistenceException { |
172 | 173 | return ((LocalStatisticsStore<LocalConcept>)getStatisticsStore()).getAllTerms(); |
173 | 174 | } |
174 | 175 | |
— | — | @@ -177,20 +178,20 @@ |
178 | 179 | |
179 | 180 | ///////////////////////////////////////////////////////////////////////////////////////////// |
180 | 181 | |
181 | | - protected final DatabaseDataSet.Factory<TermReference> termFactory = new DatabaseDataSet.Factory<TermReference>() { |
182 | | - public TermReference newInstance(ResultSet row) throws SQLException, PersistenceException { |
| 182 | + protected final DatabaseDataSet.Factory<TermMeaning> termFactory = new DatabaseDataSet.Factory<TermMeaning>() { |
| 183 | + public TermMeaning newInstance(ResultSet row) throws SQLException, PersistenceException { |
183 | 184 | return newTerm(row); |
184 | 185 | } |
185 | 186 | }; |
186 | 187 | |
187 | | - protected TermReference newTerm(ResultSet row) throws SQLException, PersistenceException { |
| 188 | + protected TermMeaning newTerm(ResultSet row) throws SQLException, PersistenceException { |
188 | 189 | int id = row.getInt("id"); |
189 | 190 | String name = asString(row.getObject("name")); |
190 | 191 | int card = row.getInt("cardinality"); |
191 | 192 | double relevance = row.getInt("relevance"); |
192 | 193 | |
193 | 194 | LocalConcept concept = newConcept(id, name, null, card, relevance); |
194 | | - return new TermReference(name, concept, relevance); |
| 195 | + return new TermMeaning(name, concept, relevance); |
195 | 196 | } |
196 | 197 | |
197 | 198 | protected class DatabaseLocalStatisticsStore extends DatabaseStatisticsStore implements LocalStatisticsStore<LocalConcept> { |
— | — | @@ -202,10 +203,10 @@ |
203 | 204 | termTable = (EntityTable)database.getTable("term"); |
204 | 205 | } |
205 | 206 | |
206 | | - public DataSet<TermReference> getAllTerms() throws PersistenceException { |
| 207 | + public DataSet<TermMeaning> getAllTerms() throws PersistenceException { |
207 | 208 | try { |
208 | 209 | String sql = "SELECT rank as id, term, freq as cardinality, -1 as relevance FROM "+termTable.getSQLName()+" as T"; |
209 | | - return new ChunkedQueryDataSet<TermReference>(database, termFactory, "getAllTerms", "query", sql, null, null, termTable, "rank", queryChunkSize); |
| 210 | + return new ChunkedQueryDataSet<TermMeaning>(database, termFactory, "getAllTerms", "query", sql, null, null, termTable, "rank", queryChunkSize); |
210 | 211 | } catch (SQLException e) { |
211 | 212 | throw new PersistenceException(e); |
212 | 213 | } |
— | — | @@ -355,7 +356,7 @@ |
356 | 357 | } |
357 | 358 | |
358 | 359 | if (spec!=null && spec.getIncludeTerms()) { |
359 | | - TermReference[] terms = TermReference.parseList( asString(m.get("dTerms")), getConceptFactory(), ((ConceptInfoStoreSchema)database).termReferenceListEntry ); |
| 360 | + TermReference[] terms = TermMeaning.parseList( asString(m.get("dTerms")), getConceptFactory(), ((ConceptInfoStoreSchema)database).termReferenceListEntry ); |
360 | 361 | concept.setTerms(terms); |
361 | 362 | } |
362 | 363 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java |
— | — | @@ -0,0 +1,102 @@ |
| 2 | +package de.brightbyte.wikiword.model; |
| 3 | + |
| 4 | +import java.io.Serializable; |
| 5 | + |
| 6 | +public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance>, TermReference { |
| 7 | + |
| 8 | + private static final long serialVersionUID = 241753475865301115L; |
| 9 | + |
| 10 | + protected String phrase; |
| 11 | + protected int weight; |
| 12 | + protected int offset; |
| 13 | + protected int length; |
| 14 | + |
| 15 | + public PhraseOccurance(String phrase, int weight, int offset, int length) { |
| 16 | + if (length <= 0) throw new IllegalArgumentException("bad length: "+length); |
| 17 | + if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string"); |
| 18 | + //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string"); |
| 19 | + if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string"); |
| 20 | + if (length < phrase.length()) phrase = phrase.substring(offset, offset+length); |
| 21 | + |
| 22 | + this.phrase = phrase; |
| 23 | + this.weight = weight; |
| 24 | + this.offset = offset; |
| 25 | + this.length = length; |
| 26 | + } |
| 27 | + |
| 28 | + public int getLength() { |
| 29 | + return length; |
| 30 | + } |
| 31 | + |
| 32 | + public int getOffset() { |
| 33 | + return offset; |
| 34 | + } |
| 35 | + |
| 36 | + public int getEndOffset() { |
| 37 | + return getOffset() + getLength(); |
| 38 | + } |
| 39 | + |
| 40 | + public String getPhrase() { |
| 41 | + return phrase; |
| 42 | + } |
| 43 | + |
| 44 | + public String getTerm() { |
| 45 | + return getPhrase(); |
| 46 | + } |
| 47 | + |
| 48 | + public int getWeight() { |
| 49 | + return weight; |
| 50 | + } |
| 51 | + |
| 52 | + public String toString() { |
| 53 | + return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]"; |
| 54 | + } |
| 55 | + |
| 56 | + @Override |
| 57 | + public int hashCode() { |
| 58 | + final int PRIME = 31; |
| 59 | + int result = 1; |
| 60 | + result = PRIME * result + length; |
| 61 | + result = PRIME * result + offset; |
| 62 | + result = PRIME * result + ((phrase == null) ? 0 : phrase.hashCode()); |
| 63 | + return result; |
| 64 | + } |
| 65 | + |
| 66 | + @Override |
| 67 | + public boolean equals(Object obj) { |
| 68 | + if (this == obj) |
| 69 | + return true; |
| 70 | + if (obj == null) |
| 71 | + return false; |
| 72 | + if (getClass() != obj.getClass()) |
| 73 | + return false; |
| 74 | + final PhraseOccurance other = (PhraseOccurance) obj; |
| 75 | + if (length != other.length) |
| 76 | + return false; |
| 77 | + if (offset != other.offset) |
| 78 | + return false; |
| 79 | + if (phrase == null) { |
| 80 | + if (other.phrase != null) |
| 81 | + return false; |
| 82 | + } else if (!phrase.equals(other.phrase)) |
| 83 | + return false; |
| 84 | + return true; |
| 85 | + } |
| 86 | + |
| 87 | + public boolean overlaps(PhraseOccurance other) { |
| 88 | + if (getEndOffset() <= other.getOffset()) return false; |
| 89 | + if (getOffset() >= other.getEndOffset()) return false; |
| 90 | + |
| 91 | + return true; |
| 92 | + } |
| 93 | + |
| 94 | + public int compareTo(PhraseOccurance other) { |
| 95 | + int o = getOffset() - other.getOffset(); |
| 96 | + if (o!=0) return o; //by offset... |
| 97 | + |
| 98 | + int e = getEndOffset() - other.getEndOffset(); |
| 99 | + if (e!=0) return -e; //but longest first! |
| 100 | + |
| 101 | + return 0; |
| 102 | + } |
| 103 | +} |
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java |
___________________________________________________________________ |
Added: svn:mergeinfo |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermMeaning.java |
— | — | @@ -0,0 +1,93 @@ |
| 2 | +package de.brightbyte.wikiword.model; |
| 3 | + |
| 4 | +import java.io.Serializable; |
| 5 | + |
| 6 | +import de.brightbyte.util.PersistenceException; |
| 7 | +import de.brightbyte.wikiword.model.WikiWordConcept.Factory; |
| 8 | +import de.brightbyte.wikiword.schema.ConceptInfoStoreSchema.ConceptListEntrySpec; |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | +public class TermMeaning implements TermReference, Serializable { |
| 13 | + |
| 14 | + private String term; |
| 15 | + private double score; |
| 16 | + private WikiWordConcept concept; |
| 17 | + |
| 18 | + public TermMeaning(String term, WikiWordConcept concept, double score) { |
| 19 | + this.term = term; |
| 20 | + this.concept = concept; |
| 21 | + this.score = score; |
| 22 | + } |
| 23 | + |
| 24 | + public WikiWordConcept getConcept() { |
| 25 | + return concept; |
| 26 | + } |
| 27 | + |
| 28 | + public double getScore() { |
| 29 | + return score; |
| 30 | + } |
| 31 | + |
| 32 | + public String getTerm() { |
| 33 | + return term; |
| 34 | + } |
| 35 | + |
| 36 | + public String toString() { |
| 37 | + return "\""+term+"\" -> "+getConcept(); |
| 38 | + } |
| 39 | + |
| 40 | + /* |
| 41 | + public static TermMeaning[] parseList(String s, WikiWordConcept.ListFormatSpec spec, WikiWordConcept.Factory factory) { |
| 42 | + return WikiWordConcept.parseList(s, factory, spec); |
| 43 | + } |
| 44 | + */ |
| 45 | + |
| 46 | + @Override |
| 47 | + public int hashCode() { |
| 48 | + final int PRIME = 31; |
| 49 | + int result = 1; |
| 50 | + result = PRIME * result + ((concept == null) ? 0 : concept.hashCode()); |
| 51 | + result = PRIME * result + ((term == null) ? 0 : term.hashCode()); |
| 52 | + return result; |
| 53 | + } |
| 54 | + |
| 55 | + @Override |
| 56 | + public boolean equals(Object obj) { |
| 57 | + if (this == obj) |
| 58 | + return true; |
| 59 | + if (obj == null) |
| 60 | + return false; |
| 61 | + if (getClass() != obj.getClass()) |
| 62 | + return false; |
| 63 | + final TermMeaning other = (TermMeaning) obj; |
| 64 | + if (concept == null) { |
| 65 | + if (other.concept != null) |
| 66 | + return false; |
| 67 | + } else if (!concept.equals(other.concept)) |
| 68 | + return false; |
| 69 | + if (term == null) { |
| 70 | + if (other.term != null) |
| 71 | + return false; |
| 72 | + } else if (!term.equals(other.term)) |
| 73 | + return false; |
| 74 | + return true; |
| 75 | + } |
| 76 | + |
| 77 | + public static TermReference[] parseList(String s, Factory<LocalConcept> factory, ConceptListEntrySpec spec) throws PersistenceException { |
| 78 | + LocalConcept[] concepts = WikiWordConcept.parseList(s, factory, spec); //XXX: this is a terrible, terrible hack. |
| 79 | + TermReference[] terms = new TermReference[concepts.length]; |
| 80 | + |
| 81 | + for (int i=0; i<terms.length; i++) { |
| 82 | + WikiWordConcept dummy = concepts[i]; |
| 83 | + |
| 84 | + String term = dummy.getName(); //UGHA! |
| 85 | + double score = dummy.getCardinality(); |
| 86 | + |
| 87 | + WikiWordConcept target = factory.newInstance(dummy.getId(), null, dummy.getType()); |
| 88 | + terms[i] = new TermMeaning(term, target, score); |
| 89 | + } |
| 90 | + |
| 91 | + return terms; |
| 92 | + } |
| 93 | + |
| 94 | +} |
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermMeaning.java |
___________________________________________________________________ |
Added: svn:mergeinfo |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermReference.java |
— | — | @@ -1,91 +1,7 @@ |
2 | 2 | package de.brightbyte.wikiword.model; |
3 | 3 | |
4 | | -import de.brightbyte.util.PersistenceException; |
5 | | -import de.brightbyte.wikiword.model.WikiWordConcept.Factory; |
6 | | -import de.brightbyte.wikiword.schema.ConceptInfoStoreSchema.ConceptListEntrySpec; |
| 4 | +public interface TermReference { |
7 | 5 | |
| 6 | + public String getTerm(); |
8 | 7 | |
9 | | - |
10 | | -public class TermReference { |
11 | | - |
12 | | - private String term; |
13 | | - private double score; |
14 | | - private WikiWordConcept concept; |
15 | | - |
16 | | - public TermReference(String term, WikiWordConcept concept, double score) { |
17 | | - this.term = term; |
18 | | - this.concept = concept; |
19 | | - this.score = score; |
20 | | - } |
21 | | - |
22 | | - public WikiWordConcept getConcept() { |
23 | | - return concept; |
24 | | - } |
25 | | - |
26 | | - public double getScore() { |
27 | | - return score; |
28 | | - } |
29 | | - |
30 | | - public String getTerm() { |
31 | | - return term; |
32 | | - } |
33 | | - |
34 | | - public String toString() { |
35 | | - return "\""+term+"\" -> "+getConcept(); |
36 | | - } |
37 | | - |
38 | | - /* |
39 | | - public static TermReference[] parseList(String s, WikiWordConcept.ListFormatSpec spec, WikiWordConcept.Factory factory) { |
40 | | - return WikiWordConcept.parseList(s, factory, spec); |
41 | | - } |
42 | | - */ |
43 | | - |
44 | | - @Override |
45 | | - public int hashCode() { |
46 | | - final int PRIME = 31; |
47 | | - int result = 1; |
48 | | - result = PRIME * result + ((concept == null) ? 0 : concept.hashCode()); |
49 | | - result = PRIME * result + ((term == null) ? 0 : term.hashCode()); |
50 | | - return result; |
51 | | - } |
52 | | - |
53 | | - @Override |
54 | | - public boolean equals(Object obj) { |
55 | | - if (this == obj) |
56 | | - return true; |
57 | | - if (obj == null) |
58 | | - return false; |
59 | | - if (getClass() != obj.getClass()) |
60 | | - return false; |
61 | | - final TermReference other = (TermReference) obj; |
62 | | - if (concept == null) { |
63 | | - if (other.concept != null) |
64 | | - return false; |
65 | | - } else if (!concept.equals(other.concept)) |
66 | | - return false; |
67 | | - if (term == null) { |
68 | | - if (other.term != null) |
69 | | - return false; |
70 | | - } else if (!term.equals(other.term)) |
71 | | - return false; |
72 | | - return true; |
73 | | - } |
74 | | - |
75 | | - public static TermReference[] parseList(String s, Factory<LocalConcept> factory, ConceptListEntrySpec spec) throws PersistenceException { |
76 | | - LocalConcept[] concepts = WikiWordConcept.parseList(s, factory, spec); //XXX: this is a terrible, terrible hack. |
77 | | - TermReference[] terms = new TermReference[concepts.length]; |
78 | | - |
79 | | - for (int i=0; i<terms.length; i++) { |
80 | | - WikiWordConcept dummy = concepts[i]; |
81 | | - |
82 | | - String term = dummy.getName(); //UGHA! |
83 | | - double score = dummy.getCardinality(); |
84 | | - |
85 | | - WikiWordConcept target = factory.newInstance(dummy.getId(), null, dummy.getType()); |
86 | | - terms[i] = new TermReference(term, target, score); |
87 | | - } |
88 | | - |
89 | | - return terms; |
90 | | - } |
91 | | - |
92 | | -} |
| 8 | +} |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java |
— | — | @@ -0,0 +1,82 @@ |
| 2 | +package de.brightbyte.wikiword.model; |
| 3 | + |
| 4 | +import java.util.AbstractList; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collections; |
| 7 | +import java.util.List; |
| 8 | +import java.util.RandomAccess; |
| 9 | + |
| 10 | +import de.brightbyte.data.filter.Filter; |
| 11 | + |
| 12 | +public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess { |
| 13 | + |
| 14 | + protected List<PhraseOccurance> phrases; |
| 15 | + protected String text; |
| 16 | + |
| 17 | + public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) { |
| 18 | + this.text = text; |
| 19 | + |
| 20 | + this.phrases = phrases; |
| 21 | + Collections.sort(this.phrases); //essential! |
| 22 | + } |
| 23 | + |
| 24 | + @Override |
| 25 | + public PhraseOccurance get(int index) { |
| 26 | + return phrases.get(index); |
| 27 | + } |
| 28 | + |
| 29 | + @Override |
| 30 | + public int size() { |
| 31 | + return phrases.size(); |
| 32 | + } |
| 33 | + |
| 34 | + public String getText() { |
| 35 | + return text; |
| 36 | + } |
| 37 | + |
| 38 | + public List<PhraseOccurance> getPhrasesAt(int offs) { |
| 39 | + int i = 0; |
| 40 | + while (i<size()) { |
| 41 | + PhraseOccurance p = get(i); |
| 42 | + if (p.getOffset() >= offs) { |
| 43 | + offs = p.getOffset(); |
| 44 | + break; |
| 45 | + } |
| 46 | + |
| 47 | + i++; |
| 48 | + } |
| 49 | + |
| 50 | + if (i>=size()) return null; |
| 51 | + |
| 52 | + int j = i; |
| 53 | + while (j<size()) { |
| 54 | + PhraseOccurance p = get(j); |
| 55 | + if (p.getOffset() > offs) break; |
| 56 | + j++; |
| 57 | + } |
| 58 | + |
| 59 | + return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
| 60 | + } |
| 61 | + |
| 62 | + public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) { |
| 63 | + List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 64 | + |
| 65 | + int i = 0; |
| 66 | + |
| 67 | + outer: |
| 68 | + while (i<size()) { |
| 69 | + List<PhraseOccurance> candidates = getPhrasesAt(i); |
| 70 | + if (candidates == null) break; |
| 71 | + |
| 72 | + for (PhraseOccurance p: candidates) { |
| 73 | + i = p.getEndOffset(); |
| 74 | + if (filter.matches(p.getPhrase())) { |
| 75 | + phrases.add(p); |
| 76 | + continue outer; |
| 77 | + } |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + return phrases; |
| 82 | + } |
| 83 | +} |
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java |
___________________________________________________________________ |
Added: svn:mergeinfo |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java |
— | — | @@ -415,7 +415,7 @@ |
416 | 416 | if (disambiguator==null) { |
417 | 417 | StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore()); |
418 | 418 | StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore()); |
419 | | - disambiguator = new SlidingCoherenceDisambiguator<Integer>( meaningFetcher, featureFetcher, true ); |
| 419 | + disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true ); |
420 | 420 | |
421 | 421 | LeveledOutput.Trace trace = new LeveledOutput.Trace(out); |
422 | 422 | meaningFetcher.setTrace(trace); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureFetcher.java |
— | — | @@ -1,7 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | 4 | import java.util.Collection; |
5 | | -import java.util.List; |
6 | 5 | import java.util.Map; |
7 | 6 | |
8 | 7 | import de.brightbyte.util.PersistenceException; |
— | — | @@ -10,5 +9,5 @@ |
11 | 10 | |
12 | 11 | public interface FeatureFetcher<C extends WikiWordConcept, K> { |
13 | 12 | public ConceptFeatures<C, K> getFeatures(C c) throws PersistenceException; |
14 | | - public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<C> c) throws PersistenceException; |
| 13 | + public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> c) throws PersistenceException; |
15 | 14 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredFeatureFetcher.java |
— | — | @@ -24,7 +24,7 @@ |
25 | 25 | return store.getConceptFeatures(c.getId()); |
26 | 26 | } |
27 | 27 | |
28 | | - public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<C> concepts) throws PersistenceException { |
| 28 | + public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> concepts) throws PersistenceException { |
29 | 29 | trace("fetching features for "+concepts); |
30 | 30 | |
31 | 31 | int[] ids = new int[concepts.size()]; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java |
— | — | @@ -6,35 +6,36 @@ |
7 | 7 | |
8 | 8 | import de.brightbyte.io.Output; |
9 | 9 | import de.brightbyte.util.PersistenceException; |
10 | | -import de.brightbyte.wikiword.model.LocalConcept; |
| 10 | +import de.brightbyte.wikiword.model.TermReference; |
| 11 | +import de.brightbyte.wikiword.model.WikiWordConcept; |
11 | 12 | |
12 | | -public abstract class AbstractDisambiguator implements Disambiguator { |
| 13 | +public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> { |
13 | 14 | |
14 | | - protected MeaningFetcher<LocalConcept> meaningFetcher; |
| 15 | + protected MeaningFetcher<? extends C> meaningFetcher; |
15 | 16 | protected Output trace; |
16 | 17 | |
17 | | - public AbstractDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) { |
| 18 | + public AbstractDisambiguator(MeaningFetcher<? extends C> meaningFetcher) { |
18 | 19 | if (meaningFetcher==null) throw new NullPointerException(); |
19 | 20 | this.meaningFetcher = meaningFetcher; |
20 | 21 | } |
21 | 22 | |
22 | | - protected Map<String, List<LocalConcept>> fetchMeanings(List<String> terms) throws PersistenceException { |
23 | | - Map<String, List<LocalConcept>> meanings = new HashMap<String, List<LocalConcept>>(); |
| 23 | + protected <X extends T>Map<X, List<? extends C>> fetchMeanings(List<X> terms) throws PersistenceException { |
| 24 | + Map<X, List<? extends C>> meanings = new HashMap<X, List<? extends C>>(); |
24 | 25 | |
25 | | - for (String t: terms) { |
26 | | - List<LocalConcept> m = meaningFetcher.getMeanings(t); |
| 26 | + for (X t: terms) { |
| 27 | + List<? extends C> m = meaningFetcher.getMeanings(t.getTerm()); |
27 | 28 | if (m!=null && m.size()>0) meanings.put(t, m); |
28 | 29 | } |
29 | 30 | |
30 | 31 | return meanings; |
31 | 32 | } |
32 | 33 | |
33 | | - public Result disambiguate(List<String> terms) throws PersistenceException { |
34 | | - Map<String, List<LocalConcept>> meanings = fetchMeanings(terms); |
| 34 | + public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException { |
| 35 | + Map<X, List<? extends C>> meanings = fetchMeanings(terms); |
35 | 36 | return disambiguate(terms, meanings); |
36 | 37 | } |
37 | 38 | |
38 | | - public abstract Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException; |
| 39 | + public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings) throws PersistenceException; |
39 | 40 | |
40 | 41 | public Output getTrace() { |
41 | 42 | return trace; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureCache.java |
— | — | @@ -34,7 +34,7 @@ |
35 | 35 | return f; |
36 | 36 | } |
37 | 37 | |
38 | | - public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<C> concepts) throws PersistenceException { |
| 38 | + public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> concepts) throws PersistenceException { |
39 | 39 | Map<Integer, ConceptFeatures<C, K>> features = new HashMap<Integer, ConceptFeatures<C, K>> (); |
40 | 40 | List<C> todo = new ArrayList<C>(concepts.size()); |
41 | 41 | for (C c: concepts) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java |
— | — | @@ -15,19 +15,20 @@ |
16 | 16 | import de.brightbyte.data.measure.Similarity; |
17 | 17 | import de.brightbyte.util.PersistenceException; |
18 | 18 | import de.brightbyte.wikiword.model.LocalConcept; |
| 19 | +import de.brightbyte.wikiword.model.TermReference; |
19 | 20 | import de.brightbyte.wikiword.model.WikiWordConcept; |
20 | 21 | |
21 | | -public class SlidingCoherenceDisambiguator<K> extends CoherenceDisambiguator<K> { |
| 22 | +public class SlidingCoherenceDisambiguator extends CoherenceDisambiguator { |
22 | 23 | |
23 | 24 | protected int window ; |
24 | 25 | |
25 | | - public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, boolean featuresAreNormalized) { |
| 26 | + public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) { |
26 | 27 | this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, |
27 | | - featuresAreNormalized ? ScalarVectorSimilarity.<K>getInstance() : CosineVectorSimilarity.<K>getInstance(), //if pre-normalized, use scalar to calc cosin |
| 28 | + featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance(), //if pre-normalized, use scalar to calc cosin |
28 | 29 | 5); |
29 | 30 | } |
30 | 31 | |
31 | | - public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<K>> sim, int window) { |
| 32 | + public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<Integer>> sim, int window) { |
32 | 33 | super(meaningFetcher, featureFetcher, popularityMeasure, sim); |
33 | 34 | |
34 | 35 | this.window = window; |
— | — | @@ -36,7 +37,7 @@ |
37 | 38 | /* (non-Javadoc) |
38 | 39 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
39 | 40 | */ |
40 | | - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException { |
| 41 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException { |
41 | 42 | if (window < 2 || terms.size()<2 || meanings.size()<2) |
42 | 43 | return popularityDisambiguator.disambiguate(terms, meanings); |
43 | 44 | |
— | — | @@ -47,10 +48,10 @@ |
48 | 49 | |
49 | 50 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
50 | 51 | |
51 | | - Map<String, LocalConcept> disambig = new HashMap<String, LocalConcept>(meanings.size()); |
| 52 | + Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size()); |
52 | 53 | |
53 | 54 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
54 | | - FeatureCache<LocalConcept, K> features = getFeatureCache(meanings); |
| 55 | + FeatureCache<LocalConcept, Integer> features = getFeatureCache(meanings); |
55 | 56 | |
56 | 57 | for (int i= window; ; i++) { |
57 | 58 | int from = i-window; |
— | — | @@ -64,12 +65,12 @@ |
65 | 66 | if (to-from < 2) { |
66 | 67 | r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings); |
67 | 68 | } else { |
68 | | - List<Map<String, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
| 69 | + List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings); |
69 | 70 | r = getBestInterpretation(terms, meanings, interpretations, similarities, features); |
70 | 71 | } |
71 | 72 | |
72 | 73 | for (int j=from; j<to; j++) { |
73 | | - String t = terms.get(j); |
| 74 | + X t = terms.get(j); |
74 | 75 | if (disambig.containsKey(t)) continue; |
75 | 76 | |
76 | 77 | LocalConcept m; |
— | — | @@ -84,23 +85,23 @@ |
85 | 86 | return getScore(disambig, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates. |
86 | 87 | } |
87 | 88 | |
88 | | - protected List<Map<String, LocalConcept>> getInterpretations(int from, int to, List<String> terms, Map<String, LocalConcept> known, Map<String, List<LocalConcept>> meanings) { |
| 89 | + protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(int from, int to, List<X> terms, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
89 | 90 | //strip out all terms with no known meaning |
90 | 91 | if (meanings.keySet().size() != terms.size()) { |
91 | | - List<String> t = new ArrayList<String>(terms.size()); |
| 92 | + List<X> t = new ArrayList<X>(terms.size()); |
92 | 93 | t.addAll(terms); |
93 | 94 | t.retainAll(meanings.keySet()); |
94 | 95 | terms = t; |
95 | 96 | } |
96 | 97 | |
97 | | - Map<String, List<LocalConcept>> mset = new HashMap<String, List<LocalConcept>>(); |
| 98 | + Map<X, List<? extends LocalConcept>> mset = new HashMap<X, List<? extends LocalConcept>>(); |
98 | 99 | |
99 | 100 | if (to>terms.size()) to = terms.size(); |
100 | 101 | |
101 | 102 | for (int i=from; i<to; i++) { |
102 | | - List<LocalConcept> m; |
| 103 | + List<? extends LocalConcept> m; |
103 | 104 | |
104 | | - String t = terms.get(i); |
| 105 | + X t = terms.get(i); |
105 | 106 | LocalConcept c = known.get(t); |
106 | 107 | |
107 | 108 | if (c!=null) m = Collections.singletonList(c); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/TermRelatedness.java |
— | — | @@ -1,10 +1,9 @@ |
2 | 2 | package de.brightbyte.wikiword.disambig; |
3 | 3 | |
4 | | -import java.util.Arrays; |
5 | | - |
6 | 4 | import de.brightbyte.data.measure.Similarity; |
7 | 5 | import de.brightbyte.util.PersistenceException; |
8 | 6 | import de.brightbyte.util.UncheckedPersistenceException; |
| 7 | +import de.brightbyte.wikiword.model.TermReference; |
9 | 8 | import de.brightbyte.wikiword.model.WikiWordConcept; |
10 | 9 | |
11 | 10 | public class TermRelatedness implements Similarity<String> { |
— | — | @@ -28,13 +27,13 @@ |
29 | 28 | } |
30 | 29 | |
31 | 30 | protected Similarity<WikiWordConcept> relatedness; |
32 | | - protected Disambiguator disambig; |
| 31 | + protected Disambiguator<TermReference, ? extends WikiWordConcept> disambig; |
33 | 32 | |
34 | | - public TermRelatedness(Disambiguator disambig) { |
| 33 | + public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig) { |
35 | 34 | this(disambig, null); |
36 | 35 | } |
37 | 36 | |
38 | | - public TermRelatedness(Disambiguator disambig, Similarity<WikiWordConcept> relatedness) { |
| 37 | + public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig, Similarity<WikiWordConcept> relatedness) { |
39 | 38 | this.relatedness = relatedness; |
40 | 39 | this.disambig = disambig; |
41 | 40 | } |
— | — | @@ -48,7 +47,7 @@ |
49 | 48 | |
50 | 49 | public Relatedness relatedness(String a, String b) { |
51 | 50 | try { |
52 | | - Disambiguator.Result r = disambig.disambiguate(Arrays.asList(new String[] {a, b})); |
| 51 | + Disambiguator.Result<Term, ? extends WikiWordConcept> r = disambig.<Term>disambiguate(Term.asTerms(a, b)); |
53 | 52 | if (r==null || r.getMeanings().size()!=2) return null; |
54 | 53 | |
55 | 54 | double d; |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java |
— | — | @@ -8,9 +8,10 @@ |
9 | 9 | import de.brightbyte.data.measure.Measure; |
10 | 10 | import de.brightbyte.data.measure.Measure.Comparator; |
11 | 11 | import de.brightbyte.wikiword.model.LocalConcept; |
| 12 | +import de.brightbyte.wikiword.model.TermReference; |
12 | 13 | import de.brightbyte.wikiword.model.WikiWordConcept; |
13 | 14 | |
14 | | -public class PopularityDisambiguator extends AbstractDisambiguator { |
| 15 | +public class PopularityDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> { |
15 | 16 | |
16 | 17 | protected Measure<WikiWordConcept> popularityMeasure; |
17 | 18 | protected Comparator<WikiWordConcept> popularityComparator; |
— | — | @@ -26,11 +27,11 @@ |
27 | 28 | this.popularityComparator = new Measure.Comparator<WikiWordConcept>(popularityMeasure, true); |
28 | 29 | } |
29 | 30 | |
30 | | - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) { |
31 | | - Map<String, LocalConcept> disambig = new HashMap<String, LocalConcept>(); |
| 31 | + public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) { |
| 32 | + Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(); |
32 | 33 | int pop = 0; |
33 | | - for (String t: terms) { |
34 | | - List<LocalConcept> m = meanings.get(t); |
| 34 | + for (X t: terms) { |
| 35 | + List<? extends LocalConcept> m = meanings.get(t); |
35 | 36 | if (m==null || m.size()==0) continue; |
36 | 37 | |
37 | 38 | if (m.size()>0) Collections.sort(m, popularityComparator); |
— | — | @@ -43,7 +44,7 @@ |
44 | 45 | |
45 | 46 | pop = pop / disambig.size(); |
46 | 47 | |
47 | | - Result r = new Result(disambig, pop, "pop="+pop); |
| 48 | + Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, pop, "pop="+pop); |
48 | 49 | return r; |
49 | 50 | } |
50 | 51 | |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java |
— | — | @@ -19,9 +19,10 @@ |
20 | 20 | import de.brightbyte.util.PersistenceException; |
21 | 21 | import de.brightbyte.wikiword.model.ConceptFeatures; |
22 | 22 | import de.brightbyte.wikiword.model.LocalConcept; |
| 23 | +import de.brightbyte.wikiword.model.TermReference; |
23 | 24 | import de.brightbyte.wikiword.model.WikiWordConcept; |
24 | 25 | |
25 | | -public class CoherenceDisambiguator<K> extends AbstractDisambiguator { |
| 26 | +public class CoherenceDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> { |
26 | 27 | |
27 | 28 | protected int minPopularity = 2; //FIXME: use complex cutoff specifier! |
28 | 29 | protected int maxMeanings = 8; //FIXME: magic... |
— | — | @@ -29,8 +30,8 @@ |
30 | 31 | protected double minScore = 0.1; //FIXME: magic number. should "somehow" match popularityFactor and similarityFactor |
31 | 32 | protected double popularityBias = 0.2; //FIXME: magic number. should "somehow" match popularityFactor and similarityFactor |
32 | 33 | |
33 | | - protected Similarity<LabeledVector<K>> similarityMeasure; |
34 | | - protected FeatureFetcher<LocalConcept, K> featureFetcher; |
| 34 | + protected Similarity<LabeledVector<Integer>> similarityMeasure; |
| 35 | + protected FeatureFetcher<LocalConcept, Integer> featureFetcher; |
35 | 36 | protected Measure<WikiWordConcept> popularityMeasure; |
36 | 37 | protected PopularityDisambiguator popularityDisambiguator; |
37 | 38 | |
— | — | @@ -48,12 +49,12 @@ |
49 | 50 | } |
50 | 51 | }; |
51 | 52 | |
52 | | - public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, boolean featuresAreNormalized) { |
| 53 | + public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) { |
53 | 54 | this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality, |
54 | | - featuresAreNormalized ? ScalarVectorSimilarity.<K>getInstance() : CosineVectorSimilarity.<K>getInstance()); //if pre-normalized, use scalar to calc cosin |
| 55 | + featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance()); //if pre-normalized, use scalar to calc cosin |
55 | 56 | } |
56 | 57 | |
57 | | - public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<K>> sim) { |
| 58 | + public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<Integer>> sim) { |
58 | 59 | super(meaningFetcher); |
59 | 60 | |
60 | 61 | if (popularityMeasure==null) throw new NullPointerException(); |
— | — | @@ -69,16 +70,16 @@ |
70 | 71 | return featureFetcher; |
71 | 72 | } |
72 | 73 | |
73 | | - public void setFeatureFetcher(FeatureFetcher<LocalConcept, K> featureFetcher) { |
| 74 | + public void setFeatureFetcher(FeatureFetcher<LocalConcept, Integer> featureFetcher) { |
74 | 75 | this.featureFetcher = featureFetcher; |
75 | 76 | } |
76 | 77 | |
77 | | - public Similarity<LabeledVector<K>> getSimilarityMeasure() { |
| 78 | + public Similarity<LabeledVector<Integer>> getSimilarityMeasure() { |
78 | 79 | return similarityMeasure; |
79 | 80 | } |
80 | 81 | |
81 | 82 | public void setSimilarityMeasure( |
82 | | - Similarity<LabeledVector<K>> similarityMeasure) { |
| 83 | + Similarity<LabeledVector<Integer>> similarityMeasure) { |
83 | 84 | if (similarityMeasure==null) throw new NullPointerException(); |
84 | 85 | this.similarityMeasure = similarityMeasure; |
85 | 86 | } |
— | — | @@ -115,13 +116,13 @@ |
116 | 117 | this.maxMeanings = maxMeanings; |
117 | 118 | } |
118 | 119 | |
119 | | - protected FeatureCache<LocalConcept, K> getFeatureCache(Map<String, List<LocalConcept>> meanings) throws PersistenceException { |
| 120 | + protected FeatureCache<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings) throws PersistenceException { |
120 | 121 | //TODO: keep a chain of n caches, resulting in LRU logic. |
121 | | - FeatureCache<LocalConcept, K> features = new FeatureCache<LocalConcept, K>(featureFetcher); |
| 122 | + FeatureCache<LocalConcept, Integer> features = new FeatureCache<LocalConcept, Integer>(featureFetcher); |
122 | 123 | |
123 | 124 | //NOTE: pre-fetch all features in one go |
124 | 125 | List<LocalConcept> concepts = new ArrayList<LocalConcept>(meanings.size()*10); |
125 | | - for (List<LocalConcept> m: meanings.values()) { |
| 126 | + for (List<? extends LocalConcept> m: meanings.values()) { |
126 | 127 | concepts.addAll(m); |
127 | 128 | } |
128 | 129 | |
— | — | @@ -133,7 +134,7 @@ |
134 | 135 | /* (non-Javadoc) |
135 | 136 | * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List) |
136 | 137 | */ |
137 | | - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException { |
| 138 | + public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException { |
138 | 139 | if (terms.size()<2 || meanings.size()<2) |
139 | 140 | return popularityDisambiguator.disambiguate(terms, meanings); |
140 | 141 | |
— | — | @@ -145,22 +146,22 @@ |
146 | 147 | //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence. |
147 | 148 | |
148 | 149 | LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); |
149 | | - FeatureCache<LocalConcept, K> features = getFeatureCache(meanings); |
| 150 | + FeatureCache<LocalConcept, Integer> features = getFeatureCache(meanings); |
150 | 151 | |
151 | | - List<Map<String, LocalConcept>> interpretations = getInterpretations(terms, meanings); |
| 152 | + List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings); |
152 | 153 | |
153 | 154 | return getBestInterpretation(terms, meanings, interpretations, similarities, features); |
154 | 155 | } |
155 | 156 | |
156 | | - protected void pruneMeanings(Map<String, List<LocalConcept>> meanings) { |
| 157 | + protected void pruneMeanings(Map<? extends TermReference, List<? extends LocalConcept>> meanings) { |
157 | 158 | if (minPopularity<=1) return; //nothing to do |
158 | 159 | |
159 | | - Iterator<Map.Entry<String, List<LocalConcept>>> eit = meanings.entrySet().iterator(); |
| 160 | + Iterator<?> eit = meanings.entrySet().iterator(); |
160 | 161 | while (eit.hasNext()) { |
161 | | - Entry<String, List<LocalConcept>> e = eit.next(); |
162 | | - List<LocalConcept> m = e.getValue(); |
| 162 | + Entry<TermReference, List<? extends LocalConcept>> e = (Entry<TermReference, List<? extends LocalConcept>>) eit.next(); //XXX: ugly cast. got confused about generics. ugh. |
| 163 | + List<? extends LocalConcept> m = e.getValue(); |
163 | 164 | |
164 | | - Iterator<LocalConcept> cit = m.iterator(); |
| 165 | + Iterator<? extends LocalConcept> cit = m.iterator(); |
165 | 166 | while (cit.hasNext()) { |
166 | 167 | LocalConcept c = cit.next(); |
167 | 168 | double p = popularityMeasure.measure(c); |
— | — | @@ -179,15 +180,15 @@ |
180 | 181 | } |
181 | 182 | } |
182 | 183 | |
183 | | - protected Result getBestInterpretation(List<String> terms, Map<String, List<LocalConcept>> meanings, |
184 | | - List<Map<String, LocalConcept>> interpretations, |
185 | | - LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, K> features) throws PersistenceException { |
| 184 | + protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings, |
| 185 | + List<Map<X, LocalConcept>> interpretations, |
| 186 | + LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, Integer> features) throws PersistenceException { |
186 | 187 | |
187 | | - List<Result> rankings = new ArrayList<Result>(); |
| 188 | + List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>(); |
188 | 189 | |
189 | 190 | double traceLimit = -1; |
190 | | - for (Map<String, LocalConcept> interp: interpretations) { |
191 | | - Result r = getScore(interp, similarities, features); |
| 191 | + for (Map<X, LocalConcept> interp: interpretations) { |
| 192 | + Result<X, LocalConcept> r = getScore(interp, similarities, features); |
192 | 193 | |
193 | 194 | if (r.getScore() >= minScore) { |
194 | 195 | rankings.add(r); |
— | — | @@ -205,27 +206,27 @@ |
206 | 207 | Collections.reverse(rankings); |
207 | 208 | |
208 | 209 | //TODO: if result is tight (less than 50% distance), use more popularity score! |
209 | | - Result r = rankings.get(0); |
| 210 | + Result<X, LocalConcept> r = rankings.get(0); |
210 | 211 | return r; |
211 | 212 | } |
212 | 213 | |
213 | | - protected List<Map<String, LocalConcept>> getInterpretations(List<String> terms, Map<String, List<LocalConcept>> meanings) { |
| 214 | + protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) { |
214 | 215 | if (terms.size()==0) { |
215 | | - return Collections.singletonList(Collections.<String, LocalConcept>emptyMap()); |
| 216 | + return Collections.singletonList(Collections.<X, LocalConcept>emptyMap()); |
216 | 217 | } |
217 | 218 | |
218 | | - String t = terms.get(0); |
219 | | - List<LocalConcept> m = meanings.get(t); |
| 219 | + X t = terms.get(0); |
| 220 | + List<? extends LocalConcept> m = meanings.get(t); |
220 | 221 | |
221 | | - List<Map<String, LocalConcept>> base = getInterpretations(terms.subList(1, terms.size()), meanings); |
| 222 | + List<Map<X, LocalConcept>> base = getInterpretations(terms.subList(1, terms.size()), meanings); |
222 | 223 | |
223 | 224 | if (m==null || m.size()==0) return base; |
224 | 225 | |
225 | | - List<Map<String, LocalConcept>> interpretations = new ArrayList<Map<String, LocalConcept>>(); |
| 226 | + List<Map<X, LocalConcept>> interpretations = new ArrayList<Map<X, LocalConcept>>(); |
226 | 227 | |
227 | | - for (Map<String, LocalConcept> be: base) { |
| 228 | + for (Map<X, LocalConcept> be: base) { |
228 | 229 | for (LocalConcept c: m) { |
229 | | - Map<String, LocalConcept> e = new HashMap<String, LocalConcept>(); |
| 230 | + Map<X, LocalConcept> e = new HashMap<X, LocalConcept>(); |
230 | 231 | e.putAll(be); |
231 | 232 | e.put(t, c); |
232 | 233 | |
— | — | @@ -237,7 +238,7 @@ |
238 | 239 | return interpretations; |
239 | 240 | } |
240 | 241 | |
241 | | - protected Result getScore(Map<String, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, K> features) throws PersistenceException { |
| 242 | + protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, Integer> features) throws PersistenceException { |
242 | 243 | double sim = 0; |
243 | 244 | double pop = 0; |
244 | 245 | |
— | — | @@ -259,8 +260,8 @@ |
260 | 261 | d = similarities.get(a, b); |
261 | 262 | } |
262 | 263 | else { |
263 | | - ConceptFeatures<LocalConcept, K> fa = features.getFeatures(a); |
264 | | - ConceptFeatures<LocalConcept, K> fb = features.getFeatures(b); |
| 264 | + ConceptFeatures<LocalConcept, Integer> fa = features.getFeatures(a); |
| 265 | + ConceptFeatures<LocalConcept, Integer> fb = features.getFeatures(b); |
265 | 266 | |
266 | 267 | //force relevance/cardinality to the figures from the meaning lookup |
267 | 268 | //not strictly necessary, but nice to keep it consistent. |
— | — | @@ -296,7 +297,7 @@ |
297 | 298 | double score = popf * popularityBias + simf * ( 1 - popularityBias ); |
298 | 299 | //double score = Math.sqrt( popf * simf ); //FIXME: functor! |
299 | 300 | |
300 | | - return new Result(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop); |
| 301 | + return new Result<X, LocalConcept>(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop); |
301 | 302 | } |
302 | | - |
| 303 | + |
303 | 304 | } |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java |
— | — | @@ -0,0 +1,64 @@ |
| 2 | +package de.brightbyte.wikiword.disambig; |
| 3 | + |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Arrays; |
| 6 | +import java.util.List; |
| 7 | + |
| 8 | +import de.brightbyte.wikiword.model.TermReference; |
| 9 | + |
| 10 | +public class Term implements TermReference { |
| 11 | + |
| 12 | + private final String term; |
| 13 | + |
| 14 | + public Term(final String term) { |
| 15 | + super(); |
| 16 | + this.term = term; |
| 17 | + } |
| 18 | + |
| 19 | + public String getTerm() { |
| 20 | + return term; |
| 21 | + } |
| 22 | + |
| 23 | + public String toString() { |
| 24 | + return getTerm(); |
| 25 | + } |
| 26 | + |
| 27 | + @Override |
| 28 | + public int hashCode() { |
| 29 | + final int PRIME = 31; |
| 30 | + int result = 1; |
| 31 | + result = PRIME * result + ((term == null) ? 0 : term.hashCode()); |
| 32 | + return result; |
| 33 | + } |
| 34 | + |
| 35 | + @Override |
| 36 | + public boolean equals(Object obj) { |
| 37 | + if (this == obj) |
| 38 | + return true; |
| 39 | + if (obj == null) |
| 40 | + return false; |
| 41 | + if (getClass() != obj.getClass()) |
| 42 | + return false; |
| 43 | + final Term other = (Term) obj; |
| 44 | + if (term == null) { |
| 45 | + if (other.term != null) |
| 46 | + return false; |
| 47 | + } else if (!term.equals(other.term)) |
| 48 | + return false; |
| 49 | + return true; |
| 50 | + } |
| 51 | + |
| 52 | + public static List<Term> asTerms(String... terms) { |
| 53 | + return asTerms(Arrays.asList(terms)); |
| 54 | + } |
| 55 | + |
| 56 | + public static List<Term> asTerms(List<String> terms) { |
| 57 | + List<Term> tt = new ArrayList<Term>(); |
| 58 | + for (String t: terms) { |
| 59 | + tt.add(new Term(t)); |
| 60 | + } |
| 61 | + |
| 62 | + return tt; |
| 63 | + } |
| 64 | + |
| 65 | +} |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java |
— | — | @@ -5,23 +5,24 @@ |
6 | 6 | |
7 | 7 | import de.brightbyte.io.Output; |
8 | 8 | import de.brightbyte.util.PersistenceException; |
| 9 | +import de.brightbyte.wikiword.model.TermReference; |
9 | 10 | import de.brightbyte.wikiword.model.WikiWordConcept; |
10 | 11 | |
11 | | -public interface Disambiguator { |
| 12 | +public interface Disambiguator<T extends TermReference, C extends WikiWordConcept> { |
12 | 13 | |
13 | | - public static class Result implements Comparable { |
14 | | - private Map<String, ? extends WikiWordConcept> meanings; |
| 14 | + public static class Result<T extends TermReference, C extends WikiWordConcept> implements Comparable { |
| 15 | + private Map<? extends T, ? extends C> meanings; |
15 | 16 | private double score; |
16 | 17 | private String description; |
17 | 18 | |
18 | | - public Result(Map<String, ? extends WikiWordConcept> meanings, double score, String description) { |
| 19 | + public Result(Map<? extends T, ? extends C> meanings, double score, String description) { |
19 | 20 | super(); |
20 | 21 | this.meanings = meanings; |
21 | 22 | this.score = score; |
22 | 23 | this.description = description; |
23 | 24 | } |
24 | 25 | |
25 | | - public Map<String, ? extends WikiWordConcept> getMeanings() { |
| 26 | + public Map<? extends T, ? extends C> getMeanings() { |
26 | 27 | return meanings; |
27 | 28 | } |
28 | 29 | |
— | — | @@ -51,6 +52,6 @@ |
52 | 53 | |
53 | 54 | public void setTrace(Output trace); |
54 | 55 | |
55 | | - public Result disambiguate(List<String> terms) throws PersistenceException; |
| 56 | + public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException; |
56 | 57 | |
57 | 58 | } |
\ No newline at end of file |