r64386 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r64385‎ | r64386 | r64387 >
Date:11:46, 30 March 2010
Author:daniel
Status:deferred
Tags:
Comment:
make Disambiguator use generic key type, so it can be applied to PhraseOccurance objects
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureCache.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredFeatureFetcher.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/TermRelatedness.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermMeaning.java (added) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermReference.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseLocalConceptStore.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalConceptStore.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalStatisticsStore.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/attic/ZLibBenchmark.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccurance.java (deleted) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccuranceSequence.java (deleted) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/StreamProcessorApp.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/attic/ZLibBenchmark.java
@@ -0,0 +1,167 @@
 2+package de.brightbyte.wikiword.builder;
 3+
 4+import java.io.BufferedReader;
 5+import java.io.File;
 6+import java.io.IOException;
 7+import java.io.InputStreamReader;
 8+import java.io.UnsupportedEncodingException;
 9+
 10+import com.jcraft.jzlib.JZlib;
 11+import com.jcraft.jzlib.ZStream;
 12+
 13+import de.brightbyte.data.ByteString;
 14+import de.brightbyte.io.ConsoleIO;
 15+import de.brightbyte.io.IOUtil;
 16+import de.brightbyte.util.StringUtils;
 17+
 18+public class ZLibBenchmark {
 19+ protected byte[] dictionary;
 20+
 21+ public ZLibBenchmark() {
 22+ }
 23+
 24+ public void setDictionary(byte[] data) {
 25+ this.dictionary = data;
 26+ }
 27+
 28+ protected int level = JZlib.Z_BEST_COMPRESSION;
 29+ protected int windowbits = 15;
 30+ protected int strategy = JZlib.Z_DEFAULT_STRATEGY;
 31+ protected int bufferSize = 32 * 1024;
 32+
 33+ public ByteString compress(byte[] data) {
 34+ int err;
 35+
 36+ int comprLen = bufferSize; //FIXME: data.length;
 37+
 38+ byte[] compr = new byte[comprLen];
 39+
 40+ ZStream c_stream = new ZStream();
 41+
 42+ err = c_stream.deflateInit(level, 15);
 43+ CHECK_ERR(c_stream, err, "deflateInit");
 44+
 45+ err = c_stream.deflateParams(level, strategy);
 46+ CHECK_ERR(c_stream, err, "deflateInit");
 47+
 48+ err = c_stream.deflateSetDictionary(dictionary, dictionary.length);
 49+ CHECK_ERR(c_stream, err, "deflateSetDictionary");
 50+
 51+ long dictId = c_stream.adler;
 52+
 53+ c_stream.next_out = compr;
 54+ c_stream.next_out_index = 0;
 55+ c_stream.avail_out = comprLen;
 56+
 57+ c_stream.next_in = data;
 58+ c_stream.next_in_index = 0;
 59+ c_stream.avail_in = data.length;
 60+
 61+ err = c_stream.deflate(JZlib.Z_FINISH);
 62+ //FIXME: JZlib.Z_STREAM_END expected, getting JZlib.Z_OK
 63+
 64+ if (err != JZlib.Z_STREAM_END && err != JZlib.Z_OK) {
 65+ throw new RuntimeException("deflate should report Z_STREAM_END, found "+err);
 66+ }
 67+ err = c_stream.deflateEnd();
 68+ CHECK_ERR(c_stream, err, "deflateEnd");
 69+
 70+ return new ByteString(compr, 0, c_stream.next_out_index);
 71+ }
 72+
 73+ public ByteString uncompress(byte[] data, boolean ignoreChecksumm) {
 74+ int uncomprLen = bufferSize; //FIXME: data.length * 10;
 75+ byte[] uncompr = new byte[uncomprLen];
 76+ ZStream d_stream = new ZStream();
 77+
 78+ d_stream.next_in = data;
 79+ d_stream.next_in_index = 0;
 80+ d_stream.avail_in = data.length;
 81+
 82+ int err = d_stream.inflateInit(windowbits);
 83+ CHECK_ERR(d_stream, err, "inflateInit");
 84+ d_stream.next_out = uncompr;
 85+ d_stream.next_out_index = 0;
 86+ d_stream.avail_out = uncomprLen;
 87+
 88+ while (true) {
 89+ err = d_stream.inflate(JZlib.Z_NO_FLUSH);
 90+ if (err == JZlib.Z_STREAM_END) {
 91+ break;
 92+ }
 93+ if (err == JZlib.Z_NEED_DICT) {
 94+ /*if ((int) d_stream.adler != (int) dictId) {
 95+ System.out.println("unexpected dictionary");
 96+ System.exit(1);
 97+ } */
 98+ err = d_stream.inflateSetDictionary(dictionary,
 99+ dictionary.length);
 100+ }
 101+
 102+ if (ignoreChecksumm && err==JZlib.Z_DATA_ERROR) break;
 103+ else CHECK_ERR(d_stream, err, "inflate with dict");
 104+ }
 105+
 106+ err = d_stream.inflateEnd();
 107+ CHECK_ERR(d_stream, err, "inflateEnd");
 108+
 109+ int j = 0;
 110+ for (; j < uncompr.length; j++)
 111+ if (uncompr[j] == 0)
 112+ break;
 113+
 114+ return new ByteString(uncompr, 0, d_stream.next_out_index);
 115+ }
 116+
 117+ public ByteString getPrefix(ByteString b) {
 118+ return b.subString(0, 6);
 119+ }
 120+
 121+ public ByteString strip(ByteString b) {
 122+ return b.subString(6, b.length()-5);
 123+ }
 124+
 125+ public ByteString pad(ByteString prefix, ByteString b, ByteString suffix) {
 126+ return ByteString.concat(prefix, b, suffix);
 127+ }
 128+
 129+ public static void main(String[] args) throws IOException {
 130+
 131+ String d = args[0];
 132+ String denc = "UTF-8";
 133+ String enc = "UTF-8";
 134+
 135+ String dict = IOUtil.slurp(new File(d), denc);
 136+
 137+ ZLibBenchmark app = new ZLibBenchmark();
 138+ app.setDictionary(dict.getBytes(enc));
 139+
 140+ ByteString b = app.compress("dummy".getBytes());
 141+ ByteString prefix = app.getPrefix(b);
 142+ ByteString suffix = new ByteString( new byte[] {0, 0, 0, 0, 0} );
 143+
 144+ BufferedReader r = new BufferedReader( new InputStreamReader( System.in ));
 145+ String s;
 146+ while ((s = r.readLine()) != null) {
 147+ s = s.trim();
 148+ if (s.length()==0) continue;
 149+
 150+ byte[] data = s.getBytes(enc);
 151+ System.out.println("UTF-16: "+s.length()*2+" bytes");
 152+ System.out.println(enc+": "+data.length+" bytes: "+StringUtils.hex(data));
 153+ b = app.compress(data);
 154+ System.out.println("compressed: "+b.length()+" bytes: "+b.toString());
 155+ b = app.strip(b);
 156+ System.out.println("stripped: "+b.length()+" bytes: "+b.toString());
 157+ b = app.pad(prefix, b, suffix);
 158+ b = app.uncompress(b.getBytes(), true);
 159+ System.out.println("uncompressed: "+b.length()+" bytes, "+new String(b.getBytes(), enc));
 160+ }
 161+ }
 162+
 163+ static void CHECK_ERR(ZStream z, int err, String msg) {
 164+ if (err != JZlib.Z_OK)
 165+ throw new RuntimeException(z.msg + "; code: " + err);
 166+ }
 167+
 168+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccuranceSequence.java
@@ -1,82 +0,0 @@
2 -package de.brightbyte.wikiword.analyzer;
3 -
4 -import java.util.AbstractList;
5 -import java.util.ArrayList;
6 -import java.util.Collections;
7 -import java.util.List;
8 -import java.util.RandomAccess;
9 -
10 -import de.brightbyte.data.filter.Filter;
11 -
12 -public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess {
13 -
14 - protected List<PhraseOccurance> phrases;
15 - protected String text;
16 -
17 - public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) {
18 - this.text = text;
19 -
20 - this.phrases = phrases;
21 - Collections.sort(this.phrases); //essential!
22 - }
23 -
24 - @Override
25 - public PhraseOccurance get(int index) {
26 - return phrases.get(index);
27 - }
28 -
29 - @Override
30 - public int size() {
31 - return phrases.size();
32 - }
33 -
34 - public String getText() {
35 - return text;
36 - }
37 -
38 - public List<PhraseOccurance> getPhrasesAt(int offs) {
39 - int i = 0;
40 - while (i<size()) {
41 - PhraseOccurance p = get(i);
42 - if (p.getOffset() >= offs) {
43 - offs = p.getOffset();
44 - break;
45 - }
46 -
47 - i++;
48 - }
49 -
50 - if (i>=size()) return null;
51 -
52 - int j = i;
53 - while (j<size()) {
54 - PhraseOccurance p = get(j);
55 - if (p.getOffset() > offs) break;
56 - j++;
57 - }
58 -
59 - return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
60 - }
61 -
62 - public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
63 - List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
64 -
65 - int i = 0;
66 -
67 - outer:
68 - while (i<size()) {
69 - List<PhraseOccurance> candidates = getPhrasesAt(i);
70 - if (candidates == null) break;
71 -
72 - for (PhraseOccurance p: candidates) {
73 - i = p.getEndOffset();
74 - if (filter.matches(p.getPhrase())) {
75 - phrases.add(p);
76 - continue outer;
77 - }
78 - }
79 - }
80 -
81 - return phrases;
82 - }
83 -}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccurance.java
@@ -1,98 +0,0 @@
2 -package de.brightbyte.wikiword.analyzer;
3 -
4 -import java.io.Serializable;
5 -
6 -public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance> {
7 -
8 - private static final long serialVersionUID = 241753475865301115L;
9 -
10 - protected String phrase;
11 - protected int weight;
12 - protected int offset;
13 - protected int length;
14 -
15 - public PhraseOccurance(String phrase, int weight, int offset, int length) {
16 - if (length <= 0) throw new IllegalArgumentException("bad length: "+length);
17 - if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string");
18 - //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string");
19 - if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string");
20 - if (length < phrase.length()) phrase = phrase.substring(offset, offset+length);
21 -
22 - this.phrase = phrase;
23 - this.weight = weight;
24 - this.offset = offset;
25 - this.length = length;
26 - }
27 -
28 - public int getLength() {
29 - return length;
30 - }
31 -
32 - public int getOffset() {
33 - return offset;
34 - }
35 -
36 - public int getEndOffset() {
37 - return getOffset() + getLength();
38 - }
39 -
40 - public String getPhrase() {
41 - return phrase;
42 - }
43 -
44 - public int getWeight() {
45 - return weight;
46 - }
47 -
48 - public String toString() {
49 - return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";
50 - }
51 -
52 - @Override
53 - public int hashCode() {
54 - final int PRIME = 31;
55 - int result = 1;
56 - result = PRIME * result + length;
57 - result = PRIME * result + offset;
58 - result = PRIME * result + ((phrase == null) ? 0 : phrase.hashCode());
59 - return result;
60 - }
61 -
62 - @Override
63 - public boolean equals(Object obj) {
64 - if (this == obj)
65 - return true;
66 - if (obj == null)
67 - return false;
68 - if (getClass() != obj.getClass())
69 - return false;
70 - final PhraseOccurance other = (PhraseOccurance) obj;
71 - if (length != other.length)
72 - return false;
73 - if (offset != other.offset)
74 - return false;
75 - if (phrase == null) {
76 - if (other.phrase != null)
77 - return false;
78 - } else if (!phrase.equals(other.phrase))
79 - return false;
80 - return true;
81 - }
82 -
83 - public boolean overlaps(PhraseOccurance other) {
84 - if (getEndOffset() <= other.getOffset()) return false;
85 - if (getOffset() >= other.getEndOffset()) return false;
86 -
87 - return true;
88 - }
89 -
90 - public int compareTo(PhraseOccurance other) {
91 - int o = getOffset() - other.getOffset();
92 - if (o!=0) return o; //by offset...
93 -
94 - int e = getEndOffset() - other.getEndOffset();
95 - if (e!=0) return -e; //but longest first!
96 -
97 - return 0;
98 - }
99 -}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java
@@ -5,6 +5,8 @@
66 import java.util.Iterator;
77 import java.util.regex.Matcher;
88
 9+import de.brightbyte.wikiword.model.PhraseOccurance;
 10+
911 public class PhraseAggregator {
1012 public class PhraseBuilder {
1113 protected StringBuilder phrase;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
@@ -20,6 +20,8 @@
2121 import de.brightbyte.io.ConsoleIO;
2222 import de.brightbyte.wikiword.Corpus;
2323 import de.brightbyte.wikiword.TweakSet;
 24+import de.brightbyte.wikiword.model.PhraseOccurance;
 25+import de.brightbyte.wikiword.model.PhraseOccuranceSequence;
2426
2527 public class PlainTextAnalyzer extends AbstractAnalyzer {
2628 private LanguageConfiguration config;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java
@@ -1,29 +1,31 @@
22 package de.brightbyte.wikiword.extract;
33
44 import java.io.IOException;
 5+import java.util.List;
56
6 -import sun.net.dns.ResolverConfiguration.Options;
7 -
87 import de.brightbyte.data.cursor.DataCursor;
98 import de.brightbyte.data.cursor.DataSink;
109 import de.brightbyte.io.ConsoleIO;
1110 import de.brightbyte.io.LineCursor;
1211 import de.brightbyte.io.OutputSink;
1312 import de.brightbyte.util.PersistenceException;
14 -import de.brightbyte.wikiword.analyzer.PhraseOccuranceSequence;
1513 import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer;
1614 import de.brightbyte.wikiword.disambig.Disambiguator;
1715 import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator;
1816 import de.brightbyte.wikiword.disambig.StoredFeatureFetcher;
1917 import de.brightbyte.wikiword.disambig.StoredMeaningFetcher;
 18+import de.brightbyte.wikiword.disambig.Disambiguator.Result;
2019 import de.brightbyte.wikiword.model.LocalConcept;
 20+import de.brightbyte.wikiword.model.PhraseOccurance;
 21+import de.brightbyte.wikiword.model.PhraseOccuranceSequence;
 22+import de.brightbyte.wikiword.model.TermReference;
2123 import de.brightbyte.wikiword.store.DatabaseConceptStores;
2224 import de.brightbyte.wikiword.store.FeatureStore;
2325 import de.brightbyte.wikiword.store.LocalConceptStore;
2426 import de.brightbyte.wikiword.store.WikiWordConceptStore;
2527
2628 public class WordSenseIndexer extends StreamProcessorApp<String, String, WikiWordConceptStore> {
27 - protected Disambiguator disambiguator;
 29+ protected Disambiguator<TermReference, LocalConcept> disambiguator;
2830 protected PlainTextAnalyzer analyzer;
2931 private int phraseLength;
3032
@@ -58,7 +60,7 @@
5961 protected void init() throws PersistenceException, InstantiationException {
6062 StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore());
6163 StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore());
62 - disambiguator = new SlidingCoherenceDisambiguator<Integer>( meaningFetcher, featureFetcher, true );
 64+ disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true );
6365
6466 analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks);
6567
@@ -66,9 +68,11 @@
6769 }
6870
6971 @Override
70 - protected String process(String line) {
71 - PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength);
72 - return null;
 72+ protected String process(String line) throws PersistenceException {
 73+ PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first.
 74+ List<PhraseOccurance> phrases = sequence.getDisjointPhraseSequence(null);
 75+ Disambiguator.Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(phrases);
 76+ return result.toString(); //FIXME: annotate!
7377 }
7478
7579 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/StreamProcessorApp.java
@@ -49,6 +49,6 @@
5050 }
5151 }
5252
53 - protected abstract O process(I rec);
 53+ protected abstract O process(I rec) throws Exception;
5454
5555 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalStatisticsStore.java
@@ -2,6 +2,7 @@
33
44 import de.brightbyte.data.cursor.DataSet;
55 import de.brightbyte.util.PersistenceException;
 6+import de.brightbyte.wikiword.model.TermMeaning;
67 import de.brightbyte.wikiword.model.TermReference;
78 import de.brightbyte.wikiword.model.WikiWordConcept;
89
@@ -9,11 +10,11 @@
1011
1112 public int getNumberOfTerms() throws PersistenceException;
1213
13 - public DataSet<TermReference> getAllTerms()
 14+ public DataSet<TermMeaning> getAllTerms()
1415 throws PersistenceException;
1516
1617 /**
17 - * Returns a TermReference for a random term from the top-n
 18+ * Returns a TermMeaning for a random term from the top-n
1819 * terms with repect to the frequency of occurance.
1920 * @param top the maximum rank of the terms to be returned. If top is 0,
2021 * any terms from the full range may be returned. If it is negative,
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/LocalConceptStore.java
@@ -3,6 +3,7 @@
44 import de.brightbyte.data.cursor.DataSet;
55 import de.brightbyte.util.PersistenceException;
66 import de.brightbyte.wikiword.model.LocalConcept;
 7+import de.brightbyte.wikiword.model.TermMeaning;
78 import de.brightbyte.wikiword.model.TermReference;
89 import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec;
910
@@ -16,7 +17,7 @@
1718
1819 public int getNumberOfTerms() throws PersistenceException;
1920
20 - public abstract DataSet<TermReference> getAllTerms() throws PersistenceException;
 21+ public abstract DataSet<TermMeaning> getAllTerms() throws PersistenceException;
2122 //public abstract DataSet<ResourceReference> getAllResources() throws PersistenceException;
2223
2324 //public abstract DataSet<LocalConcept> getLocalConcepts(DataSet<LocalConceptReference> refs) throws PersistenceException ;
@@ -24,7 +25,7 @@
2526 //public abstract LocalConcept getLocalConcept(int id) throws PersistenceException ;
2627
2728 /**
28 - * Returns a TermReference for a random term from the top-n
 29+ * Returns a TermMeaning for a random term from the top-n
2930 * terms with repect to the frequency of occurance.
3031 * @param top the maximum rank of the terms to be returned. If top is 0,
3132 * any terms from the full range may be returned. If it is negative,
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseLocalConceptStore.java
@@ -25,6 +25,7 @@
2626 import de.brightbyte.wikiword.TweakSet;
2727 import de.brightbyte.wikiword.model.ConceptRelations;
2828 import de.brightbyte.wikiword.model.LocalConcept;
 29+import de.brightbyte.wikiword.model.TermMeaning;
2930 import de.brightbyte.wikiword.model.TermReference;
3031 import de.brightbyte.wikiword.model.WikiWordResource;
3132 import de.brightbyte.wikiword.schema.ConceptInfoStoreSchema;
@@ -167,7 +168,7 @@
168169 return ((LocalStatisticsStore<LocalConcept>)getStatisticsStore()).pickRandomTerm(top);
169170 }
170171
171 - public DataSet<TermReference> getAllTerms() throws PersistenceException {
 172+ public DataSet<TermMeaning> getAllTerms() throws PersistenceException {
172173 return ((LocalStatisticsStore<LocalConcept>)getStatisticsStore()).getAllTerms();
173174 }
174175
@@ -177,20 +178,20 @@
178179
179180 /////////////////////////////////////////////////////////////////////////////////////////////
180181
181 - protected final DatabaseDataSet.Factory<TermReference> termFactory = new DatabaseDataSet.Factory<TermReference>() {
182 - public TermReference newInstance(ResultSet row) throws SQLException, PersistenceException {
 182+ protected final DatabaseDataSet.Factory<TermMeaning> termFactory = new DatabaseDataSet.Factory<TermMeaning>() {
 183+ public TermMeaning newInstance(ResultSet row) throws SQLException, PersistenceException {
183184 return newTerm(row);
184185 }
185186 };
186187
187 - protected TermReference newTerm(ResultSet row) throws SQLException, PersistenceException {
 188+ protected TermMeaning newTerm(ResultSet row) throws SQLException, PersistenceException {
188189 int id = row.getInt("id");
189190 String name = asString(row.getObject("name"));
190191 int card = row.getInt("cardinality");
191192 double relevance = row.getInt("relevance");
192193
193194 LocalConcept concept = newConcept(id, name, null, card, relevance);
194 - return new TermReference(name, concept, relevance);
 195+ return new TermMeaning(name, concept, relevance);
195196 }
196197
197198 protected class DatabaseLocalStatisticsStore extends DatabaseStatisticsStore implements LocalStatisticsStore<LocalConcept> {
@@ -202,10 +203,10 @@
203204 termTable = (EntityTable)database.getTable("term");
204205 }
205206
206 - public DataSet<TermReference> getAllTerms() throws PersistenceException {
 207+ public DataSet<TermMeaning> getAllTerms() throws PersistenceException {
207208 try {
208209 String sql = "SELECT rank as id, term, freq as cardinality, -1 as relevance FROM "+termTable.getSQLName()+" as T";
209 - return new ChunkedQueryDataSet<TermReference>(database, termFactory, "getAllTerms", "query", sql, null, null, termTable, "rank", queryChunkSize);
 210+ return new ChunkedQueryDataSet<TermMeaning>(database, termFactory, "getAllTerms", "query", sql, null, null, termTable, "rank", queryChunkSize);
210211 } catch (SQLException e) {
211212 throw new PersistenceException(e);
212213 }
@@ -355,7 +356,7 @@
356357 }
357358
358359 if (spec!=null && spec.getIncludeTerms()) {
359 - TermReference[] terms = TermReference.parseList( asString(m.get("dTerms")), getConceptFactory(), ((ConceptInfoStoreSchema)database).termReferenceListEntry );
 360+ TermReference[] terms = TermMeaning.parseList( asString(m.get("dTerms")), getConceptFactory(), ((ConceptInfoStoreSchema)database).termReferenceListEntry );
360361 concept.setTerms(terms);
361362 }
362363
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java
@@ -0,0 +1,102 @@
 2+package de.brightbyte.wikiword.model;
 3+
 4+import java.io.Serializable;
 5+
 6+public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance>, TermReference {
 7+
 8+ private static final long serialVersionUID = 241753475865301115L;
 9+
 10+ protected String phrase;
 11+ protected int weight;
 12+ protected int offset;
 13+ protected int length;
 14+
 15+ public PhraseOccurance(String phrase, int weight, int offset, int length) {
 16+ if (length <= 0) throw new IllegalArgumentException("bad length: "+length);
 17+ if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string");
 18+ //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string");
 19+ if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string");
 20+ if (length < phrase.length()) phrase = phrase.substring(offset, offset+length);
 21+
 22+ this.phrase = phrase;
 23+ this.weight = weight;
 24+ this.offset = offset;
 25+ this.length = length;
 26+ }
 27+
 28+ public int getLength() {
 29+ return length;
 30+ }
 31+
 32+ public int getOffset() {
 33+ return offset;
 34+ }
 35+
 36+ public int getEndOffset() {
 37+ return getOffset() + getLength();
 38+ }
 39+
 40+ public String getPhrase() {
 41+ return phrase;
 42+ }
 43+
 44+ public String getTerm() {
 45+ return getPhrase();
 46+ }
 47+
 48+ public int getWeight() {
 49+ return weight;
 50+ }
 51+
 52+ public String toString() {
 53+ return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";
 54+ }
 55+
 56+ @Override
 57+ public int hashCode() {
 58+ final int PRIME = 31;
 59+ int result = 1;
 60+ result = PRIME * result + length;
 61+ result = PRIME * result + offset;
 62+ result = PRIME * result + ((phrase == null) ? 0 : phrase.hashCode());
 63+ return result;
 64+ }
 65+
 66+ @Override
 67+ public boolean equals(Object obj) {
 68+ if (this == obj)
 69+ return true;
 70+ if (obj == null)
 71+ return false;
 72+ if (getClass() != obj.getClass())
 73+ return false;
 74+ final PhraseOccurance other = (PhraseOccurance) obj;
 75+ if (length != other.length)
 76+ return false;
 77+ if (offset != other.offset)
 78+ return false;
 79+ if (phrase == null) {
 80+ if (other.phrase != null)
 81+ return false;
 82+ } else if (!phrase.equals(other.phrase))
 83+ return false;
 84+ return true;
 85+ }
 86+
 87+ public boolean overlaps(PhraseOccurance other) {
 88+ if (getEndOffset() <= other.getOffset()) return false;
 89+ if (getOffset() >= other.getEndOffset()) return false;
 90+
 91+ return true;
 92+ }
 93+
 94+ public int compareTo(PhraseOccurance other) {
 95+ int o = getOffset() - other.getOffset();
 96+ if (o!=0) return o; //by offset...
 97+
 98+ int e = getEndOffset() - other.getEndOffset();
 99+ if (e!=0) return -e; //but longest first!
 100+
 101+ return 0;
 102+ }
 103+}
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java
___________________________________________________________________
Added: svn:mergeinfo
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermMeaning.java
@@ -0,0 +1,93 @@
 2+package de.brightbyte.wikiword.model;
 3+
 4+import java.io.Serializable;
 5+
 6+import de.brightbyte.util.PersistenceException;
 7+import de.brightbyte.wikiword.model.WikiWordConcept.Factory;
 8+import de.brightbyte.wikiword.schema.ConceptInfoStoreSchema.ConceptListEntrySpec;
 9+
 10+
 11+
 12+public class TermMeaning implements TermReference, Serializable {
 13+
 14+ private String term;
 15+ private double score;
 16+ private WikiWordConcept concept;
 17+
 18+ public TermMeaning(String term, WikiWordConcept concept, double score) {
 19+ this.term = term;
 20+ this.concept = concept;
 21+ this.score = score;
 22+ }
 23+
 24+ public WikiWordConcept getConcept() {
 25+ return concept;
 26+ }
 27+
 28+ public double getScore() {
 29+ return score;
 30+ }
 31+
 32+ public String getTerm() {
 33+ return term;
 34+ }
 35+
 36+ public String toString() {
 37+ return "\""+term+"\" -> "+getConcept();
 38+ }
 39+
 40+ /*
 41+ public static TermMeaning[] parseList(String s, WikiWordConcept.ListFormatSpec spec, WikiWordConcept.Factory factory) {
 42+ return WikiWordConcept.parseList(s, factory, spec);
 43+ }
 44+ */
 45+
 46+ @Override
 47+ public int hashCode() {
 48+ final int PRIME = 31;
 49+ int result = 1;
 50+ result = PRIME * result + ((concept == null) ? 0 : concept.hashCode());
 51+ result = PRIME * result + ((term == null) ? 0 : term.hashCode());
 52+ return result;
 53+ }
 54+
 55+ @Override
 56+ public boolean equals(Object obj) {
 57+ if (this == obj)
 58+ return true;
 59+ if (obj == null)
 60+ return false;
 61+ if (getClass() != obj.getClass())
 62+ return false;
 63+ final TermMeaning other = (TermMeaning) obj;
 64+ if (concept == null) {
 65+ if (other.concept != null)
 66+ return false;
 67+ } else if (!concept.equals(other.concept))
 68+ return false;
 69+ if (term == null) {
 70+ if (other.term != null)
 71+ return false;
 72+ } else if (!term.equals(other.term))
 73+ return false;
 74+ return true;
 75+ }
 76+
 77+ public static TermReference[] parseList(String s, Factory<LocalConcept> factory, ConceptListEntrySpec spec) throws PersistenceException {
 78+ LocalConcept[] concepts = WikiWordConcept.parseList(s, factory, spec); //XXX: this is a terrible, terrible hack.
 79+ TermReference[] terms = new TermReference[concepts.length];
 80+
 81+ for (int i=0; i<terms.length; i++) {
 82+ WikiWordConcept dummy = concepts[i];
 83+
 84+ String term = dummy.getName(); //UGHA!
 85+ double score = dummy.getCardinality();
 86+
 87+ WikiWordConcept target = factory.newInstance(dummy.getId(), null, dummy.getType());
 88+ terms[i] = new TermMeaning(term, target, score);
 89+ }
 90+
 91+ return terms;
 92+ }
 93+
 94+}
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermMeaning.java
___________________________________________________________________
Added: svn:mergeinfo
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/TermReference.java
@@ -1,91 +1,7 @@
22 package de.brightbyte.wikiword.model;
33
4 -import de.brightbyte.util.PersistenceException;
5 -import de.brightbyte.wikiword.model.WikiWordConcept.Factory;
6 -import de.brightbyte.wikiword.schema.ConceptInfoStoreSchema.ConceptListEntrySpec;
 4+public interface TermReference {
75
 6+ public String getTerm();
87
9 -
10 -public class TermReference {
11 -
12 - private String term;
13 - private double score;
14 - private WikiWordConcept concept;
15 -
16 - public TermReference(String term, WikiWordConcept concept, double score) {
17 - this.term = term;
18 - this.concept = concept;
19 - this.score = score;
20 - }
21 -
22 - public WikiWordConcept getConcept() {
23 - return concept;
24 - }
25 -
26 - public double getScore() {
27 - return score;
28 - }
29 -
30 - public String getTerm() {
31 - return term;
32 - }
33 -
34 - public String toString() {
35 - return "\""+term+"\" -> "+getConcept();
36 - }
37 -
38 - /*
39 - public static TermReference[] parseList(String s, WikiWordConcept.ListFormatSpec spec, WikiWordConcept.Factory factory) {
40 - return WikiWordConcept.parseList(s, factory, spec);
41 - }
42 - */
43 -
44 - @Override
45 - public int hashCode() {
46 - final int PRIME = 31;
47 - int result = 1;
48 - result = PRIME * result + ((concept == null) ? 0 : concept.hashCode());
49 - result = PRIME * result + ((term == null) ? 0 : term.hashCode());
50 - return result;
51 - }
52 -
53 - @Override
54 - public boolean equals(Object obj) {
55 - if (this == obj)
56 - return true;
57 - if (obj == null)
58 - return false;
59 - if (getClass() != obj.getClass())
60 - return false;
61 - final TermReference other = (TermReference) obj;
62 - if (concept == null) {
63 - if (other.concept != null)
64 - return false;
65 - } else if (!concept.equals(other.concept))
66 - return false;
67 - if (term == null) {
68 - if (other.term != null)
69 - return false;
70 - } else if (!term.equals(other.term))
71 - return false;
72 - return true;
73 - }
74 -
75 - public static TermReference[] parseList(String s, Factory<LocalConcept> factory, ConceptListEntrySpec spec) throws PersistenceException {
76 - LocalConcept[] concepts = WikiWordConcept.parseList(s, factory, spec); //XXX: this is a terrible, terrible hack.
77 - TermReference[] terms = new TermReference[concepts.length];
78 -
79 - for (int i=0; i<terms.length; i++) {
80 - WikiWordConcept dummy = concepts[i];
81 -
82 - String term = dummy.getName(); //UGHA!
83 - double score = dummy.getCardinality();
84 -
85 - WikiWordConcept target = factory.newInstance(dummy.getId(), null, dummy.getType());
86 - terms[i] = new TermReference(term, target, score);
87 - }
88 -
89 - return terms;
90 - }
91 -
92 -}
 8+}
\ No newline at end of file
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java
@@ -0,0 +1,82 @@
 2+package de.brightbyte.wikiword.model;
 3+
 4+import java.util.AbstractList;
 5+import java.util.ArrayList;
 6+import java.util.Collections;
 7+import java.util.List;
 8+import java.util.RandomAccess;
 9+
 10+import de.brightbyte.data.filter.Filter;
 11+
 12+public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess {
 13+
 14+ protected List<PhraseOccurance> phrases;
 15+ protected String text;
 16+
 17+ public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) {
 18+ this.text = text;
 19+
 20+ this.phrases = phrases;
 21+ Collections.sort(this.phrases); //essential!
 22+ }
 23+
 24+ @Override
 25+ public PhraseOccurance get(int index) {
 26+ return phrases.get(index);
 27+ }
 28+
 29+ @Override
 30+ public int size() {
 31+ return phrases.size();
 32+ }
 33+
 34+ public String getText() {
 35+ return text;
 36+ }
 37+
 38+ public List<PhraseOccurance> getPhrasesAt(int offs) {
 39+ int i = 0;
 40+ while (i<size()) {
 41+ PhraseOccurance p = get(i);
 42+ if (p.getOffset() >= offs) {
 43+ offs = p.getOffset();
 44+ break;
 45+ }
 46+
 47+ i++;
 48+ }
 49+
 50+ if (i>=size()) return null;
 51+
 52+ int j = i;
 53+ while (j<size()) {
 54+ PhraseOccurance p = get(j);
 55+ if (p.getOffset() > offs) break;
 56+ j++;
 57+ }
 58+
 59+ return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
 60+ }
 61+
 62+ public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
 63+ List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
 64+
 65+ int i = 0;
 66+
 67+ outer:
 68+ while (i<size()) {
 69+ List<PhraseOccurance> candidates = getPhrasesAt(i);
 70+ if (candidates == null) break;
 71+
 72+ for (PhraseOccurance p: candidates) {
 73+ i = p.getEndOffset();
 74+ if (filter.matches(p.getPhrase())) {
 75+ phrases.add(p);
 76+ continue outer;
 77+ }
 78+ }
 79+ }
 80+
 81+ return phrases;
 82+ }
 83+}
Property changes on: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSequence.java
___________________________________________________________________
Added: svn:mergeinfo
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/query/QueryConsole.java
@@ -415,7 +415,7 @@
416416 if (disambiguator==null) {
417417 StoredMeaningFetcher meaningFetcher = new StoredMeaningFetcher(getLocalConceptStore());
418418 StoredFeatureFetcher<LocalConcept, Integer> featureFetcher = new StoredFeatureFetcher<LocalConcept, Integer>(getFeatureStore());
419 - disambiguator = new SlidingCoherenceDisambiguator<Integer>( meaningFetcher, featureFetcher, true );
 419+ disambiguator = new SlidingCoherenceDisambiguator( meaningFetcher, featureFetcher, true );
420420
421421 LeveledOutput.Trace trace = new LeveledOutput.Trace(out);
422422 meaningFetcher.setTrace(trace);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureFetcher.java
@@ -1,7 +1,6 @@
22 package de.brightbyte.wikiword.disambig;
33
44 import java.util.Collection;
5 -import java.util.List;
65 import java.util.Map;
76
87 import de.brightbyte.util.PersistenceException;
@@ -10,5 +9,5 @@
1110
1211 public interface FeatureFetcher<C extends WikiWordConcept, K> {
1312 public ConceptFeatures<C, K> getFeatures(C c) throws PersistenceException;
14 - public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<C> c) throws PersistenceException;
 13+ public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> c) throws PersistenceException;
1514 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredFeatureFetcher.java
@@ -24,7 +24,7 @@
2525 return store.getConceptFeatures(c.getId());
2626 }
2727
28 - public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<C> concepts) throws PersistenceException {
 28+ public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> concepts) throws PersistenceException {
2929 trace("fetching features for "+concepts);
3030
3131 int[] ids = new int[concepts.size()];
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
@@ -6,35 +6,36 @@
77
88 import de.brightbyte.io.Output;
99 import de.brightbyte.util.PersistenceException;
10 -import de.brightbyte.wikiword.model.LocalConcept;
 10+import de.brightbyte.wikiword.model.TermReference;
 11+import de.brightbyte.wikiword.model.WikiWordConcept;
1112
12 -public abstract class AbstractDisambiguator implements Disambiguator {
 13+public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> {
1314
14 - protected MeaningFetcher<LocalConcept> meaningFetcher;
 15+ protected MeaningFetcher<? extends C> meaningFetcher;
1516 protected Output trace;
1617
17 - public AbstractDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher) {
 18+ public AbstractDisambiguator(MeaningFetcher<? extends C> meaningFetcher) {
1819 if (meaningFetcher==null) throw new NullPointerException();
1920 this.meaningFetcher = meaningFetcher;
2021 }
2122
22 - protected Map<String, List<LocalConcept>> fetchMeanings(List<String> terms) throws PersistenceException {
23 - Map<String, List<LocalConcept>> meanings = new HashMap<String, List<LocalConcept>>();
 23+ protected <X extends T>Map<X, List<? extends C>> fetchMeanings(List<X> terms) throws PersistenceException {
 24+ Map<X, List<? extends C>> meanings = new HashMap<X, List<? extends C>>();
2425
25 - for (String t: terms) {
26 - List<LocalConcept> m = meaningFetcher.getMeanings(t);
 26+ for (X t: terms) {
 27+ List<? extends C> m = meaningFetcher.getMeanings(t.getTerm());
2728 if (m!=null && m.size()>0) meanings.put(t, m);
2829 }
2930
3031 return meanings;
3132 }
3233
33 - public Result disambiguate(List<String> terms) throws PersistenceException {
34 - Map<String, List<LocalConcept>> meanings = fetchMeanings(terms);
 34+ public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException {
 35+ Map<X, List<? extends C>> meanings = fetchMeanings(terms);
3536 return disambiguate(terms, meanings);
3637 }
3738
38 - public abstract Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException;
 39+ public abstract <X extends T>Result<X, C> disambiguate(List<X> terms, Map<X, List<? extends C>> meanings) throws PersistenceException;
3940
4041 public Output getTrace() {
4142 return trace;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/FeatureCache.java
@@ -34,7 +34,7 @@
3535 return f;
3636 }
3737
38 - public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<C> concepts) throws PersistenceException {
 38+ public Map<Integer, ConceptFeatures<C, K>> getFeatures(Collection<? extends C> concepts) throws PersistenceException {
3939 Map<Integer, ConceptFeatures<C, K>> features = new HashMap<Integer, ConceptFeatures<C, K>> ();
4040 List<C> todo = new ArrayList<C>(concepts.size());
4141 for (C c: concepts) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/SlidingCoherenceDisambiguator.java
@@ -15,19 +15,20 @@
1616 import de.brightbyte.data.measure.Similarity;
1717 import de.brightbyte.util.PersistenceException;
1818 import de.brightbyte.wikiword.model.LocalConcept;
 19+import de.brightbyte.wikiword.model.TermReference;
1920 import de.brightbyte.wikiword.model.WikiWordConcept;
2021
21 -public class SlidingCoherenceDisambiguator<K> extends CoherenceDisambiguator<K> {
 22+public class SlidingCoherenceDisambiguator extends CoherenceDisambiguator {
2223
2324 protected int window ;
2425
25 - public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, boolean featuresAreNormalized) {
 26+ public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) {
2627 this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality,
27 - featuresAreNormalized ? ScalarVectorSimilarity.<K>getInstance() : CosineVectorSimilarity.<K>getInstance(), //if pre-normalized, use scalar to calc cosin
 28+ featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance(), //if pre-normalized, use scalar to calc cosin
2829 5);
2930 }
3031
31 - public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<K>> sim, int window) {
 32+ public SlidingCoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<Integer>> sim, int window) {
3233 super(meaningFetcher, featureFetcher, popularityMeasure, sim);
3334
3435 this.window = window;
@@ -36,7 +37,7 @@
3738 /* (non-Javadoc)
3839 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
3940 */
40 - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException {
 41+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException {
4142 if (window < 2 || terms.size()<2 || meanings.size()<2)
4243 return popularityDisambiguator.disambiguate(terms, meanings);
4344
@@ -47,10 +48,10 @@
4849
4950 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
5051
51 - Map<String, LocalConcept> disambig = new HashMap<String, LocalConcept>(meanings.size());
 52+ Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>(meanings.size());
5253
5354 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
54 - FeatureCache<LocalConcept, K> features = getFeatureCache(meanings);
 55+ FeatureCache<LocalConcept, Integer> features = getFeatureCache(meanings);
5556
5657 for (int i= window; ; i++) {
5758 int from = i-window;
@@ -64,12 +65,12 @@
6566 if (to-from < 2) {
6667 r = popularityDisambiguator.disambiguate(terms.subList(from, to), meanings);
6768 } else {
68 - List<Map<String, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings);
 69+ List<Map<X, LocalConcept>> interpretations = getInterpretations(from, to, terms, disambig, meanings);
6970 r = getBestInterpretation(terms, meanings, interpretations, similarities, features);
7071 }
7172
7273 for (int j=from; j<to; j++) {
73 - String t = terms.get(j);
 74+ X t = terms.get(j);
7475 if (disambig.containsKey(t)) continue;
7576
7677 LocalConcept m;
@@ -84,23 +85,23 @@
8586 return getScore(disambig, similarities, features); //FIXME: this is unnecessarily expensive, we usually don't need the scores this calculates.
8687 }
8788
88 - protected List<Map<String, LocalConcept>> getInterpretations(int from, int to, List<String> terms, Map<String, LocalConcept> known, Map<String, List<LocalConcept>> meanings) {
 89+ protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(int from, int to, List<X> terms, Map<X, ? extends LocalConcept> known, Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
8990 //strip out all terms with no known meaning
9091 if (meanings.keySet().size() != terms.size()) {
91 - List<String> t = new ArrayList<String>(terms.size());
 92+ List<X> t = new ArrayList<X>(terms.size());
9293 t.addAll(terms);
9394 t.retainAll(meanings.keySet());
9495 terms = t;
9596 }
9697
97 - Map<String, List<LocalConcept>> mset = new HashMap<String, List<LocalConcept>>();
 98+ Map<X, List<? extends LocalConcept>> mset = new HashMap<X, List<? extends LocalConcept>>();
9899
99100 if (to>terms.size()) to = terms.size();
100101
101102 for (int i=from; i<to; i++) {
102 - List<LocalConcept> m;
 103+ List<? extends LocalConcept> m;
103104
104 - String t = terms.get(i);
 105+ X t = terms.get(i);
105106 LocalConcept c = known.get(t);
106107
107108 if (c!=null) m = Collections.singletonList(c);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/TermRelatedness.java
@@ -1,10 +1,9 @@
22 package de.brightbyte.wikiword.disambig;
33
4 -import java.util.Arrays;
5 -
64 import de.brightbyte.data.measure.Similarity;
75 import de.brightbyte.util.PersistenceException;
86 import de.brightbyte.util.UncheckedPersistenceException;
 7+import de.brightbyte.wikiword.model.TermReference;
98 import de.brightbyte.wikiword.model.WikiWordConcept;
109
1110 public class TermRelatedness implements Similarity<String> {
@@ -28,13 +27,13 @@
2928 }
3029
3130 protected Similarity<WikiWordConcept> relatedness;
32 - protected Disambiguator disambig;
 31+ protected Disambiguator<TermReference, ? extends WikiWordConcept> disambig;
3332
34 - public TermRelatedness(Disambiguator disambig) {
 33+ public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig) {
3534 this(disambig, null);
3635 }
3736
38 - public TermRelatedness(Disambiguator disambig, Similarity<WikiWordConcept> relatedness) {
 37+ public TermRelatedness(Disambiguator<TermReference, ? extends WikiWordConcept> disambig, Similarity<WikiWordConcept> relatedness) {
3938 this.relatedness = relatedness;
4039 this.disambig = disambig;
4140 }
@@ -48,7 +47,7 @@
4948
5049 public Relatedness relatedness(String a, String b) {
5150 try {
52 - Disambiguator.Result r = disambig.disambiguate(Arrays.asList(new String[] {a, b}));
 51+ Disambiguator.Result<Term, ? extends WikiWordConcept> r = disambig.<Term>disambiguate(Term.asTerms(a, b));
5352 if (r==null || r.getMeanings().size()!=2) return null;
5453
5554 double d;
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
@@ -8,9 +8,10 @@
99 import de.brightbyte.data.measure.Measure;
1010 import de.brightbyte.data.measure.Measure.Comparator;
1111 import de.brightbyte.wikiword.model.LocalConcept;
 12+import de.brightbyte.wikiword.model.TermReference;
1213 import de.brightbyte.wikiword.model.WikiWordConcept;
1314
14 -public class PopularityDisambiguator extends AbstractDisambiguator {
 15+public class PopularityDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> {
1516
1617 protected Measure<WikiWordConcept> popularityMeasure;
1718 protected Comparator<WikiWordConcept> popularityComparator;
@@ -26,11 +27,11 @@
2728 this.popularityComparator = new Measure.Comparator<WikiWordConcept>(popularityMeasure, true);
2829 }
2930
30 - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) {
31 - Map<String, LocalConcept> disambig = new HashMap<String, LocalConcept>();
 31+ public <X extends TermReference>Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) {
 32+ Map<X, LocalConcept> disambig = new HashMap<X, LocalConcept>();
3233 int pop = 0;
33 - for (String t: terms) {
34 - List<LocalConcept> m = meanings.get(t);
 34+ for (X t: terms) {
 35+ List<? extends LocalConcept> m = meanings.get(t);
3536 if (m==null || m.size()==0) continue;
3637
3738 if (m.size()>0) Collections.sort(m, popularityComparator);
@@ -43,7 +44,7 @@
4445
4546 pop = pop / disambig.size();
4647
47 - Result r = new Result(disambig, pop, "pop="+pop);
 48+ Result<X, LocalConcept> r = new Result<X, LocalConcept>(disambig, pop, "pop="+pop);
4849 return r;
4950 }
5051
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
@@ -19,9 +19,10 @@
2020 import de.brightbyte.util.PersistenceException;
2121 import de.brightbyte.wikiword.model.ConceptFeatures;
2222 import de.brightbyte.wikiword.model.LocalConcept;
 23+import de.brightbyte.wikiword.model.TermReference;
2324 import de.brightbyte.wikiword.model.WikiWordConcept;
2425
25 -public class CoherenceDisambiguator<K> extends AbstractDisambiguator {
 26+public class CoherenceDisambiguator extends AbstractDisambiguator<TermReference, LocalConcept> {
2627
2728 protected int minPopularity = 2; //FIXME: use complex cutoff specifier!
2829 protected int maxMeanings = 8; //FIXME: magic...
@@ -29,8 +30,8 @@
3031 protected double minScore = 0.1; //FIXME: magic number. should "somehow" match popularityFactor and similarityFactor
3132 protected double popularityBias = 0.2; //FIXME: magic number. should "somehow" match popularityFactor and similarityFactor
3233
33 - protected Similarity<LabeledVector<K>> similarityMeasure;
34 - protected FeatureFetcher<LocalConcept, K> featureFetcher;
 34+ protected Similarity<LabeledVector<Integer>> similarityMeasure;
 35+ protected FeatureFetcher<LocalConcept, Integer> featureFetcher;
3536 protected Measure<WikiWordConcept> popularityMeasure;
3637 protected PopularityDisambiguator popularityDisambiguator;
3738
@@ -48,12 +49,12 @@
4950 }
5051 };
5152
52 - public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, boolean featuresAreNormalized) {
 53+ public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, boolean featuresAreNormalized) {
5354 this(meaningFetcher, featureFetcher, WikiWordConcept.theCardinality,
54 - featuresAreNormalized ? ScalarVectorSimilarity.<K>getInstance() : CosineVectorSimilarity.<K>getInstance()); //if pre-normalized, use scalar to calc cosin
 55+ featuresAreNormalized ? ScalarVectorSimilarity.<Integer>getInstance() : CosineVectorSimilarity.<Integer>getInstance()); //if pre-normalized, use scalar to calc cosin
5556 }
5657
57 - public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, K> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<K>> sim) {
 58+ public CoherenceDisambiguator(MeaningFetcher<LocalConcept> meaningFetcher, FeatureFetcher<LocalConcept, Integer> featureFetcher, Measure<WikiWordConcept> popularityMeasure, Similarity<LabeledVector<Integer>> sim) {
5859 super(meaningFetcher);
5960
6061 if (popularityMeasure==null) throw new NullPointerException();
@@ -69,16 +70,16 @@
7071 return featureFetcher;
7172 }
7273
73 - public void setFeatureFetcher(FeatureFetcher<LocalConcept, K> featureFetcher) {
 74+ public void setFeatureFetcher(FeatureFetcher<LocalConcept, Integer> featureFetcher) {
7475 this.featureFetcher = featureFetcher;
7576 }
7677
77 - public Similarity<LabeledVector<K>> getSimilarityMeasure() {
 78+ public Similarity<LabeledVector<Integer>> getSimilarityMeasure() {
7879 return similarityMeasure;
7980 }
8081
8182 public void setSimilarityMeasure(
82 - Similarity<LabeledVector<K>> similarityMeasure) {
 83+ Similarity<LabeledVector<Integer>> similarityMeasure) {
8384 if (similarityMeasure==null) throw new NullPointerException();
8485 this.similarityMeasure = similarityMeasure;
8586 }
@@ -115,13 +116,13 @@
116117 this.maxMeanings = maxMeanings;
117118 }
118119
119 - protected FeatureCache<LocalConcept, K> getFeatureCache(Map<String, List<LocalConcept>> meanings) throws PersistenceException {
 120+ protected FeatureCache<LocalConcept, Integer> getFeatureCache(Map<? extends TermReference, List<? extends LocalConcept>> meanings) throws PersistenceException {
120121 //TODO: keep a chain of n caches, resulting in LRU logic.
121 - FeatureCache<LocalConcept, K> features = new FeatureCache<LocalConcept, K>(featureFetcher);
 122+ FeatureCache<LocalConcept, Integer> features = new FeatureCache<LocalConcept, Integer>(featureFetcher);
122123
123124 //NOTE: pre-fetch all features in one go
124125 List<LocalConcept> concepts = new ArrayList<LocalConcept>(meanings.size()*10);
125 - for (List<LocalConcept> m: meanings.values()) {
 126+ for (List<? extends LocalConcept> m: meanings.values()) {
126127 concepts.addAll(m);
127128 }
128129
@@ -133,7 +134,7 @@
134135 /* (non-Javadoc)
135136 * @see de.brightbyte.wikiword.disambig.Disambiguator#disambiguate(java.util.List)
136137 */
137 - public Result disambiguate(List<String> terms, Map<String, List<LocalConcept>> meanings) throws PersistenceException {
 138+ public <X extends TermReference>Disambiguator.Result<X, LocalConcept> disambiguate(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) throws PersistenceException {
138139 if (terms.size()<2 || meanings.size()<2)
139140 return popularityDisambiguator.disambiguate(terms, meanings);
140141
@@ -145,22 +146,22 @@
146147 //CAVEAT: because the map disambig can contain only one meaning per term, the same term can not occur with two meanings within the same term sequence.
147148
148149 LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true);
149 - FeatureCache<LocalConcept, K> features = getFeatureCache(meanings);
 150+ FeatureCache<LocalConcept, Integer> features = getFeatureCache(meanings);
150151
151 - List<Map<String, LocalConcept>> interpretations = getInterpretations(terms, meanings);
 152+ List<Map<X, LocalConcept>> interpretations = getInterpretations(terms, meanings);
152153
153154 return getBestInterpretation(terms, meanings, interpretations, similarities, features);
154155 }
155156
156 - protected void pruneMeanings(Map<String, List<LocalConcept>> meanings) {
 157+ protected void pruneMeanings(Map<? extends TermReference, List<? extends LocalConcept>> meanings) {
157158 if (minPopularity<=1) return; //nothing to do
158159
159 - Iterator<Map.Entry<String, List<LocalConcept>>> eit = meanings.entrySet().iterator();
 160+ Iterator<?> eit = meanings.entrySet().iterator();
160161 while (eit.hasNext()) {
161 - Entry<String, List<LocalConcept>> e = eit.next();
162 - List<LocalConcept> m = e.getValue();
 162+ Entry<TermReference, List<? extends LocalConcept>> e = (Entry<TermReference, List<? extends LocalConcept>>) eit.next(); //XXX: ugly cast. got confused about generics. ugh.
 163+ List<? extends LocalConcept> m = e.getValue();
163164
164 - Iterator<LocalConcept> cit = m.iterator();
 165+ Iterator<? extends LocalConcept> cit = m.iterator();
165166 while (cit.hasNext()) {
166167 LocalConcept c = cit.next();
167168 double p = popularityMeasure.measure(c);
@@ -179,15 +180,15 @@
180181 }
181182 }
182183
183 - protected Result getBestInterpretation(List<String> terms, Map<String, List<LocalConcept>> meanings,
184 - List<Map<String, LocalConcept>> interpretations,
185 - LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, K> features) throws PersistenceException {
 184+ protected <X extends TermReference>Result<X, LocalConcept> getBestInterpretation(List<X> terms, Map<X, List<? extends LocalConcept>> meanings,
 185+ List<Map<X, LocalConcept>> interpretations,
 186+ LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, Integer> features) throws PersistenceException {
186187
187 - List<Result> rankings = new ArrayList<Result>();
 188+ List<Result<X, LocalConcept>> rankings = new ArrayList<Result<X, LocalConcept>>();
188189
189190 double traceLimit = -1;
190 - for (Map<String, LocalConcept> interp: interpretations) {
191 - Result r = getScore(interp, similarities, features);
 191+ for (Map<X, LocalConcept> interp: interpretations) {
 192+ Result<X, LocalConcept> r = getScore(interp, similarities, features);
192193
193194 if (r.getScore() >= minScore) {
194195 rankings.add(r);
@@ -205,27 +206,27 @@
206207 Collections.reverse(rankings);
207208
208209 //TODO: if result is tight (less than 50% distance), use more popularity score!
209 - Result r = rankings.get(0);
 210+ Result<X, LocalConcept> r = rankings.get(0);
210211 return r;
211212 }
212213
213 - protected List<Map<String, LocalConcept>> getInterpretations(List<String> terms, Map<String, List<LocalConcept>> meanings) {
 214+ protected <X extends TermReference>List<Map<X, LocalConcept>> getInterpretations(List<X> terms, Map<X, List<? extends LocalConcept>> meanings) {
214215 if (terms.size()==0) {
215 - return Collections.singletonList(Collections.<String, LocalConcept>emptyMap());
 216+ return Collections.singletonList(Collections.<X, LocalConcept>emptyMap());
216217 }
217218
218 - String t = terms.get(0);
219 - List<LocalConcept> m = meanings.get(t);
 219+ X t = terms.get(0);
 220+ List<? extends LocalConcept> m = meanings.get(t);
220221
221 - List<Map<String, LocalConcept>> base = getInterpretations(terms.subList(1, terms.size()), meanings);
 222+ List<Map<X, LocalConcept>> base = getInterpretations(terms.subList(1, terms.size()), meanings);
222223
223224 if (m==null || m.size()==0) return base;
224225
225 - List<Map<String, LocalConcept>> interpretations = new ArrayList<Map<String, LocalConcept>>();
 226+ List<Map<X, LocalConcept>> interpretations = new ArrayList<Map<X, LocalConcept>>();
226227
227 - for (Map<String, LocalConcept> be: base) {
 228+ for (Map<X, LocalConcept> be: base) {
228229 for (LocalConcept c: m) {
229 - Map<String, LocalConcept> e = new HashMap<String, LocalConcept>();
 230+ Map<X, LocalConcept> e = new HashMap<X, LocalConcept>();
230231 e.putAll(be);
231232 e.put(t, c);
232233
@@ -237,7 +238,7 @@
238239 return interpretations;
239240 }
240241
241 - protected Result getScore(Map<String, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, K> features) throws PersistenceException {
 242+ protected <X extends TermReference>Result<X, LocalConcept> getScore(Map<X, LocalConcept> interp, LabeledMatrix<LocalConcept, LocalConcept> similarities, FeatureCache<LocalConcept, Integer> features) throws PersistenceException {
242243 double sim = 0;
243244 double pop = 0;
244245
@@ -259,8 +260,8 @@
260261 d = similarities.get(a, b);
261262 }
262263 else {
263 - ConceptFeatures<LocalConcept, K> fa = features.getFeatures(a);
264 - ConceptFeatures<LocalConcept, K> fb = features.getFeatures(b);
 264+ ConceptFeatures<LocalConcept, Integer> fa = features.getFeatures(a);
 265+ ConceptFeatures<LocalConcept, Integer> fb = features.getFeatures(b);
265266
266267 //force relevance/cardinality to the figures from the meaning lookup
267268 //not strictly necessary, but nice to keep it consistent.
@@ -296,7 +297,7 @@
297298 double score = popf * popularityBias + simf * ( 1 - popularityBias );
298299 //double score = Math.sqrt( popf * simf ); //FIXME: functor!
299300
300 - return new Result(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop);
 301+ return new Result<X, LocalConcept>(interp, score, "simf="+simf+", popf="+popf+", sim="+sim+", pop="+pop);
301302 }
302 -
 303+
303304 }
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java
@@ -0,0 +1,64 @@
 2+package de.brightbyte.wikiword.disambig;
 3+
 4+import java.util.ArrayList;
 5+import java.util.Arrays;
 6+import java.util.List;
 7+
 8+import de.brightbyte.wikiword.model.TermReference;
 9+
 10+public class Term implements TermReference {
 11+
 12+ private final String term;
 13+
 14+ public Term(final String term) {
 15+ super();
 16+ this.term = term;
 17+ }
 18+
 19+ public String getTerm() {
 20+ return term;
 21+ }
 22+
 23+ public String toString() {
 24+ return getTerm();
 25+ }
 26+
 27+ @Override
 28+ public int hashCode() {
 29+ final int PRIME = 31;
 30+ int result = 1;
 31+ result = PRIME * result + ((term == null) ? 0 : term.hashCode());
 32+ return result;
 33+ }
 34+
 35+ @Override
 36+ public boolean equals(Object obj) {
 37+ if (this == obj)
 38+ return true;
 39+ if (obj == null)
 40+ return false;
 41+ if (getClass() != obj.getClass())
 42+ return false;
 43+ final Term other = (Term) obj;
 44+ if (term == null) {
 45+ if (other.term != null)
 46+ return false;
 47+ } else if (!term.equals(other.term))
 48+ return false;
 49+ return true;
 50+ }
 51+
 52+ public static List<Term> asTerms(String... terms) {
 53+ return asTerms(Arrays.asList(terms));
 54+ }
 55+
 56+ public static List<Term> asTerms(List<String> terms) {
 57+ List<Term> tt = new ArrayList<Term>();
 58+ for (String t: terms) {
 59+ tt.add(new Term(t));
 60+ }
 61+
 62+ return tt;
 63+ }
 64+
 65+}
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Disambiguator.java
@@ -5,23 +5,24 @@
66
77 import de.brightbyte.io.Output;
88 import de.brightbyte.util.PersistenceException;
 9+import de.brightbyte.wikiword.model.TermReference;
910 import de.brightbyte.wikiword.model.WikiWordConcept;
1011
11 -public interface Disambiguator {
 12+public interface Disambiguator<T extends TermReference, C extends WikiWordConcept> {
1213
13 - public static class Result implements Comparable {
14 - private Map<String, ? extends WikiWordConcept> meanings;
 14+ public static class Result<T extends TermReference, C extends WikiWordConcept> implements Comparable {
 15+ private Map<? extends T, ? extends C> meanings;
1516 private double score;
1617 private String description;
1718
18 - public Result(Map<String, ? extends WikiWordConcept> meanings, double score, String description) {
 19+ public Result(Map<? extends T, ? extends C> meanings, double score, String description) {
1920 super();
2021 this.meanings = meanings;
2122 this.score = score;
2223 this.description = description;
2324 }
2425
25 - public Map<String, ? extends WikiWordConcept> getMeanings() {
 26+ public Map<? extends T, ? extends C> getMeanings() {
2627 return meanings;
2728 }
2829
@@ -51,6 +52,6 @@
5253
5354 public void setTrace(Output trace);
5455
55 - public Result disambiguate(List<String> terms) throws PersistenceException;
 56+ public <X extends T>Result<X, C> disambiguate(List<X> terms) throws PersistenceException;
5657
5758 }
\ No newline at end of file

Status & tagging log