Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java |
— | — | @@ -52,7 +52,7 @@ |
53 | 53 | return asTerms(Arrays.asList(terms)); |
54 | 54 | } |
55 | 55 | |
56 | | - public static List<Term> asTerms(List<String> terms) { |
| 56 | + public static List<Term> asTerms(Iterable<String> terms) { |
57 | 57 | List<Term> tt = new ArrayList<Term>(); |
58 | 58 | for (String t: terms) { |
59 | 59 | tt.add(new Term(t)); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseFeatureStore.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import java.sql.ResultSet; |
5 | 5 | import java.sql.SQLException; |
6 | 6 | import java.util.ArrayList; |
| 7 | +import java.util.Collections; |
7 | 8 | import java.util.HashMap; |
8 | 9 | import java.util.List; |
9 | 10 | import java.util.Map; |
— | — | @@ -83,6 +84,8 @@ |
84 | 85 | } |
85 | 86 | |
86 | 87 | public Map<Integer, ConceptFeatures<T, Integer>> getConceptsFeatures(int[] concepts) throws PersistenceException { |
| 88 | + if (concepts.length==0) return Collections.emptyMap(); |
| 89 | + |
87 | 90 | try { |
88 | 91 | String sql = "SELECT concept, feature, normal_weight FROM " +featureTable.getSQLName()+" as F "; |
89 | 92 | sql += " WHERE concept IN "+database.encodeSet(concepts); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/AbstractTsvOutput.java |
— | — | @@ -4,11 +4,13 @@ |
5 | 5 | import java.util.regex.Matcher; |
6 | 6 | import java.util.regex.Pattern; |
7 | 7 | |
| 8 | +import de.brightbyte.data.cursor.DataCursor; |
| 9 | +import de.brightbyte.data.cursor.DataSink; |
8 | 10 | import de.brightbyte.util.PersistenceException; |
9 | 11 | import de.brightbyte.wikiword.Corpus; |
10 | 12 | import de.brightbyte.wikiword.DatasetIdentifier; |
11 | 13 | |
12 | | -public abstract class AbstractTsvOutput extends AbstractWriterOutput { |
| 14 | +public abstract class AbstractTsvOutput extends AbstractWriterOutput implements DataSink<String[]> { |
13 | 15 | |
14 | 16 | private CharSequence terminator = "\r\n"; |
15 | 17 | private CharSequence separator = "\t"; |
— | — | @@ -34,6 +36,22 @@ |
35 | 37 | |
36 | 38 | protected StringBuilder buffer = new StringBuilder(); |
37 | 39 | |
| 40 | + |
| 41 | + public int transfer(DataCursor<String[]> cursor) throws PersistenceException { |
| 42 | + String[] rec; |
| 43 | + int c = 0; |
| 44 | + while ((rec = cursor.next()) != null) { |
| 45 | + commit(rec); |
| 46 | + c++; |
| 47 | + } |
| 48 | + |
| 49 | + return c; |
| 50 | + } |
| 51 | + |
| 52 | + public void commit(String[] values) throws PersistenceException { |
| 53 | + writeRow(values); |
| 54 | + } |
| 55 | + |
38 | 56 | protected void writeRow(String... values) throws PersistenceException { |
39 | 57 | buffer.setLength(0); |
40 | 58 | boolean first = true; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java |
— | — | @@ -1,22 +1,27 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.text.ParseException; |
| 6 | +import java.util.Collections; |
5 | 7 | import java.util.List; |
| 8 | +import java.util.regex.Pattern; |
6 | 9 | |
7 | 10 | import de.brightbyte.data.cursor.DataCursor; |
8 | 11 | import de.brightbyte.data.cursor.DataSink; |
9 | 12 | import de.brightbyte.io.ConsoleIO; |
10 | 13 | import de.brightbyte.io.LineCursor; |
11 | 14 | import de.brightbyte.io.OutputSink; |
| 15 | +import de.brightbyte.text.Chunker; |
| 16 | +import de.brightbyte.text.RegularExpressionChunker; |
12 | 17 | import de.brightbyte.util.PersistenceException; |
13 | 18 | import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer; |
14 | 19 | import de.brightbyte.wikiword.disambig.Disambiguator; |
15 | 20 | import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator; |
16 | 21 | import de.brightbyte.wikiword.disambig.StoredFeatureFetcher; |
17 | 22 | import de.brightbyte.wikiword.disambig.StoredMeaningFetcher; |
| 23 | +import de.brightbyte.wikiword.disambig.Term; |
| 24 | +import de.brightbyte.wikiword.disambig.Disambiguator.Result; |
18 | 25 | import de.brightbyte.wikiword.model.LocalConcept; |
19 | | -import de.brightbyte.wikiword.model.PhraseOccurance; |
20 | | -import de.brightbyte.wikiword.model.PhraseOccuranceSequence; |
21 | 26 | import de.brightbyte.wikiword.model.TermReference; |
22 | 27 | import de.brightbyte.wikiword.store.DatabaseConceptStores; |
23 | 28 | import de.brightbyte.wikiword.store.FeatureStore; |
— | — | @@ -27,6 +32,8 @@ |
28 | 33 | protected Disambiguator<TermReference, LocalConcept> disambiguator; |
29 | 34 | protected PlainTextAnalyzer analyzer; |
30 | 35 | private int phraseLength; |
| 36 | + protected Chunker chunker; |
| 37 | + protected boolean flip = false; |
31 | 38 | |
32 | 39 | public WordSenseIndexer() { |
33 | 40 | super(false, true); |
— | — | @@ -64,17 +71,47 @@ |
65 | 72 | analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks); |
66 | 73 | analyzer.initialize(); |
67 | 74 | |
68 | | - phraseLength = args.getIntOption("phrase-length", tweaks.getTweak("wikiSenseIndexer.phraseLength", 6)); |
| 75 | + phraseLength = args.getIntOption("phrase-length", tweaks.getTweak("wikiSenseIndexer.phraseLength", 6)); |
| 76 | + |
| 77 | + chunker = new RegularExpressionChunker(Pattern.compile("\\s*[,;|]\\s*")); //TODO: configure! |
| 78 | + flip = true; //FIXME: parameter |
| 79 | + //TODO: parameter for limiting concept type |
69 | 80 | } |
70 | 81 | |
71 | 82 | @Override |
72 | | - protected String process(String line) throws PersistenceException { |
| 83 | + protected String process(String line) throws PersistenceException, ParseException { |
| 84 | + //TODO: logic for handling overlapping phrases in a PhraseOccuranceSequence |
| 85 | + /* |
73 | 86 | PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first. |
74 | 87 | List<PhraseOccurance> phrases = sequence.getDisjointPhraseSequence(null); |
75 | 88 | Disambiguator.Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(phrases); |
76 | 89 | return result.toString(); //FIXME: annotate! |
| 90 | + */ |
| 91 | + |
| 92 | + List<Term> terms = Term.asTerms(chunker.chunk(line)); |
| 93 | + if (flip) Collections.reverse(terms); |
| 94 | + |
| 95 | + Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms); |
| 96 | + if (flip) Collections.reverse(terms); |
| 97 | + |
| 98 | + return assembleMeanings(terms, result); |
77 | 99 | } |
78 | 100 | |
| 101 | + private String assembleMeanings(List<Term> terms, Result<Term, LocalConcept> result) { |
| 102 | + StringBuilder s = new StringBuilder(); |
| 103 | + |
| 104 | + for (Term t: terms) { |
| 105 | + LocalConcept concept = result.getMeanings().get(t); |
| 106 | + |
| 107 | + if (s.length()>0) s.append(';'); |
| 108 | + s.append(t.getTerm()); //FIXME: escape! |
| 109 | + s.append('='); |
| 110 | + if (concept!=null) s.append(concept.getName()); //FIXME: escape! |
| 111 | + } |
| 112 | + |
| 113 | + return s.toString(); |
| 114 | + } |
| 115 | + |
79 | 116 | public static void main(String[] argv) throws Exception { |
80 | 117 | WordSenseIndexer q = new WordSenseIndexer(); |
81 | 118 | q.launch(argv); |