r64391 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r64390‎ | r64391 | r64392 >
Date:15:50, 30 March 2010
Author:daniel
Status:deferred
Tags:
Comment:
chunker-based word sense indexer
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseFeatureStore.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/AbstractTsvOutput.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/Term.java
@@ -52,7 +52,7 @@
5353 return asTerms(Arrays.asList(terms));
5454 }
5555
56 - public static List<Term> asTerms(List<String> terms) {
 56+ public static List<Term> asTerms(Iterable<String> terms) {
5757 List<Term> tt = new ArrayList<Term>();
5858 for (String t: terms) {
5959 tt.add(new Term(t));
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/store/DatabaseFeatureStore.java
@@ -3,6 +3,7 @@
44 import java.sql.ResultSet;
55 import java.sql.SQLException;
66 import java.util.ArrayList;
 7+import java.util.Collections;
78 import java.util.HashMap;
89 import java.util.List;
910 import java.util.Map;
@@ -83,6 +84,8 @@
8485 }
8586
8687 public Map<Integer, ConceptFeatures<T, Integer>> getConceptsFeatures(int[] concepts) throws PersistenceException {
 88+ if (concepts.length==0) return Collections.emptyMap();
 89+
8790 try {
8891 String sql = "SELECT concept, feature, normal_weight FROM " +featureTable.getSQLName()+" as F ";
8992 sql += " WHERE concept IN "+database.encodeSet(concepts);
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/AbstractTsvOutput.java
@@ -4,11 +4,13 @@
55 import java.util.regex.Matcher;
66 import java.util.regex.Pattern;
77
 8+import de.brightbyte.data.cursor.DataCursor;
 9+import de.brightbyte.data.cursor.DataSink;
810 import de.brightbyte.util.PersistenceException;
911 import de.brightbyte.wikiword.Corpus;
1012 import de.brightbyte.wikiword.DatasetIdentifier;
1113
12 -public abstract class AbstractTsvOutput extends AbstractWriterOutput {
 14+public abstract class AbstractTsvOutput extends AbstractWriterOutput implements DataSink<String[]> {
1315
1416 private CharSequence terminator = "\r\n";
1517 private CharSequence separator = "\t";
@@ -34,6 +36,22 @@
3537
3638 protected StringBuilder buffer = new StringBuilder();
3739
 40+
 41+ public int transfer(DataCursor<String[]> cursor) throws PersistenceException {
 42+ String[] rec;
 43+ int c = 0;
 44+ while ((rec = cursor.next()) != null) {
 45+ commit(rec);
 46+ c++;
 47+ }
 48+
 49+ return c;
 50+ }
 51+
 52+ public void commit(String[] values) throws PersistenceException {
 53+ writeRow(values);
 54+ }
 55+
3856 protected void writeRow(String... values) throws PersistenceException {
3957 buffer.setLength(0);
4058 boolean first = true;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java
@@ -1,22 +1,27 @@
22 package de.brightbyte.wikiword.extract;
33
44 import java.io.IOException;
 5+import java.text.ParseException;
 6+import java.util.Collections;
57 import java.util.List;
 8+import java.util.regex.Pattern;
69
710 import de.brightbyte.data.cursor.DataCursor;
811 import de.brightbyte.data.cursor.DataSink;
912 import de.brightbyte.io.ConsoleIO;
1013 import de.brightbyte.io.LineCursor;
1114 import de.brightbyte.io.OutputSink;
 15+import de.brightbyte.text.Chunker;
 16+import de.brightbyte.text.RegularExpressionChunker;
1217 import de.brightbyte.util.PersistenceException;
1318 import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer;
1419 import de.brightbyte.wikiword.disambig.Disambiguator;
1520 import de.brightbyte.wikiword.disambig.SlidingCoherenceDisambiguator;
1621 import de.brightbyte.wikiword.disambig.StoredFeatureFetcher;
1722 import de.brightbyte.wikiword.disambig.StoredMeaningFetcher;
 23+import de.brightbyte.wikiword.disambig.Term;
 24+import de.brightbyte.wikiword.disambig.Disambiguator.Result;
1825 import de.brightbyte.wikiword.model.LocalConcept;
19 -import de.brightbyte.wikiword.model.PhraseOccurance;
20 -import de.brightbyte.wikiword.model.PhraseOccuranceSequence;
2126 import de.brightbyte.wikiword.model.TermReference;
2227 import de.brightbyte.wikiword.store.DatabaseConceptStores;
2328 import de.brightbyte.wikiword.store.FeatureStore;
@@ -27,6 +32,8 @@
2833 protected Disambiguator<TermReference, LocalConcept> disambiguator;
2934 protected PlainTextAnalyzer analyzer;
3035 private int phraseLength;
 36+ protected Chunker chunker;
 37+ protected boolean flip = false;
3138
3239 public WordSenseIndexer() {
3340 super(false, true);
@@ -64,17 +71,47 @@
6572 analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(getCorpus(), tweaks);
6673 analyzer.initialize();
6774
68 - phraseLength = args.getIntOption("phrase-length", tweaks.getTweak("wikiSenseIndexer.phraseLength", 6));
 75+ phraseLength = args.getIntOption("phrase-length", tweaks.getTweak("wikiSenseIndexer.phraseLength", 6));
 76+
 77+ chunker = new RegularExpressionChunker(Pattern.compile("\\s*[,;|]\\s*")); //TODO: configure!
 78+ flip = true; //FIXME: parameter
 79+ //TODO: parameter for limiting concept type
6980 }
7081
7182 @Override
72 - protected String process(String line) throws PersistenceException {
 83+ protected String process(String line) throws PersistenceException, ParseException {
 84+ //TODO: logic for handling overlapping phrases in a PhraseOccuranceSequence
 85+ /*
7386 PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first.
7487 List<PhraseOccurance> phrases = sequence.getDisjointPhraseSequence(null);
7588 Disambiguator.Result<PhraseOccurance, LocalConcept> result = disambiguator.disambiguate(phrases);
7689 return result.toString(); //FIXME: annotate!
 90+ */
 91+
 92+ List<Term> terms = Term.asTerms(chunker.chunk(line));
 93+ if (flip) Collections.reverse(terms);
 94+
 95+ Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms);
 96+ if (flip) Collections.reverse(terms);
 97+
 98+ return assembleMeanings(terms, result);
7799 }
78100
 101+ private String assembleMeanings(List<Term> terms, Result<Term, LocalConcept> result) {
 102+ StringBuilder s = new StringBuilder();
 103+
 104+ for (Term t: terms) {
 105+ LocalConcept concept = result.getMeanings().get(t);
 106+
 107+ if (s.length()>0) s.append(';');
 108+ s.append(t.getTerm()); //FIXME: escape!
 109+ s.append('=');
 110+ if (concept!=null) s.append(concept.getName()); //FIXME: escape!
 111+ }
 112+
 113+ return s.toString();
 114+ }
 115+
79116 public static void main(String[] argv) throws Exception {
80117 WordSenseIndexer q = new WordSenseIndexer();
81118 q.launch(argv);

Status & tagging log