r109956 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109955‎ | r109956 | r109957 >
Date:21:29, 24 January 2012
Author:oren
Status:deferred
Tags:
Comment:
a slower version of the FastWikiTokenizer but that can work with streams
Modified paths:
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -1,5 +1,7 @@
22 package org.wikimedia.lsearch.analyzers;
33
 4+import java.io.IOException;
 5+import java.io.Reader;
46 import java.util.ArrayList;
57 import java.util.HashMap;
68 import java.util.HashSet;
@@ -14,25 +16,33 @@
1517 import org.wikimedia.lsearch.util.UnicodeDecomposer;
1618
1719 /**
18 - * Wiki Tokenizer. Tokens are words and numbers. All letters are
19 - * lowercased and diacritics deleted using Unicode compatibility
20 - * decomposition (i.e. č -> c). Parses some basic wiki syntax,
21 - * template names are skipped, from images captions are extracted,
22 - * categories and interwiki links are extracted...
 20+ * Wiki Tokenizer. Tokens are words and numbers. All letters are lower cased and
 21+ * diacritics deleted using Unicode compatibility decomposition (i.e. č -> c).
 22+ * Parses some basic wiki syntax, template names are skipped, from images
 23+ * captions are extracted, categories and interwiki links are extracted...
2324 *
24 - * Tokenizer will not take a Reader as input, but a String (for
25 - * optimal performance)
 25+ * This Tokenizer's performance is optimized for String input, not Reader as
 26+ * input.
2627 *
 28+ * Problems does not use a filter chain does do lots of char level processing on
 29+ * the char
 30+ *
2731 * @author rainman
28 - *
 32+ * @author OrenBochman
 33+ *
2934 */
3035 public class FastWikiTokenizerEngine {
3136 private static final int MAX_WORD_LEN = 255;
32 - private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text, e.g. gödel
 37+ private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text,
 38+ // e.g. gödel
3339 private final char[] aliasBuffer = new char[MAX_WORD_LEN]; // buffer for aliases, e.g. goedel
34 - private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for dedomposed text e.g. godel
 40+ private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for
 41+ // decomposed
 42+ // text e.g.
 43+ // godel
3544 private final char[] glueBuffer = new char[MAX_WORD_LEN-1]; // buffer of spaces, etc.. that glues tokens together to produce the original (for highlight)
3645 private final char[] tempBuffer = new char[MAX_WORD_LEN-1]; // buffer for temp stuff
 46+ private Reader rdr;
3747 private char[] text;
3848 private String textString; // original text in string format
3949 private int textLength;
@@ -80,7 +90,7 @@
8191 /** valid protocols for external links */
8292 private final static String[] PROTOCOLS = {"http://","https://","ftp://","mailto://","news://","gopher://"};
8393
84 - /** This many tokens from begining of text are eligable for keywords */
 94+ /** This many tokens from beginning of text are eligible for keywords */
8595 public static int KEYWORD_TOKEN_LIMIT = 250;
8696
8797 /** Token gap at first section break */
@@ -143,6 +153,37 @@
144154 init();
145155 }
146156
 157+ public FastWikiTokenizerEngine(Reader rdr, IndexId iid,
 158+ TokenizerOptions options) {
 159+
 160+ StringBuilder sb = new StringBuilder();
 161+
 162+ // use standard Reader-reading techniques to access the Reader
 163+ int len = -1; // Number of chars read
 164+ char[] buf = new char[256]; // Characters read from Reader
 165+ try {
 166+ while ((len = rdr.read(buf, 0, 256)) != -1) {
 167+ // System.out.println("Next chunk from Reader is " + (new
 168+ // String(buf, 0, len)));
 169+ sb.append(buf, 0, len);
 170+
 171+ }
 172+ } catch (IOException ioe) {
 173+ System.err
 174+ .println("Error reading from Reader :" + ioe.getMessage());
 175+ }
 176+
 177+ this.rdr = rdr;
 178+ this.text = new char[sb.length()];
 179+ sb.getChars(0, sb.length(), this.text, 0);
 180+ this.textString = String.valueOf(text);
 181+ this.language = iid.getLangCode();
 182+ this.iid = iid;
 183+ this.options = options;
 184+ textLength = sb.length();
 185+ init();
 186+ }
 187+
147188 /**
148189 * Strip accents
149190 * @param c
@@ -1439,6 +1480,9 @@
14401481 return tokens;
14411482 }
14421483
 1484+ /**
 1485+ * removes comments from the input.
 1486+ */
14431487 private void stripComments() {
14441488 char[] stripped = new char[textLength];
14451489 int slen = 0;

Status & tagging log