Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -1,5 +1,7 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.Reader; |
4 | 6 | import java.util.ArrayList; |
5 | 7 | import java.util.HashMap; |
6 | 8 | import java.util.HashSet; |
— | — | @@ -14,25 +16,33 @@ |
15 | 17 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
16 | 18 | |
17 | 19 | /** |
18 | | - * Wiki Tokenizer. Tokens are words and numbers. All letters are |
19 | | - * lowercased and diacritics deleted using Unicode compatibility |
20 | | - * decomposition (i.e. č -> c). Parses some basic wiki syntax, |
21 | | - * template names are skipped, from images captions are extracted, |
22 | | - * categories and interwiki links are extracted... |
| 20 | + * Wiki Tokenizer. Tokens are words and numbers. All letters are lower cased and |
| 21 | + * diacritics deleted using Unicode compatibility decomposition (i.e. č -> c). |
| 22 | + * Parses some basic wiki syntax, template names are skipped, from images |
| 23 | + * captions are extracted, categories and interwiki links are extracted... |
23 | 24 | * |
24 | | - * Tokenizer will not take a Reader as input, but a String (for |
25 | | - * optimal performance) |
| 25 | + * This Tokenizer's performance is optimized for String input, not Reader as |
| 26 | + * input. |
26 | 27 | * |
| 28 | + * Problems does not use a filter chain does do lots of char level processing on |
| 29 | + * the char |
| 30 | + * |
27 | 31 | * @author rainman |
28 | | - * |
| 32 | + * @author OrenBochman |
| 33 | + * |
29 | 34 | */ |
30 | 35 | public class FastWikiTokenizerEngine { |
31 | 36 | private static final int MAX_WORD_LEN = 255; |
32 | | - private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text, e.g. gödel |
| 37 | + private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text, |
| 38 | + // e.g. gödel |
33 | 39 | private final char[] aliasBuffer = new char[MAX_WORD_LEN]; // buffer for aliases, e.g. goedel |
34 | | - private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for dedomposed text e.g. godel |
| 40 | + private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for |
| 41 | + // decomposed |
| 42 | + // text e.g. |
| 43 | + // godel |
35 | 44 | private final char[] glueBuffer = new char[MAX_WORD_LEN-1]; // buffer of spaces, etc.. that glues tokens together to produce the original (for highlight) |
36 | 45 | private final char[] tempBuffer = new char[MAX_WORD_LEN-1]; // buffer for temp stuff |
| 46 | + private Reader rdr; |
37 | 47 | private char[] text; |
38 | 48 | private String textString; // original text in string format |
39 | 49 | private int textLength; |
— | — | @@ -80,7 +90,7 @@ |
81 | 91 | /** valid protocols for external links */ |
82 | 92 | private final static String[] PROTOCOLS = {"http://","https://","ftp://","mailto://","news://","gopher://"}; |
83 | 93 | |
84 | | - /** This many tokens from begining of text are eligable for keywords */ |
| 94 | + /** This many tokens from beginning of text are eligible for keywords */ |
85 | 95 | public static int KEYWORD_TOKEN_LIMIT = 250; |
86 | 96 | |
87 | 97 | /** Token gap at first section break */ |
— | — | @@ -143,6 +153,37 @@ |
144 | 154 | init(); |
145 | 155 | } |
146 | 156 | |
| 157 | + public FastWikiTokenizerEngine(Reader rdr, IndexId iid, |
| 158 | + TokenizerOptions options) { |
| 159 | + |
| 160 | + StringBuilder sb = new StringBuilder(); |
| 161 | + |
| 162 | + // use standard Reader-reading techniques to access the Reader |
| 163 | + int len = -1; // Number of chars read |
| 164 | + char[] buf = new char[256]; // Characters read from Reader |
| 165 | + try { |
| 166 | + while ((len = rdr.read(buf, 0, 256)) != -1) { |
| 167 | + // System.out.println("Next chunk from Reader is " + (new |
| 168 | + // String(buf, 0, len))); |
| 169 | + sb.append(buf, 0, len); |
| 170 | + |
| 171 | + } |
| 172 | + } catch (IOException ioe) { |
| 173 | + System.err |
| 174 | + .println("Error reading from Reader :" + ioe.getMessage()); |
| 175 | + } |
| 176 | + |
| 177 | + this.rdr = rdr; |
| 178 | + this.text = new char[sb.length()]; |
| 179 | + sb.getChars(0, sb.length(), this.text, 0); |
| 180 | + this.textString = String.valueOf(text); |
| 181 | + this.language = iid.getLangCode(); |
| 182 | + this.iid = iid; |
| 183 | + this.options = options; |
| 184 | + textLength = sb.length(); |
| 185 | + init(); |
| 186 | + } |
| 187 | + |
147 | 188 | /** |
148 | 189 | * Strip accents |
149 | 190 | * @param c |
— | — | @@ -1439,6 +1480,9 @@ |
1440 | 1481 | return tokens; |
1441 | 1482 | } |
1442 | 1483 | |
| 1484 | + /** |
| 1485 | + * removes comments from the input. |
| 1486 | + */ |
1443 | 1487 | private void stripComments() { |
1444 | 1488 | char[] stripped = new char[textLength]; |
1445 | 1489 | int slen = 0; |