r109956 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r109955‎ \| r109956 \| r109957 >
Date:	21:29, 24 January 2012
Author:	oren
Status:	deferred
Tags:
Comment:	a slower version of the FastWikiTokenizer but that can work with streams
Modified paths:	/trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -1,5 +1,7 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
	4	+import java.io.IOException;
	5	+import java.io.Reader;
4	6	import java.util.ArrayList;
5	7	import java.util.HashMap;
6	8	import java.util.HashSet;
—	—	@@ -14,25 +16,33 @@
15	17	import org.wikimedia.lsearch.util.UnicodeDecomposer;
16	18
17	19	/**
18		~~- * Wiki Tokenizer. Tokens are words and numbers. All letters are~~
19		~~- * lowercased and diacritics deleted using Unicode compatibility~~
20		~~- * decomposition (i.e. č -> c). Parses some basic wiki syntax,~~
21		~~- * template names are skipped, from images captions are extracted,~~
22		~~- * categories and interwiki links are extracted...~~
	20	+ * Wiki Tokenizer. Tokens are words and numbers. All letters are lower cased and
	21	+ * diacritics deleted using Unicode compatibility decomposition (i.e. č -> c).
	22	+ * Parses some basic wiki syntax, template names are skipped, from images
	23	+ * captions are extracted, categories and interwiki links are extracted...
23	24	*
24		~~- * Tokenizer will not take a Reader as input, but a String (for~~
25		~~- * optimal performance)~~
	25	+ * This Tokenizer's performance is optimized for String input, not Reader as
	26	+ * input.
26	27	*
	28	+ * Problems does not use a filter chain does do lots of char level processing on
	29	+ * the char
	30	+ *
27	31	* @author rainman
28		- *
	32	+ * @author OrenBochman
	33	+ *
29	34	*/
30	35	public class FastWikiTokenizerEngine {
31	36	private static final int MAX_WORD_LEN = 255;
32		~~- private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text, e.g. gödel~~
	37	+ private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text,
	38	+ // e.g. gödel
33	39	private final char[] aliasBuffer = new char[MAX_WORD_LEN]; // buffer for aliases, e.g. goedel
34		~~- private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for dedomposed text e.g. godel~~
	40	+ private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for
	41	+ // decomposed
	42	+ // text e.g.
	43	+ // godel
35	44	private final char[] glueBuffer = new char[MAX_WORD_LEN-1]; // buffer of spaces, etc.. that glues tokens together to produce the original (for highlight)
36	45	private final char[] tempBuffer = new char[MAX_WORD_LEN-1]; // buffer for temp stuff
	46	+ private Reader rdr;
37	47	private char[] text;
38	48	private String textString; // original text in string format
39	49	private int textLength;
—	—	@@ -80,7 +90,7 @@
81	91	/** valid protocols for external links */
82	92	private final static String[] PROTOCOLS = {"http://","https://","ftp://","mailto://","news://","gopher://"};
83	93
84		~~- /** This many tokens from begining of text are eligable for keywords */~~
	94	+ /** This many tokens from beginning of text are eligible for keywords */
85	95	public static int KEYWORD_TOKEN_LIMIT = 250;
86	96
87	97	/** Token gap at first section break */
—	—	@@ -143,6 +153,37 @@
144	154	init();
145	155	}
146	156
	157	+ public FastWikiTokenizerEngine(Reader rdr, IndexId iid,
	158	+ TokenizerOptions options) {
	159	+
	160	+ StringBuilder sb = new StringBuilder();
	161	+
	162	+ // use standard Reader-reading techniques to access the Reader
	163	+ int len = -1; // Number of chars read
	164	+ char[] buf = new char[256]; // Characters read from Reader
	165	+ try {
	166	+ while ((len = rdr.read(buf, 0, 256)) != -1) {
	167	+ // System.out.println("Next chunk from Reader is " + (new
	168	+ // String(buf, 0, len)));
	169	+ sb.append(buf, 0, len);
	170	+
	171	+ }
	172	+ } catch (IOException ioe) {
	173	+ System.err
	174	+ .println("Error reading from Reader :" + ioe.getMessage());
	175	+ }
	176	+
	177	+ this.rdr = rdr;
	178	+ this.text = new char[sb.length()];
	179	+ sb.getChars(0, sb.length(), this.text, 0);
	180	+ this.textString = String.valueOf(text);
	181	+ this.language = iid.getLangCode();
	182	+ this.iid = iid;
	183	+ this.options = options;
	184	+ textLength = sb.length();
	185	+ init();
	186	+ }
	187	+
147	188	/**
148	189	* Strip accents
149	190	* @param c
—	—	@@ -1439,6 +1480,9 @@
1440	1481	return tokens;
1441	1482	}
1442	1483
	1484	+ /**
	1485	+ * removes comments from the input.
	1486	+ */
1443	1487	private void stripComments() {
1444	1488	char[] stripped = new char[textLength];
1445	1489	int slen = 0;

Status & tagging log

14:06, 26 January 2012 😂 (talk | contribs) changed the status of r109956 [removed: new added: deferred]