r82929 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r82928‎ \| r82929 \| r82930 >
Date:	10:11, 28 February 2011
Author:	nikerabbit
Status:	deferred
Tags:
Comment:	Cleanups to spelling, comments, imports and code duplication
Modified paths:	/trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java (modified) (history) /trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java (modified) (history) /trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java (modified) (history) /trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java
—	—	@@ -1,16 +1,11 @@
2	2	package org.wikimedia.lsearch.util;
3	3
4		~~-import java.net.URL;~~
5		-
6	4	import org.wikimedia.lsearch.config.Configuration;
7	5	import org.wikimedia.lsearch.config.IndexId;
8	6	import org.wikimedia.lsearch.util.Localization;
9	7
10	8	public class LocalizationTest {
11	9
12		- /**
13		~~- * @param args~~
14		~~- */~~
15	10	public static void main(String[] args) {
16	11	Configuration.open();
17	12	String text = "#redirect [[mw]]";
Index: trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java
—	—	@@ -21,7 +21,7 @@
22	22	*
23	23	* <p>This class tries to handle <code>null</code> input gracefully.
24	24	* An exception will not be thrown for a <code>null</code> input.
25		~~- * Each method documents its behaviour in more detail.</p>~~
	25	+ * Each method documents its behavior in more detail.</p>
26	26	*
27	27	* @author Apache Jakarta Velocity
28	28	* @author Stephen Colebourne
Index: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java
—	—	@@ -44,13 +44,14 @@
45	45	import java.io.*;
46	46
47	47	/**
48		~~- This class implements the Kstem algorithm~~
	48	+ * This class implements the Kstem algorithm
49	49	*/
50	50	public class KStemmer {
51		~~- /** Default size of the cache that stores <code>(word,stem)</code> pairs.~~
52		~~- <p>This speeds up processing since Kstem works by~~
53		~~- sucessive "transformations" to the input word until a~~
54		~~- suitable stem is found.~~
	51	+ /**
	52	+ * Default size of the cache that stores <code>(word,stem)</code> pairs.
	53	+ *
	54	+ * This speeds up processing since Kstem works by successive
	55	+ * "transformations" to the input word until a suitable stem is found.
55	56	*/
56	57	static public int DEFAULT_CACHE_SIZE = 20000;
57	58	static private final int MaxWordLen = 100;
—	—	@@ -203,9 +204,9 @@
204	205	}
205	206	}
206	207
207		~~- private static Hashtable dict_ht = null;~~
	208	+ private static Hashtable<String, DictEntry> dict_ht = null;
208	209	private int MaxCacheSize;
209		~~- private Hashtable stem_ht = null;~~
	210	+ private Hashtable<String, String> stem_ht = null;
210	211	private StringBuffer word;
211	212	private int j; /* index of final letter in stem (within word) */
212	213	private int k; /* INDEX of final letter in word.
—	—	@@ -214,7 +215,7 @@
215	216	wordLength, which returns (k+1). */
216	217
217	218	private void initializeStemHash() {
218		~~- stem_ht = new Hashtable();~~
	219	+ stem_ht = new Hashtable<String, String>();
219	220	}
220	221
221	222	private char finalChar() {
—	—	@@ -249,7 +250,7 @@
250	251	if (dict_ht != null)
251	252	return;
252	253
253		~~- dict_ht = new Hashtable();~~
	254	+ dict_ht = new Hashtable<String, DictEntry>();
254	255	for (int i=0;i<exceptionWords.length;i++) {
255	256	if (!dict_ht.containsKey(exceptionWords[i])) {
256	257	entry = new DictEntry(exceptionWords[i],true);
—	—	@@ -282,110 +283,28 @@
283	284	}
284	285
285	286	defaultEntry = new DictEntry(null,false);
286		-
287		~~- String[] array;~~
288		~~- array = KStemData1.data;~~
289		-
290		~~- for (int i=0;i<array.length;i++) {~~
291		~~- if (!dict_ht.containsKey(array[i])) {~~
292		~~- dict_ht.put(array[i],defaultEntry);~~
	287	+
	288	+ appendStems( dict_ht, defaultEntry, KStemData1.data, "4" );
	289	+ appendStems( dict_ht, defaultEntry, KStemData2.data, "4" );
	290	+ appendStems( dict_ht, defaultEntry, KStemData3.data, "4" );
	291	+ appendStems( dict_ht, defaultEntry, KStemData4.data, "4" );
	292	+ appendStems( dict_ht, defaultEntry, KStemData5.data, "4" );
	293	+ appendStems( dict_ht, defaultEntry, KStemData6.data, "4" );
	294	+ appendStems( dict_ht, defaultEntry, KStemData7.data, "4" );
	295	+ appendStems( dict_ht, defaultEntry, KStemData8.data, "4" );
	296	+ appendStems( dict_ht, defaultEntry, supplementDict, "5" );
	297	+ appendStems( dict_ht, defaultEntry, properNouns, "6" );
	298	+ }
	299	+
	300	+ private static void appendStems( Hashtable<String, DictEntry> stems, DictEntry defaultEntry, String[] array, String dict ) {
	301	+ for (int i=0; i < array.length; i++) {
	302	+ if (!stems.containsKey(array[i])) {
	303	+ stems.put(array[i],defaultEntry);
293	304	} else {
294	305	System.out.println("Warning: Entry ["+array[i]+
295		~~- "] already in dictionary 4");~~
	306	+ "] already in dictionary " + dict);
296	307	}
297	308	}
298		-
299		-
300		~~- array = KStemData2.data;~~
301		~~- for (int i=0;i<array.length;i++) {~~
302		~~- if (!dict_ht.containsKey(array[i])) {~~
303		~~- dict_ht.put(array[i],defaultEntry);~~
304		~~- } else {~~
305		~~- System.out.println("Warning: Entry ["+array[i]+~~
306		~~- "] already in dictionary 4");~~
307		~~- }~~
308		~~- }~~
309		-
310		~~- array = KStemData3.data;~~
311		~~- for (int i=0;i<array.length;i++) {~~
312		~~- if (!dict_ht.containsKey(array[i])) {~~
313		~~- dict_ht.put(array[i],defaultEntry);~~
314		~~- } else {~~
315		~~- System.out.println("Warning: Entry ["+array[i]+~~
316		~~- "] already in dictionary 4");~~
317		~~- }~~
318		~~- }~~
319		-
320		~~- array = KStemData4.data;~~
321		~~- for (int i=0;i<array.length;i++) {~~
322		~~- if (!dict_ht.containsKey(array[i])) {~~
323		~~- dict_ht.put(array[i],defaultEntry);~~
324		~~- } else {~~
325		~~- System.out.println("Warning: Entry ["+array[i]+~~
326		~~- "] already in dictionary 4");~~
327		~~- }~~
328		~~- }~~
329		-
330		-
331		~~- array = KStemData5.data;~~
332		~~- for (int i=0;i<array.length;i++) {~~
333		~~- if (!dict_ht.containsKey(array[i])) {~~
334		~~- dict_ht.put(array[i],defaultEntry);~~
335		~~- } else {~~
336		~~- System.out.println("Warning: Entry ["+array[i]+~~
337		~~- "] already in dictionary 4");~~
338		~~- }~~
339		~~- }~~
340		-
341		-
342		~~- array = KStemData6.data;~~
343		~~- for (int i=0;i<array.length;i++) {~~
344		~~- if (!dict_ht.containsKey(array[i])) {~~
345		~~- dict_ht.put(array[i],defaultEntry);~~
346		~~- } else {~~
347		~~- System.out.println("Warning: Entry ["+array[i]+~~
348		~~- "] already in dictionary 4");~~
349		~~- }~~
350		~~- }~~
351		-
352		~~- array = KStemData7.data;~~
353		~~- for (int i=0;i<array.length;i++) {~~
354		~~- if (!dict_ht.containsKey(array[i])) {~~
355		~~- dict_ht.put(array[i],defaultEntry);~~
356		~~- } else {~~
357		~~- System.out.println("Warning: Entry ["+array[i]+~~
358		~~- "] already in dictionary 4");~~
359		~~- }~~
360		~~- }~~
361		-
362		~~- for (int i=0;i<KStemData8.data.length;i++) {~~
363		~~- if (!dict_ht.containsKey(KStemData8.data[i])) {~~
364		~~- dict_ht.put(KStemData8.data[i],defaultEntry);~~
365		~~- } else {~~
366		~~- System.out.println("Warning: Entry ["+KStemData8.data[i]+~~
367		~~- "] already in dictionary 4");~~
368		~~- }~~
369		~~- }~~
370		-
371		~~- for (int i=0;i<supplementDict.length;i++) {~~
372		~~- if (!dict_ht.containsKey(supplementDict[i])) {~~
373		~~- dict_ht.put(supplementDict[i],defaultEntry);~~
374		~~- } else {~~
375		~~- System.out.println("Warning: Entry ["+~~
376		~~- supplementDict[i]+~~
377		~~- "] already in dictionary 5");~~
378		~~- }~~
379		~~- }~~
380		-
381		~~- for (int i=0;i<properNouns.length;i++) {~~
382		~~- if (!dict_ht.containsKey(properNouns[i])) {~~
383		~~- dict_ht.put(properNouns[i],defaultEntry);~~
384		~~- } else {~~
385		~~- System.out.println("Warning: Entry ["+~~
386		~~- properNouns[i]+~~
387		~~- "] already in dictionary 6");~~
388		~~- }~~
389		~~- }~~
390	309	}
391	310
392	311	private boolean isAlpha(char ch) {
Index: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java
—	—	@@ -45,56 +45,66 @@
46	46
47	47	import java.io.IOException;
48	48
49		~~-/** Transforms the token stream according to the KStem stemming algorithm.~~
50		~~- * For more information about KStem see <a href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">~~
51		~~- "Viewing Morphology as an Inference Process"</a>~~
52		~~- (Krovetz, R., Proceedings of the Sixteenth Annual International ACM SIGIR~~
53		~~- Conference on Research and Development in Information Retrieval, 191-203, 1993).~~
54		-
55		~~- Note: the input to the stemming filter must already be in lower case,~~
56		~~- so you will need to use LowerCaseFilter or LowerCaseTokenizer farther~~
57		~~- down the Tokenizer chain in order for this to work properly!~~
58		~~- <P>~~
59		~~- To use this filter with other analyzers, you'll want to write an~~
60		~~- Analyzer class that sets up the TokenStream chain as you want it.~~
61		~~- To use this with LowerCaseTokenizer, for example, you'd write an~~
62		~~- analyzer like this:~~
63		~~- <P>~~
64		~~- <PRE>~~
65		~~- class MyAnalyzer extends Analyzer {~~
66		~~- public final TokenStream tokenStream(String fieldName, Reader reader) {~~
67		~~- return new KStemStemFilter(new LowerCaseTokenizer(reader));~~
68		~~- }~~
69		~~- }~~
70		~~- </PRE>~~
71		-
	49	+/**
	50	+ * Transforms the token stream according to the KStem stemming algorithm. For
	51	+ * more information about KStem see <a
	52	+ * href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">
	53	+ * "Viewing Morphology as an Inference Process"</a> (Krovetz, R., Proceedings of
	54	+ * the Sixteenth Annual International ACM SIGIR Conference on Research and
	55	+ * Development in Information Retrieval, 191-203, 1993).
	56	+ *
	57	+ * Note: the input to the stemming filter must already be in lower case, so you
	58	+ * will need to use LowerCaseFilter or LowerCaseTokenizer farther down the
	59	+ * Tokenizer chain in order for this to work properly!
	60	+ * <P>
	61	+ * To use this filter with other analyzers, you'll want to write an Analyzer
	62	+ * class that sets up the TokenStream chain as you want it. To use this with
	63	+ * LowerCaseTokenizer, for example, you'd write an analyzer like this:
	64	+ * <P>
	65	+ *
	66	+ * <PRE>
	67	+ * class MyAnalyzer extends Analyzer {
	68	+ * public final TokenStream tokenStream(String fieldName, Reader reader) {
	69	+ * return new KStemStemFilter(new LowerCaseTokenizer(reader));
	70	+ * }
	71	+ * }
	72	+ * </PRE>
72	73	*/
73	74
74	75	public final class KStemFilter extends TokenFilter {
75	76	private KStemmer stemmer;
76	77
77		~~- /** Create a KStemmer with the given cache size.~~
78		~~- * @param in The TokenStream whose output will be the input to KStemFilter.~~
79		~~- * @param cacheSize Maximum number of entries to store in the~~
80		~~- * Stemmer's cache (stems stored in this cache do not need to be~~
81		~~- * recomputed, speeding up the stemming process).~~
	78	+ /**
	79	+ * Create a KStemmer with the given cache size.
	80	+ *
	81	+ * @param in
	82	+ * The TokenStream whose output will be the input to KStemFilter.
	83	+ * @param cacheSize
	84	+ * Maximum number of entries to store in the Stemmer's cache
	85	+ * (stems stored in this cache do not need to be recomputed,
	86	+ * speeding up the stemming process).
82	87	*/
83	88	public KStemFilter(TokenStream in, int cacheSize) {
84	89	super(in);
85	90	stemmer = new KStemmer(cacheSize);
86	91	}
87	92
88		~~- /** Create a KStemmer with the default cache size of 20 000 entries.~~
89		~~- * @param in The TokenStream whose output will be the input to KStemFilter.~~
	93	+ /**
	94	+ * Create a KStemmer with the default cache size of 20 000 entries.
	95	+ *
	96	+ * @param in
	97	+ * The TokenStream whose output will be the input to KStemFilter.
90	98	*/
91	99	public KStemFilter(TokenStream in) {
92	100	super(in);
93	101	stemmer = new KStemmer();
94	102	}
95	103
96		~~- /** Returns the next, stemmed, input Token.~~
97		~~- * @return The stemed form of a token.~~
98		~~- * @throws IOException~~
	104	+ /**
	105	+ * Returns the next, stemmed, input Token.
	106	+ *
	107	+ * @return The stemmed form of a token.
	108	+ * @throws IOException
99	109	*/
100	110	public final Token next() throws IOException {
101	111	Token token = input.next();
—	—	@@ -103,7 +113,8 @@
104	114	else {
105	115	String s = stemmer.stem(token.termText());
106	116	if (!s.equals(token.termText()))
107		~~- return new Token(s, token.startOffset, token.endOffset, token.type);~~
	117	+ return new Token(s, token.startOffset, token.endOffset,
	118	+ token.type);
108	119	return token;
109	120	}
110	121	}
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java
—	—	@@ -4,18 +4,17 @@
5	5	import java.util.Collections;
6	6	import java.util.Comparator;
7	7	import java.util.Hashtable;
8		~~-import java.util.List;~~
9	8	import java.util.Map.Entry;
10	9
11	10	import org.apache.log4j.Logger;
12	11
13	12	public class HttpMonitor extends Thread {
14	13	static Logger log = Logger.getLogger(HttpMonitor.class);
15		~~- protected static HttpMonitor instance=null;~~
	14	+ protected static HttpMonitor instance;
16	15	/** times when http request have been started */
17	16	protected Hashtable<HttpHandler,Long> startTimes = new Hashtable<HttpHandler,Long>();
18	17
19		~~- /** threshold for reporting 10s */~~
	18	+ /** threshold in milliseconds for reporting */
20	19	protected long threshold = 10000;
21	20
22	21	private HttpMonitor(){}
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
—	—	@@ -436,7 +436,7 @@
437	437	log.error("Error sending result line ("+score + " " + namespace + " " + title +"): "+e.getMessage(),e);
438	438	}
439	439	}
440		-
	440	+ /** Unused? */
441	441	private void sendResultLine(String namespace, String title) {
442	442	try{
443	443	sendOutputLine(namespace + " " + encodeTitle(title));
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java
—	—	@@ -72,7 +72,7 @@
73	73	if (max != null)
74	74	maxThreads = Integer.parseInt(max);
75	75
76		~~- // Initialise statistics~~
	76	+ // Initialize statistics
77	77	stats = new Statistics(1000, statsPeriod);
78	78	if (config.getBoolean("Ganglia", "report")) {
79	79	log.info("Starting ganglia statistics thread...");
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java
—	—	@@ -18,7 +18,7 @@
19	19
20	20	/**
21	21	* Simple HTTP 1.1 handler, used for Index and Search daemons
22		~~- * for more info on protocole see handle() method~~
	22	+ * for more info about the protocol see handle() method
23	23	*
24	24	* @author Brion Vibber
25	25	*
—	—	@@ -136,7 +136,7 @@
137	137	* URL path format: /operation/database/searchterm
138	138	* The path should be URL-encoded UTF-8 (standard IRI).
139	139	*
140		~~- * Additional paramters may be specified in a query string:~~
	140	+ * Additional parameters may be specified in a query string:
141	141	* namespaces: comma-separated list of namespace numeric keys to subset results
142	142	* limit: maximum number of results to return
143	143	* offset: number of matches to skip before returning results
—	—	@@ -271,7 +271,7 @@
272	272	return null;
273	273	}
274	274
275		~~- /** This method is to be used for header reads only (which is utf-8 free!) */~~
	275	+ /** This method is to be used for header reads only (which is UTF-8 free!) */
276	276	@SuppressWarnings("deprecation")
277	277	protected String readInputLine() {
278	278	String sin="";
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -83,7 +83,7 @@
84	84
85	85	/** Type of index, enumeration */
86	86	protected IndexType type;
87		~~- /** Part number in split repestnation, e.g. 1..N */~~
	87	+ /** Part number in split representation, e.g. 1..N */
88	88	protected int partNum;
89	89
90	90	/** Namespace -> part (for nssplit indexes) */
—	—	@@ -137,10 +137,10 @@
138	138	/** Namespaces that are searched by default */
139	139	protected NamespaceFilter defaultNs = null;
140	140
141		~~- /** filter set to true for namespaces with subpages */~~
	141	+ /** Filter set to true for namespaces with subpages */
142	142	protected NamespaceFilter nsWithSubpages = null;
143	143
144		~~- /** namespaces with content (from initialise settings) */~~
	144	+ /** Namespaces with content (from initialise settings) */
145	145	protected NamespaceFilter contentNamespaces = null;
146	146
147	147	/** If we should be using additional global rank for scores */
—	—	@@ -683,7 +683,6 @@
684	684	/**
685	685	* Get all indexes parts for this iid except for logical names.
686	686	* I.e. for db of kind mainsplit, it will return db.mainpart, db.restpart
687		~~- * @return~~
688	687	*/
689	688	public HashSet<String> getPhysicalIndexes() {
690	689	HashSet<String> ret = new HashSet<String>();
—	—	@@ -712,8 +711,6 @@
713	712
714	713	/**
715	714	* Wrapper for getPhysicalIndexes to get iid objects
716		- *
717		~~- * @return~~
718	715	*/
719	716	public ArrayList<IndexId> getPhysicalIndexIds(){
720	717	HashSet<String> physical = getPhysicalIndexes();
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -119,13 +119,13 @@
120	120
121	121	protected static GlobalConfiguration instance = null;
122	122
123		~~- /** All the lang codes we encountered, used for "smart interwiki" */~~
	123	+ /** All the language codes we encountered, used for "smart interwiki" */
124	124	protected HashSet<String> smartInterwikiCodes = new HashSet<String>();
125	125	protected boolean useSmartInterwiki = false;
126	126	protected int maxSearchLimit = 1000;
127	127	protected int maxSearchOffset = 1000000;
128	128
129		~~- /** Wether to report warnings and info */~~
	129	+ /** Whether to report warnings and info */
130	130	protected static boolean verbose = true;
131	131
132	132	/** Sections in lsearch-config.conf */
—	—	@@ -145,14 +145,12 @@
146	146	}
147	147
148	148	protected GlobalConfiguration(){
149		~~- // try to determin this hosts IP address~~
	149	+ // try to determine this hosts IP address
150	150	determineInetAddress();
151	151	}
152	152
153	153	/**
154	154	* Get singleton instance of this class
155		- *
156		~~- * @return~~
157	155	*/
158	156	synchronized public static GlobalConfiguration getInstance() {
159	157	if (instance == null)
—	—	@@ -382,7 +380,7 @@
383	381	}
384	382
385	383	/**
386		~~- * Reads a config file from a bufferedreader, will~~
	384	+ * Reads a config file from a BufferedReader, will
387	385	* close the reader when done.
388	386	*
389	387	* @param in opened reader
—	—	@@ -423,7 +421,7 @@
424	422	prop.append("\n");
425	423	}
426	424	globalProperties.load(new ByteArrayInputStream(prop.toString().getBytes("utf-8")));
427		~~- // get some predifined global properties~~
	425	+ // get some predefined global properties
428	426	this.databaseSuffixes = getArrayProperty("Database.suffix");
429	427	this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
430	428	this.exactCaseSuffix = getArrayProperty("ExactCase.suffix");
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -69,8 +69,19 @@
70	70	public class SearchEngine {
71	71	static org.apache.log4j.Logger log = Logger.getLogger(SearchEngine.class);
72	72
	73	+ /**
	74	+ * Maximum number of search results at once.
	75	+ */
73	76	protected static int maxlimit = 1000;
	77	+
	78	+ /**
	79	+ * Largest search result offset.
	80	+ */
74	81	protected static int maxoffset = 100000;
	82	+
	83	+ /**
	84	+ * Maximum number of search results for prefix query.
	85	+ */
75	86	protected final int MAXPREFIX = 50;
76	87	protected static GlobalConfiguration global = null;
77	88	protected static Configuration config = null;
—	—	@@ -518,7 +529,7 @@
519	530	return res;
520	531	}
521	532
522		~~- /** Strip key using PrefixIndexBuilder stip function */~~
	533	+ /** Strip key using PrefixIndexBuilder strip function */
523	534	private String stripKey(String key){
524	535	return PrefixIndexBuilder.stripKey(key);
525	536	}
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java
—	—	@@ -32,7 +32,7 @@
33	33	import org.apache.lucene.analysis.TokenStream;
34	34	import org.apache.lucene.analysis.TokenFilter;
35	35
36		~~-/** Stem filter for esperanto */~~
	36	+/** Stem filter for Esperanto */
37	37	public class EsperantoStemFilter extends TokenFilter {
38	38	public EsperantoStemFilter(TokenStream tokenizer) {
39	39	super(tokenizer);
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -18,10 +18,10 @@
19	19
20	20	/**
21	21	* Wiki Tokenizer. Tokens are words and numbers. All letters are
22		~~- * lowercased and diacritics deleted using unicode compatibility~~
	22	+ * lowercased and diacritics deleted using Unicode compatibility
23	23	* decomposition (i.e. č -> c). Parses some basic wiki syntax,
24	24	* template names are skipped, from images captions are extracted,
25		~~- * categories and interwiki links are extracted ...~~
	25	+ * categories and interwiki links are extracted...
26	26	*
27	27	* Tokenizer will not take a Reader as input, but a String (for
28	28	* optimal performance)
—	—	@@ -172,7 +172,7 @@
173	173	* This function is called at word boundaries, it is used to
174	174	* make a new token and add it to token stream
175	175	*
176		~~- * Does unicode decomposition, and will make alias token with~~
	176	+ * Does Unicode decomposition, and will make alias token with
177	177	* alternative transliterations (e.g. ö -> oe)
178	178	*/
179	179	private final void addToken(){
—	—	@@ -203,7 +203,7 @@
204	204	boolean addDecomposed = false;
205	205	boolean allUpperCase = true;
206	206	boolean titleCase = true;
207		~~- boolean split = false; // if more tokens shold be produced, e.g. joe's -> joe + s~~
	207	+ boolean split = false; // if more tokens should be produced, e.g. joe's -> joe + s
208	208	for(int i=0;i<length;i++){
209	209	if(decomposer.isCombiningChar(buffer[i])){
210	210	addDecomposed = true;
—	—	@@ -328,7 +328,7 @@
329	329	else if(titleCase)
330	330	exact.setType("titlecase");
331	331	}
332		~~- // detect hyphenation (takes presedence over case detection)~~
	332	+ // detect hyphenation (takes precedence over case detection)
333	333	if(cur+1<textLength && text[cur]=='-' && (Character.isLetterOrDigit(text[cur+1]) \|\| decomposer.isCombiningChar(text[cur+1])))
334	334	exact.setType("with_hyphen");
335	335
—	—	@@ -347,14 +347,14 @@
348	348	if(decompLength!=0 && addDecomposed){
349	349	Token t = makeToken(new String(decompBuffer, 0, decompLength), start, start + length, false);
350	350	t.setPositionIncrement(0);
351		~~- t.setType(exact.type());~~
	351	+ t.setType(exact.type() + "-decomposed");
352	352	addToTokens(t);
353	353	}
354	354	// add alias (if any) token to stream
355	355	if(aliasLength>0){
356	356	Token t = makeToken(new String(aliasBuffer, 0, aliasLength), start, start + length, false);
357	357	t.setPositionIncrement(0);
358		~~- t.setType(exact.type());~~
	358	+ t.setType(exact.type() + "-aliased");
359	359	addToTokens(t);
360	360	}
361	361	}
—	—	@@ -796,7 +796,7 @@
797	797	if(lc == '\n' \|\| lc =='\r')
798	798	break;
799	799	}
800		~~- int start=0, end=0; // number of ='s at begining and end of line~~
	800	+ int start=0, end=0; // number of ='s at beginning and end of line
801	801	// find first sequence of =
802	802	for(lookup = cur ; lookup < textLength && lookup < endOfLine ; lookup++ ){
803	803	if(text[lookup] == '=')
—	—	@@ -804,7 +804,7 @@
805	805	else
806	806	break;
807	807	}
808		~~- // find the last squence of =~~
	808	+ // find the last sequence of =
809	809	for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){
810	810	if(text[lookup] == '=')
811	811	end++;
—	—	@@ -843,6 +843,7 @@
844	844	}
845	845	return true;
846	846	}
	847	+
847	848	/** Check if it's a reference tag starting at cur */
848	849	protected boolean checkRefStart(){
849	850	if(matchesString("<ref")){
—	—	@@ -894,7 +895,7 @@
895	896	return tokens;
896	897	}
897	898
898		~~- // strip comments so we don't neded to complicate syntax parsing even more~~
	899	+ // strip comments so we don't need to complicate syntax parsing even more
899	900	stripComments();
900	901
901	902	// start parsing
—	—	@@ -974,7 +975,7 @@
975	976	}
976	977	}
977	978	} else if(cur > 0 && text[cur-1]=='\n' && text[cur+1] == '-'){
978		~~- // explicitely put '-' into the glue buffer~~
	979	+ // Explicitly put '-' into the glue buffer
979	980	if(options.highlightParsing){
980	981	if(glueLength == 0)
981	982	glueStart = cur+1;
—	—	@@ -1276,7 +1277,7 @@
1277	1278	continue;
1278	1279	case LINK_FETCH:
1279	1280	if(length == 0 && c ==' ')
1280		~~- continue; // ignore leading whitespaces~~
	1281	+ continue; // ignore leading whitespace
1281	1282	if(c == ']'){
1282	1283	state = ParserState.LINK_END;
1283	1284	continue;
—	—	@@ -1333,7 +1334,7 @@
1334	1335	cur = fetchStart;
1335	1336	state = ParserState.CATEGORY_WORDS;
1336	1337	} else
1337		~~- System.err.print("ERROR: Inconsistent parser state, attepmted category backtrace for uninitalized fetchStart.");~~
	1338	+ System.err.print("ERROR: Inconsistent parser state, attempted category backtrace for uninitalized fetchStart.");
1338	1339	fetchStart = -1;
1339	1340	continue;
1340	1341	case INTERWIKI:
—	—	@@ -1375,7 +1376,7 @@
1376	1377	continue;
1377	1378	case TABLE_BEGIN:
1378	1379	tableLevel++;
1379		~~- // ignore everything up to the newspace, since they are table display params~~
	1380	+ // ignore everything up to the newline, since they are table display params
1380	1381	while(cur < textLength && (text[cur]!='\r' && text[cur]!='\n'))
1381	1382	cur++;
1382	1383	state = ParserState.WORD;
—	—	@@ -1422,7 +1423,7 @@
1423	1424	flushGlue();
1424	1425	if(nonContentTokens.size() != 0){
1425	1426	boolean first = true;
1426		~~- // flush any remaning tokens from initial templates, etc..~~
	1427	+ // flush any remaining tokens from initial templates, etc..
1427	1428	for(Token tt : nonContentTokens){
1428	1429	if(first){
1429	1430	tt.setPositionIncrement(FIRST_SECTION_GAP);
—	—	@@ -1595,7 +1596,11 @@
1596	1597	return new String(buf,0,len).trim();
1597	1598	}
1598	1599
1599		~~- /** Delete all vowels from a word or phrase */~~
	1600	+ /**
	1601	+ * Delete all vowels from a word or phrase
	1602	+ *
	1603	+ * Unused (except test)?
	1604	+ */
1600	1605	public static String deleteVowels(String title){
1601	1606	char[] buf = new char[256];
1602	1607
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java
—	—	@@ -8,7 +8,7 @@
9	9	import org.apache.lucene.analysis.TokenStream;
10	10
11	11	/**
12		~~- * Add english singular forms of words as aliases of~~
	12	+ * Add English singular forms of words as aliases of
13	13	* type "singular"
14	14	*
15	15	* @author rainman
—	—	@@ -17,7 +17,7 @@
18	18	public class EnglishSingularFilter extends TokenFilter{
19	19	Singular singular = new EnglishKStemSingular();
20	20
21		~~- Token next = null, next2=null;~~
	21	+ Token next = null, next2= null;
22	22	public EnglishSingularFilter(TokenStream input) {
23	23	super(input);
24	24	}
—	—	@@ -53,7 +53,7 @@
54	54	return t;
55	55	}
56	56
57		~~- /** Return token with sigular form of the noun, or null if none found */~~
	57	+ /** Return token with singular form of the noun, or null if none found */
58	58	protected final Token singular(Token t){
59	59	String w = singular.getSingular(t.termText());
60	60	if(w != null){
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java
—	—	@@ -90,7 +90,7 @@
91	91	return ret;
92	92	}
93	93
94		~~- /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */~~
	94	+ /** Get a brand new hash set of predefined stop words (i.e. not those generated from lucene indexes) */
95	95	public static HashSet<String> getPredefinedSet(String langCode){
96	96	loadPredefined();
97	97	HashSet<String> ret = new HashSet<String>();
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
—	—	@@ -67,7 +67,7 @@
68	68	for(int i=0;i<levels;i++)
69	69	keywordsBySize.add(new ArrayList<String>());
70	70	TokenizerOptions options = new TokenizerOptions(exactCase);
71		~~- // arange keywords into a list by token number~~
	71	+ // arrange keywords into a list by token number
72	72	for(String k : keywords){
73	73	ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,options).parse();
74	74	if(parsed.size() == 0)
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
—	—	@@ -6,12 +6,11 @@
7	7	import java.util.Iterator;
8	8
9	9	import org.apache.lucene.analysis.Analyzer;
10		~~-import org.apache.lucene.analysis.LowerCaseFilter;~~
11	10	import org.apache.lucene.analysis.Token;
12	11	import org.apache.lucene.analysis.TokenStream;
13	12
14	13	/** Produces a token stream for category field in the lucene index.
15		~~- * Each token is a single category (category names themself are~~
	14	+ * Each token is a single category (category names themselves are
16	15	* not tokenized) */
17	16	public class CategoryAnalyzer extends Analyzer {
18	17	public class ArrayTokenStream extends TokenStream {
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
—	—	@@ -7,7 +7,7 @@
8	8	import org.apache.lucene.analysis.Token;
9	9	import org.apache.lucene.analysis.TokenStream;
10	10	/**
11		~~- * Analyzer that just lowecases the text, doesn't split up anything, etc..~~
	11	+ * Analyzer that just lowercases the text, doesn't split up anything, etc..
12	12	*
13	13	* @author rainman
14	14	*
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
—	—	@@ -1,12 +1,9 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
4	4	import java.io.IOException;
5		~~-import java.lang.reflect.Constructor;~~
6		~~-import java.lang.reflect.InvocationTargetException;~~
7	5
8	6	import org.apache.log4j.Logger;
9	7	import org.apache.lucene.analysis.Token;
10		~~-import org.apache.lucene.analysis.TokenFilter;~~
11	8	import org.apache.lucene.analysis.TokenStream;
12	9
13	10	/**
—	—	@@ -30,7 +27,6 @@
31	28	* 2) stemmers should never change tokens, if the text needs to be
32	29	* changed, return a new Token object
33	30	*
34		~~- * @param language~~
35	31	*/
36	32	public AliasFilter(FilterFactory filters, TokenStream input, TokenStream duplicate){
37	33	this.input = input;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java
—	—	@@ -19,7 +19,7 @@
20	20
21	21	@Override
22	22	public Token next() throws IOException {
23		~~- // return buferred~~
	23	+ // return buffered
24	24	if(inx < buffer.size())
25	25	return buffer.get(inx++);
26	26
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java
—	—	@@ -3,7 +3,7 @@
4	4	import org.apache.lucene.analysis.KStemmer;
5	5
6	6	/**
7		~~- * KStem-based singular-finding class for english~~
	7	+ * KStem-based singular-finding class for English
8	8	*
9	9	* @author rainman
10	10	*
—	—	@@ -15,7 +15,7 @@
16	16	if(!word.equals(ret))
17	17	return ret;
18	18	else{
19		~~- // strip possesive~~
	19	+ // strip possessive suffix
20	20	if(word.endsWith("'s"))
21	21	return word.substring(0,word.length()-2);
22	22	return null;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java
—	—	@@ -58,7 +58,7 @@
59	59
60	60	}
61	61	/**
62		~~- * Serialize alttitle for highlighting, serializies titles, redirects, sections.~~
	62	+ * Serialize alttitle for highlighting, serializes titles, redirects, sections.
63	63	* Writes original names + highlight tokens.
64	64	*
65	65	* @param article
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
—	—	@@ -7,8 +7,8 @@
8	8	import org.apache.lucene.analysis.TokenStream;
9	9
10	10	/**
11		~~- * Vietnamese standard transliterations to ascii. Most of the stuff is done by unicode decomposed,~~
12		~~- * we just additionaly convert Đ/đ -> D/d~~
	11	+ * Vietnamese standard transliterations to ascii. Most of the stuff is done by Unicode decomposition.
	12	+ * Additional conversions here are: Đ/đ -> D/d
13	13	*
14	14	* @author rainman
15	15	*
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -1,28 +1,12 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
4		~~-import java.util.ArrayList;~~
5		~~-import java.util.HashMap;~~
6	4	import java.util.HashSet;
7	5
8	6	import org.apache.log4j.Logger;
9	7	import org.apache.lucene.analysis.Analyzer;
10	8	import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
11		~~-import org.apache.lucene.analysis.PorterStemFilter;~~
12		~~-import org.apache.lucene.analysis.SimpleAnalyzer;~~
13		~~-import org.apache.lucene.analysis.de.GermanStemFilter;~~
14		~~-import org.apache.lucene.analysis.fr.FrenchStemFilter;~~
15		~~-import org.apache.lucene.analysis.nl.DutchStemFilter;~~
16		~~-import org.apache.lucene.analysis.ru.RussianStemFilter;~~
17		~~-import org.apache.lucene.analysis.th.ThaiWordFilter;~~
18		~~-import org.apache.lucene.search.FieldSortedHitQueue;~~
19		~~-import org.wikimedia.lsearch.analyzers.FieldBuilder.BuilderSet;~~
20		~~-import org.wikimedia.lsearch.beans.Article;~~
21		~~-import org.wikimedia.lsearch.beans.Title;~~
22	9	import org.wikimedia.lsearch.config.GlobalConfiguration;
23	10	import org.wikimedia.lsearch.config.IndexId;
24		~~-import org.wikimedia.lsearch.index.WikiIndexModifier;~~
25		~~-import org.wikimedia.lsearch.ranks.Links;~~
26		~~-import org.wikimedia.lsearch.related.RelatedTitle;~~
27	11
28	12	/**
29	13	* Global functions related to creation/usage of analyzers.
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
—	—	@@ -72,8 +72,8 @@
73	73	(c >= 0x3300 && c <= 0x337f) \|\|
74	74	(c >= 0x3400 && c <= 0x3d2d) \|\|
75	75	(c >= 0x4e00 && c <= 0x9fff) \|\|
76		~~- (c >= 0xf900 && c <= 0xfaff) \|\|~~
77		~~- (c >= 0xac00 && c <= 0xd7af);~~
	76	+ (c >= 0xf900 && c <= 0xfaff) \|\|
	77	+ (c >= 0xac00 && c <= 0xd7af);
78	78	}
79	79
80	80	}
\ No newline at end of file
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java
—	—	@@ -6,6 +6,9 @@
7	7	import org.apache.lucene.analysis.TokenFilter;
8	8	import org.apache.lucene.analysis.TokenStream;
9	9
	10	+/**
	11	+ * Removes dots from acronyms?
	12	+ */
10	13	public class AcronymFilter extends TokenFilter {
11	14	Token buffered = null;
12	15
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
—	—	@@ -38,7 +38,7 @@
39	39	protected Token phrase1 = null, phrase2 = null;
40	40	protected boolean phraseReady = false;
41	41	protected String gap = "_";
42		~~- /** pairs of words, two adjecent words */~~
	42	+ /** pairs of words, two adjacent words */
43	43	protected Token pair1 = null, pair2 = null;
44	44	protected boolean pairReady = false;
45	45	protected Token nextToken = null;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
—	—	@@ -3,18 +3,18 @@
4	4	import org.wikimedia.lsearch.config.IndexId;
5	5
6	6	/**
7		~~- * Agregate class for FilterFactory and FieldNameFactory. This class~~
8		~~- * contains methods used to build various fields of the index,~~
9		~~- * it contains field names to be used, filter that are to be applied...~~
	7	+ * Aggregate class for FilterFactory and FieldNameFactory. This class contains
	8	+ * methods used to build various fields of the index, it contains field names to
	9	+ * be used, filter that are to be applied...
10	10	*
11	11	* @author rainman
12		- *
	12	+ *
13	13	*/
14	14	public class FieldBuilder {
15	15	public class BuilderSet{
16	16	FilterFactory filters;
17	17	FieldNameFactory fields;
18		~~- boolean addKeywords; // wether to add keywords from beginning of article~~
	18	+ boolean addKeywords; // whether to add keywords from beginning of article
19	19
20	20	public BuilderSet(FilterFactory filters, FieldNameFactory fields) {
21	21	this.filters = filters;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java
—	—	@@ -1,9 +1,11 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
	4	+import java.util.Arrays;
4	5	import java.util.HashMap;
	6	+import java.util.HashSet;
5	7
6	8	/**
7		~~- * Porter-based singular filter for english~~
	9	+ * Porter-based singular filter for English
8	10	*
9	11	* @author rainman
10	12	*
—	—	@@ -18,10 +20,11 @@
19	21	if(w.length() <= 3 \|\| w.charAt(w.length()-1) != 's')
20	22	return null;
21	23	// exceptions (from porter2)
22		~~- if("news".equals(w) \|\| "atlas".equals(w) \|\| "cosmos".equals(w)~~
23		~~- \|\| "bias".equals(w) \|\| "andes".equals(w) \|\| "aries".equals(w))~~
	24	+ String[] exceptions = { "news", "atlas", "cosmos", "bias", "andes", "aries" };
	25	+ HashSet<String> set = new HashSet<String>(Arrays.asList(exceptions));
	26	+ if( set.contains(w) )
24	27	return null;
25		~~- // don't strip posssesive form~~
	28	+ // don't strip possessive form
26	29	if(w.endsWith("'s")){
27	30	//if(w.length() > 2)
28	31	// return w.substring(0,w.length()-2);

Status & tagging log

11:56, 15 June 2011 Reedy (talk | contribs) changed the status of r82929 [removed: new added: deferred]