r82929 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82928‎ | r82929 | r82930 >
Date:10:11, 28 February 2011
Author:nikerabbit
Status:deferred
Tags:
Comment:
Cleanups to spelling, comments, imports and code duplication
Modified paths:
  • /trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java (modified) (history)
  • /trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java (modified) (history)
  • /trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java
@@ -1,16 +1,11 @@
22 package org.wikimedia.lsearch.util;
33
4 -import java.net.URL;
5 -
64 import org.wikimedia.lsearch.config.Configuration;
75 import org.wikimedia.lsearch.config.IndexId;
86 import org.wikimedia.lsearch.util.Localization;
97
108 public class LocalizationTest {
119
12 - /**
13 - * @param args
14 - */
1510 public static void main(String[] args) {
1611 Configuration.open();
1712 String text = "#redirect [[mw]]";
Index: trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java
@@ -21,7 +21,7 @@
2222 *
2323 * <p>This class tries to handle <code>null</code> input gracefully.
2424 * An exception will not be thrown for a <code>null</code> input.
25 - * Each method documents its behaviour in more detail.</p>
 25+ * Each method documents its behavior in more detail.</p>
2626 *
2727 * @author Apache Jakarta Velocity
2828 * @author Stephen Colebourne
Index: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java
@@ -44,13 +44,14 @@
4545 import java.io.*;
4646
4747 /**
48 - This class implements the Kstem algorithm
 48+ * This class implements the Kstem algorithm
4949 */
5050 public class KStemmer {
51 - /** Default size of the cache that stores <code>(word,stem)</code> pairs.
52 - <p>This speeds up processing since Kstem works by
53 - sucessive "transformations" to the input word until a
54 - suitable stem is found.
 51+ /**
 52+ * Default size of the cache that stores <code>(word,stem)</code> pairs.
 53+ *
 54+ * This speeds up processing since Kstem works by successive
 55+ * "transformations" to the input word until a suitable stem is found.
5556 */
5657 static public int DEFAULT_CACHE_SIZE = 20000;
5758 static private final int MaxWordLen = 100;
@@ -203,9 +204,9 @@
204205 }
205206 }
206207
207 - private static Hashtable dict_ht = null;
 208+ private static Hashtable<String, DictEntry> dict_ht = null;
208209 private int MaxCacheSize;
209 - private Hashtable stem_ht = null;
 210+ private Hashtable<String, String> stem_ht = null;
210211 private StringBuffer word;
211212 private int j; /* index of final letter in stem (within word) */
212213 private int k; /* INDEX of final letter in word.
@@ -214,7 +215,7 @@
215216 wordLength, which returns (k+1). */
216217
217218 private void initializeStemHash() {
218 - stem_ht = new Hashtable();
 219+ stem_ht = new Hashtable<String, String>();
219220 }
220221
221222 private char finalChar() {
@@ -249,7 +250,7 @@
250251 if (dict_ht != null)
251252 return;
252253
253 - dict_ht = new Hashtable();
 254+ dict_ht = new Hashtable<String, DictEntry>();
254255 for (int i=0;i<exceptionWords.length;i++) {
255256 if (!dict_ht.containsKey(exceptionWords[i])) {
256257 entry = new DictEntry(exceptionWords[i],true);
@@ -282,110 +283,28 @@
283284 }
284285
285286 defaultEntry = new DictEntry(null,false);
286 -
287 - String[] array;
288 - array = KStemData1.data;
289 -
290 - for (int i=0;i<array.length;i++) {
291 - if (!dict_ht.containsKey(array[i])) {
292 - dict_ht.put(array[i],defaultEntry);
 287+
 288+ appendStems( dict_ht, defaultEntry, KStemData1.data, "4" );
 289+ appendStems( dict_ht, defaultEntry, KStemData2.data, "4" );
 290+ appendStems( dict_ht, defaultEntry, KStemData3.data, "4" );
 291+ appendStems( dict_ht, defaultEntry, KStemData4.data, "4" );
 292+ appendStems( dict_ht, defaultEntry, KStemData5.data, "4" );
 293+ appendStems( dict_ht, defaultEntry, KStemData6.data, "4" );
 294+ appendStems( dict_ht, defaultEntry, KStemData7.data, "4" );
 295+ appendStems( dict_ht, defaultEntry, KStemData8.data, "4" );
 296+ appendStems( dict_ht, defaultEntry, supplementDict, "5" );
 297+ appendStems( dict_ht, defaultEntry, properNouns, "6" );
 298+ }
 299+
 300+ private static void appendStems( Hashtable<String, DictEntry> stems, DictEntry defaultEntry, String[] array, String dict ) {
 301+ for (int i=0; i < array.length; i++) {
 302+ if (!stems.containsKey(array[i])) {
 303+ stems.put(array[i],defaultEntry);
293304 } else {
294305 System.out.println("Warning: Entry ["+array[i]+
295 - "] already in dictionary 4");
 306+ "] already in dictionary " + dict);
296307 }
297308 }
298 -
299 -
300 - array = KStemData2.data;
301 - for (int i=0;i<array.length;i++) {
302 - if (!dict_ht.containsKey(array[i])) {
303 - dict_ht.put(array[i],defaultEntry);
304 - } else {
305 - System.out.println("Warning: Entry ["+array[i]+
306 - "] already in dictionary 4");
307 - }
308 - }
309 -
310 - array = KStemData3.data;
311 - for (int i=0;i<array.length;i++) {
312 - if (!dict_ht.containsKey(array[i])) {
313 - dict_ht.put(array[i],defaultEntry);
314 - } else {
315 - System.out.println("Warning: Entry ["+array[i]+
316 - "] already in dictionary 4");
317 - }
318 - }
319 -
320 - array = KStemData4.data;
321 - for (int i=0;i<array.length;i++) {
322 - if (!dict_ht.containsKey(array[i])) {
323 - dict_ht.put(array[i],defaultEntry);
324 - } else {
325 - System.out.println("Warning: Entry ["+array[i]+
326 - "] already in dictionary 4");
327 - }
328 - }
329 -
330 -
331 - array = KStemData5.data;
332 - for (int i=0;i<array.length;i++) {
333 - if (!dict_ht.containsKey(array[i])) {
334 - dict_ht.put(array[i],defaultEntry);
335 - } else {
336 - System.out.println("Warning: Entry ["+array[i]+
337 - "] already in dictionary 4");
338 - }
339 - }
340 -
341 -
342 - array = KStemData6.data;
343 - for (int i=0;i<array.length;i++) {
344 - if (!dict_ht.containsKey(array[i])) {
345 - dict_ht.put(array[i],defaultEntry);
346 - } else {
347 - System.out.println("Warning: Entry ["+array[i]+
348 - "] already in dictionary 4");
349 - }
350 - }
351 -
352 - array = KStemData7.data;
353 - for (int i=0;i<array.length;i++) {
354 - if (!dict_ht.containsKey(array[i])) {
355 - dict_ht.put(array[i],defaultEntry);
356 - } else {
357 - System.out.println("Warning: Entry ["+array[i]+
358 - "] already in dictionary 4");
359 - }
360 - }
361 -
362 - for (int i=0;i<KStemData8.data.length;i++) {
363 - if (!dict_ht.containsKey(KStemData8.data[i])) {
364 - dict_ht.put(KStemData8.data[i],defaultEntry);
365 - } else {
366 - System.out.println("Warning: Entry ["+KStemData8.data[i]+
367 - "] already in dictionary 4");
368 - }
369 - }
370 -
371 - for (int i=0;i<supplementDict.length;i++) {
372 - if (!dict_ht.containsKey(supplementDict[i])) {
373 - dict_ht.put(supplementDict[i],defaultEntry);
374 - } else {
375 - System.out.println("Warning: Entry ["+
376 - supplementDict[i]+
377 - "] already in dictionary 5");
378 - }
379 - }
380 -
381 - for (int i=0;i<properNouns.length;i++) {
382 - if (!dict_ht.containsKey(properNouns[i])) {
383 - dict_ht.put(properNouns[i],defaultEntry);
384 - } else {
385 - System.out.println("Warning: Entry ["+
386 - properNouns[i]+
387 - "] already in dictionary 6");
388 - }
389 - }
390309 }
391310
392311 private boolean isAlpha(char ch) {
Index: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java
@@ -45,56 +45,66 @@
4646
4747 import java.io.IOException;
4848
49 -/** Transforms the token stream according to the KStem stemming algorithm.
50 - * For more information about KStem see <a href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">
51 - "Viewing Morphology as an Inference Process"</a>
52 - (Krovetz, R., Proceedings of the Sixteenth Annual International ACM SIGIR
53 - Conference on Research and Development in Information Retrieval, 191-203, 1993).
54 -
55 - Note: the input to the stemming filter must already be in lower case,
56 - so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
57 - down the Tokenizer chain in order for this to work properly!
58 - <P>
59 - To use this filter with other analyzers, you'll want to write an
60 - Analyzer class that sets up the TokenStream chain as you want it.
61 - To use this with LowerCaseTokenizer, for example, you'd write an
62 - analyzer like this:
63 - <P>
64 - <PRE>
65 - class MyAnalyzer extends Analyzer {
66 - public final TokenStream tokenStream(String fieldName, Reader reader) {
67 - return new KStemStemFilter(new LowerCaseTokenizer(reader));
68 - }
69 - }
70 - </PRE>
71 -
 49+/**
 50+ * Transforms the token stream according to the KStem stemming algorithm. For
 51+ * more information about KStem see <a
 52+ * href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">
 53+ * "Viewing Morphology as an Inference Process"</a> (Krovetz, R., Proceedings of
 54+ * the Sixteenth Annual International ACM SIGIR Conference on Research and
 55+ * Development in Information Retrieval, 191-203, 1993).
 56+ *
 57+ * Note: the input to the stemming filter must already be in lower case, so you
 58+ * will need to use LowerCaseFilter or LowerCaseTokenizer farther down the
 59+ * Tokenizer chain in order for this to work properly!
 60+ * <P>
 61+ * To use this filter with other analyzers, you'll want to write an Analyzer
 62+ * class that sets up the TokenStream chain as you want it. To use this with
 63+ * LowerCaseTokenizer, for example, you'd write an analyzer like this:
 64+ * <P>
 65+ *
 66+ * <PRE>
 67+ * class MyAnalyzer extends Analyzer {
 68+ * public final TokenStream tokenStream(String fieldName, Reader reader) {
 69+ * return new KStemStemFilter(new LowerCaseTokenizer(reader));
 70+ * }
 71+ * }
 72+ * </PRE>
7273 */
7374
7475 public final class KStemFilter extends TokenFilter {
7576 private KStemmer stemmer;
7677
77 - /** Create a KStemmer with the given cache size.
78 - * @param in The TokenStream whose output will be the input to KStemFilter.
79 - * @param cacheSize Maximum number of entries to store in the
80 - * Stemmer's cache (stems stored in this cache do not need to be
81 - * recomputed, speeding up the stemming process).
 78+ /**
 79+ * Create a KStemmer with the given cache size.
 80+ *
 81+ * @param in
 82+ * The TokenStream whose output will be the input to KStemFilter.
 83+ * @param cacheSize
 84+ * Maximum number of entries to store in the Stemmer's cache
 85+ * (stems stored in this cache do not need to be recomputed,
 86+ * speeding up the stemming process).
8287 */
8388 public KStemFilter(TokenStream in, int cacheSize) {
8489 super(in);
8590 stemmer = new KStemmer(cacheSize);
8691 }
8792
88 - /** Create a KStemmer with the default cache size of 20 000 entries.
89 - * @param in The TokenStream whose output will be the input to KStemFilter.
 93+ /**
 94+ * Create a KStemmer with the default cache size of 20 000 entries.
 95+ *
 96+ * @param in
 97+ * The TokenStream whose output will be the input to KStemFilter.
9098 */
9199 public KStemFilter(TokenStream in) {
92100 super(in);
93101 stemmer = new KStemmer();
94102 }
95103
96 - /** Returns the next, stemmed, input Token.
97 - * @return The stemed form of a token.
98 - * @throws IOException
 104+ /**
 105+ * Returns the next, stemmed, input Token.
 106+ *
 107+ * @return The stemmed form of a token.
 108+ * @throws IOException
99109 */
100110 public final Token next() throws IOException {
101111 Token token = input.next();
@@ -103,7 +113,8 @@
104114 else {
105115 String s = stemmer.stem(token.termText());
106116 if (!s.equals(token.termText()))
107 - return new Token(s, token.startOffset, token.endOffset, token.type);
 117+ return new Token(s, token.startOffset, token.endOffset,
 118+ token.type);
108119 return token;
109120 }
110121 }
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java
@@ -4,18 +4,17 @@
55 import java.util.Collections;
66 import java.util.Comparator;
77 import java.util.Hashtable;
8 -import java.util.List;
98 import java.util.Map.Entry;
109
1110 import org.apache.log4j.Logger;
1211
1312 public class HttpMonitor extends Thread {
1413 static Logger log = Logger.getLogger(HttpMonitor.class);
15 - protected static HttpMonitor instance=null;
 14+ protected static HttpMonitor instance;
1615 /** times when http request have been started */
1716 protected Hashtable<HttpHandler,Long> startTimes = new Hashtable<HttpHandler,Long>();
1817
19 - /** threshold for reporting 10s */
 18+ /** threshold in milliseconds for reporting */
2019 protected long threshold = 10000;
2120
2221 private HttpMonitor(){}
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
@@ -436,7 +436,7 @@
437437 log.error("Error sending result line ("+score + " " + namespace + " " + title +"): "+e.getMessage(),e);
438438 }
439439 }
440 -
 440+ /** Unused? */
441441 private void sendResultLine(String namespace, String title) {
442442 try{
443443 sendOutputLine(namespace + " " + encodeTitle(title));
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java
@@ -72,7 +72,7 @@
7373 if (max != null)
7474 maxThreads = Integer.parseInt(max);
7575
76 - // Initialise statistics
 76+ // Initialize statistics
7777 stats = new Statistics(1000, statsPeriod);
7878 if (config.getBoolean("Ganglia", "report")) {
7979 log.info("Starting ganglia statistics thread...");
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java
@@ -18,7 +18,7 @@
1919
2020 /**
2121 * Simple HTTP 1.1 handler, used for Index and Search daemons
22 - * for more info on protocole see handle() method
 22+ * for more info about the protocol see handle() method
2323 *
2424 * @author Brion Vibber
2525 *
@@ -136,7 +136,7 @@
137137 * URL path format: /operation/database/searchterm
138138 * The path should be URL-encoded UTF-8 (standard IRI).
139139 *
140 - * Additional paramters may be specified in a query string:
 140+ * Additional parameters may be specified in a query string:
141141 * namespaces: comma-separated list of namespace numeric keys to subset results
142142 * limit: maximum number of results to return
143143 * offset: number of matches to skip before returning results
@@ -271,7 +271,7 @@
272272 return null;
273273 }
274274
275 - /** This method is to be used for header reads only (which is utf-8 free!) */
 275+ /** This method is to be used for header reads only (which is UTF-8 free!) */
276276 @SuppressWarnings("deprecation")
277277 protected String readInputLine() {
278278 String sin="";
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java
@@ -83,7 +83,7 @@
8484
8585 /** Type of index, enumeration */
8686 protected IndexType type;
87 - /** Part number in split repestnation, e.g. 1..N */
 87+ /** Part number in split representation, e.g. 1..N */
8888 protected int partNum;
8989
9090 /** Namespace -> part (for nssplit indexes) */
@@ -137,10 +137,10 @@
138138 /** Namespaces that are searched by default */
139139 protected NamespaceFilter defaultNs = null;
140140
141 - /** filter set to true for namespaces with subpages */
 141+ /** Filter set to true for namespaces with subpages */
142142 protected NamespaceFilter nsWithSubpages = null;
143143
144 - /** namespaces with content (from initialise settings) */
 144+ /** Namespaces with content (from initialise settings) */
145145 protected NamespaceFilter contentNamespaces = null;
146146
147147 /** If we should be using additional global rank for scores */
@@ -683,7 +683,6 @@
684684 /**
685685 * Get all indexes parts for this iid except for logical names.
686686 * I.e. for db of kind mainsplit, it will return db.mainpart, db.restpart
687 - * @return
688687 */
689688 public HashSet<String> getPhysicalIndexes() {
690689 HashSet<String> ret = new HashSet<String>();
@@ -712,8 +711,6 @@
713712
714713 /**
715714 * Wrapper for getPhysicalIndexes to get iid objects
716 - *
717 - * @return
718715 */
719716 public ArrayList<IndexId> getPhysicalIndexIds(){
720717 HashSet<String> physical = getPhysicalIndexes();
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -119,13 +119,13 @@
120120
121121 protected static GlobalConfiguration instance = null;
122122
123 - /** All the lang codes we encountered, used for "smart interwiki" */
 123+ /** All the language codes we encountered, used for "smart interwiki" */
124124 protected HashSet<String> smartInterwikiCodes = new HashSet<String>();
125125 protected boolean useSmartInterwiki = false;
126126 protected int maxSearchLimit = 1000;
127127 protected int maxSearchOffset = 1000000;
128128
129 - /** Wether to report warnings and info */
 129+ /** Whether to report warnings and info */
130130 protected static boolean verbose = true;
131131
132132 /** Sections in lsearch-config.conf */
@@ -145,14 +145,12 @@
146146 }
147147
148148 protected GlobalConfiguration(){
149 - // try to determin this hosts IP address
 149+ // try to determine this hosts IP address
150150 determineInetAddress();
151151 }
152152
153153 /**
154154 * Get singleton instance of this class
155 - *
156 - * @return
157155 */
158156 synchronized public static GlobalConfiguration getInstance() {
159157 if (instance == null)
@@ -382,7 +380,7 @@
383381 }
384382
385383 /**
386 - * Reads a config file from a bufferedreader, will
 384+ * Reads a config file from a BufferedReader, will
387385 * close the reader when done.
388386 *
389387 * @param in opened reader
@@ -423,7 +421,7 @@
424422 prop.append("\n");
425423 }
426424 globalProperties.load(new ByteArrayInputStream(prop.toString().getBytes("utf-8")));
427 - // get some predifined global properties
 425+ // get some predefined global properties
428426 this.databaseSuffixes = getArrayProperty("Database.suffix");
429427 this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
430428 this.exactCaseSuffix = getArrayProperty("ExactCase.suffix");
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -69,8 +69,19 @@
7070 public class SearchEngine {
7171 static org.apache.log4j.Logger log = Logger.getLogger(SearchEngine.class);
7272
 73+ /**
 74+ * Maximum number of search results at once.
 75+ */
7376 protected static int maxlimit = 1000;
 77+
 78+ /**
 79+ * Largest search result offset.
 80+ */
7481 protected static int maxoffset = 100000;
 82+
 83+ /**
 84+ * Maximum number of search results for prefix query.
 85+ */
7586 protected final int MAXPREFIX = 50;
7687 protected static GlobalConfiguration global = null;
7788 protected static Configuration config = null;
@@ -518,7 +529,7 @@
519530 return res;
520531 }
521532
522 - /** Strip key using PrefixIndexBuilder stip function */
 533+ /** Strip key using PrefixIndexBuilder strip function */
523534 private String stripKey(String key){
524535 return PrefixIndexBuilder.stripKey(key);
525536 }
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java
@@ -32,7 +32,7 @@
3333 import org.apache.lucene.analysis.TokenStream;
3434 import org.apache.lucene.analysis.TokenFilter;
3535
36 -/** Stem filter for esperanto */
 36+/** Stem filter for Esperanto */
3737 public class EsperantoStemFilter extends TokenFilter {
3838 public EsperantoStemFilter(TokenStream tokenizer) {
3939 super(tokenizer);
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -18,10 +18,10 @@
1919
2020 /**
2121 * Wiki Tokenizer. Tokens are words and numbers. All letters are
22 - * lowercased and diacritics deleted using unicode compatibility
 22+ * lowercased and diacritics deleted using Unicode compatibility
2323 * decomposition (i.e. č -> c). Parses some basic wiki syntax,
2424 * template names are skipped, from images captions are extracted,
25 - * categories and interwiki links are extracted ...
 25+ * categories and interwiki links are extracted...
2626 *
2727 * Tokenizer will not take a Reader as input, but a String (for
2828 * optimal performance)
@@ -172,7 +172,7 @@
173173 * This function is called at word boundaries, it is used to
174174 * make a new token and add it to token stream
175175 *
176 - * Does unicode decomposition, and will make alias token with
 176+ * Does Unicode decomposition, and will make alias token with
177177 * alternative transliterations (e.g. ö -> oe)
178178 */
179179 private final void addToken(){
@@ -203,7 +203,7 @@
204204 boolean addDecomposed = false;
205205 boolean allUpperCase = true;
206206 boolean titleCase = true;
207 - boolean split = false; // if more tokens shold be produced, e.g. joe's -> joe + s
 207+ boolean split = false; // if more tokens should be produced, e.g. joe's -> joe + s
208208 for(int i=0;i<length;i++){
209209 if(decomposer.isCombiningChar(buffer[i])){
210210 addDecomposed = true;
@@ -328,7 +328,7 @@
329329 else if(titleCase)
330330 exact.setType("titlecase");
331331 }
332 - // detect hyphenation (takes presedence over case detection)
 332+ // detect hyphenation (takes precedence over case detection)
333333 if(cur+1<textLength && text[cur]=='-' && (Character.isLetterOrDigit(text[cur+1]) || decomposer.isCombiningChar(text[cur+1])))
334334 exact.setType("with_hyphen");
335335
@@ -347,14 +347,14 @@
348348 if(decompLength!=0 && addDecomposed){
349349 Token t = makeToken(new String(decompBuffer, 0, decompLength), start, start + length, false);
350350 t.setPositionIncrement(0);
351 - t.setType(exact.type());
 351+ t.setType(exact.type() + "-decomposed");
352352 addToTokens(t);
353353 }
354354 // add alias (if any) token to stream
355355 if(aliasLength>0){
356356 Token t = makeToken(new String(aliasBuffer, 0, aliasLength), start, start + length, false);
357357 t.setPositionIncrement(0);
358 - t.setType(exact.type());
 358+ t.setType(exact.type() + "-aliased");
359359 addToTokens(t);
360360 }
361361 }
@@ -796,7 +796,7 @@
797797 if(lc == '\n' || lc =='\r')
798798 break;
799799 }
800 - int start=0, end=0; // number of ='s at begining and end of line
 800+ int start=0, end=0; // number of ='s at beginning and end of line
801801 // find first sequence of =
802802 for(lookup = cur ; lookup < textLength && lookup < endOfLine ; lookup++ ){
803803 if(text[lookup] == '=')
@@ -804,7 +804,7 @@
805805 else
806806 break;
807807 }
808 - // find the last squence of =
 808+ // find the last sequence of =
809809 for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){
810810 if(text[lookup] == '=')
811811 end++;
@@ -843,6 +843,7 @@
844844 }
845845 return true;
846846 }
 847+
847848 /** Check if it's a reference tag starting at cur */
848849 protected boolean checkRefStart(){
849850 if(matchesString("<ref")){
@@ -894,7 +895,7 @@
895896 return tokens;
896897 }
897898
898 - // strip comments so we don't neded to complicate syntax parsing even more
 899+ // strip comments so we don't need to complicate syntax parsing even more
899900 stripComments();
900901
901902 // start parsing
@@ -974,7 +975,7 @@
975976 }
976977 }
977978 } else if(cur > 0 && text[cur-1]=='\n' && text[cur+1] == '-'){
978 - // explicitely put '-' into the glue buffer
 979+ // Explicitly put '-' into the glue buffer
979980 if(options.highlightParsing){
980981 if(glueLength == 0)
981982 glueStart = cur+1;
@@ -1276,7 +1277,7 @@
12771278 continue;
12781279 case LINK_FETCH:
12791280 if(length == 0 && c ==' ')
1280 - continue; // ignore leading whitespaces
 1281+ continue; // ignore leading whitespace
12811282 if(c == ']'){
12821283 state = ParserState.LINK_END;
12831284 continue;
@@ -1333,7 +1334,7 @@
13341335 cur = fetchStart;
13351336 state = ParserState.CATEGORY_WORDS;
13361337 } else
1337 - System.err.print("ERROR: Inconsistent parser state, attepmted category backtrace for uninitalized fetchStart.");
 1338+ System.err.print("ERROR: Inconsistent parser state, attempted category backtrace for uninitalized fetchStart.");
13381339 fetchStart = -1;
13391340 continue;
13401341 case INTERWIKI:
@@ -1375,7 +1376,7 @@
13761377 continue;
13771378 case TABLE_BEGIN:
13781379 tableLevel++;
1379 - // ignore everything up to the newspace, since they are table display params
 1380+ // ignore everything up to the newline, since they are table display params
13801381 while(cur < textLength && (text[cur]!='\r' && text[cur]!='\n'))
13811382 cur++;
13821383 state = ParserState.WORD;
@@ -1422,7 +1423,7 @@
14231424 flushGlue();
14241425 if(nonContentTokens.size() != 0){
14251426 boolean first = true;
1426 - // flush any remaning tokens from initial templates, etc..
 1427+ // flush any remaining tokens from initial templates, etc..
14271428 for(Token tt : nonContentTokens){
14281429 if(first){
14291430 tt.setPositionIncrement(FIRST_SECTION_GAP);
@@ -1595,7 +1596,11 @@
15961597 return new String(buf,0,len).trim();
15971598 }
15981599
1599 - /** Delete all vowels from a word or phrase */
 1600+ /**
 1601+ * Delete all vowels from a word or phrase
 1602+ *
 1603+ * Unused (except test)?
 1604+ */
16001605 public static String deleteVowels(String title){
16011606 char[] buf = new char[256];
16021607
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java
@@ -8,7 +8,7 @@
99 import org.apache.lucene.analysis.TokenStream;
1010
1111 /**
12 - * Add english singular forms of words as aliases of
 12+ * Add English singular forms of words as aliases of
1313 * type "singular"
1414 *
1515 * @author rainman
@@ -17,7 +17,7 @@
1818 public class EnglishSingularFilter extends TokenFilter{
1919 Singular singular = new EnglishKStemSingular();
2020
21 - Token next = null, next2=null;
 21+ Token next = null, next2= null;
2222 public EnglishSingularFilter(TokenStream input) {
2323 super(input);
2424 }
@@ -53,7 +53,7 @@
5454 return t;
5555 }
5656
57 - /** Return token with sigular form of the noun, or null if none found */
 57+ /** Return token with singular form of the noun, or null if none found */
5858 protected final Token singular(Token t){
5959 String w = singular.getSingular(t.termText());
6060 if(w != null){
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java
@@ -90,7 +90,7 @@
9191 return ret;
9292 }
9393
94 - /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */
 94+ /** Get a brand new hash set of predefined stop words (i.e. not those generated from lucene indexes) */
9595 public static HashSet<String> getPredefinedSet(String langCode){
9696 loadPredefined();
9797 HashSet<String> ret = new HashSet<String>();
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
@@ -67,7 +67,7 @@
6868 for(int i=0;i<levels;i++)
6969 keywordsBySize.add(new ArrayList<String>());
7070 TokenizerOptions options = new TokenizerOptions(exactCase);
71 - // arange keywords into a list by token number
 71+ // arrange keywords into a list by token number
7272 for(String k : keywords){
7373 ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,options).parse();
7474 if(parsed.size() == 0)
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
@@ -6,12 +6,11 @@
77 import java.util.Iterator;
88
99 import org.apache.lucene.analysis.Analyzer;
10 -import org.apache.lucene.analysis.LowerCaseFilter;
1110 import org.apache.lucene.analysis.Token;
1211 import org.apache.lucene.analysis.TokenStream;
1312
1413 /** Produces a token stream for category field in the lucene index.
15 - * Each token is a single category (category names themself are
 14+ * Each token is a single category (category names themselves are
1615 * not tokenized) */
1716 public class CategoryAnalyzer extends Analyzer {
1817 public class ArrayTokenStream extends TokenStream {
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
@@ -7,7 +7,7 @@
88 import org.apache.lucene.analysis.Token;
99 import org.apache.lucene.analysis.TokenStream;
1010 /**
11 - * Analyzer that just lowecases the text, doesn't split up anything, etc..
 11+ * Analyzer that just lowercases the text, doesn't split up anything, etc..
1212 *
1313 * @author rainman
1414 *
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
@@ -1,12 +1,9 @@
22 package org.wikimedia.lsearch.analyzers;
33
44 import java.io.IOException;
5 -import java.lang.reflect.Constructor;
6 -import java.lang.reflect.InvocationTargetException;
75
86 import org.apache.log4j.Logger;
97 import org.apache.lucene.analysis.Token;
10 -import org.apache.lucene.analysis.TokenFilter;
118 import org.apache.lucene.analysis.TokenStream;
129
1310 /**
@@ -30,7 +27,6 @@
3128 * 2) stemmers should never change tokens, if the text needs to be
3229 * changed, return a new Token object
3330 *
34 - * @param language
3531 */
3632 public AliasFilter(FilterFactory filters, TokenStream input, TokenStream duplicate){
3733 this.input = input;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java
@@ -19,7 +19,7 @@
2020
2121 @Override
2222 public Token next() throws IOException {
23 - // return buferred
 23+ // return buffered
2424 if(inx < buffer.size())
2525 return buffer.get(inx++);
2626
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java
@@ -3,7 +3,7 @@
44 import org.apache.lucene.analysis.KStemmer;
55
66 /**
7 - * KStem-based singular-finding class for english
 7+ * KStem-based singular-finding class for English
88 *
99 * @author rainman
1010 *
@@ -15,7 +15,7 @@
1616 if(!word.equals(ret))
1717 return ret;
1818 else{
19 - // strip possesive
 19+ // strip possessive suffix
2020 if(word.endsWith("'s"))
2121 return word.substring(0,word.length()-2);
2222 return null;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java
@@ -58,7 +58,7 @@
5959
6060 }
6161 /**
62 - * Serialize alttitle for highlighting, serializies titles, redirects, sections.
 62+ * Serialize alttitle for highlighting, serializes titles, redirects, sections.
6363 * Writes original names + highlight tokens.
6464 *
6565 * @param article
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
@@ -7,8 +7,8 @@
88 import org.apache.lucene.analysis.TokenStream;
99
1010 /**
11 - * Vietnamese standard transliterations to ascii. Most of the stuff is done by unicode decomposed,
12 - * we just additionaly convert Đ/đ -> D/d
 11+ * Vietnamese standard transliterations to ascii. Most of the stuff is done by Unicode decomposition.
 12+ * Additional conversions here are: Đ/đ -> D/d
1313 *
1414 * @author rainman
1515 *
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -1,28 +1,12 @@
22 package org.wikimedia.lsearch.analyzers;
33
4 -import java.util.ArrayList;
5 -import java.util.HashMap;
64 import java.util.HashSet;
75
86 import org.apache.log4j.Logger;
97 import org.apache.lucene.analysis.Analyzer;
108 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
11 -import org.apache.lucene.analysis.PorterStemFilter;
12 -import org.apache.lucene.analysis.SimpleAnalyzer;
13 -import org.apache.lucene.analysis.de.GermanStemFilter;
14 -import org.apache.lucene.analysis.fr.FrenchStemFilter;
15 -import org.apache.lucene.analysis.nl.DutchStemFilter;
16 -import org.apache.lucene.analysis.ru.RussianStemFilter;
17 -import org.apache.lucene.analysis.th.ThaiWordFilter;
18 -import org.apache.lucene.search.FieldSortedHitQueue;
19 -import org.wikimedia.lsearch.analyzers.FieldBuilder.BuilderSet;
20 -import org.wikimedia.lsearch.beans.Article;
21 -import org.wikimedia.lsearch.beans.Title;
229 import org.wikimedia.lsearch.config.GlobalConfiguration;
2310 import org.wikimedia.lsearch.config.IndexId;
24 -import org.wikimedia.lsearch.index.WikiIndexModifier;
25 -import org.wikimedia.lsearch.ranks.Links;
26 -import org.wikimedia.lsearch.related.RelatedTitle;
2711
2812 /**
2913 * Global functions related to creation/usage of analyzers.
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
@@ -72,8 +72,8 @@
7373 (c >= 0x3300 && c <= 0x337f) ||
7474 (c >= 0x3400 && c <= 0x3d2d) ||
7575 (c >= 0x4e00 && c <= 0x9fff) ||
76 - (c >= 0xf900 && c <= 0xfaff) ||
77 - (c >= 0xac00 && c <= 0xd7af);
 76+ (c >= 0xf900 && c <= 0xfaff) ||
 77+ (c >= 0xac00 && c <= 0xd7af);
7878 }
7979
8080 }
\ No newline at end of file
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java
@@ -6,6 +6,9 @@
77 import org.apache.lucene.analysis.TokenFilter;
88 import org.apache.lucene.analysis.TokenStream;
99
 10+/**
 11+ * Removes dots from acronyms?
 12+ */
1013 public class AcronymFilter extends TokenFilter {
1114 Token buffered = null;
1215
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
@@ -38,7 +38,7 @@
3939 protected Token phrase1 = null, phrase2 = null;
4040 protected boolean phraseReady = false;
4141 protected String gap = "_";
42 - /** pairs of words, two adjecent words */
 42+ /** pairs of words, two adjacent words */
4343 protected Token pair1 = null, pair2 = null;
4444 protected boolean pairReady = false;
4545 protected Token nextToken = null;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
@@ -3,18 +3,18 @@
44 import org.wikimedia.lsearch.config.IndexId;
55
66 /**
7 - * Agregate class for FilterFactory and FieldNameFactory. This class
8 - * contains methods used to build various fields of the index,
9 - * it contains field names to be used, filter that are to be applied...
 7+ * Aggregate class for FilterFactory and FieldNameFactory. This class contains
 8+ * methods used to build various fields of the index, it contains field names to
 9+ * be used, filter that are to be applied...
1010 *
1111 * @author rainman
12 - *
 12+ *
1313 */
1414 public class FieldBuilder {
1515 public class BuilderSet{
1616 FilterFactory filters;
1717 FieldNameFactory fields;
18 - boolean addKeywords; // wether to add keywords from beginning of article
 18+ boolean addKeywords; // whether to add keywords from beginning of article
1919
2020 public BuilderSet(FilterFactory filters, FieldNameFactory fields) {
2121 this.filters = filters;
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java
@@ -1,9 +1,11 @@
22 package org.wikimedia.lsearch.analyzers;
33
 4+import java.util.Arrays;
45 import java.util.HashMap;
 6+import java.util.HashSet;
57
68 /**
7 - * Porter-based singular filter for english
 9+ * Porter-based singular filter for English
810 *
911 * @author rainman
1012 *
@@ -18,10 +20,11 @@
1921 if(w.length() <= 3 || w.charAt(w.length()-1) != 's')
2022 return null;
2123 // exceptions (from porter2)
22 - if("news".equals(w) || "atlas".equals(w) || "cosmos".equals(w)
23 - || "bias".equals(w) || "andes".equals(w) || "aries".equals(w))
 24+ String[] exceptions = { "news", "atlas", "cosmos", "bias", "andes", "aries" };
 25+ HashSet<String> set = new HashSet<String>(Arrays.asList(exceptions));
 26+ if( set.contains(w) )
2427 return null;
25 - // don't strip posssesive form
 28+ // don't strip possessive form
2629 if(w.endsWith("'s")){
2730 //if(w.length() > 2)
2831 // return w.substring(0,w.length()-2);

Status & tagging log