Index: trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java |
— | — | @@ -1,16 +1,11 @@ |
2 | 2 | package org.wikimedia.lsearch.util; |
3 | 3 | |
4 | | -import java.net.URL; |
5 | | - |
6 | 4 | import org.wikimedia.lsearch.config.Configuration; |
7 | 5 | import org.wikimedia.lsearch.config.IndexId; |
8 | 6 | import org.wikimedia.lsearch.util.Localization; |
9 | 7 | |
10 | 8 | public class LocalizationTest { |
11 | 9 | |
12 | | - /** |
13 | | - * @param args |
14 | | - */ |
15 | 10 | public static void main(String[] args) { |
16 | 11 | Configuration.open(); |
17 | 12 | String text = "#redirect [[mw]]"; |
Index: trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java |
— | — | @@ -21,7 +21,7 @@ |
22 | 22 | * |
23 | 23 | * <p>This class tries to handle <code>null</code> input gracefully. |
24 | 24 | * An exception will not be thrown for a <code>null</code> input. |
25 | | - * Each method documents its behaviour in more detail.</p> |
| 25 | + * Each method documents its behavior in more detail.</p> |
26 | 26 | * |
27 | 27 | * @author Apache Jakarta Velocity |
28 | 28 | * @author Stephen Colebourne |
Index: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java |
— | — | @@ -44,13 +44,14 @@ |
45 | 45 | import java.io.*; |
46 | 46 | |
47 | 47 | /** |
48 | | - This class implements the Kstem algorithm |
| 48 | + * This class implements the Kstem algorithm |
49 | 49 | */ |
50 | 50 | public class KStemmer { |
51 | | - /** Default size of the cache that stores <code>(word,stem)</code> pairs. |
52 | | - <p>This speeds up processing since Kstem works by |
53 | | - sucessive "transformations" to the input word until a |
54 | | - suitable stem is found. |
| 51 | + /** |
| 52 | + * Default size of the cache that stores <code>(word,stem)</code> pairs. |
| 53 | + * |
| 54 | + * This speeds up processing since Kstem works by successive |
| 55 | + * "transformations" to the input word until a suitable stem is found. |
55 | 56 | */ |
56 | 57 | static public int DEFAULT_CACHE_SIZE = 20000; |
57 | 58 | static private final int MaxWordLen = 100; |
— | — | @@ -203,9 +204,9 @@ |
204 | 205 | } |
205 | 206 | } |
206 | 207 | |
207 | | - private static Hashtable dict_ht = null; |
| 208 | + private static Hashtable<String, DictEntry> dict_ht = null; |
208 | 209 | private int MaxCacheSize; |
209 | | - private Hashtable stem_ht = null; |
| 210 | + private Hashtable<String, String> stem_ht = null; |
210 | 211 | private StringBuffer word; |
211 | 212 | private int j; /* index of final letter in stem (within word) */ |
212 | 213 | private int k; /* INDEX of final letter in word. |
— | — | @@ -214,7 +215,7 @@ |
215 | 216 | wordLength, which returns (k+1). */ |
216 | 217 | |
217 | 218 | private void initializeStemHash() { |
218 | | - stem_ht = new Hashtable(); |
| 219 | + stem_ht = new Hashtable<String, String>(); |
219 | 220 | } |
220 | 221 | |
221 | 222 | private char finalChar() { |
— | — | @@ -249,7 +250,7 @@ |
250 | 251 | if (dict_ht != null) |
251 | 252 | return; |
252 | 253 | |
253 | | - dict_ht = new Hashtable(); |
| 254 | + dict_ht = new Hashtable<String, DictEntry>(); |
254 | 255 | for (int i=0;i<exceptionWords.length;i++) { |
255 | 256 | if (!dict_ht.containsKey(exceptionWords[i])) { |
256 | 257 | entry = new DictEntry(exceptionWords[i],true); |
— | — | @@ -282,110 +283,28 @@ |
283 | 284 | } |
284 | 285 | |
285 | 286 | defaultEntry = new DictEntry(null,false); |
286 | | - |
287 | | - String[] array; |
288 | | - array = KStemData1.data; |
289 | | - |
290 | | - for (int i=0;i<array.length;i++) { |
291 | | - if (!dict_ht.containsKey(array[i])) { |
292 | | - dict_ht.put(array[i],defaultEntry); |
| 287 | + |
| 288 | + appendStems( dict_ht, defaultEntry, KStemData1.data, "4" ); |
| 289 | + appendStems( dict_ht, defaultEntry, KStemData2.data, "4" ); |
| 290 | + appendStems( dict_ht, defaultEntry, KStemData3.data, "4" ); |
| 291 | + appendStems( dict_ht, defaultEntry, KStemData4.data, "4" ); |
| 292 | + appendStems( dict_ht, defaultEntry, KStemData5.data, "4" ); |
| 293 | + appendStems( dict_ht, defaultEntry, KStemData6.data, "4" ); |
| 294 | + appendStems( dict_ht, defaultEntry, KStemData7.data, "4" ); |
| 295 | + appendStems( dict_ht, defaultEntry, KStemData8.data, "4" ); |
| 296 | + appendStems( dict_ht, defaultEntry, supplementDict, "5" ); |
| 297 | + appendStems( dict_ht, defaultEntry, properNouns, "6" ); |
| 298 | + } |
| 299 | + |
| 300 | + private static void appendStems( Hashtable<String, DictEntry> stems, DictEntry defaultEntry, String[] array, String dict ) { |
| 301 | + for (int i=0; i < array.length; i++) { |
| 302 | + if (!stems.containsKey(array[i])) { |
| 303 | + stems.put(array[i],defaultEntry); |
293 | 304 | } else { |
294 | 305 | System.out.println("Warning: Entry ["+array[i]+ |
295 | | - "] already in dictionary 4"); |
| 306 | + "] already in dictionary " + dict); |
296 | 307 | } |
297 | 308 | } |
298 | | - |
299 | | - |
300 | | - array = KStemData2.data; |
301 | | - for (int i=0;i<array.length;i++) { |
302 | | - if (!dict_ht.containsKey(array[i])) { |
303 | | - dict_ht.put(array[i],defaultEntry); |
304 | | - } else { |
305 | | - System.out.println("Warning: Entry ["+array[i]+ |
306 | | - "] already in dictionary 4"); |
307 | | - } |
308 | | - } |
309 | | - |
310 | | - array = KStemData3.data; |
311 | | - for (int i=0;i<array.length;i++) { |
312 | | - if (!dict_ht.containsKey(array[i])) { |
313 | | - dict_ht.put(array[i],defaultEntry); |
314 | | - } else { |
315 | | - System.out.println("Warning: Entry ["+array[i]+ |
316 | | - "] already in dictionary 4"); |
317 | | - } |
318 | | - } |
319 | | - |
320 | | - array = KStemData4.data; |
321 | | - for (int i=0;i<array.length;i++) { |
322 | | - if (!dict_ht.containsKey(array[i])) { |
323 | | - dict_ht.put(array[i],defaultEntry); |
324 | | - } else { |
325 | | - System.out.println("Warning: Entry ["+array[i]+ |
326 | | - "] already in dictionary 4"); |
327 | | - } |
328 | | - } |
329 | | - |
330 | | - |
331 | | - array = KStemData5.data; |
332 | | - for (int i=0;i<array.length;i++) { |
333 | | - if (!dict_ht.containsKey(array[i])) { |
334 | | - dict_ht.put(array[i],defaultEntry); |
335 | | - } else { |
336 | | - System.out.println("Warning: Entry ["+array[i]+ |
337 | | - "] already in dictionary 4"); |
338 | | - } |
339 | | - } |
340 | | - |
341 | | - |
342 | | - array = KStemData6.data; |
343 | | - for (int i=0;i<array.length;i++) { |
344 | | - if (!dict_ht.containsKey(array[i])) { |
345 | | - dict_ht.put(array[i],defaultEntry); |
346 | | - } else { |
347 | | - System.out.println("Warning: Entry ["+array[i]+ |
348 | | - "] already in dictionary 4"); |
349 | | - } |
350 | | - } |
351 | | - |
352 | | - array = KStemData7.data; |
353 | | - for (int i=0;i<array.length;i++) { |
354 | | - if (!dict_ht.containsKey(array[i])) { |
355 | | - dict_ht.put(array[i],defaultEntry); |
356 | | - } else { |
357 | | - System.out.println("Warning: Entry ["+array[i]+ |
358 | | - "] already in dictionary 4"); |
359 | | - } |
360 | | - } |
361 | | - |
362 | | - for (int i=0;i<KStemData8.data.length;i++) { |
363 | | - if (!dict_ht.containsKey(KStemData8.data[i])) { |
364 | | - dict_ht.put(KStemData8.data[i],defaultEntry); |
365 | | - } else { |
366 | | - System.out.println("Warning: Entry ["+KStemData8.data[i]+ |
367 | | - "] already in dictionary 4"); |
368 | | - } |
369 | | - } |
370 | | - |
371 | | - for (int i=0;i<supplementDict.length;i++) { |
372 | | - if (!dict_ht.containsKey(supplementDict[i])) { |
373 | | - dict_ht.put(supplementDict[i],defaultEntry); |
374 | | - } else { |
375 | | - System.out.println("Warning: Entry ["+ |
376 | | - supplementDict[i]+ |
377 | | - "] already in dictionary 5"); |
378 | | - } |
379 | | - } |
380 | | - |
381 | | - for (int i=0;i<properNouns.length;i++) { |
382 | | - if (!dict_ht.containsKey(properNouns[i])) { |
383 | | - dict_ht.put(properNouns[i],defaultEntry); |
384 | | - } else { |
385 | | - System.out.println("Warning: Entry ["+ |
386 | | - properNouns[i]+ |
387 | | - "] already in dictionary 6"); |
388 | | - } |
389 | | - } |
390 | 309 | } |
391 | 310 | |
392 | 311 | private boolean isAlpha(char ch) { |
Index: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java |
— | — | @@ -45,56 +45,66 @@ |
46 | 46 | |
47 | 47 | import java.io.IOException; |
48 | 48 | |
49 | | -/** Transforms the token stream according to the KStem stemming algorithm. |
50 | | - * For more information about KStem see <a href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf"> |
51 | | - "Viewing Morphology as an Inference Process"</a> |
52 | | - (Krovetz, R., Proceedings of the Sixteenth Annual International ACM SIGIR |
53 | | - Conference on Research and Development in Information Retrieval, 191-203, 1993). |
54 | | - |
55 | | - Note: the input to the stemming filter must already be in lower case, |
56 | | - so you will need to use LowerCaseFilter or LowerCaseTokenizer farther |
57 | | - down the Tokenizer chain in order for this to work properly! |
58 | | - <P> |
59 | | - To use this filter with other analyzers, you'll want to write an |
60 | | - Analyzer class that sets up the TokenStream chain as you want it. |
61 | | - To use this with LowerCaseTokenizer, for example, you'd write an |
62 | | - analyzer like this: |
63 | | - <P> |
64 | | - <PRE> |
65 | | - class MyAnalyzer extends Analyzer { |
66 | | - public final TokenStream tokenStream(String fieldName, Reader reader) { |
67 | | - return new KStemStemFilter(new LowerCaseTokenizer(reader)); |
68 | | - } |
69 | | - } |
70 | | - </PRE> |
71 | | - |
| 49 | +/** |
| 50 | + * Transforms the token stream according to the KStem stemming algorithm. For |
| 51 | + * more information about KStem see <a |
| 52 | + * href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf"> |
| 53 | + * "Viewing Morphology as an Inference Process"</a> (Krovetz, R., Proceedings of |
| 54 | + * the Sixteenth Annual International ACM SIGIR Conference on Research and |
| 55 | + * Development in Information Retrieval, 191-203, 1993). |
| 56 | + * |
| 57 | + * Note: the input to the stemming filter must already be in lower case, so you |
| 58 | + * will need to use LowerCaseFilter or LowerCaseTokenizer farther down the |
| 59 | + * Tokenizer chain in order for this to work properly! |
| 60 | + * <P> |
| 61 | + * To use this filter with other analyzers, you'll want to write an Analyzer |
| 62 | + * class that sets up the TokenStream chain as you want it. To use this with |
| 63 | + * LowerCaseTokenizer, for example, you'd write an analyzer like this: |
| 64 | + * <P> |
| 65 | + * |
| 66 | + * <PRE> |
| 67 | + * class MyAnalyzer extends Analyzer { |
| 68 | + * public final TokenStream tokenStream(String fieldName, Reader reader) { |
| 69 | + * return new KStemStemFilter(new LowerCaseTokenizer(reader)); |
| 70 | + * } |
| 71 | + * } |
| 72 | + * </PRE> |
72 | 73 | */ |
73 | 74 | |
74 | 75 | public final class KStemFilter extends TokenFilter { |
75 | 76 | private KStemmer stemmer; |
76 | 77 | |
77 | | - /** Create a KStemmer with the given cache size. |
78 | | - * @param in The TokenStream whose output will be the input to KStemFilter. |
79 | | - * @param cacheSize Maximum number of entries to store in the |
80 | | - * Stemmer's cache (stems stored in this cache do not need to be |
81 | | - * recomputed, speeding up the stemming process). |
| 78 | + /** |
| 79 | + * Create a KStemmer with the given cache size. |
| 80 | + * |
| 81 | + * @param in |
| 82 | + * The TokenStream whose output will be the input to KStemFilter. |
| 83 | + * @param cacheSize |
| 84 | + * Maximum number of entries to store in the Stemmer's cache |
| 85 | + * (stems stored in this cache do not need to be recomputed, |
| 86 | + * speeding up the stemming process). |
82 | 87 | */ |
83 | 88 | public KStemFilter(TokenStream in, int cacheSize) { |
84 | 89 | super(in); |
85 | 90 | stemmer = new KStemmer(cacheSize); |
86 | 91 | } |
87 | 92 | |
88 | | - /** Create a KStemmer with the default cache size of 20 000 entries. |
89 | | - * @param in The TokenStream whose output will be the input to KStemFilter. |
| 93 | + /** |
| 94 | + * Create a KStemmer with the default cache size of 20 000 entries. |
| 95 | + * |
| 96 | + * @param in |
| 97 | + * The TokenStream whose output will be the input to KStemFilter. |
90 | 98 | */ |
91 | 99 | public KStemFilter(TokenStream in) { |
92 | 100 | super(in); |
93 | 101 | stemmer = new KStemmer(); |
94 | 102 | } |
95 | 103 | |
96 | | - /** Returns the next, stemmed, input Token. |
97 | | - * @return The stemed form of a token. |
98 | | - * @throws IOException |
| 104 | + /** |
| 105 | + * Returns the next, stemmed, input Token. |
| 106 | + * |
| 107 | + * @return The stemmed form of a token. |
| 108 | + * @throws IOException |
99 | 109 | */ |
100 | 110 | public final Token next() throws IOException { |
101 | 111 | Token token = input.next(); |
— | — | @@ -103,7 +113,8 @@ |
104 | 114 | else { |
105 | 115 | String s = stemmer.stem(token.termText()); |
106 | 116 | if (!s.equals(token.termText())) |
107 | | - return new Token(s, token.startOffset, token.endOffset, token.type); |
| 117 | + return new Token(s, token.startOffset, token.endOffset, |
| 118 | + token.type); |
108 | 119 | return token; |
109 | 120 | } |
110 | 121 | } |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java |
— | — | @@ -4,18 +4,17 @@ |
5 | 5 | import java.util.Collections; |
6 | 6 | import java.util.Comparator; |
7 | 7 | import java.util.Hashtable; |
8 | | -import java.util.List; |
9 | 8 | import java.util.Map.Entry; |
10 | 9 | |
11 | 10 | import org.apache.log4j.Logger; |
12 | 11 | |
13 | 12 | public class HttpMonitor extends Thread { |
14 | 13 | static Logger log = Logger.getLogger(HttpMonitor.class); |
15 | | - protected static HttpMonitor instance=null; |
| 14 | + protected static HttpMonitor instance; |
16 | 15 | /** times when http request have been started */ |
17 | 16 | protected Hashtable<HttpHandler,Long> startTimes = new Hashtable<HttpHandler,Long>(); |
18 | 17 | |
19 | | - /** threshold for reporting 10s */ |
| 18 | + /** threshold in milliseconds for reporting */ |
20 | 19 | protected long threshold = 10000; |
21 | 20 | |
22 | 21 | private HttpMonitor(){} |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java |
— | — | @@ -436,7 +436,7 @@ |
437 | 437 | log.error("Error sending result line ("+score + " " + namespace + " " + title +"): "+e.getMessage(),e); |
438 | 438 | } |
439 | 439 | } |
440 | | - |
| 440 | + /** Unused? */ |
441 | 441 | private void sendResultLine(String namespace, String title) { |
442 | 442 | try{ |
443 | 443 | sendOutputLine(namespace + " " + encodeTitle(title)); |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java |
— | — | @@ -72,7 +72,7 @@ |
73 | 73 | if (max != null) |
74 | 74 | maxThreads = Integer.parseInt(max); |
75 | 75 | |
76 | | - // Initialise statistics |
| 76 | + // Initialize statistics |
77 | 77 | stats = new Statistics(1000, statsPeriod); |
78 | 78 | if (config.getBoolean("Ganglia", "report")) { |
79 | 79 | log.info("Starting ganglia statistics thread..."); |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java |
— | — | @@ -18,7 +18,7 @@ |
19 | 19 | |
20 | 20 | /** |
21 | 21 | * Simple HTTP 1.1 handler, used for Index and Search daemons |
22 | | - * for more info on protocole see handle() method |
| 22 | + * for more info about the protocol see handle() method |
23 | 23 | * |
24 | 24 | * @author Brion Vibber |
25 | 25 | * |
— | — | @@ -136,7 +136,7 @@ |
137 | 137 | * URL path format: /operation/database/searchterm |
138 | 138 | * The path should be URL-encoded UTF-8 (standard IRI). |
139 | 139 | * |
140 | | - * Additional paramters may be specified in a query string: |
| 140 | + * Additional parameters may be specified in a query string: |
141 | 141 | * namespaces: comma-separated list of namespace numeric keys to subset results |
142 | 142 | * limit: maximum number of results to return |
143 | 143 | * offset: number of matches to skip before returning results |
— | — | @@ -271,7 +271,7 @@ |
272 | 272 | return null; |
273 | 273 | } |
274 | 274 | |
275 | | - /** This method is to be used for header reads only (which is utf-8 free!) */ |
| 275 | + /** This method is to be used for header reads only (which is UTF-8 free!) */ |
276 | 276 | @SuppressWarnings("deprecation") |
277 | 277 | protected String readInputLine() { |
278 | 278 | String sin=""; |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -83,7 +83,7 @@ |
84 | 84 | |
85 | 85 | /** Type of index, enumeration */ |
86 | 86 | protected IndexType type; |
87 | | - /** Part number in split repestnation, e.g. 1..N */ |
| 87 | + /** Part number in split representation, e.g. 1..N */ |
88 | 88 | protected int partNum; |
89 | 89 | |
90 | 90 | /** Namespace -> part (for nssplit indexes) */ |
— | — | @@ -137,10 +137,10 @@ |
138 | 138 | /** Namespaces that are searched by default */ |
139 | 139 | protected NamespaceFilter defaultNs = null; |
140 | 140 | |
141 | | - /** filter set to true for namespaces with subpages */ |
| 141 | + /** Filter set to true for namespaces with subpages */ |
142 | 142 | protected NamespaceFilter nsWithSubpages = null; |
143 | 143 | |
144 | | - /** namespaces with content (from initialise settings) */ |
| 144 | + /** Namespaces with content (from initialise settings) */ |
145 | 145 | protected NamespaceFilter contentNamespaces = null; |
146 | 146 | |
147 | 147 | /** If we should be using additional global rank for scores */ |
— | — | @@ -683,7 +683,6 @@ |
684 | 684 | /** |
685 | 685 | * Get all indexes parts for this iid except for logical names. |
686 | 686 | * I.e. for db of kind mainsplit, it will return db.mainpart, db.restpart |
687 | | - * @return |
688 | 687 | */ |
689 | 688 | public HashSet<String> getPhysicalIndexes() { |
690 | 689 | HashSet<String> ret = new HashSet<String>(); |
— | — | @@ -712,8 +711,6 @@ |
713 | 712 | |
714 | 713 | /** |
715 | 714 | * Wrapper for getPhysicalIndexes to get iid objects |
716 | | - * |
717 | | - * @return |
718 | 715 | */ |
719 | 716 | public ArrayList<IndexId> getPhysicalIndexIds(){ |
720 | 717 | HashSet<String> physical = getPhysicalIndexes(); |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -119,13 +119,13 @@ |
120 | 120 | |
121 | 121 | protected static GlobalConfiguration instance = null; |
122 | 122 | |
123 | | - /** All the lang codes we encountered, used for "smart interwiki" */ |
| 123 | + /** All the language codes we encountered, used for "smart interwiki" */ |
124 | 124 | protected HashSet<String> smartInterwikiCodes = new HashSet<String>(); |
125 | 125 | protected boolean useSmartInterwiki = false; |
126 | 126 | protected int maxSearchLimit = 1000; |
127 | 127 | protected int maxSearchOffset = 1000000; |
128 | 128 | |
129 | | - /** Wether to report warnings and info */ |
| 129 | + /** Whether to report warnings and info */ |
130 | 130 | protected static boolean verbose = true; |
131 | 131 | |
132 | 132 | /** Sections in lsearch-config.conf */ |
— | — | @@ -145,14 +145,12 @@ |
146 | 146 | } |
147 | 147 | |
148 | 148 | protected GlobalConfiguration(){ |
149 | | - // try to determin this hosts IP address |
| 149 | + // try to determine this hosts IP address |
150 | 150 | determineInetAddress(); |
151 | 151 | } |
152 | 152 | |
153 | 153 | /** |
154 | 154 | * Get singleton instance of this class |
155 | | - * |
156 | | - * @return |
157 | 155 | */ |
158 | 156 | synchronized public static GlobalConfiguration getInstance() { |
159 | 157 | if (instance == null) |
— | — | @@ -382,7 +380,7 @@ |
383 | 381 | } |
384 | 382 | |
385 | 383 | /** |
386 | | - * Reads a config file from a bufferedreader, will |
| 384 | + * Reads a config file from a BufferedReader, will |
387 | 385 | * close the reader when done. |
388 | 386 | * |
389 | 387 | * @param in opened reader |
— | — | @@ -423,7 +421,7 @@ |
424 | 422 | prop.append("\n"); |
425 | 423 | } |
426 | 424 | globalProperties.load(new ByteArrayInputStream(prop.toString().getBytes("utf-8"))); |
427 | | - // get some predifined global properties |
| 425 | + // get some predefined global properties |
428 | 426 | this.databaseSuffixes = getArrayProperty("Database.suffix"); |
429 | 427 | this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix"); |
430 | 428 | this.exactCaseSuffix = getArrayProperty("ExactCase.suffix"); |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -69,8 +69,19 @@ |
70 | 70 | public class SearchEngine { |
71 | 71 | static org.apache.log4j.Logger log = Logger.getLogger(SearchEngine.class); |
72 | 72 | |
| 73 | + /** |
| 74 | + * Maximum number of search results at once. |
| 75 | + */ |
73 | 76 | protected static int maxlimit = 1000; |
| 77 | + |
| 78 | + /** |
| 79 | + * Largest search result offset. |
| 80 | + */ |
74 | 81 | protected static int maxoffset = 100000; |
| 82 | + |
| 83 | + /** |
| 84 | + * Maximum number of search results for prefix query. |
| 85 | + */ |
75 | 86 | protected final int MAXPREFIX = 50; |
76 | 87 | protected static GlobalConfiguration global = null; |
77 | 88 | protected static Configuration config = null; |
— | — | @@ -518,7 +529,7 @@ |
519 | 530 | return res; |
520 | 531 | } |
521 | 532 | |
522 | | - /** Strip key using PrefixIndexBuilder stip function */ |
| 533 | + /** Strip key using PrefixIndexBuilder strip function */ |
523 | 534 | private String stripKey(String key){ |
524 | 535 | return PrefixIndexBuilder.stripKey(key); |
525 | 536 | } |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java |
— | — | @@ -32,7 +32,7 @@ |
33 | 33 | import org.apache.lucene.analysis.TokenStream; |
34 | 34 | import org.apache.lucene.analysis.TokenFilter; |
35 | 35 | |
36 | | -/** Stem filter for esperanto */ |
| 36 | +/** Stem filter for Esperanto */ |
37 | 37 | public class EsperantoStemFilter extends TokenFilter { |
38 | 38 | public EsperantoStemFilter(TokenStream tokenizer) { |
39 | 39 | super(tokenizer); |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -18,10 +18,10 @@ |
19 | 19 | |
20 | 20 | /** |
21 | 21 | * Wiki Tokenizer. Tokens are words and numbers. All letters are |
22 | | - * lowercased and diacritics deleted using unicode compatibility |
| 22 | + * lowercased and diacritics deleted using Unicode compatibility |
23 | 23 | * decomposition (i.e. č -> c). Parses some basic wiki syntax, |
24 | 24 | * template names are skipped, from images captions are extracted, |
25 | | - * categories and interwiki links are extracted ... |
| 25 | + * categories and interwiki links are extracted... |
26 | 26 | * |
27 | 27 | * Tokenizer will not take a Reader as input, but a String (for |
28 | 28 | * optimal performance) |
— | — | @@ -172,7 +172,7 @@ |
173 | 173 | * This function is called at word boundaries, it is used to |
174 | 174 | * make a new token and add it to token stream |
175 | 175 | * |
176 | | - * Does unicode decomposition, and will make alias token with |
| 176 | + * Does Unicode decomposition, and will make alias token with |
177 | 177 | * alternative transliterations (e.g. ö -> oe) |
178 | 178 | */ |
179 | 179 | private final void addToken(){ |
— | — | @@ -203,7 +203,7 @@ |
204 | 204 | boolean addDecomposed = false; |
205 | 205 | boolean allUpperCase = true; |
206 | 206 | boolean titleCase = true; |
207 | | - boolean split = false; // if more tokens shold be produced, e.g. joe's -> joe + s |
| 207 | + boolean split = false; // if more tokens should be produced, e.g. joe's -> joe + s |
208 | 208 | for(int i=0;i<length;i++){ |
209 | 209 | if(decomposer.isCombiningChar(buffer[i])){ |
210 | 210 | addDecomposed = true; |
— | — | @@ -328,7 +328,7 @@ |
329 | 329 | else if(titleCase) |
330 | 330 | exact.setType("titlecase"); |
331 | 331 | } |
332 | | - // detect hyphenation (takes presedence over case detection) |
| 332 | + // detect hyphenation (takes precedence over case detection) |
333 | 333 | if(cur+1<textLength && text[cur]=='-' && (Character.isLetterOrDigit(text[cur+1]) || decomposer.isCombiningChar(text[cur+1]))) |
334 | 334 | exact.setType("with_hyphen"); |
335 | 335 | |
— | — | @@ -347,14 +347,14 @@ |
348 | 348 | if(decompLength!=0 && addDecomposed){ |
349 | 349 | Token t = makeToken(new String(decompBuffer, 0, decompLength), start, start + length, false); |
350 | 350 | t.setPositionIncrement(0); |
351 | | - t.setType(exact.type()); |
| 351 | + t.setType(exact.type() + "-decomposed"); |
352 | 352 | addToTokens(t); |
353 | 353 | } |
354 | 354 | // add alias (if any) token to stream |
355 | 355 | if(aliasLength>0){ |
356 | 356 | Token t = makeToken(new String(aliasBuffer, 0, aliasLength), start, start + length, false); |
357 | 357 | t.setPositionIncrement(0); |
358 | | - t.setType(exact.type()); |
| 358 | + t.setType(exact.type() + "-aliased"); |
359 | 359 | addToTokens(t); |
360 | 360 | } |
361 | 361 | } |
— | — | @@ -796,7 +796,7 @@ |
797 | 797 | if(lc == '\n' || lc =='\r') |
798 | 798 | break; |
799 | 799 | } |
800 | | - int start=0, end=0; // number of ='s at begining and end of line |
| 800 | + int start=0, end=0; // number of ='s at beginning and end of line |
801 | 801 | // find first sequence of = |
802 | 802 | for(lookup = cur ; lookup < textLength && lookup < endOfLine ; lookup++ ){ |
803 | 803 | if(text[lookup] == '=') |
— | — | @@ -804,7 +804,7 @@ |
805 | 805 | else |
806 | 806 | break; |
807 | 807 | } |
808 | | - // find the last squence of = |
| 808 | + // find the last sequence of = |
809 | 809 | for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){ |
810 | 810 | if(text[lookup] == '=') |
811 | 811 | end++; |
— | — | @@ -843,6 +843,7 @@ |
844 | 844 | } |
845 | 845 | return true; |
846 | 846 | } |
| 847 | + |
847 | 848 | /** Check if it's a reference tag starting at cur */ |
848 | 849 | protected boolean checkRefStart(){ |
849 | 850 | if(matchesString("<ref")){ |
— | — | @@ -894,7 +895,7 @@ |
895 | 896 | return tokens; |
896 | 897 | } |
897 | 898 | |
898 | | - // strip comments so we don't neded to complicate syntax parsing even more |
| 899 | + // strip comments so we don't need to complicate syntax parsing even more |
899 | 900 | stripComments(); |
900 | 901 | |
901 | 902 | // start parsing |
— | — | @@ -974,7 +975,7 @@ |
975 | 976 | } |
976 | 977 | } |
977 | 978 | } else if(cur > 0 && text[cur-1]=='\n' && text[cur+1] == '-'){ |
978 | | - // explicitely put '-' into the glue buffer |
| 979 | + // Explicitly put '-' into the glue buffer |
979 | 980 | if(options.highlightParsing){ |
980 | 981 | if(glueLength == 0) |
981 | 982 | glueStart = cur+1; |
— | — | @@ -1276,7 +1277,7 @@ |
1277 | 1278 | continue; |
1278 | 1279 | case LINK_FETCH: |
1279 | 1280 | if(length == 0 && c ==' ') |
1280 | | - continue; // ignore leading whitespaces |
| 1281 | + continue; // ignore leading whitespace |
1281 | 1282 | if(c == ']'){ |
1282 | 1283 | state = ParserState.LINK_END; |
1283 | 1284 | continue; |
— | — | @@ -1333,7 +1334,7 @@ |
1334 | 1335 | cur = fetchStart; |
1335 | 1336 | state = ParserState.CATEGORY_WORDS; |
1336 | 1337 | } else |
1337 | | - System.err.print("ERROR: Inconsistent parser state, attepmted category backtrace for uninitalized fetchStart."); |
| 1338 | + System.err.print("ERROR: Inconsistent parser state, attempted category backtrace for uninitalized fetchStart."); |
1338 | 1339 | fetchStart = -1; |
1339 | 1340 | continue; |
1340 | 1341 | case INTERWIKI: |
— | — | @@ -1375,7 +1376,7 @@ |
1376 | 1377 | continue; |
1377 | 1378 | case TABLE_BEGIN: |
1378 | 1379 | tableLevel++; |
1379 | | - // ignore everything up to the newspace, since they are table display params |
| 1380 | + // ignore everything up to the newline, since they are table display params |
1380 | 1381 | while(cur < textLength && (text[cur]!='\r' && text[cur]!='\n')) |
1381 | 1382 | cur++; |
1382 | 1383 | state = ParserState.WORD; |
— | — | @@ -1422,7 +1423,7 @@ |
1423 | 1424 | flushGlue(); |
1424 | 1425 | if(nonContentTokens.size() != 0){ |
1425 | 1426 | boolean first = true; |
1426 | | - // flush any remaning tokens from initial templates, etc.. |
| 1427 | + // flush any remaining tokens from initial templates, etc.. |
1427 | 1428 | for(Token tt : nonContentTokens){ |
1428 | 1429 | if(first){ |
1429 | 1430 | tt.setPositionIncrement(FIRST_SECTION_GAP); |
— | — | @@ -1595,7 +1596,11 @@ |
1596 | 1597 | return new String(buf,0,len).trim(); |
1597 | 1598 | } |
1598 | 1599 | |
1599 | | - /** Delete all vowels from a word or phrase */ |
| 1600 | + /** |
| 1601 | + * Delete all vowels from a word or phrase |
| 1602 | + * |
| 1603 | + * Unused (except test)? |
| 1604 | + */ |
1600 | 1605 | public static String deleteVowels(String title){ |
1601 | 1606 | char[] buf = new char[256]; |
1602 | 1607 | |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java |
— | — | @@ -8,7 +8,7 @@ |
9 | 9 | import org.apache.lucene.analysis.TokenStream; |
10 | 10 | |
11 | 11 | /** |
12 | | - * Add english singular forms of words as aliases of |
| 12 | + * Add English singular forms of words as aliases of |
13 | 13 | * type "singular" |
14 | 14 | * |
15 | 15 | * @author rainman |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | public class EnglishSingularFilter extends TokenFilter{ |
19 | 19 | Singular singular = new EnglishKStemSingular(); |
20 | 20 | |
21 | | - Token next = null, next2=null; |
| 21 | + Token next = null, next2= null; |
22 | 22 | public EnglishSingularFilter(TokenStream input) { |
23 | 23 | super(input); |
24 | 24 | } |
— | — | @@ -53,7 +53,7 @@ |
54 | 54 | return t; |
55 | 55 | } |
56 | 56 | |
57 | | - /** Return token with sigular form of the noun, or null if none found */ |
| 57 | + /** Return token with singular form of the noun, or null if none found */ |
58 | 58 | protected final Token singular(Token t){ |
59 | 59 | String w = singular.getSingular(t.termText()); |
60 | 60 | if(w != null){ |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java |
— | — | @@ -90,7 +90,7 @@ |
91 | 91 | return ret; |
92 | 92 | } |
93 | 93 | |
94 | | - /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */ |
| 94 | + /** Get a brand new hash set of predefined stop words (i.e. not those generated from lucene indexes) */ |
95 | 95 | public static HashSet<String> getPredefinedSet(String langCode){ |
96 | 96 | loadPredefined(); |
97 | 97 | HashSet<String> ret = new HashSet<String>(); |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -67,7 +67,7 @@ |
68 | 68 | for(int i=0;i<levels;i++) |
69 | 69 | keywordsBySize.add(new ArrayList<String>()); |
70 | 70 | TokenizerOptions options = new TokenizerOptions(exactCase); |
71 | | - // arange keywords into a list by token number |
| 71 | + // arrange keywords into a list by token number |
72 | 72 | for(String k : keywords){ |
73 | 73 | ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,options).parse(); |
74 | 74 | if(parsed.size() == 0) |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java |
— | — | @@ -6,12 +6,11 @@ |
7 | 7 | import java.util.Iterator; |
8 | 8 | |
9 | 9 | import org.apache.lucene.analysis.Analyzer; |
10 | | -import org.apache.lucene.analysis.LowerCaseFilter; |
11 | 10 | import org.apache.lucene.analysis.Token; |
12 | 11 | import org.apache.lucene.analysis.TokenStream; |
13 | 12 | |
14 | 13 | /** Produces a token stream for category field in the lucene index. |
15 | | - * Each token is a single category (category names themself are |
| 14 | + * Each token is a single category (category names themselves are |
16 | 15 | * not tokenized) */ |
17 | 16 | public class CategoryAnalyzer extends Analyzer { |
18 | 17 | public class ArrayTokenStream extends TokenStream { |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java |
— | — | @@ -7,7 +7,7 @@ |
8 | 8 | import org.apache.lucene.analysis.Token; |
9 | 9 | import org.apache.lucene.analysis.TokenStream; |
10 | 10 | /** |
11 | | - * Analyzer that just lowecases the text, doesn't split up anything, etc.. |
| 11 | + * Analyzer that just lowercases the text, doesn't split up anything, etc.. |
12 | 12 | * |
13 | 13 | * @author rainman |
14 | 14 | * |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java |
— | — | @@ -1,12 +1,9 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | | -import java.lang.reflect.Constructor; |
6 | | -import java.lang.reflect.InvocationTargetException; |
7 | 5 | |
8 | 6 | import org.apache.log4j.Logger; |
9 | 7 | import org.apache.lucene.analysis.Token; |
10 | | -import org.apache.lucene.analysis.TokenFilter; |
11 | 8 | import org.apache.lucene.analysis.TokenStream; |
12 | 9 | |
13 | 10 | /** |
— | — | @@ -30,7 +27,6 @@ |
31 | 28 | * 2) stemmers should never change tokens, if the text needs to be |
32 | 29 | * changed, return a new Token object |
33 | 30 | * |
34 | | - * @param language |
35 | 31 | */ |
36 | 32 | public AliasFilter(FilterFactory filters, TokenStream input, TokenStream duplicate){ |
37 | 33 | this.input = input; |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java |
— | — | @@ -19,7 +19,7 @@ |
20 | 20 | |
21 | 21 | @Override |
22 | 22 | public Token next() throws IOException { |
23 | | - // return buferred |
| 23 | + // return buffered |
24 | 24 | if(inx < buffer.size()) |
25 | 25 | return buffer.get(inx++); |
26 | 26 | |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java |
— | — | @@ -3,7 +3,7 @@ |
4 | 4 | import org.apache.lucene.analysis.KStemmer; |
5 | 5 | |
6 | 6 | /** |
7 | | - * KStem-based singular-finding class for english |
| 7 | + * KStem-based singular-finding class for English |
8 | 8 | * |
9 | 9 | * @author rainman |
10 | 10 | * |
— | — | @@ -15,7 +15,7 @@ |
16 | 16 | if(!word.equals(ret)) |
17 | 17 | return ret; |
18 | 18 | else{ |
19 | | - // strip possesive |
| 19 | + // strip possessive suffix |
20 | 20 | if(word.endsWith("'s")) |
21 | 21 | return word.substring(0,word.length()-2); |
22 | 22 | return null; |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | |
60 | 60 | } |
61 | 61 | /** |
62 | | - * Serialize alttitle for highlighting, serializies titles, redirects, sections. |
| 62 | + * Serialize alttitle for highlighting, serializes titles, redirects, sections. |
63 | 63 | * Writes original names + highlight tokens. |
64 | 64 | * |
65 | 65 | * @param article |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java |
— | — | @@ -7,8 +7,8 @@ |
8 | 8 | import org.apache.lucene.analysis.TokenStream; |
9 | 9 | |
10 | 10 | /** |
11 | | - * Vietnamese standard transliterations to ascii. Most of the stuff is done by unicode decomposed, |
12 | | - * we just additionaly convert Đ/đ -> D/d |
| 11 | + * Vietnamese standard transliterations to ascii. Most of the stuff is done by Unicode decomposition. |
| 12 | + * Additional conversions here are: Đ/đ -> D/d |
13 | 13 | * |
14 | 14 | * @author rainman |
15 | 15 | * |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -1,28 +1,12 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
4 | | -import java.util.ArrayList; |
5 | | -import java.util.HashMap; |
6 | 4 | import java.util.HashSet; |
7 | 5 | |
8 | 6 | import org.apache.log4j.Logger; |
9 | 7 | import org.apache.lucene.analysis.Analyzer; |
10 | 8 | import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; |
11 | | -import org.apache.lucene.analysis.PorterStemFilter; |
12 | | -import org.apache.lucene.analysis.SimpleAnalyzer; |
13 | | -import org.apache.lucene.analysis.de.GermanStemFilter; |
14 | | -import org.apache.lucene.analysis.fr.FrenchStemFilter; |
15 | | -import org.apache.lucene.analysis.nl.DutchStemFilter; |
16 | | -import org.apache.lucene.analysis.ru.RussianStemFilter; |
17 | | -import org.apache.lucene.analysis.th.ThaiWordFilter; |
18 | | -import org.apache.lucene.search.FieldSortedHitQueue; |
19 | | -import org.wikimedia.lsearch.analyzers.FieldBuilder.BuilderSet; |
20 | | -import org.wikimedia.lsearch.beans.Article; |
21 | | -import org.wikimedia.lsearch.beans.Title; |
22 | 9 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
23 | 10 | import org.wikimedia.lsearch.config.IndexId; |
24 | | -import org.wikimedia.lsearch.index.WikiIndexModifier; |
25 | | -import org.wikimedia.lsearch.ranks.Links; |
26 | | -import org.wikimedia.lsearch.related.RelatedTitle; |
27 | 11 | |
28 | 12 | /** |
29 | 13 | * Global functions related to creation/usage of analyzers. |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java |
— | — | @@ -72,8 +72,8 @@ |
73 | 73 | (c >= 0x3300 && c <= 0x337f) || |
74 | 74 | (c >= 0x3400 && c <= 0x3d2d) || |
75 | 75 | (c >= 0x4e00 && c <= 0x9fff) || |
76 | | - (c >= 0xf900 && c <= 0xfaff) || |
77 | | - (c >= 0xac00 && c <= 0xd7af); |
| 76 | + (c >= 0xf900 && c <= 0xfaff) || |
| 77 | + (c >= 0xac00 && c <= 0xd7af); |
78 | 78 | } |
79 | 79 | |
80 | 80 | } |
\ No newline at end of file |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java |
— | — | @@ -6,6 +6,9 @@ |
7 | 7 | import org.apache.lucene.analysis.TokenFilter; |
8 | 8 | import org.apache.lucene.analysis.TokenStream; |
9 | 9 | |
| 10 | +/** |
| 11 | + * Removes dots from acronyms? |
| 12 | + */ |
10 | 13 | public class AcronymFilter extends TokenFilter { |
11 | 14 | Token buffered = null; |
12 | 15 | |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java |
— | — | @@ -38,7 +38,7 @@ |
39 | 39 | protected Token phrase1 = null, phrase2 = null; |
40 | 40 | protected boolean phraseReady = false; |
41 | 41 | protected String gap = "_"; |
42 | | - /** pairs of words, two adjecent words */ |
| 42 | + /** pairs of words, two adjacent words */ |
43 | 43 | protected Token pair1 = null, pair2 = null; |
44 | 44 | protected boolean pairReady = false; |
45 | 45 | protected Token nextToken = null; |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java |
— | — | @@ -3,18 +3,18 @@ |
4 | 4 | import org.wikimedia.lsearch.config.IndexId; |
5 | 5 | |
6 | 6 | /** |
7 | | - * Agregate class for FilterFactory and FieldNameFactory. This class |
8 | | - * contains methods used to build various fields of the index, |
9 | | - * it contains field names to be used, filter that are to be applied... |
| 7 | + * Aggregate class for FilterFactory and FieldNameFactory. This class contains |
| 8 | + * methods used to build various fields of the index, it contains field names to |
| 9 | + * be used, filter that are to be applied... |
10 | 10 | * |
11 | 11 | * @author rainman |
12 | | - * |
| 12 | + * |
13 | 13 | */ |
14 | 14 | public class FieldBuilder { |
15 | 15 | public class BuilderSet{ |
16 | 16 | FilterFactory filters; |
17 | 17 | FieldNameFactory fields; |
18 | | - boolean addKeywords; // wether to add keywords from beginning of article |
| 18 | + boolean addKeywords; // whether to add keywords from beginning of article |
19 | 19 | |
20 | 20 | public BuilderSet(FilterFactory filters, FieldNameFactory fields) { |
21 | 21 | this.filters = filters; |
Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java |
— | — | @@ -1,9 +1,11 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
| 4 | +import java.util.Arrays; |
4 | 5 | import java.util.HashMap; |
| 6 | +import java.util.HashSet; |
5 | 7 | |
6 | 8 | /** |
7 | | - * Porter-based singular filter for english |
| 9 | + * Porter-based singular filter for English |
8 | 10 | * |
9 | 11 | * @author rainman |
10 | 12 | * |
— | — | @@ -18,10 +20,11 @@ |
19 | 21 | if(w.length() <= 3 || w.charAt(w.length()-1) != 's') |
20 | 22 | return null; |
21 | 23 | // exceptions (from porter2) |
22 | | - if("news".equals(w) || "atlas".equals(w) || "cosmos".equals(w) |
23 | | - || "bias".equals(w) || "andes".equals(w) || "aries".equals(w)) |
| 24 | + String[] exceptions = { "news", "atlas", "cosmos", "bias", "andes", "aries" }; |
| 25 | + HashSet<String> set = new HashSet<String>(Arrays.asList(exceptions)); |
| 26 | + if( set.contains(w) ) |
24 | 27 | return null; |
25 | | - // don't strip posssesive form |
| 28 | + // don't strip possessive form |
26 | 29 | if(w.endsWith("'s")){ |
27 | 30 | //if(w.length() > 2) |
28 | 31 | // return w.substring(0,w.length()-2); |