Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -130,12 +130,12 @@ |
131 | 131 | Analyzer analyzer = Analyzers.getSearcherAnalyzer("en"); |
132 | 132 | bs = new FieldBuilder("en").getBuilder(); |
133 | 133 | parser = new WikiQueryParser(bs.getFields().title(),"0",analyzer,bs,NamespacePolicy.IGNORE,stopWords); |
134 | | - assertEquals("[how, do, you, do]",parser.extractPhrases(parser.parseRaw("how do you do")).toString()); |
135 | | - assertEquals("[making, something, rest]",parser.extractPhrases(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString()); |
136 | | - assertEquals("[godel, theorem]",parser.extractPhrases(parser.parseRaw("gödel theorem")).toString()); |
137 | | - assertEquals("[some, text, and, some, phrase]",parser.extractPhrases(parser.parseRaw("some_text and \"some phrase\"")).toString()); |
| 134 | + assertEquals("[how, do, you, do]",parser.extractWords(parser.parseRaw("how do you do")).toString()); |
| 135 | + assertEquals("[making, something, rest]",parser.extractWords(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString()); |
| 136 | + assertEquals("[godel, theorem]",parser.extractWords(parser.parseRaw("gödel theorem")).toString()); |
| 137 | + assertEquals("[some, text, and, some, phrase]",parser.extractWords(parser.parseRaw("some_text and \"some phrase\"")).toString()); |
138 | 138 | |
139 | | - ArrayList<String> words = parser.extractPhrases(parser.parseRaw("the who band is something nobody knows about")); |
| 139 | + ArrayList<String> words = parser.extractWords(parser.parseRaw("the who band is something nobody knows about")); |
140 | 140 | assertEquals("contents:\"the who band\"~10 contents:\"band is something\"~10 contents:\"something nobody\"~10 contents:\"nobody knows\"~10 contents:\"knows about\"~10",parser.makePhraseQueries(words,"contents",10,1).toString()); |
141 | 141 | |
142 | 142 | // namespace policies |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java |
— | — | @@ -0,0 +1,100 @@ |
| 2 | +package org.wikimedia.lsearch.test; |
| 3 | + |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.IOException; |
| 6 | +import java.io.InputStreamReader; |
| 7 | +import java.net.MalformedURLException; |
| 8 | +import java.net.URL; |
| 9 | +import java.net.URLEncoder; |
| 10 | + |
| 11 | +/** |
| 12 | + * Remotely test a spell-checker host |
| 13 | + * |
| 14 | + * @author rainman |
| 15 | + * |
| 16 | + */ |
| 17 | +public class SpellCheckTest { |
| 18 | + static String host = "localhost"; |
| 19 | + static int port = 8123; |
| 20 | + static String db = "enwiki"; |
| 21 | + |
| 22 | + public static String getSuggestion(String query) throws IOException{ |
| 23 | + query = query.replace(" ","%20"); |
| 24 | + String urlString = "http://"+host+":"+port+"/search/"+db+"/"+query+"?case=ignore&limit=20&namespaces=0&offset=0"; |
| 25 | + URL url = new URL(urlString); |
| 26 | + BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); |
| 27 | + String line; |
| 28 | + int lineNum = 0; |
| 29 | + while ( (line = br.readLine()) != null ) { |
| 30 | + if(lineNum == 1){ |
| 31 | + if(line.startsWith("#suggest")){ |
| 32 | + br.close(); |
| 33 | + return line.substring(9).replaceAll("<[^>]+>",""); |
| 34 | + } |
| 35 | + } |
| 36 | + lineNum ++ ; |
| 37 | + } |
| 38 | + br.close(); |
| 39 | + return ""; |
| 40 | + } |
| 41 | + |
| 42 | + /** |
| 43 | + * @param args |
| 44 | + * @throws IOException |
| 45 | + */ |
| 46 | + public static void main(String[] args) throws IOException { |
| 47 | + int len = CHECK.length; |
| 48 | + System.out.println("Running "+len+" tests"); |
| 49 | + int good = 0, failed = 0; |
| 50 | + int count = 1; |
| 51 | + for(String[] c : CHECK){ |
| 52 | + String sug = getSuggestion(c[0]); |
| 53 | + if(!sug.equals(c[1])){ |
| 54 | + System.out.println("["+count+"/"+len+"] FAILED {"+sug+"} EXPECTED ["+c[1]+"] FOR ["+c[0]+"]"); |
| 55 | + failed++; |
| 56 | + } else{ |
| 57 | + System.out.println("["+count+"/"+len+"] OK"); |
| 58 | + good++; |
| 59 | + } |
| 60 | + count ++; |
| 61 | + } |
| 62 | + System.out.println("Good tests: "+good+", failed tests: "+failed); |
| 63 | + } |
| 64 | + |
| 65 | + // wrong -> right |
| 66 | + private static final String[][] CHECK = { |
| 67 | + {"annul of improbably research", "annals of improbable research" }, |
| 68 | + {"los angles", "los angeles" }, |
| 69 | + {"what is the type of engineers thats deal with various depth of the eart crust", "what is the type of engineers thats deal with various depths of the earth crust"}, |
| 70 | + {"argentina cilmage", "argentina climate"}, |
| 71 | + {"Vista Compatibly", "Vista Compatible"}, |
| 72 | + {"sarah thomson", "sarah thompson"}, |
| 73 | + {"attribution (finance)", ""}, |
| 74 | + {"SOUTH PARK EPISDOE LIST", "SOUTH PARK EPISODE LIST"}, |
| 75 | + {"the grnd canyon", "the grand canyon"}, |
| 76 | + {"ron burgand","ron burgundy"}, |
| 77 | + {"fullmetal achemist ep 1","fullmetal alchemist ep 1"}, |
| 78 | + {"fullmetal alchemist ep 1",""}, |
| 79 | + {"enerst shackleton", "ernest shackleton"}, |
| 80 | + {"los angles lakers", "los angeles lakers"}, |
| 81 | + {"crab fisher","crab fishing"}, |
| 82 | + {"discovery channe;", "discovery channel"}, |
| 83 | + {"Young Cuties", ""}, |
| 84 | + {"fire australia", ""}, |
| 85 | + {"platoon film", ""}, |
| 86 | + {"basillar artery","basilar artery"}, |
| 87 | + {"franki vallie","frankie valli"}, |
| 88 | + {"cuties",""}, |
| 89 | + {"teh",""}, |
| 90 | + {"21st ammendment", "21st amendment"}, |
| 91 | + {"stargate junior",""}, |
| 92 | + {"fire australia",""}, |
| 93 | + {"ISO crack", ""}, |
| 94 | + {"The James Gang (band)",""}, |
| 95 | + {"cource", "course"}, |
| 96 | + {"carolene products",""}, |
| 97 | + {"orvileWright","overnight"}, |
| 98 | + |
| 99 | + }; |
| 100 | + |
| 101 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java |
— | — | @@ -65,8 +65,20 @@ |
66 | 66 | HashMap query = new QueryStringMap(uri); |
67 | 67 | SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query); |
68 | 68 | contentType = "text/plain"; |
69 | | - if(res!=null && res.isSuccess()){ |
| 69 | + // format: |
| 70 | + // <namespace> <title> (resNum-times) |
| 71 | + if(what.equals("prefix")){ |
70 | 72 | sendHeaders(200, "OK"); |
| 73 | + for(ResultSet rs : res.getResults()){ |
| 74 | + sendResultLine(rs.namespace, rs.title); |
| 75 | + } |
| 76 | + } |
| 77 | + // format: |
| 78 | + // <num of hits> |
| 79 | + // #suggest <query> or #no suggestion |
| 80 | + // <score> <ns> <title> (resNum-times) |
| 81 | + else if(res!=null && res.isSuccess()){ |
| 82 | + sendHeaders(200, "OK"); |
71 | 83 | sendOutputLine(Integer.toString(res.getNumHits())); |
72 | 84 | if(res.getSuggest() != null) |
73 | 85 | sendOutputLine("#suggest "+res.getSuggest()); |
— | — | @@ -122,4 +134,12 @@ |
123 | 135 | } |
124 | 136 | } |
125 | 137 | |
| 138 | + private void sendResultLine(String namespace, String title) { |
| 139 | + try{ |
| 140 | + sendOutputLine(namespace + " " + URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8")); |
| 141 | + } catch(Exception e){ |
| 142 | + log.error("Error sending prefix result line (" + namespace + " " + title +"): "+e.getMessage()); |
| 143 | + } |
| 144 | + } |
| 145 | + |
126 | 146 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -51,7 +51,7 @@ |
52 | 52 | protected Set<String> stopWords; |
53 | 53 | |
54 | 54 | /** Distance an metaphone metrics */ |
55 | | - static class Metric { |
| 55 | + static public class Metric { |
56 | 56 | protected DoubleMetaphone dmeta = new DoubleMetaphone(); |
57 | 57 | protected String meta1, meta2; |
58 | 58 | protected EditDistance sd; |
— | — | @@ -132,7 +132,7 @@ |
133 | 133 | this.type = type; |
134 | 134 | } |
135 | 135 | public String toString(){ |
136 | | - return "dist:"+dist+"-freq:"+freq+"-sub:"+substitutes+"-pres:"+preserves; |
| 136 | + return "["+type+" dist:"+dist+" freq:"+freq+" sub:"+substitutes+" pres:"+preserves+"]"; |
137 | 137 | } |
138 | 138 | } |
139 | 139 | |
— | — | @@ -176,15 +176,23 @@ |
177 | 177 | ArrayList<Change> suggestionsTitle = new ArrayList<Change>(); |
178 | 178 | |
179 | 179 | // add correct words |
180 | | - for(int i=0;i<tokens.size();i++){ |
| 180 | + /*for(int i=0;i<tokens.size();i++){ |
181 | 181 | Token t = tokens.get(i); |
182 | 182 | if(correctWords.contains(t.termText())){ |
183 | 183 | Change c = new Change(0,1,Change.Type.TITLE_WORD); |
184 | 184 | c.preserves.put(i,t.termText()); |
185 | 185 | suggestions.add(c); |
186 | 186 | } |
| 187 | + } */ |
| 188 | + |
| 189 | + // check for exact title match |
| 190 | + if(tokens.size() == 1){ |
| 191 | + String w = tokens.get(0).termText(); |
| 192 | + if(correctWords.contains(w) && reader.docFreq(new Term("title",w)) != 0) |
| 193 | + return null; |
187 | 194 | } |
188 | 195 | |
| 196 | + HashSet<String> stemmedCorrectWords = stemSet(correctWords,parser.getBuilder().getFilters()); |
189 | 197 | ArrayList<ArrayList<SuggestResult>> wordSug = new ArrayList<ArrayList<SuggestResult>>(); |
190 | 198 | HashSet<Integer> correctIndex = new HashSet<Integer>(); |
191 | 199 | ArrayList<SuggestResult> possibleStopWords = new ArrayList<SuggestResult>(); |
— | — | @@ -214,11 +222,7 @@ |
215 | 223 | if(w2 == null) |
216 | 224 | continue; |
217 | 225 | |
218 | | - String phrase = w+gap+w2; |
219 | | - if(reader.docFreq(new Term("phrase",phrase)) != 0){ |
220 | | - correctPhrases.add(i); |
221 | | - correctPhrases.add(i2); |
222 | | - } else if(correctWords.contains(w) && correctWords.contains(w2)){ |
| 226 | + if(correctWords.contains(w) && correctWords.contains(w2)){ |
223 | 227 | for(HashSet<String> title : titles){ |
224 | 228 | if(title.contains(w) && title.contains(w2)){ |
225 | 229 | correctPhrases.add(i); |
— | — | @@ -263,26 +267,18 @@ |
264 | 268 | } |
265 | 269 | } |
266 | 270 | possibleStopWords.add(maybeStopWord); |
267 | | - // detect common misspells |
268 | | - if(sug.size() > 1){ |
269 | | - SuggestResult r1 = sug.get(0); |
270 | | - SuggestResult r2 = sug.get(1); |
271 | | - if(r1.dist == 1 && r2.dist == 0 && r1.frequency > 100 * r2.frequency){ |
272 | | - Change c = new Change(r1.dist,r1.frequency,Change.Type.WORD); |
273 | | - c.substitutes.put(i,r1.word); |
274 | | - suggestions.add(c); |
275 | | - } |
276 | | - } |
277 | 271 | } else{ |
278 | 272 | wordSug.add(null); |
279 | 273 | possibleStopWords.add(null); |
280 | 274 | } |
281 | 275 | // suggest split |
282 | | - SuggestResult split = suggestSplit(w,minFreq); |
283 | | - if(split != null){ |
284 | | - Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT); |
285 | | - sc.substitutes.put(i,split.word.replace("_"," ")); |
286 | | - suggestions.add(sc); |
| 276 | + if(!correctWords.contains(w)){ |
| 277 | + SuggestResult split = suggestSplit(w,minFreq); |
| 278 | + if(split != null){ |
| 279 | + Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT); |
| 280 | + sc.substitutes.put(i,split.word.replace("_"," ")); |
| 281 | + suggestions.add(sc); |
| 282 | + } |
287 | 283 | } |
288 | 284 | // suggest join |
289 | 285 | if(i-1 >= 0 |
— | — | @@ -306,7 +302,8 @@ |
307 | 303 | ArrayList<SuggestResult> sug2 = null; |
308 | 304 | String w2 = null; |
309 | 305 | String gap = "_"; |
310 | | - boolean good1 = sug1.get(0).getDist() == 0; // w1 is spellchecked right |
| 306 | + // if w1 is spellchecked right |
| 307 | + boolean good1 = sug1.get(0).getDist() == 0; |
311 | 308 | int i2 = i; |
312 | 309 | boolean maybeStopWord = false; // the currecnt i2 might be a stop word, try to find phrases with it as stop word |
313 | 310 | int distOffset = 0; // if we spellcheked to stop word, all phrases should have this initial dist |
— | — | @@ -331,7 +328,8 @@ |
332 | 329 | } |
333 | 330 | if(sug2 == null) |
334 | 331 | continue; |
335 | | - boolean good2 = sug2.get(0).getDist() == 0; // w2 is spellchecked right |
| 332 | + // if second word is spelled right |
| 333 | + boolean good2 = sug2.get(0).getDist() == 0; |
336 | 334 | int maxdist = Math.min((w1.length() + w2.length()) / 3, 5); |
337 | 335 | int mindist = -1; |
338 | 336 | boolean forTitlesOnly = false; |
— | — | @@ -358,21 +356,30 @@ |
359 | 357 | } |
360 | 358 | //log.info("Checking "+phrase); |
361 | 359 | if(freq > 0){ |
| 360 | + // number of characters added/substracted |
| 361 | + int diff1 = Math.abs(s1.word.length()-w1.length()); |
| 362 | + int diff2 = Math.abs(s2.word.length()-w2.length()); |
362 | 363 | log.info("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle); |
363 | 364 | int dist = s1.dist + s2.dist + distOffset; |
364 | 365 | boolean accept = true; |
365 | 366 | Change c = new Change(dist,freq,Change.Type.PHRASE); |
366 | 367 | if(s1.word.equals(w1)) |
367 | 368 | c.preserves.put(i,w1); |
368 | | - else if(!good1 || inTitle) |
| 369 | + else if(!good1 || (inTitle && diff1 <= 2 && !correctWords.contains(w1))) |
369 | 370 | c.substitutes.put(i,s1.word); |
370 | | - else |
| 371 | + else if(!good1 || (inTitle && diff1 <=2)){ |
| 372 | + forTitlesOnly = true; |
| 373 | + c.substitutes.put(i,s1.word); |
| 374 | + } else |
371 | 375 | accept = false; |
372 | 376 | if(s2.word.equals(w2)) |
373 | 377 | c.preserves.put(i2,w2); |
374 | | - else if(!good2 || inTitle) |
| 378 | + else if(!good2 || (inTitle && diff2 <= 2 && !correctWords.contains(w2))) |
375 | 379 | c.substitutes.put(i2,s2.word); |
376 | | - else |
| 380 | + else if(!good2 || (inTitle && diff2 <= 2)){ |
| 381 | + forTitlesOnly = true; |
| 382 | + c.substitutes.put(i2,s2.word); |
| 383 | + } else |
377 | 384 | accept = false; |
378 | 385 | if(accept){ |
379 | 386 | if(mindist == -1) |
— | — | @@ -384,10 +391,11 @@ |
385 | 392 | } |
386 | 393 | } |
387 | 394 | } |
388 | | - } while(maybeStopWord); |
| 395 | + } while(maybeStopWord && i2+1<tokens.size()); |
389 | 396 | } |
390 | 397 | // try to construct a valid title by spell-checking all words |
391 | 398 | if(suggestionsTitle.size() > 0){ |
| 399 | + log.info("Trying exact-title matches"); |
392 | 400 | Object[] ret = calculateChanges(suggestionsTitle,searchterm.length()/2); |
393 | 401 | ArrayList<Entry<Integer,String>> proposedTitle = (ArrayList<Entry<Integer, String>>) ret[0]; |
394 | 402 | boolean madeChanges = false; |
— | — | @@ -395,8 +403,10 @@ |
396 | 404 | String formated = searchterm; |
397 | 405 | for(Entry<Integer,String> e : proposedTitle){ |
398 | 406 | Token t = tokens.get(e.getKey()); |
399 | | - String nt = e.getValue(); |
400 | | - if(!stemsToSame(t.termText(),nt,parser.getBuilder().getFilters())){ |
| 407 | + String nt = e.getValue(); |
| 408 | + // replace words if they don't stem to same word, of they stem to same, but the words is misspelled |
| 409 | + boolean stemNotSame = stemNotSameOrInSet(t.termText(),nt,parser.getBuilder().getFilters(),stemmedCorrectWords); |
| 410 | + if(stemNotSame || (!stemNotSame && reader.docFreq(new Term("word",t.termText())) == 0)){ |
401 | 411 | formated = markSuggestion(formated,t,nt); |
402 | 412 | title = applySuggestion(title,t,nt); |
403 | 413 | madeChanges = true; |
— | — | @@ -412,6 +422,7 @@ |
413 | 423 | } else if(tokens.size() == 1 && wordSug.get(0)!=null |
414 | 424 | && wordSug.get(0).size() > 0 && !correctWords.contains(tokens.get(0).termText())){ |
415 | 425 | // only one token, try different spell-checks for title |
| 426 | + log.info("Trying exact-title single word match"); |
416 | 427 | ArrayList<SuggestResult> sg = (ArrayList<SuggestResult>) wordSug.get(0).clone(); |
417 | 428 | Collections.sort(sg,new SuggestResult.ComparatorNoCommonMisspell()); |
418 | 429 | Token t = tokens.get(0); |
— | — | @@ -434,6 +445,7 @@ |
435 | 446 | ArrayList<Entry<Integer,String>> proposedChanges = new ArrayList<Entry<Integer,String>>(); |
436 | 447 | if(suggestions.size() > 0){ |
437 | 448 | // found some suggestions |
| 449 | + log.info("Trying phrases ..."); |
438 | 450 | Object[] ret = calculateChanges(suggestions,searchterm.length()/2); |
439 | 451 | proposedChanges = (ArrayList<Entry<Integer, String>>) ret[0]; |
440 | 452 | ArrayList<Entry<Integer,String>> preservedWords = (ArrayList<Entry<Integer, String>>) ret[1]; |
— | — | @@ -442,12 +454,13 @@ |
443 | 455 | for(Entry<Integer,String> e : proposedChanges) |
444 | 456 | preserveTokens.add(e.getKey()); |
445 | 457 | } |
446 | | - |
| 458 | + log.info("Adding words, preserve tokens: "+preserveTokens+", preserve correct phrases: "+correctPhrases); |
447 | 459 | // last resort: go with individual word suggestions |
448 | 460 | HashMap<Integer,String> wordChanges = new HashMap<Integer,String>(); |
449 | | - for(int i=0;i<tokens.size();i++){ |
450 | | - if(preserveTokens.contains(i)) |
| 461 | + for(int i=0;i<tokens.size();i++){ |
| 462 | + if(preserveTokens.contains(i) || correctPhrases.contains(i)) |
451 | 463 | continue; |
| 464 | + // TODO: maybe check for common misspells here?! |
452 | 465 | ArrayList<SuggestResult> sug = wordSug.get(i); |
453 | 466 | if(sug == null) |
454 | 467 | continue; |
— | — | @@ -457,7 +470,7 @@ |
458 | 471 | } |
459 | 472 | if(wordChanges.size() != 0) |
460 | 473 | proposedChanges.addAll(wordChanges.entrySet()); |
461 | | - |
| 474 | + |
462 | 475 | // sort in reverse order from that in query, i.e. first change in the last term |
463 | 476 | Collections.sort(proposedChanges,new Comparator<Entry<Integer,String>>() { |
464 | 477 | public int compare(Entry<Integer,String> o1, Entry<Integer,String> o2){ |
— | — | @@ -471,7 +484,9 @@ |
472 | 485 | for(Entry<Integer,String> e : proposedChanges){ |
473 | 486 | Token t = tokens.get(e.getKey()); |
474 | 487 | String nt = e.getValue(); |
475 | | - if(!stemsToSame(t.termText(),nt,parser.getBuilder().getFilters())){ |
| 488 | + // incorrect words, or doesn't stem to same |
| 489 | + boolean stemNotSame = stemNotSameOrInSet(t.termText(),nt,parser.getBuilder().getFilters(),stemmedCorrectWords); |
| 490 | + if(stemNotSame || (!stemNotSame && reader.docFreq(new Term("word",t.termText())) == 0)){ |
476 | 491 | formated = markSuggestion(formated,t,nt); |
477 | 492 | searchterm = applySuggestion(searchterm,t,nt); |
478 | 493 | madeChanges = true; |
— | — | @@ -484,15 +499,27 @@ |
485 | 500 | return null; |
486 | 501 | } |
487 | 502 | |
| 503 | + /** try to figure out the case of original spell-checked word, and output the new word in that case */ |
| 504 | + protected String simulateCase(String formated, Token t, String newWord) { |
| 505 | + String old = formated.substring(t.startOffset(),t.endOffset()); |
| 506 | + if(old.equals(old.toLowerCase())) |
| 507 | + return newWord.toLowerCase(); |
| 508 | + if(old.equals(old.toUpperCase())) |
| 509 | + return newWord.toUpperCase(); |
| 510 | + if(old.length()>1 && old.equals(old.substring(0,1).toUpperCase()+old.substring(1))) |
| 511 | + return newWord.substring(0,1).toUpperCase()+newWord.substring(1).toLowerCase(); |
| 512 | + return newWord; |
| 513 | + } |
| 514 | + |
488 | 515 | protected String markSuggestion(String formated, Token t, String newWord){ |
489 | 516 | return formated.substring(0,t.startOffset()) |
490 | | - + "<i>" + newWord + "</i>" |
| 517 | + + "<i>" + simulateCase(formated,t,newWord) + "</i>" |
491 | 518 | + formated.substring(t.endOffset()); |
492 | 519 | } |
493 | 520 | |
494 | 521 | protected String applySuggestion(String searchterm, Token t, String newWord){ |
495 | 522 | return searchterm.substring(0,t.startOffset()) |
496 | | - + newWord |
| 523 | + + simulateCase(searchterm,t,newWord) |
497 | 524 | + searchterm.substring(t.endOffset()); |
498 | 525 | } |
499 | 526 | |
— | — | @@ -575,7 +602,7 @@ |
576 | 603 | hr.addAll(r1); hr.addAll(r2); |
577 | 604 | ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
578 | 605 | res.addAll(hr); |
579 | | - Collections.sort(res,new SuggestResult.Comparator()); |
| 606 | + Collections.sort(res,new SuggestResult.ComparatorNoCommonMisspell()); |
580 | 607 | return res; |
581 | 608 | } |
582 | 609 | return r1; |
— | — | @@ -718,11 +745,46 @@ |
719 | 746 | if(t1 != null && t2 != null && t1.termText().equals(t2.termText())) |
720 | 747 | return true; |
721 | 748 | } catch (IOException e) { |
722 | | - log.error("Cannot stemm words "+word1+", "+word2+" : "+e.getMessage()); |
| 749 | + log.error("Cannot stem words "+word1+", "+word2+" : "+e.getMessage()); |
723 | 750 | } |
724 | 751 | return false; |
725 | 752 | } |
726 | 753 | |
| 754 | + /** check if stemmed newWord is 1) not same to stememed oldWord, OR 2) not in stemmed set*/ |
| 755 | + public boolean stemNotSameOrInSet(String oldWord, String newWord, FilterFactory filters, Set<String> stemmedSet){ |
| 756 | + if(!filters.hasStemmer()) |
| 757 | + return false; |
| 758 | + ArrayList<String> in = new ArrayList<String>(); |
| 759 | + in.add(oldWord); in.add(newWord); |
| 760 | + TokenStream ts = filters.makeStemmer(new StringsTokenStream(in)); |
| 761 | + try { |
| 762 | + Token t1 = ts.next(); |
| 763 | + Token t2 = ts.next(); |
| 764 | + if(t1 != null && t2 != null && (t1.termText().equals(t2.termText()) && stemmedSet.contains(t2.termText()))) |
| 765 | + return false; |
| 766 | + } catch (IOException e) { |
| 767 | + log.error("Cannot stem words "+oldWord+", "+oldWord+" : "+e.getMessage()); |
| 768 | + } |
| 769 | + return true; |
| 770 | + } |
| 771 | + |
| 772 | + /** stem all words in the set */ |
| 773 | + public HashSet<String> stemSet(HashSet<String> set, FilterFactory filters){ |
| 774 | + if(!filters.hasStemmer()) |
| 775 | + return new HashSet<String>(); |
| 776 | + HashSet<String> ret = new HashSet<String>(); |
| 777 | + TokenStream ts = filters.makeStemmer(new StringsTokenStream(set)); |
| 778 | + try { |
| 779 | + Token t; |
| 780 | + while((t = ts.next()) != null) |
| 781 | + ret.add(t.termText()); |
| 782 | + return ret; |
| 783 | + } catch (IOException e) { |
| 784 | + log.error("Cannot stem set "+set+" : "+e.getMessage()); |
| 785 | + return new HashSet<String>(); |
| 786 | + } |
| 787 | + } |
| 788 | + |
727 | 789 | static class StringsTokenStream extends TokenStream { |
728 | 790 | Iterator<String> input; |
729 | 791 | int count = 0; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java |
— | — | @@ -62,7 +62,10 @@ |
63 | 63 | if(text.length()>=2){ |
64 | 64 | System.out.println("METAPHONES: "+dmeta.doubleMetaphone(text)+", "+dmeta.doubleMetaphone(text,true)); |
65 | 65 | System.out.println("SUGGEST: "); |
| 66 | + int count = 0; |
66 | 67 | for(SuggestResult r : sc.suggestWords(text,10)){ |
| 68 | + if(++count >= 10 ) |
| 69 | + break; |
67 | 70 | System.out.println(r); |
68 | 71 | } |
69 | 72 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java |
— | — | @@ -40,23 +40,37 @@ |
41 | 41 | private int count = 0; |
42 | 42 | private String field; |
43 | 43 | private boolean first = true; |
| 44 | + private String prefix = null; |
| 45 | + private boolean silent = false; // no report output |
44 | 46 | |
45 | 47 | public LuceneDictionary(IndexReader reader, String field) { |
46 | | - try { |
47 | | - this.field = field; |
48 | | - termEnum = reader.terms(new Term(field, "")); |
49 | | - } catch (IOException e) { |
50 | | - throw new RuntimeException(e); |
51 | | - } |
| 48 | + this(reader,field,""); |
52 | 49 | } |
53 | 50 | |
| 51 | + public LuceneDictionary(IndexReader reader, String field, String prefix) { |
| 52 | + if(!prefix.equals("")) |
| 53 | + this.prefix = prefix; |
| 54 | + |
| 55 | + try { |
| 56 | + this.field = field; |
| 57 | + termEnum = reader.terms(new Term(field, prefix)); |
| 58 | + } catch (IOException e) { |
| 59 | + throw new RuntimeException(e); |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + /** Don't print progress */ |
| 64 | + public void setNoProgressReport(){ |
| 65 | + silent = true; |
| 66 | + } |
| 67 | + |
54 | 68 | public Word next() { |
55 | | - if(++count % REPORT == 0){ |
| 69 | + if(!silent && ++count % REPORT == 0){ |
56 | 70 | System.out.println("Processed "+count+" terms"); |
57 | 71 | } |
58 | 72 | try { |
59 | 73 | while(true){ |
60 | | - if(first){ |
| 74 | + if(first && termEnum.term() != null){ |
61 | 75 | first = false; |
62 | 76 | break; |
63 | 77 | } |
— | — | @@ -64,6 +78,8 @@ |
65 | 79 | return null; |
66 | 80 | else if(!termEnum.term().field().equals(field)) |
67 | 81 | return null; // end of our field |
| 82 | + else if(prefix != null && !termEnum.term().text().startsWith(prefix)) |
| 83 | + return null; // no longer same prefix |
68 | 84 | |
69 | 85 | break; |
70 | 86 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java |
— | — | @@ -32,18 +32,18 @@ |
33 | 33 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
34 | 34 | import org.wikimedia.lsearch.search.IndexSearcherMul; |
35 | 35 | import org.wikimedia.lsearch.search.WikiSearcher; |
| 36 | +import org.wikimedia.lsearch.spell.Suggest; |
36 | 37 | import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
37 | 38 | import org.wikimedia.lsearch.spell.dist.DoubleMetaphone; |
38 | 39 | import org.wikimedia.lsearch.util.HighFreqTerms; |
39 | 40 | |
40 | 41 | /** |
41 | | - * Index words and phrases from article titles. |
| 42 | + * Index words and phrases from articles. |
42 | 43 | * |
43 | 44 | * Fields: |
44 | 45 | * * word - word from title |
| 46 | + * * word_ngramN - word ngrams |
45 | 47 | * * phrase - phrase like douglas_adams |
46 | | - * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3) |
47 | | - * * namespace - namespaces where the word/phrase is present |
48 | 48 | * |
49 | 49 | * @author rainman |
50 | 50 | * |
— | — | @@ -146,10 +146,9 @@ |
147 | 147 | addPhrase(w,freq,true); |
148 | 148 | } |
149 | 149 | } |
150 | | - } |
| 150 | + } |
151 | 151 | ngramWriter.closeAndOptimize(); |
152 | | - ir.close(); |
153 | | - |
| 152 | + ir.close(); |
154 | 153 | } catch (IOException e) { |
155 | 154 | log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage()); |
156 | 155 | e.printStackTrace(); |
— | — | @@ -158,6 +157,24 @@ |
159 | 158 | |
160 | 159 | } |
161 | 160 | |
| 161 | + /** Check if there are common mispellings of this phrase */ |
| 162 | + protected boolean checkCommonPhraseMisspell(String phrase, int freq, IndexReader ir, String field) { |
| 163 | + LuceneDictionary d = new LuceneDictionary(ir,field,phrase.substring(0,1)); |
| 164 | + d.setNoProgressReport(); |
| 165 | + Suggest.Metric metric = new Suggest.Metric(phrase); |
| 166 | + Word word; |
| 167 | + while((word = d.next()) != null){ |
| 168 | + if(word.getFrequency() * 100 < freq && word.getWord().indexOf("_")!=-1 ){ |
| 169 | + String w = word.getWord(); |
| 170 | + if(metric.distance(w) == 1){ |
| 171 | + System.out.println("Detected common mispelling for "+w+" (correct: "+phrase+")"); |
| 172 | + return true; |
| 173 | + } |
| 174 | + } |
| 175 | + } |
| 176 | + return false; |
| 177 | + } |
| 178 | + |
162 | 179 | /** |
163 | 180 | * Register a title in the index, without tokenization, just lowercase. |
164 | 181 | * |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/LocalIndex.java |
— | — | @@ -49,5 +49,9 @@ |
50 | 50 | this.timestamp = timestamp; |
51 | 51 | } |
52 | 52 | |
| 53 | + public String toString(){ |
| 54 | + return path+" at "+timestamp+" for "+iid; |
| 55 | + } |
53 | 56 | |
| 57 | + |
54 | 58 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java |
— | — | @@ -10,6 +10,14 @@ |
11 | 11 | public String namespace; |
12 | 12 | public String title; |
13 | 13 | Explanation explanation; |
| 14 | + |
| 15 | + public ResultSet(String key) { |
| 16 | + int colon = key.indexOf(':'); |
| 17 | + this.score = 0; |
| 18 | + this.namespace = key.substring(0,colon); |
| 19 | + this.title = key.substring(colon+1); |
| 20 | + this.explanation = null; |
| 21 | + } |
14 | 22 | public ResultSet(double score, String namespace, String title) { |
15 | 23 | this.score = score; |
16 | 24 | this.namespace = namespace; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Benchmark.java |
— | — | @@ -107,10 +107,16 @@ |
108 | 108 | @SuppressWarnings("deprecation") |
109 | 109 | protected int search(){ |
110 | 110 | String query = ""; |
111 | | - for(int i=0;i<words;i++){ |
112 | | - if(!query.equals("")) |
113 | | - query += " OR "; |
114 | | - query += terms.next(); |
| 111 | + if(verb.equals("prefix")){ |
| 112 | + int num = (int)(Math.random()*8); |
| 113 | + String t = terms.next(); |
| 114 | + query = namespaceFilter+":"+t.substring(0,Math.min(num,t.length())); |
| 115 | + } else{ |
| 116 | + for(int i=0;i<words;i++){ |
| 117 | + if(!query.equals("")) |
| 118 | + query += " OR "; |
| 119 | + query += terms.next(); |
| 120 | + } |
115 | 121 | } |
116 | 122 | String urlString; |
117 | 123 | if(namespace.equals("")){ |
— | — | @@ -132,11 +138,13 @@ |
133 | 139 | new InputStreamReader( |
134 | 140 | conn.getInputStream())); |
135 | 141 | String inputLine; |
136 | | - int resCount = -1; |
| 142 | + int resCount = verb.equals("prefix")? 0 : -1; |
137 | 143 | |
138 | 144 | while ((inputLine = in.readLine()) != null){ |
139 | 145 | if(resCount == -1) |
140 | 146 | resCount = Integer.parseInt(inputLine); |
| 147 | + if(verb.equals("prefix")) |
| 148 | + resCount ++ ; |
141 | 149 | } |
142 | 150 | in.close(); |
143 | 151 | |
— | — | @@ -195,7 +203,7 @@ |
196 | 204 | } else if (args[i].equals("-c")) { |
197 | 205 | runs = Integer.parseInt(args[++i]); |
198 | 206 | } else if (args[i].equals("-v")) { |
199 | | - database = args[++i]; |
| 207 | + verb = args[++i]; |
200 | 208 | } else if (args[i].equals("-wf")) { |
201 | 209 | wordfile = args[++i]; |
202 | 210 | } else if (args[i].equals("-n") || args[i].equals("-ns")) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java |
— | — | @@ -0,0 +1,154 @@ |
| 2 | +package org.wikimedia.lsearch.prefix; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collections; |
| 7 | +import java.util.Comparator; |
| 8 | +import java.util.HashMap; |
| 9 | +import java.util.Iterator; |
| 10 | +import java.util.Map.Entry; |
| 11 | + |
| 12 | +import org.apache.log4j.Logger; |
| 13 | +import org.apache.lucene.analysis.SimpleAnalyzer; |
| 14 | +import org.apache.lucene.document.Document; |
| 15 | +import org.apache.lucene.document.Field; |
| 16 | +import org.apache.lucene.index.IndexReader; |
| 17 | +import org.apache.lucene.index.IndexWriter; |
| 18 | +import org.apache.lucene.index.Term; |
| 19 | +import org.apache.lucene.index.TermDocs; |
| 20 | +import org.wikimedia.lsearch.analyzers.LowercaseAnalyzer; |
| 21 | +import org.wikimedia.lsearch.analyzers.PrefixAnalyzer; |
| 22 | +import org.wikimedia.lsearch.config.Configuration; |
| 23 | +import org.wikimedia.lsearch.config.IndexId; |
| 24 | +import org.wikimedia.lsearch.index.IndexThread; |
| 25 | +import org.wikimedia.lsearch.ranks.StringList; |
| 26 | +import org.wikimedia.lsearch.spell.api.LuceneDictionary; |
| 27 | +import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
| 28 | +import org.wikimedia.lsearch.storage.ArticleAnalytics; |
| 29 | +import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
| 30 | + |
| 31 | +/** |
| 32 | + * Build an index of all title prefixes |
| 33 | + * |
| 34 | + * @author rainman |
| 35 | + * |
| 36 | + */ |
| 37 | +public class PrefixIndexBuilder { |
| 38 | + static Logger log = Logger.getLogger(PrefixIndexBuilder.class); |
| 39 | + |
| 40 | + public static void main(String[] args) throws IOException{ |
| 41 | + final int PER_PREFIX = 10; |
| 42 | + boolean usetemp = false; |
| 43 | + String dbname = null; |
| 44 | + |
| 45 | + Configuration.open(); |
| 46 | + if(args.length == 0){ |
| 47 | + System.out.println("Syntax: java PrefixIndexBuilder [-t] <dbname>"); |
| 48 | + return; |
| 49 | + } |
| 50 | + for(int i=0;i<args.length;i++){ |
| 51 | + if(args[i].equals("-t")) |
| 52 | + usetemp = true; |
| 53 | + else if(args[i].startsWith("-")){ |
| 54 | + System.out.println("Unrecognized option "+args[i]); |
| 55 | + return; |
| 56 | + } else |
| 57 | + dbname = args[i]; |
| 58 | + } |
| 59 | + |
| 60 | + IndexId iid = IndexId.get(dbname); |
| 61 | + IndexId pre = iid.getPrefix(); |
| 62 | + |
| 63 | + long start = System.currentTimeMillis(); |
| 64 | + |
| 65 | + if(!usetemp){ |
| 66 | + IndexWriter writer = new IndexWriter(pre.getTempPath(),new PrefixAnalyzer(),true); |
| 67 | + writer.setMergeFactor(20); |
| 68 | + writer.setMaxBufferedDocs(500); |
| 69 | + LinkAnalysisStorage st = new LinkAnalysisStorage(iid); |
| 70 | + log.info("Writing temp index"); |
| 71 | + int count = 0; |
| 72 | + Iterator<ArticleAnalytics> it = st.iterator(); |
| 73 | + while(it.hasNext()){ |
| 74 | + if(++count % 1000 == 0) |
| 75 | + System.out.println("Processed "+count); |
| 76 | + ArticleAnalytics aa = it.next(); |
| 77 | + String key = aa.getKey(); |
| 78 | + //String title = key.substring(key.indexOf(":")+1).toLowerCase(); |
| 79 | + String redirect = aa.getRedirectTarget(); |
| 80 | + if(redirect == null) |
| 81 | + redirect = ""; |
| 82 | + int ref = aa.getReferences(); |
| 83 | + Document d = new Document(); |
| 84 | + d.add(new Field("key",key,Field.Store.YES,Field.Index.TOKENIZED)); |
| 85 | + d.add(new Field("redirect",redirect,Field.Store.YES,Field.Index.NO)); |
| 86 | + d.add(new Field("ref",Integer.toString(ref),Field.Store.YES,Field.Index.NO)); |
| 87 | + writer.addDocument(d); |
| 88 | + } |
| 89 | + log.info("Optimizing temp index"); |
| 90 | + writer.optimize(); |
| 91 | + writer.close(); |
| 92 | + } |
| 93 | + log.info("Writing prefix index"); |
| 94 | + IndexWriter writer = new IndexWriter(pre.getImportPath(), new LowercaseAnalyzer(),true); |
| 95 | + writer.setMergeFactor(20); |
| 96 | + writer.setMaxBufferedDocs(1000); |
| 97 | + IndexReader ir = IndexReader.open(pre.getTempPath()); |
| 98 | + LuceneDictionary dict = new LuceneDictionary(ir,"key"); |
| 99 | + Word w; |
| 100 | + while((w = dict.next()) != null){ |
| 101 | + String prefix = w.getWord(); |
| 102 | + Term t = new Term("key",prefix); |
| 103 | + if(ir.docFreq(t) < 2) |
| 104 | + continue; |
| 105 | + TermDocs td = ir.termDocs(t); |
| 106 | + HashMap<String,Integer> refs = new HashMap<String,Integer>(); |
| 107 | + while(td.next()){ |
| 108 | + Document d = ir.document(td.doc()); |
| 109 | + refs.put(d.get("key"),Integer.parseInt(d.get("ref"))); |
| 110 | + } |
| 111 | + ArrayList<Entry<String,Integer>> sorted = new ArrayList<Entry<String,Integer>>(); |
| 112 | + sorted.addAll(refs.entrySet()); |
| 113 | + Collections.sort(sorted,new Comparator<Entry<String,Integer>>() { |
| 114 | + public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2){ |
| 115 | + return o2.getValue() - o1.getValue(); |
| 116 | + } |
| 117 | + }); |
| 118 | + ArrayList<String> selected = new ArrayList<String>(); |
| 119 | + for(int i=0;i<PER_PREFIX && i<sorted.size();i++){ |
| 120 | + selected.add(sorted.get(i).getKey()); |
| 121 | + } |
| 122 | + Document d = new Document(); |
| 123 | + d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.UN_TOKENIZED)); |
| 124 | + d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO)); |
| 125 | + writer.addDocument(d); |
| 126 | + } |
| 127 | + log.info("Adding title keys ..."); |
| 128 | + int count = 0; |
| 129 | + for(int i=0;i<ir.maxDoc();i++){ |
| 130 | + if(++count % 1000 == 0) |
| 131 | + System.out.println("Added "+count); |
| 132 | + if(ir.isDeleted(i)) |
| 133 | + continue; |
| 134 | + Document d = new Document(); |
| 135 | + d.add(new Field("key",ir.document(i).get("key"),Field.Store.YES,Field.Index.TOKENIZED)); |
| 136 | + writer.addDocument(d); |
| 137 | + } |
| 138 | + ir.close(); |
| 139 | + log.info("Optimizing ..."); |
| 140 | + writer.optimize(); |
| 141 | + writer.close(); |
| 142 | + |
| 143 | + IndexThread.makeIndexSnapshot(pre,pre.getImportPath()); |
| 144 | + long delta = System.currentTimeMillis() - start; |
| 145 | + System.out.println("Finished in "+formatTime(delta)); |
| 146 | + } |
| 147 | + |
| 148 | + private static String formatTime(long l) { |
| 149 | + l /= 1000; |
| 150 | + if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s"; |
| 151 | + else if(l >= 60) return (l%3600)/60+"m "+(l%60)+"s"; |
| 152 | + else return l+"s"; |
| 153 | + } |
| 154 | + |
| 155 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java |
— | — | @@ -125,7 +125,7 @@ |
126 | 126 | } |
127 | 127 | |
128 | 128 | public class LinkAnalysisIterator implements Iterator<ArticleAnalytics>{ |
129 | | - int inx = 0, next = -1; |
| 129 | + int inx = -1, next = -1; |
130 | 130 | int maxdoc; |
131 | 131 | |
132 | 132 | public LinkAnalysisIterator() throws IOException{ |
— | — | @@ -137,7 +137,7 @@ |
138 | 138 | if(inx >= maxdoc) |
139 | 139 | return false; |
140 | 140 | if(next == -1){ |
141 | | - for(next=inx;next<maxdoc;next++) |
| 141 | + for(next=inx+1;next<maxdoc;next++) |
142 | 142 | if(!reader.isDeleted(next)) |
143 | 143 | return true; |
144 | 144 | return false; |
— | — | @@ -152,6 +152,8 @@ |
153 | 153 | inx = next; |
154 | 154 | next = -1; |
155 | 155 | } else{ |
| 156 | + if(inx == -1) |
| 157 | + inx = 0; |
156 | 158 | for(;inx<maxdoc;inx++){ |
157 | 159 | if(!reader.isDeleted(inx)) |
158 | 160 | break; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | /** If true, this machine is an indexer for this index */ |
60 | 60 | protected boolean myIndex; |
61 | 61 | |
62 | | - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED }; |
| 62 | + protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED, PREFIX }; |
63 | 63 | |
64 | 64 | /** Type of index, enumeration */ |
65 | 65 | protected IndexType type; |
— | — | @@ -162,6 +162,8 @@ |
163 | 163 | this.type = IndexType.LINK_ANALYSIS; |
164 | 164 | else if(type.equals("related")) |
165 | 165 | this.type = IndexType.RELATED; |
| 166 | + else if(type.equals("prefix")) |
| 167 | + this.type = IndexType.PREFIX; |
166 | 168 | |
167 | 169 | // parts |
168 | 170 | String[] parts = dbrole.split("\\."); |
— | — | @@ -265,6 +267,10 @@ |
266 | 268 | public boolean isRelated(){ |
267 | 269 | return type == IndexType.RELATED; |
268 | 270 | } |
| 271 | + /** If this is the index storing article list for specific prefixes */ |
| 272 | + public boolean isPrefix(){ |
| 273 | + return type == IndexType.PREFIX; |
| 274 | + } |
269 | 275 | |
270 | 276 | /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */ |
271 | 277 | public int getPartNum() { |
— | — | @@ -412,7 +418,7 @@ |
413 | 419 | |
414 | 420 | /** get all hosts that search db this iid belongs to */ |
415 | 421 | public HashSet<String> getDBSearchHosts(){ |
416 | | - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated()) |
| 422 | + if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix()) |
417 | 423 | return searchHosts; |
418 | 424 | else{ |
419 | 425 | // add all hosts that search: dbname and all parts |
— | — | @@ -463,7 +469,7 @@ |
464 | 470 | */ |
465 | 471 | public HashSet<String> getPhysicalIndexes() { |
466 | 472 | HashSet<String> ret = new HashSet<String>(); |
467 | | - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated()) |
| 473 | + if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix()) |
468 | 474 | ret.add(dbrole); |
469 | 475 | else if(isMainsplit() || isSplit() || isNssplit()){ |
470 | 476 | for(String p : splitParts) |
— | — | @@ -549,6 +555,11 @@ |
550 | 556 | return get(dbname+".related"); |
551 | 557 | } |
552 | 558 | |
| 559 | + /** Get the prefix index iid */ |
| 560 | + public IndexId getPrefix() { |
| 561 | + return get(dbname+".prefix"); |
| 562 | + } |
553 | 563 | |
| 564 | + |
554 | 565 | |
555 | 566 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -232,7 +232,7 @@ |
233 | 233 | } else if(typeid.matches("nspart[1-9][0-9]*")){ |
234 | 234 | type = "nssplit"; |
235 | 235 | dbrole = dbname + "." + typeid; |
236 | | - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related")){ |
| 236 | + } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){ |
237 | 237 | type = typeid; |
238 | 238 | dbrole = dbname + "." + typeid; |
239 | 239 | } else |
— | — | @@ -519,7 +519,7 @@ |
520 | 520 | } else if(typeid.matches("nspart[1-9][0-9]*")){ |
521 | 521 | type = "nssplit"; |
522 | 522 | dbrole = dbname + "." + typeid; |
523 | | - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related")){ |
| 523 | + } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){ |
524 | 524 | type = typeid; |
525 | 525 | dbrole = dbname + "." + typeid; |
526 | 526 | } else |
— | — | @@ -816,6 +816,12 @@ |
817 | 817 | System.out.println("Unrecognized suggest parameters in ("+role+")"); |
818 | 818 | |
819 | 819 | dbroles.put(type,params); |
| 820 | + } else if(type.equals("prefix")){ |
| 821 | + // no params |
| 822 | + if(tokens.length>1 && verbose) |
| 823 | + System.out.println("Unrecognized prefix parameters in ("+role+")"); |
| 824 | + |
| 825 | + dbroles.put(type,params); |
820 | 826 | } else{ |
821 | 827 | System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring."); |
822 | 828 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -1,17 +1,22 @@ |
2 | 2 | package org.wikimedia.lsearch.search; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.io.Reader; |
5 | 6 | import java.net.URI; |
6 | 7 | import java.text.MessageFormat; |
7 | 8 | import java.util.ArrayList; |
8 | 9 | import java.util.HashMap; |
9 | 10 | import java.util.HashSet; |
10 | 11 | import java.util.Hashtable; |
| 12 | +import java.util.Iterator; |
11 | 13 | |
12 | 14 | import org.apache.log4j.Logger; |
13 | 15 | import org.apache.lucene.analysis.Analyzer; |
14 | 16 | import org.apache.lucene.document.Document; |
15 | 17 | import org.apache.lucene.index.IndexReader; |
| 18 | +import org.apache.lucene.index.Term; |
| 19 | +import org.apache.lucene.index.TermDocs; |
| 20 | +import org.apache.lucene.index.TermEnum; |
16 | 21 | import org.apache.lucene.queryParser.ParseException; |
17 | 22 | import org.apache.lucene.search.Hits; |
18 | 23 | import org.apache.lucene.search.Query; |
— | — | @@ -31,6 +36,7 @@ |
32 | 37 | import org.wikimedia.lsearch.frontend.SearchDaemon; |
33 | 38 | import org.wikimedia.lsearch.frontend.SearchServer; |
34 | 39 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 40 | +import org.wikimedia.lsearch.ranks.StringList; |
35 | 41 | import org.wikimedia.lsearch.spell.Suggest; |
36 | 42 | import org.wikimedia.lsearch.spell.SuggestQuery; |
37 | 43 | import org.wikimedia.lsearch.util.QueryStringMap; |
— | — | @@ -57,9 +63,7 @@ |
58 | 64 | /** Main search method, call this from the search frontend */ |
59 | 65 | public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) { |
60 | 66 | |
61 | | - if (what.equals("titlematch")) { |
62 | | - // TODO: return searchTitles(searchterm); |
63 | | - } else if (what.equals("search") || what.equals("explain")) { |
| 67 | + if (what.equals("search") || what.equals("explain")) { |
64 | 68 | int offset = 0, limit = 100; boolean exactCase = false; |
65 | 69 | if (query.containsKey("offset")) |
66 | 70 | offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
— | — | @@ -94,16 +98,57 @@ |
95 | 99 | exactCase = true; |
96 | 100 | NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
97 | 101 | return search(iid, searchterm, offset, limit, namespaces, what.equals("rawexplain"), exactCase, true); |
| 102 | + } else if (what.equals("titlematch")) { |
| 103 | + // TODO: return searchTitles(searchterm); |
| 104 | + } else if (what.equals("prefix")){ |
| 105 | + return prefixSearch(iid, searchterm); |
98 | 106 | } else { |
99 | 107 | SearchResults res = new SearchResults(); |
100 | 108 | res.setErrorMsg("Unrecognized search type. Try one of: " + |
101 | | - "search, explain, raw, rawexplain."); |
| 109 | + "search, explain, raw, rawexplain, prefix."); |
102 | 110 | log.warn("Unknown request type [" + what + "]."); |
103 | 111 | return res; |
104 | 112 | } |
105 | 113 | return null; |
106 | 114 | } |
107 | 115 | |
| 116 | + private SearchResults prefixSearch(IndexId iid, String searchterm) { |
| 117 | + IndexId pre = iid.getPrefix(); |
| 118 | + SearcherCache cache = SearcherCache.getInstance(); |
| 119 | + SearchResults res = new SearchResults(); |
| 120 | + try { |
| 121 | + long start = System.currentTimeMillis(); |
| 122 | + searchterm = searchterm.toLowerCase(); |
| 123 | + IndexSearcherMul searcher = cache.getLocalSearcher(pre); |
| 124 | + IndexReader reader = searcher.getIndexReader(); |
| 125 | + TermDocs td = reader.termDocs(new Term("prefix",searchterm)); |
| 126 | + if(td.next()){ |
| 127 | + // found entry with a prefix, return |
| 128 | + StringList sl = new StringList(reader.document(td.doc()).get("articles")); |
| 129 | + Iterator<String> it = sl.iterator(); |
| 130 | + while(it.hasNext()) |
| 131 | + res.addResult(new ResultSet(it.next())); |
| 132 | + //logRequest(pre,"prefix",searchterm,null,res.getNumHits(),start,searcher); |
| 133 | + return res; |
| 134 | + } |
| 135 | + // check if it's an unique prefix |
| 136 | + TermEnum te = reader.terms(new Term("key",searchterm)); |
| 137 | + String r = te.term().text(); |
| 138 | + if(r.startsWith(searchterm)){ |
| 139 | + TermDocs td1 = reader.termDocs(new Term("key",r)); |
| 140 | + if(td1.next()){ |
| 141 | + res.addResult(new ResultSet(reader.document(td1.doc()).get("key"))); |
| 142 | + //logRequest(pre,"prefix",searchterm,null,res.getNumHits(),start,searcher); |
| 143 | + return res; |
| 144 | + } |
| 145 | + } |
| 146 | + } catch (IOException e) { |
| 147 | + // res.setErrorMsg("Internal error during prefix search: "+e.getMessage()); |
| 148 | + log.error("Internal error in SearchEngine::prefixSearch : "+e.getMessage()); |
| 149 | + } |
| 150 | + return res; |
| 151 | + } |
| 152 | + |
108 | 153 | /** Search mainpart or restpart of the split index */ |
109 | 154 | public SearchResults searchPart(IndexId iid, String searchterm, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){ |
110 | 155 | if( ! (iid.isMainsplit() || iid.isNssplit())) |
— | — | @@ -390,6 +435,6 @@ |
391 | 436 | long delta = System.currentTimeMillis() - start; |
392 | 437 | SearchServer.stats.add(true, delta, SearchDaemon.getOpenCount()); |
393 | 438 | log.info(MessageFormat.format("{0} {1}: query=[{2}] parsed=[{3}] hit=[{4}] in {5}ms using {6}", |
394 | | - new Object[] {what, iid.toString(), searchterm, query.toString(), new Integer(numhits), new Long(delta), searcher.toString()})); |
| 439 | + new Object[] {what, iid.toString(), searchterm, query==null? "" : query.toString(), new Integer(numhits), new Long(delta), searcher.toString()})); |
395 | 440 | } |
396 | 441 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -40,7 +40,7 @@ |
41 | 41 | global = GlobalConfiguration.getInstance(); |
42 | 42 | |
43 | 43 | Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup"); |
44 | | - if(iid.isSpell()); // no warmup for spell-chekers |
| 44 | + if(iid.isSpell() || iid.isPrefix()); // no warmup for spell-chekers and prefixes (for now) |
45 | 45 | else if(warmup == null){ |
46 | 46 | makeNamespaceFilters(is,iid); |
47 | 47 | simpleWarmup(is,iid); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -466,7 +466,15 @@ |
467 | 467 | p = makeRelated(doc,fields.related(),article,1); |
468 | 468 | |
469 | 469 | // anchors |
470 | | - makeKeywordField(doc,fields.anchor(),rankBoost); |
| 470 | + // makeKeywordField(doc,fields.anchor(),rankBoost); |
| 471 | + |
| 472 | + // add the whole title for extract boost |
| 473 | + String wt = FastWikiTokenizerEngine.stipTitle(article.getTitle()); |
| 474 | + if(!bs.isExactCase()) |
| 475 | + wt = wt.toLowerCase(); |
| 476 | + Field wtitle = new Field(fields.wholetitle(),wt,Field.Store.NO, Field.Index.UN_TOKENIZED); |
| 477 | + wtitle.setBoost(rankBoost); |
| 478 | + doc.add(wtitle); |
471 | 479 | |
472 | 480 | } |
473 | 481 | // make analyzer |
— | — | @@ -522,7 +530,7 @@ |
523 | 531 | if(ranks.get(i) == 0) |
524 | 532 | break; // we don't want redirects with zero links |
525 | 533 | //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i)); |
526 | | - Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.YES, Field.Index.TOKENIZED); |
| 534 | + Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED); |
527 | 535 | alttitle.setBoost(calculateArticleRank(ranks.get(i))); |
528 | 536 | doc.add(alttitle); |
529 | 537 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -792,5 +792,50 @@ |
793 | 793 | return keywords; |
794 | 794 | } |
795 | 795 | |
796 | | - |
| 796 | + /** Delete everything that is not being indexes, decompose chars */ |
| 797 | + public static String stipTitle(String title){ |
| 798 | + UnicodeDecomposer decomposer = UnicodeDecomposer.getInstance(); |
| 799 | + char[] str = title.toCharArray(); |
| 800 | + char[] buf = new char[256]; |
| 801 | + int len = 0; |
| 802 | + for(int i=0;i<str.length;i++){ |
| 803 | + char ch = str[i]; |
| 804 | + if(ch == ':' || ch == '(' || ch == ')' || ch =='[' || ch == ']' || ch == '.' || ch == ',' |
| 805 | + || ch == ';' || ch == '"' || ch=='-' || ch=='+' || ch=='*' || ch=='!' || ch=='~' || ch=='$' |
| 806 | + || ch == '%' || ch == '^' || ch == '&' || ch == '_' || ch=='=' || ch=='|' || ch=='\\'){ |
| 807 | + if(len > 0 && buf[len-1]!=' '){ |
| 808 | + if(len >= buf.length){ // extend buf |
| 809 | + char[] n = new char[buf.length*2]; |
| 810 | + System.arraycopy(buf,0,n,0,buf.length); |
| 811 | + buf = n; |
| 812 | + } |
| 813 | + buf[len++] = ' '; // replace the special char with space |
| 814 | + } |
| 815 | + } else{ |
| 816 | + char[] decomp = decomposer.decompose(ch); |
| 817 | + if(decomp == null){ |
| 818 | + // no decomposition add char, but don't double spaces |
| 819 | + if(ch!=' ' || (len>0 && buf[len-1]!=' ')){ |
| 820 | + if(len >= buf.length){ |
| 821 | + char[] n = new char[buf.length*2]; |
| 822 | + System.arraycopy(buf,0,n,0,buf.length); |
| 823 | + buf = n; |
| 824 | + } |
| 825 | + buf[len++] = ch; |
| 826 | + } |
| 827 | + } else{ |
| 828 | + // add decomposed chars |
| 829 | + for(int j = 0; j < decomp.length; j++){ |
| 830 | + if(len >= buf.length){ |
| 831 | + char[] n = new char[buf.length*2]; |
| 832 | + System.arraycopy(buf,0,n,0,buf.length); |
| 833 | + buf = n; |
| 834 | + } |
| 835 | + buf[len++] = decomp[j]; |
| 836 | + } |
| 837 | + } |
| 838 | + } |
| 839 | + } |
| 840 | + return new String(buf,0,len); |
| 841 | + } |
797 | 842 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PrefixAnalyzer.java |
— | — | @@ -0,0 +1,37 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.Reader; |
| 6 | + |
| 7 | +import org.apache.lucene.analysis.Analyzer; |
| 8 | +import org.apache.lucene.analysis.Token; |
| 9 | +import org.apache.lucene.analysis.TokenStream; |
| 10 | +import org.apache.lucene.analysis.Tokenizer; |
| 11 | + |
| 12 | +public class PrefixAnalyzer extends Analyzer { |
| 13 | + static public class PrefixTokenizer extends Tokenizer { |
| 14 | + String in; |
| 15 | + int count = 0; |
| 16 | + |
| 17 | + public PrefixTokenizer(String input){ |
| 18 | + in = input; |
| 19 | + } |
| 20 | + @Override |
| 21 | + public Token next() throws IOException { |
| 22 | + count++; |
| 23 | + if(count > in.length()) |
| 24 | + return null; |
| 25 | + else |
| 26 | + return new Token(in.substring(0,count),0,count); |
| 27 | + } |
| 28 | + } |
| 29 | + |
| 30 | + public TokenStream tokenStream(String fieldName, String str) { |
| 31 | + return new PrefixTokenizer(str.toLowerCase()); |
| 32 | + } |
| 33 | + |
| 34 | + @Override |
| 35 | + public TokenStream tokenStream(String fieldName, Reader reader) { |
| 36 | + throw new UnsupportedOperationException("Use tokenStream(String,String)"); |
| 37 | + } |
| 38 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -86,15 +86,18 @@ |
87 | 87 | public static float ALT_TITLE_BOOST = 8; |
88 | 88 | public static float ALT_TITLE_ALIAS_BOOST = 0.4f; |
89 | 89 | public static float KEYWORD_BOOST = 0.02f; |
| 90 | + public static float CONTENTS_BOOST = 0.2f; |
90 | 91 | |
91 | 92 | public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 20; |
92 | | - public static float ADDITIONAL_BOOST_CONTENTS = 1; |
93 | | - public static int ADDITIONAL_PHRASE_SLOP_TITLE = 10; |
94 | | - public static float ADDITIONAL_BOOST_TITLE = 2; |
| 93 | + public static float ADDITIONAL_BOOST_CONTENTS = 0.5f; |
| 94 | + public static int ADDITIONAL_PHRASE_SLOP_TITLE = 1; |
| 95 | + public static float ADDITIONAL_BOOST_TITLE = 0.5f; |
95 | 96 | public static int ADDITIONAL_PHRASE_SLOP_RELATED = 10; |
96 | | - public static float ADDITIONAL_BOOST_RELATED = 1f; |
| 97 | + public static float ADDITIONAL_BOOST_RELATED = 0.04f; |
97 | 98 | |
98 | | - public static float ANCHOR_BOOST = 1f; |
| 99 | + public static float WHOLE_TITLE_BOOST = 8f; |
| 100 | + public static float EXACT_CONTENTS_BOOST = 1f; |
| 101 | + public static float ANCHOR_BOOST = 0.02f; |
99 | 102 | |
100 | 103 | public static boolean ADD_STEM_TITLE = true; |
101 | 104 | public static boolean ADD_TITLE_PHRASES = true; |
— | — | @@ -1070,7 +1073,7 @@ |
1071 | 1074 | } |
1072 | 1075 | |
1073 | 1076 | /** Extract all words from the query */ |
1074 | | - public ArrayList<String> extractPhrases(Query query){ |
| 1077 | + public ArrayList<String> extractWords(Query query){ |
1075 | 1078 | ArrayList<String> list = new ArrayList<String>(); |
1076 | 1079 | if(query == null) |
1077 | 1080 | return list; |
— | — | @@ -1106,7 +1109,7 @@ |
1107 | 1110 | else if(bcl.length == 1 && bcl[0].getOccur() != Occur.MUST_NOT) |
1108 | 1111 | addWords(list,bcl[0].getQuery()); |
1109 | 1112 | else if(bcl.length == 2){ |
1110 | | - // TODO: this might brake in some complex queries! (with some parenthesis and transliterations...) |
| 1113 | + // TODO: this might break in some complex queries! (with some parenthesis and transliterations...) |
1111 | 1114 | if(bcl[0].getOccur() == Occur.MUST && bcl[1].getOccur() == Occur.SHOULD) |
1112 | 1115 | // second is alias |
1113 | 1116 | addWords(list,bcl[0].getQuery()); |
— | — | @@ -1315,7 +1318,7 @@ |
1316 | 1319 | defaultBoost = olfDefaultBoost; |
1317 | 1320 | defaultAliasBoost = ALIAS_BOOST; |
1318 | 1321 | |
1319 | | - ArrayList<String> words = extractPhrases(qt); |
| 1322 | + ArrayList<String> words = extractWords(qt); |
1320 | 1323 | |
1321 | 1324 | if(qt == qs) // either null, or category query |
1322 | 1325 | return new Object[] {qt,words}; |
— | — | @@ -1470,6 +1473,20 @@ |
1471 | 1474 | return bq; |
1472 | 1475 | } |
1473 | 1476 | return null; |
| 1477 | + } |
| 1478 | + |
| 1479 | + /** Join a collection via a char/string */ |
| 1480 | + protected String join(Collection<String> col, String sep){ |
| 1481 | + StringBuffer sb = new StringBuffer(); |
| 1482 | + boolean first = true; |
| 1483 | + for(String s : col){ |
| 1484 | + if(!first){ |
| 1485 | + sb.append(sep); |
| 1486 | + } else |
| 1487 | + first = false; |
| 1488 | + sb.append(s); |
| 1489 | + } |
| 1490 | + return sb.toString(); |
1474 | 1491 | } |
1475 | 1492 | |
1476 | 1493 | /** |
— | — | @@ -1485,7 +1502,7 @@ |
1486 | 1503 | queryText = quoteCJK(queryText); |
1487 | 1504 | if(policy != null) |
1488 | 1505 | this.namespacePolicy = policy; |
1489 | | - defaultBoost = 1; |
| 1506 | + defaultBoost = CONTENTS_BOOST; |
1490 | 1507 | defaultAliasBoost = ALIAS_BOOST; |
1491 | 1508 | Query qc = parseRaw(queryText); |
1492 | 1509 | Object[] qtwords = makeTitleQuery(queryText); |
— | — | @@ -1497,7 +1514,7 @@ |
1498 | 1515 | if(qc.equals(qt)) |
1499 | 1516 | return qc; // don't duplicate (probably a query for categories only) |
1500 | 1517 | |
1501 | | - BooleanQuery bq = new BooleanQuery(); |
| 1518 | + BooleanQuery bq = new BooleanQuery(true); |
1502 | 1519 | bq.add(qc,BooleanClause.Occur.SHOULD); |
1503 | 1520 | bq.add(qt,BooleanClause.Occur.SHOULD); |
1504 | 1521 | |
— | — | @@ -1522,9 +1539,14 @@ |
1523 | 1540 | bq.add(qk,BooleanClause.Occur.SHOULD); |
1524 | 1541 | } |
1525 | 1542 | |
| 1543 | + // whole title |
| 1544 | + Query wt = new TermQuery(new Term(fields.wholetitle(),join(words," "))); |
| 1545 | + wt.setBoost(WHOLE_TITLE_BOOST); |
| 1546 | + Query wc = makePhrase(words,fields.contents(),0); |
| 1547 | + wc.setBoost(EXACT_CONTENTS_BOOST); |
1526 | 1548 | // add additional score queries! |
1527 | | - Query pqc = makePhraseQueries(words,"contents",ADDITIONAL_PHRASE_SLOP_CONTENTS,ADDITIONAL_BOOST_CONTENTS); |
1528 | | - Query pqt = makePhraseQueries(words,"stemtitle",ADDITIONAL_PHRASE_SLOP_TITLE,ADDITIONAL_BOOST_TITLE); |
| 1549 | + Query pqc = makePhraseQueries(words,fields.contents(),ADDITIONAL_PHRASE_SLOP_CONTENTS,ADDITIONAL_BOOST_CONTENTS); |
| 1550 | + Query pqt = makePhraseQueries(words,fields.stemtitle(),ADDITIONAL_PHRASE_SLOP_TITLE,ADDITIONAL_BOOST_TITLE); |
1529 | 1551 | // skip last related group |
1530 | 1552 | Query[] pqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1]; |
1531 | 1553 | for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){ |
— | — | @@ -1534,16 +1556,20 @@ |
1535 | 1557 | for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){ |
1536 | 1558 | wqr[i-1] = makeWordQueries(words,"related"+i,ADDITIONAL_BOOST_RELATED / 4); |
1537 | 1559 | } |
1538 | | - if(pqc == null && pqt == null && pqr[0] == null && wqr[0] == null) |
| 1560 | + if(wt==null && pqc == null && pqt == null && pqr[0] == null && wqr[0] == null) |
1539 | 1561 | return bq; |
1540 | 1562 | // build the final query |
1541 | 1563 | BooleanQuery finalQuery = new BooleanQuery(true); |
1542 | 1564 | BooleanQuery additional = new BooleanQuery(true); |
1543 | | - |
| 1565 | + |
1544 | 1566 | if(pqc != null) |
1545 | 1567 | additional.add(pqc,Occur.MUST); |
1546 | 1568 | if(pqt != null) |
1547 | 1569 | additional.add(pqt,Occur.SHOULD); |
| 1570 | + if(wt != null) |
| 1571 | + additional.add(wt,Occur.SHOULD); |
| 1572 | + if(wc != null) |
| 1573 | + additional.add(wc,Occur.SHOULD); |
1548 | 1574 | for(Query q : pqr){ |
1549 | 1575 | if(q != null) |
1550 | 1576 | additional.add(q,Occur.SHOULD); |
— | — | @@ -1554,12 +1580,12 @@ |
1555 | 1581 | } |
1556 | 1582 | |
1557 | 1583 | // anchors |
1558 | | - Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST); |
| 1584 | + //Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST); |
1559 | 1585 | |
1560 | 1586 | finalQuery.add(bq,Occur.MUST); |
1561 | 1587 | finalQuery.add(additional,Occur.SHOULD); |
1562 | | - if(anchors != null) |
1563 | | - finalQuery.add(anchors,Occur.SHOULD); |
| 1588 | + //if(anchors != null) |
| 1589 | + // finalQuery.add(anchors,Occur.SHOULD); |
1564 | 1590 | |
1565 | 1591 | return finalQuery; |
1566 | 1592 | |
— | — | @@ -1617,8 +1643,6 @@ |
1618 | 1644 | } |
1619 | 1645 | public void setBuilder(FieldBuilder.BuilderSet builder) { |
1620 | 1646 | this.builder = builder; |
1621 | | - } |
1622 | | - |
| 1647 | + } |
1623 | 1648 | |
1624 | | - |
1625 | 1649 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java |
— | — | @@ -66,6 +66,13 @@ |
67 | 67 | else |
68 | 68 | return "anchor"; |
69 | 69 | } |
| 70 | + |
| 71 | + public String wholetitle(){ |
| 72 | + if(exactCase) |
| 73 | + return "wholetitle_exact"; |
| 74 | + else |
| 75 | + return "wholetitle"; |
| 76 | + } |
70 | 77 | |
71 | 78 | |
72 | 79 | public boolean isExactCase() { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java |
— | — | @@ -0,0 +1,44 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.Reader; |
| 6 | + |
| 7 | +import org.apache.lucene.analysis.Analyzer; |
| 8 | +import org.apache.lucene.analysis.Token; |
| 9 | +import org.apache.lucene.analysis.TokenStream; |
| 10 | +/** |
| 11 | + * Analyzer that just lowecases the text, doesn't split up anything, etc.. |
| 12 | + * |
| 13 | + * @author rainman |
| 14 | + * |
| 15 | + */ |
| 16 | +public class LowercaseAnalyzer extends Analyzer { |
| 17 | + public static class LowercaseTokenizer extends TokenStream { |
| 18 | + String text; |
| 19 | + boolean sent = false; |
| 20 | + LowercaseTokenizer(String in){ |
| 21 | + text = in.toLowerCase(); |
| 22 | + } |
| 23 | + @Override |
| 24 | + public Token next() throws IOException { |
| 25 | + if(sent) |
| 26 | + return null; |
| 27 | + else{ |
| 28 | + sent = true; |
| 29 | + return new Token(text,0,text.length()); |
| 30 | + } |
| 31 | + } |
| 32 | + |
| 33 | + } |
| 34 | + |
| 35 | + @Override |
| 36 | + public TokenStream tokenStream(String fieldName, String text) { |
| 37 | + return new LowercaseTokenizer(text); |
| 38 | + } |
| 39 | + @Override |
| 40 | + public TokenStream tokenStream(String fieldName, Reader reader) { |
| 41 | + throw new UnsupportedOperationException("Use tokenStream(String,String)"); |
| 42 | + } |
| 43 | + |
| 44 | + |
| 45 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -232,7 +232,71 @@ |
233 | 233 | writer.addDocument(doc,an); |
234 | 234 | state = State.MODIFIED_ARTICLES; |
235 | 235 | } |
| 236 | + public static HashSet<Character> separators = new HashSet<Character>(); |
| 237 | + static{ |
| 238 | + separators.add(' '); |
| 239 | + separators.add('\r'); |
| 240 | + separators.add('\n'); |
| 241 | + separators.add('\t'); |
| 242 | + separators.add(':'); |
| 243 | + separators.add('('); |
| 244 | + separators.add(')'); |
| 245 | + separators.add('['); |
| 246 | + separators.add(']'); |
| 247 | + separators.add('.'); |
| 248 | + separators.add(','); |
| 249 | + separators.add(':'); |
| 250 | + separators.add(';'); |
| 251 | + separators.add('"'); |
| 252 | + separators.add('+'); |
| 253 | + separators.add('*'); |
| 254 | + separators.add('!'); |
| 255 | + separators.add('~'); |
| 256 | + separators.add('$'); |
| 257 | + separators.add('%'); |
| 258 | + separators.add('^'); |
| 259 | + separators.add('&'); |
| 260 | + separators.add('_'); |
| 261 | + separators.add('='); |
| 262 | + separators.add('|'); |
| 263 | + separators.add('\\'); |
| 264 | + } |
236 | 265 | |
| 266 | + /** |
| 267 | + * Find a sentance boundaries |
| 268 | + * |
| 269 | + * @param text - raw text |
| 270 | + * @param start - start index to search from |
| 271 | + * @param reverse - if true, will lookup in reverse |
| 272 | + * @param max - radius of search (if no boundary is found return last wordbreak) |
| 273 | + * @return |
| 274 | + */ |
| 275 | + protected int findSentance(char[] text, int start, boolean reverse, int max){ |
| 276 | + int inc = (reverse)? -1 : 1; |
| 277 | + int count = 0; |
| 278 | + int wordbreak = start; |
| 279 | + int i = start; |
| 280 | + for(;i>0 && i<text.length;i+=inc){ |
| 281 | + char c = text[i]; |
| 282 | + if(c == '.') |
| 283 | + return i; |
| 284 | + else if(c == '*' && ((i>1 && text[i-1]=='\n') || i==0)) |
| 285 | + return i; |
| 286 | + else if(separators.contains(c)) |
| 287 | + wordbreak = i; |
| 288 | + if(count >= max) |
| 289 | + return wordbreak; // more than max chars away, return the latest wordbreak |
| 290 | + count ++; |
| 291 | + } |
| 292 | + return i; |
| 293 | + } |
| 294 | + |
| 295 | + /** Find surrounding for a link - extract sentances, list items .... */ |
| 296 | + protected String findContext(char[] text, int start, int end){ |
| 297 | + // TODO: implement |
| 298 | + return null; |
| 299 | + } |
| 300 | + |
237 | 301 | /** Find the target key to title (ns:title) to which the links is pointing to |
238 | 302 | * @throws IOException */ |
239 | 303 | protected String findTargetLink(int ns, String title) throws IOException{ |
Index: branches/lucene-search-2.1/lsearch-global.conf |
— | — | @@ -17,14 +17,14 @@ |
18 | 18 | wikidev : (single) (language,sr) |
19 | 19 | wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[]) |
20 | 20 | wikilucene : (language,en) (warmup,10) |
21 | | -wikilucene : (spell,3,1) |
| 21 | +wikilucene : (spell,3,1) (prefix) |
22 | 22 | |
23 | 23 | # Search groups |
24 | 24 | # Index parts of a split index are always taken from the node's group |
25 | 25 | # host : db1.part db2.part |
26 | 26 | # Mulitple hosts can search multiple dbs (N-N mapping) |
27 | 27 | [Search-Group] |
28 | | -oblak : wikilucene wikidev |
| 28 | +oblak : wikilucene wikidev wikilucene.prefix |
29 | 29 | |
30 | 30 | # Index nodes |
31 | 31 | # host: db1.part db2.part |