r25925 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r25924‎ | r25925 | r25926 >
Date:22:47, 18 September 2007
Author:rainman
Status:old
Tags:
Comment:
* Experimental implementation of ajax suggest engine, based on
Julien Lemoine's ideas. Use lucene index instead of trie
* Add (remote) spell-check unit test cases
* More fine-tuning of the spell-check engine
Modified paths:
  • /branches/lucene-search-2.1/lsearch-global.conf (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PrefixAnalyzer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/LocalIndex.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -130,12 +130,12 @@
131131 Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
132132 bs = new FieldBuilder("en").getBuilder();
133133 parser = new WikiQueryParser(bs.getFields().title(),"0",analyzer,bs,NamespacePolicy.IGNORE,stopWords);
134 - assertEquals("[how, do, you, do]",parser.extractPhrases(parser.parseRaw("how do you do")).toString());
135 - assertEquals("[making, something, rest]",parser.extractPhrases(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString());
136 - assertEquals("[godel, theorem]",parser.extractPhrases(parser.parseRaw("gödel theorem")).toString());
137 - assertEquals("[some, text, and, some, phrase]",parser.extractPhrases(parser.parseRaw("some_text and \"some phrase\"")).toString());
 134+ assertEquals("[how, do, you, do]",parser.extractWords(parser.parseRaw("how do you do")).toString());
 135+ assertEquals("[making, something, rest]",parser.extractWords(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString());
 136+ assertEquals("[godel, theorem]",parser.extractWords(parser.parseRaw("gödel theorem")).toString());
 137+ assertEquals("[some, text, and, some, phrase]",parser.extractWords(parser.parseRaw("some_text and \"some phrase\"")).toString());
138138
139 - ArrayList<String> words = parser.extractPhrases(parser.parseRaw("the who band is something nobody knows about"));
 139+ ArrayList<String> words = parser.extractWords(parser.parseRaw("the who band is something nobody knows about"));
140140 assertEquals("contents:\"the who band\"~10 contents:\"band is something\"~10 contents:\"something nobody\"~10 contents:\"nobody knows\"~10 contents:\"knows about\"~10",parser.makePhraseQueries(words,"contents",10,1).toString());
141141
142142 // namespace policies
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
@@ -0,0 +1,100 @@
 2+package org.wikimedia.lsearch.test;
 3+
 4+import java.io.BufferedReader;
 5+import java.io.IOException;
 6+import java.io.InputStreamReader;
 7+import java.net.MalformedURLException;
 8+import java.net.URL;
 9+import java.net.URLEncoder;
 10+
 11+/**
 12+ * Remotely test a spell-checker host
 13+ *
 14+ * @author rainman
 15+ *
 16+ */
 17+public class SpellCheckTest {
 18+ static String host = "localhost";
 19+ static int port = 8123;
 20+ static String db = "enwiki";
 21+
 22+ public static String getSuggestion(String query) throws IOException{
 23+ query = query.replace(" ","%20");
 24+ String urlString = "http://"+host+":"+port+"/search/"+db+"/"+query+"?case=ignore&limit=20&namespaces=0&offset=0";
 25+ URL url = new URL(urlString);
 26+ BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
 27+ String line;
 28+ int lineNum = 0;
 29+ while ( (line = br.readLine()) != null ) {
 30+ if(lineNum == 1){
 31+ if(line.startsWith("#suggest")){
 32+ br.close();
 33+ return line.substring(9).replaceAll("<[^>]+>","");
 34+ }
 35+ }
 36+ lineNum ++ ;
 37+ }
 38+ br.close();
 39+ return "";
 40+ }
 41+
 42+ /**
 43+ * @param args
 44+ * @throws IOException
 45+ */
 46+ public static void main(String[] args) throws IOException {
 47+ int len = CHECK.length;
 48+ System.out.println("Running "+len+" tests");
 49+ int good = 0, failed = 0;
 50+ int count = 1;
 51+ for(String[] c : CHECK){
 52+ String sug = getSuggestion(c[0]);
 53+ if(!sug.equals(c[1])){
 54+ System.out.println("["+count+"/"+len+"] FAILED {"+sug+"} EXPECTED ["+c[1]+"] FOR ["+c[0]+"]");
 55+ failed++;
 56+ } else{
 57+ System.out.println("["+count+"/"+len+"] OK");
 58+ good++;
 59+ }
 60+ count ++;
 61+ }
 62+ System.out.println("Good tests: "+good+", failed tests: "+failed);
 63+ }
 64+
 65+ // wrong -> right
 66+ private static final String[][] CHECK = {
 67+ {"annul of improbably research", "annals of improbable research" },
 68+ {"los angles", "los angeles" },
 69+ {"what is the type of engineers thats deal with various depth of the eart crust", "what is the type of engineers thats deal with various depths of the earth crust"},
 70+ {"argentina cilmage", "argentina climate"},
 71+ {"Vista Compatibly", "Vista Compatible"},
 72+ {"sarah thomson", "sarah thompson"},
 73+ {"attribution (finance)", ""},
 74+ {"SOUTH PARK EPISDOE LIST", "SOUTH PARK EPISODE LIST"},
 75+ {"the grnd canyon", "the grand canyon"},
 76+ {"ron burgand","ron burgundy"},
 77+ {"fullmetal achemist ep 1","fullmetal alchemist ep 1"},
 78+ {"fullmetal alchemist ep 1",""},
 79+ {"enerst shackleton", "ernest shackleton"},
 80+ {"los angles lakers", "los angeles lakers"},
 81+ {"crab fisher","crab fishing"},
 82+ {"discovery channe;", "discovery channel"},
 83+ {"Young Cuties", ""},
 84+ {"fire australia", ""},
 85+ {"platoon film", ""},
 86+ {"basillar artery","basilar artery"},
 87+ {"franki vallie","frankie valli"},
 88+ {"cuties",""},
 89+ {"teh",""},
 90+ {"21st ammendment", "21st amendment"},
 91+ {"stargate junior",""},
 92+ {"fire australia",""},
 93+ {"ISO crack", ""},
 94+ {"The James Gang (band)",""},
 95+ {"cource", "course"},
 96+ {"carolene products",""},
 97+ {"orvileWright","overnight"},
 98+
 99+ };
 100+
 101+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
@@ -65,8 +65,20 @@
6666 HashMap query = new QueryStringMap(uri);
6767 SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query);
6868 contentType = "text/plain";
69 - if(res!=null && res.isSuccess()){
 69+ // format:
 70+ // <namespace> <title> (resNum-times)
 71+ if(what.equals("prefix")){
7072 sendHeaders(200, "OK");
 73+ for(ResultSet rs : res.getResults()){
 74+ sendResultLine(rs.namespace, rs.title);
 75+ }
 76+ }
 77+ // format:
 78+ // <num of hits>
 79+ // #suggest <query> or #no suggestion
 80+ // <score> <ns> <title> (resNum-times)
 81+ else if(res!=null && res.isSuccess()){
 82+ sendHeaders(200, "OK");
7183 sendOutputLine(Integer.toString(res.getNumHits()));
7284 if(res.getSuggest() != null)
7385 sendOutputLine("#suggest "+res.getSuggest());
@@ -122,4 +134,12 @@
123135 }
124136 }
125137
 138+ private void sendResultLine(String namespace, String title) {
 139+ try{
 140+ sendOutputLine(namespace + " " + URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8"));
 141+ } catch(Exception e){
 142+ log.error("Error sending prefix result line (" + namespace + " " + title +"): "+e.getMessage());
 143+ }
 144+ }
 145+
126146 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -51,7 +51,7 @@
5252 protected Set<String> stopWords;
5353
5454 /** Distance an metaphone metrics */
55 - static class Metric {
 55+ static public class Metric {
5656 protected DoubleMetaphone dmeta = new DoubleMetaphone();
5757 protected String meta1, meta2;
5858 protected EditDistance sd;
@@ -132,7 +132,7 @@
133133 this.type = type;
134134 }
135135 public String toString(){
136 - return "dist:"+dist+"-freq:"+freq+"-sub:"+substitutes+"-pres:"+preserves;
 136+ return "["+type+" dist:"+dist+" freq:"+freq+" sub:"+substitutes+" pres:"+preserves+"]";
137137 }
138138 }
139139
@@ -176,15 +176,23 @@
177177 ArrayList<Change> suggestionsTitle = new ArrayList<Change>();
178178
179179 // add correct words
180 - for(int i=0;i<tokens.size();i++){
 180+ /*for(int i=0;i<tokens.size();i++){
181181 Token t = tokens.get(i);
182182 if(correctWords.contains(t.termText())){
183183 Change c = new Change(0,1,Change.Type.TITLE_WORD);
184184 c.preserves.put(i,t.termText());
185185 suggestions.add(c);
186186 }
 187+ } */
 188+
 189+ // check for exact title match
 190+ if(tokens.size() == 1){
 191+ String w = tokens.get(0).termText();
 192+ if(correctWords.contains(w) && reader.docFreq(new Term("title",w)) != 0)
 193+ return null;
187194 }
188195
 196+ HashSet<String> stemmedCorrectWords = stemSet(correctWords,parser.getBuilder().getFilters());
189197 ArrayList<ArrayList<SuggestResult>> wordSug = new ArrayList<ArrayList<SuggestResult>>();
190198 HashSet<Integer> correctIndex = new HashSet<Integer>();
191199 ArrayList<SuggestResult> possibleStopWords = new ArrayList<SuggestResult>();
@@ -214,11 +222,7 @@
215223 if(w2 == null)
216224 continue;
217225
218 - String phrase = w+gap+w2;
219 - if(reader.docFreq(new Term("phrase",phrase)) != 0){
220 - correctPhrases.add(i);
221 - correctPhrases.add(i2);
222 - } else if(correctWords.contains(w) && correctWords.contains(w2)){
 226+ if(correctWords.contains(w) && correctWords.contains(w2)){
223227 for(HashSet<String> title : titles){
224228 if(title.contains(w) && title.contains(w2)){
225229 correctPhrases.add(i);
@@ -263,26 +267,18 @@
264268 }
265269 }
266270 possibleStopWords.add(maybeStopWord);
267 - // detect common misspells
268 - if(sug.size() > 1){
269 - SuggestResult r1 = sug.get(0);
270 - SuggestResult r2 = sug.get(1);
271 - if(r1.dist == 1 && r2.dist == 0 && r1.frequency > 100 * r2.frequency){
272 - Change c = new Change(r1.dist,r1.frequency,Change.Type.WORD);
273 - c.substitutes.put(i,r1.word);
274 - suggestions.add(c);
275 - }
276 - }
277271 } else{
278272 wordSug.add(null);
279273 possibleStopWords.add(null);
280274 }
281275 // suggest split
282 - SuggestResult split = suggestSplit(w,minFreq);
283 - if(split != null){
284 - Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);
285 - sc.substitutes.put(i,split.word.replace("_"," "));
286 - suggestions.add(sc);
 276+ if(!correctWords.contains(w)){
 277+ SuggestResult split = suggestSplit(w,minFreq);
 278+ if(split != null){
 279+ Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);
 280+ sc.substitutes.put(i,split.word.replace("_"," "));
 281+ suggestions.add(sc);
 282+ }
287283 }
288284 // suggest join
289285 if(i-1 >= 0
@@ -306,7 +302,8 @@
307303 ArrayList<SuggestResult> sug2 = null;
308304 String w2 = null;
309305 String gap = "_";
310 - boolean good1 = sug1.get(0).getDist() == 0; // w1 is spellchecked right
 306+ // if w1 is spellchecked right
 307+ boolean good1 = sug1.get(0).getDist() == 0;
311308 int i2 = i;
312309 boolean maybeStopWord = false; // the currecnt i2 might be a stop word, try to find phrases with it as stop word
313310 int distOffset = 0; // if we spellcheked to stop word, all phrases should have this initial dist
@@ -331,7 +328,8 @@
332329 }
333330 if(sug2 == null)
334331 continue;
335 - boolean good2 = sug2.get(0).getDist() == 0; // w2 is spellchecked right
 332+ // if second word is spelled right
 333+ boolean good2 = sug2.get(0).getDist() == 0;
336334 int maxdist = Math.min((w1.length() + w2.length()) / 3, 5);
337335 int mindist = -1;
338336 boolean forTitlesOnly = false;
@@ -358,21 +356,30 @@
359357 }
360358 //log.info("Checking "+phrase);
361359 if(freq > 0){
 360+ // number of characters added/substracted
 361+ int diff1 = Math.abs(s1.word.length()-w1.length());
 362+ int diff2 = Math.abs(s2.word.length()-w2.length());
362363 log.info("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle);
363364 int dist = s1.dist + s2.dist + distOffset;
364365 boolean accept = true;
365366 Change c = new Change(dist,freq,Change.Type.PHRASE);
366367 if(s1.word.equals(w1))
367368 c.preserves.put(i,w1);
368 - else if(!good1 || inTitle)
 369+ else if(!good1 || (inTitle && diff1 <= 2 && !correctWords.contains(w1)))
369370 c.substitutes.put(i,s1.word);
370 - else
 371+ else if(!good1 || (inTitle && diff1 <=2)){
 372+ forTitlesOnly = true;
 373+ c.substitutes.put(i,s1.word);
 374+ } else
371375 accept = false;
372376 if(s2.word.equals(w2))
373377 c.preserves.put(i2,w2);
374 - else if(!good2 || inTitle)
 378+ else if(!good2 || (inTitle && diff2 <= 2 && !correctWords.contains(w2)))
375379 c.substitutes.put(i2,s2.word);
376 - else
 380+ else if(!good2 || (inTitle && diff2 <= 2)){
 381+ forTitlesOnly = true;
 382+ c.substitutes.put(i2,s2.word);
 383+ } else
377384 accept = false;
378385 if(accept){
379386 if(mindist == -1)
@@ -384,10 +391,11 @@
385392 }
386393 }
387394 }
388 - } while(maybeStopWord);
 395+ } while(maybeStopWord && i2+1<tokens.size());
389396 }
390397 // try to construct a valid title by spell-checking all words
391398 if(suggestionsTitle.size() > 0){
 399+ log.info("Trying exact-title matches");
392400 Object[] ret = calculateChanges(suggestionsTitle,searchterm.length()/2);
393401 ArrayList<Entry<Integer,String>> proposedTitle = (ArrayList<Entry<Integer, String>>) ret[0];
394402 boolean madeChanges = false;
@@ -395,8 +403,10 @@
396404 String formated = searchterm;
397405 for(Entry<Integer,String> e : proposedTitle){
398406 Token t = tokens.get(e.getKey());
399 - String nt = e.getValue();
400 - if(!stemsToSame(t.termText(),nt,parser.getBuilder().getFilters())){
 407+ String nt = e.getValue();
 408+ // replace words if they don't stem to same word, of they stem to same, but the words is misspelled
 409+ boolean stemNotSame = stemNotSameOrInSet(t.termText(),nt,parser.getBuilder().getFilters(),stemmedCorrectWords);
 410+ if(stemNotSame || (!stemNotSame && reader.docFreq(new Term("word",t.termText())) == 0)){
401411 formated = markSuggestion(formated,t,nt);
402412 title = applySuggestion(title,t,nt);
403413 madeChanges = true;
@@ -412,6 +422,7 @@
413423 } else if(tokens.size() == 1 && wordSug.get(0)!=null
414424 && wordSug.get(0).size() > 0 && !correctWords.contains(tokens.get(0).termText())){
415425 // only one token, try different spell-checks for title
 426+ log.info("Trying exact-title single word match");
416427 ArrayList<SuggestResult> sg = (ArrayList<SuggestResult>) wordSug.get(0).clone();
417428 Collections.sort(sg,new SuggestResult.ComparatorNoCommonMisspell());
418429 Token t = tokens.get(0);
@@ -434,6 +445,7 @@
435446 ArrayList<Entry<Integer,String>> proposedChanges = new ArrayList<Entry<Integer,String>>();
436447 if(suggestions.size() > 0){
437448 // found some suggestions
 449+ log.info("Trying phrases ...");
438450 Object[] ret = calculateChanges(suggestions,searchterm.length()/2);
439451 proposedChanges = (ArrayList<Entry<Integer, String>>) ret[0];
440452 ArrayList<Entry<Integer,String>> preservedWords = (ArrayList<Entry<Integer, String>>) ret[1];
@@ -442,12 +454,13 @@
443455 for(Entry<Integer,String> e : proposedChanges)
444456 preserveTokens.add(e.getKey());
445457 }
446 -
 458+ log.info("Adding words, preserve tokens: "+preserveTokens+", preserve correct phrases: "+correctPhrases);
447459 // last resort: go with individual word suggestions
448460 HashMap<Integer,String> wordChanges = new HashMap<Integer,String>();
449 - for(int i=0;i<tokens.size();i++){
450 - if(preserveTokens.contains(i))
 461+ for(int i=0;i<tokens.size();i++){
 462+ if(preserveTokens.contains(i) || correctPhrases.contains(i))
451463 continue;
 464+ // TODO: maybe check for common misspells here?!
452465 ArrayList<SuggestResult> sug = wordSug.get(i);
453466 if(sug == null)
454467 continue;
@@ -457,7 +470,7 @@
458471 }
459472 if(wordChanges.size() != 0)
460473 proposedChanges.addAll(wordChanges.entrySet());
461 -
 474+
462475 // sort in reverse order from that in query, i.e. first change in the last term
463476 Collections.sort(proposedChanges,new Comparator<Entry<Integer,String>>() {
464477 public int compare(Entry<Integer,String> o1, Entry<Integer,String> o2){
@@ -471,7 +484,9 @@
472485 for(Entry<Integer,String> e : proposedChanges){
473486 Token t = tokens.get(e.getKey());
474487 String nt = e.getValue();
475 - if(!stemsToSame(t.termText(),nt,parser.getBuilder().getFilters())){
 488+ // incorrect words, or doesn't stem to same
 489+ boolean stemNotSame = stemNotSameOrInSet(t.termText(),nt,parser.getBuilder().getFilters(),stemmedCorrectWords);
 490+ if(stemNotSame || (!stemNotSame && reader.docFreq(new Term("word",t.termText())) == 0)){
476491 formated = markSuggestion(formated,t,nt);
477492 searchterm = applySuggestion(searchterm,t,nt);
478493 madeChanges = true;
@@ -484,15 +499,27 @@
485500 return null;
486501 }
487502
 503+ /** try to figure out the case of original spell-checked word, and output the new word in that case */
 504+ protected String simulateCase(String formated, Token t, String newWord) {
 505+ String old = formated.substring(t.startOffset(),t.endOffset());
 506+ if(old.equals(old.toLowerCase()))
 507+ return newWord.toLowerCase();
 508+ if(old.equals(old.toUpperCase()))
 509+ return newWord.toUpperCase();
 510+ if(old.length()>1 && old.equals(old.substring(0,1).toUpperCase()+old.substring(1)))
 511+ return newWord.substring(0,1).toUpperCase()+newWord.substring(1).toLowerCase();
 512+ return newWord;
 513+ }
 514+
488515 protected String markSuggestion(String formated, Token t, String newWord){
489516 return formated.substring(0,t.startOffset())
490 - + "<i>" + newWord + "</i>"
 517+ + "<i>" + simulateCase(formated,t,newWord) + "</i>"
491518 + formated.substring(t.endOffset());
492519 }
493520
494521 protected String applySuggestion(String searchterm, Token t, String newWord){
495522 return searchterm.substring(0,t.startOffset())
496 - + newWord
 523+ + simulateCase(searchterm,t,newWord)
497524 + searchterm.substring(t.endOffset());
498525 }
499526
@@ -575,7 +602,7 @@
576603 hr.addAll(r1); hr.addAll(r2);
577604 ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
578605 res.addAll(hr);
579 - Collections.sort(res,new SuggestResult.Comparator());
 606+ Collections.sort(res,new SuggestResult.ComparatorNoCommonMisspell());
580607 return res;
581608 }
582609 return r1;
@@ -718,11 +745,46 @@
719746 if(t1 != null && t2 != null && t1.termText().equals(t2.termText()))
720747 return true;
721748 } catch (IOException e) {
722 - log.error("Cannot stemm words "+word1+", "+word2+" : "+e.getMessage());
 749+ log.error("Cannot stem words "+word1+", "+word2+" : "+e.getMessage());
723750 }
724751 return false;
725752 }
726753
 754+ /** check if stemmed newWord is 1) not same to stememed oldWord, OR 2) not in stemmed set*/
 755+ public boolean stemNotSameOrInSet(String oldWord, String newWord, FilterFactory filters, Set<String> stemmedSet){
 756+ if(!filters.hasStemmer())
 757+ return false;
 758+ ArrayList<String> in = new ArrayList<String>();
 759+ in.add(oldWord); in.add(newWord);
 760+ TokenStream ts = filters.makeStemmer(new StringsTokenStream(in));
 761+ try {
 762+ Token t1 = ts.next();
 763+ Token t2 = ts.next();
 764+ if(t1 != null && t2 != null && (t1.termText().equals(t2.termText()) && stemmedSet.contains(t2.termText())))
 765+ return false;
 766+ } catch (IOException e) {
 767+ log.error("Cannot stem words "+oldWord+", "+oldWord+" : "+e.getMessage());
 768+ }
 769+ return true;
 770+ }
 771+
 772+ /** stem all words in the set */
 773+ public HashSet<String> stemSet(HashSet<String> set, FilterFactory filters){
 774+ if(!filters.hasStemmer())
 775+ return new HashSet<String>();
 776+ HashSet<String> ret = new HashSet<String>();
 777+ TokenStream ts = filters.makeStemmer(new StringsTokenStream(set));
 778+ try {
 779+ Token t;
 780+ while((t = ts.next()) != null)
 781+ ret.add(t.termText());
 782+ return ret;
 783+ } catch (IOException e) {
 784+ log.error("Cannot stem set "+set+" : "+e.getMessage());
 785+ return new HashSet<String>();
 786+ }
 787+ }
 788+
727789 static class StringsTokenStream extends TokenStream {
728790 Iterator<String> input;
729791 int count = 0;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
@@ -62,7 +62,10 @@
6363 if(text.length()>=2){
6464 System.out.println("METAPHONES: "+dmeta.doubleMetaphone(text)+", "+dmeta.doubleMetaphone(text,true));
6565 System.out.println("SUGGEST: ");
 66+ int count = 0;
6667 for(SuggestResult r : sc.suggestWords(text,10)){
 68+ if(++count >= 10 )
 69+ break;
6770 System.out.println(r);
6871 }
6972
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java
@@ -40,23 +40,37 @@
4141 private int count = 0;
4242 private String field;
4343 private boolean first = true;
 44+ private String prefix = null;
 45+ private boolean silent = false; // no report output
4446
4547 public LuceneDictionary(IndexReader reader, String field) {
46 - try {
47 - this.field = field;
48 - termEnum = reader.terms(new Term(field, ""));
49 - } catch (IOException e) {
50 - throw new RuntimeException(e);
51 - }
 48+ this(reader,field,"");
5249 }
5350
 51+ public LuceneDictionary(IndexReader reader, String field, String prefix) {
 52+ if(!prefix.equals(""))
 53+ this.prefix = prefix;
 54+
 55+ try {
 56+ this.field = field;
 57+ termEnum = reader.terms(new Term(field, prefix));
 58+ } catch (IOException e) {
 59+ throw new RuntimeException(e);
 60+ }
 61+ }
 62+
 63+ /** Don't print progress */
 64+ public void setNoProgressReport(){
 65+ silent = true;
 66+ }
 67+
5468 public Word next() {
55 - if(++count % REPORT == 0){
 69+ if(!silent && ++count % REPORT == 0){
5670 System.out.println("Processed "+count+" terms");
5771 }
5872 try {
5973 while(true){
60 - if(first){
 74+ if(first && termEnum.term() != null){
6175 first = false;
6276 break;
6377 }
@@ -64,6 +78,8 @@
6579 return null;
6680 else if(!termEnum.term().field().equals(field))
6781 return null; // end of our field
 82+ else if(prefix != null && !termEnum.term().text().startsWith(prefix))
 83+ return null; // no longer same prefix
6884
6985 break;
7086 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
@@ -32,18 +32,18 @@
3333 import org.wikimedia.lsearch.index.WikiIndexModifier;
3434 import org.wikimedia.lsearch.search.IndexSearcherMul;
3535 import org.wikimedia.lsearch.search.WikiSearcher;
 36+import org.wikimedia.lsearch.spell.Suggest;
3637 import org.wikimedia.lsearch.spell.api.Dictionary.Word;
3738 import org.wikimedia.lsearch.spell.dist.DoubleMetaphone;
3839 import org.wikimedia.lsearch.util.HighFreqTerms;
3940
4041 /**
41 - * Index words and phrases from article titles.
 42+ * Index words and phrases from articles.
4243 *
4344 * Fields:
4445 * * word - word from title
 46+ * * word_ngramN - word ngrams
4547 * * phrase - phrase like douglas_adams
46 - * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3)
47 - * * namespace - namespaces where the word/phrase is present
4848 *
4949 * @author rainman
5050 *
@@ -146,10 +146,9 @@
147147 addPhrase(w,freq,true);
148148 }
149149 }
150 - }
 150+ }
151151 ngramWriter.closeAndOptimize();
152 - ir.close();
153 -
 152+ ir.close();
154153 } catch (IOException e) {
155154 log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
156155 e.printStackTrace();
@@ -158,6 +157,24 @@
159158
160159 }
161160
 161+ /** Check if there are common mispellings of this phrase */
 162+ protected boolean checkCommonPhraseMisspell(String phrase, int freq, IndexReader ir, String field) {
 163+ LuceneDictionary d = new LuceneDictionary(ir,field,phrase.substring(0,1));
 164+ d.setNoProgressReport();
 165+ Suggest.Metric metric = new Suggest.Metric(phrase);
 166+ Word word;
 167+ while((word = d.next()) != null){
 168+ if(word.getFrequency() * 100 < freq && word.getWord().indexOf("_")!=-1 ){
 169+ String w = word.getWord();
 170+ if(metric.distance(w) == 1){
 171+ System.out.println("Detected common mispelling for "+w+" (correct: "+phrase+")");
 172+ return true;
 173+ }
 174+ }
 175+ }
 176+ return false;
 177+ }
 178+
162179 /**
163180 * Register a title in the index, without tokenization, just lowercase.
164181 *
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/LocalIndex.java
@@ -49,5 +49,9 @@
5050 this.timestamp = timestamp;
5151 }
5252
 53+ public String toString(){
 54+ return path+" at "+timestamp+" for "+iid;
 55+ }
5356
 57+
5458 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java
@@ -10,6 +10,14 @@
1111 public String namespace;
1212 public String title;
1313 Explanation explanation;
 14+
 15+ public ResultSet(String key) {
 16+ int colon = key.indexOf(':');
 17+ this.score = 0;
 18+ this.namespace = key.substring(0,colon);
 19+ this.title = key.substring(colon+1);
 20+ this.explanation = null;
 21+ }
1422 public ResultSet(double score, String namespace, String title) {
1523 this.score = score;
1624 this.namespace = namespace;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Benchmark.java
@@ -107,10 +107,16 @@
108108 @SuppressWarnings("deprecation")
109109 protected int search(){
110110 String query = "";
111 - for(int i=0;i<words;i++){
112 - if(!query.equals(""))
113 - query += " OR ";
114 - query += terms.next();
 111+ if(verb.equals("prefix")){
 112+ int num = (int)(Math.random()*8);
 113+ String t = terms.next();
 114+ query = namespaceFilter+":"+t.substring(0,Math.min(num,t.length()));
 115+ } else{
 116+ for(int i=0;i<words;i++){
 117+ if(!query.equals(""))
 118+ query += " OR ";
 119+ query += terms.next();
 120+ }
115121 }
116122 String urlString;
117123 if(namespace.equals("")){
@@ -132,11 +138,13 @@
133139 new InputStreamReader(
134140 conn.getInputStream()));
135141 String inputLine;
136 - int resCount = -1;
 142+ int resCount = verb.equals("prefix")? 0 : -1;
137143
138144 while ((inputLine = in.readLine()) != null){
139145 if(resCount == -1)
140146 resCount = Integer.parseInt(inputLine);
 147+ if(verb.equals("prefix"))
 148+ resCount ++ ;
141149 }
142150 in.close();
143151
@@ -195,7 +203,7 @@
196204 } else if (args[i].equals("-c")) {
197205 runs = Integer.parseInt(args[++i]);
198206 } else if (args[i].equals("-v")) {
199 - database = args[++i];
 207+ verb = args[++i];
200208 } else if (args[i].equals("-wf")) {
201209 wordfile = args[++i];
202210 } else if (args[i].equals("-n") || args[i].equals("-ns")) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
@@ -0,0 +1,154 @@
 2+package org.wikimedia.lsearch.prefix;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Collections;
 7+import java.util.Comparator;
 8+import java.util.HashMap;
 9+import java.util.Iterator;
 10+import java.util.Map.Entry;
 11+
 12+import org.apache.log4j.Logger;
 13+import org.apache.lucene.analysis.SimpleAnalyzer;
 14+import org.apache.lucene.document.Document;
 15+import org.apache.lucene.document.Field;
 16+import org.apache.lucene.index.IndexReader;
 17+import org.apache.lucene.index.IndexWriter;
 18+import org.apache.lucene.index.Term;
 19+import org.apache.lucene.index.TermDocs;
 20+import org.wikimedia.lsearch.analyzers.LowercaseAnalyzer;
 21+import org.wikimedia.lsearch.analyzers.PrefixAnalyzer;
 22+import org.wikimedia.lsearch.config.Configuration;
 23+import org.wikimedia.lsearch.config.IndexId;
 24+import org.wikimedia.lsearch.index.IndexThread;
 25+import org.wikimedia.lsearch.ranks.StringList;
 26+import org.wikimedia.lsearch.spell.api.LuceneDictionary;
 27+import org.wikimedia.lsearch.spell.api.Dictionary.Word;
 28+import org.wikimedia.lsearch.storage.ArticleAnalytics;
 29+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
 30+
 31+/**
 32+ * Build an index of all title prefixes
 33+ *
 34+ * @author rainman
 35+ *
 36+ */
 37+public class PrefixIndexBuilder {
 38+ static Logger log = Logger.getLogger(PrefixIndexBuilder.class);
 39+
 40+ public static void main(String[] args) throws IOException{
 41+ final int PER_PREFIX = 10;
 42+ boolean usetemp = false;
 43+ String dbname = null;
 44+
 45+ Configuration.open();
 46+ if(args.length == 0){
 47+ System.out.println("Syntax: java PrefixIndexBuilder [-t] <dbname>");
 48+ return;
 49+ }
 50+ for(int i=0;i<args.length;i++){
 51+ if(args[i].equals("-t"))
 52+ usetemp = true;
 53+ else if(args[i].startsWith("-")){
 54+ System.out.println("Unrecognized option "+args[i]);
 55+ return;
 56+ } else
 57+ dbname = args[i];
 58+ }
 59+
 60+ IndexId iid = IndexId.get(dbname);
 61+ IndexId pre = iid.getPrefix();
 62+
 63+ long start = System.currentTimeMillis();
 64+
 65+ if(!usetemp){
 66+ IndexWriter writer = new IndexWriter(pre.getTempPath(),new PrefixAnalyzer(),true);
 67+ writer.setMergeFactor(20);
 68+ writer.setMaxBufferedDocs(500);
 69+ LinkAnalysisStorage st = new LinkAnalysisStorage(iid);
 70+ log.info("Writing temp index");
 71+ int count = 0;
 72+ Iterator<ArticleAnalytics> it = st.iterator();
 73+ while(it.hasNext()){
 74+ if(++count % 1000 == 0)
 75+ System.out.println("Processed "+count);
 76+ ArticleAnalytics aa = it.next();
 77+ String key = aa.getKey();
 78+ //String title = key.substring(key.indexOf(":")+1).toLowerCase();
 79+ String redirect = aa.getRedirectTarget();
 80+ if(redirect == null)
 81+ redirect = "";
 82+ int ref = aa.getReferences();
 83+ Document d = new Document();
 84+ d.add(new Field("key",key,Field.Store.YES,Field.Index.TOKENIZED));
 85+ d.add(new Field("redirect",redirect,Field.Store.YES,Field.Index.NO));
 86+ d.add(new Field("ref",Integer.toString(ref),Field.Store.YES,Field.Index.NO));
 87+ writer.addDocument(d);
 88+ }
 89+ log.info("Optimizing temp index");
 90+ writer.optimize();
 91+ writer.close();
 92+ }
 93+ log.info("Writing prefix index");
 94+ IndexWriter writer = new IndexWriter(pre.getImportPath(), new LowercaseAnalyzer(),true);
 95+ writer.setMergeFactor(20);
 96+ writer.setMaxBufferedDocs(1000);
 97+ IndexReader ir = IndexReader.open(pre.getTempPath());
 98+ LuceneDictionary dict = new LuceneDictionary(ir,"key");
 99+ Word w;
 100+ while((w = dict.next()) != null){
 101+ String prefix = w.getWord();
 102+ Term t = new Term("key",prefix);
 103+ if(ir.docFreq(t) < 2)
 104+ continue;
 105+ TermDocs td = ir.termDocs(t);
 106+ HashMap<String,Integer> refs = new HashMap<String,Integer>();
 107+ while(td.next()){
 108+ Document d = ir.document(td.doc());
 109+ refs.put(d.get("key"),Integer.parseInt(d.get("ref")));
 110+ }
 111+ ArrayList<Entry<String,Integer>> sorted = new ArrayList<Entry<String,Integer>>();
 112+ sorted.addAll(refs.entrySet());
 113+ Collections.sort(sorted,new Comparator<Entry<String,Integer>>() {
 114+ public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2){
 115+ return o2.getValue() - o1.getValue();
 116+ }
 117+ });
 118+ ArrayList<String> selected = new ArrayList<String>();
 119+ for(int i=0;i<PER_PREFIX && i<sorted.size();i++){
 120+ selected.add(sorted.get(i).getKey());
 121+ }
 122+ Document d = new Document();
 123+ d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.UN_TOKENIZED));
 124+ d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO));
 125+ writer.addDocument(d);
 126+ }
 127+ log.info("Adding title keys ...");
 128+ int count = 0;
 129+ for(int i=0;i<ir.maxDoc();i++){
 130+ if(++count % 1000 == 0)
 131+ System.out.println("Added "+count);
 132+ if(ir.isDeleted(i))
 133+ continue;
 134+ Document d = new Document();
 135+ d.add(new Field("key",ir.document(i).get("key"),Field.Store.YES,Field.Index.TOKENIZED));
 136+ writer.addDocument(d);
 137+ }
 138+ ir.close();
 139+ log.info("Optimizing ...");
 140+ writer.optimize();
 141+ writer.close();
 142+
 143+ IndexThread.makeIndexSnapshot(pre,pre.getImportPath());
 144+ long delta = System.currentTimeMillis() - start;
 145+ System.out.println("Finished in "+formatTime(delta));
 146+ }
 147+
 148+ private static String formatTime(long l) {
 149+ l /= 1000;
 150+ if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
 151+ else if(l >= 60) return (l%3600)/60+"m "+(l%60)+"s";
 152+ else return l+"s";
 153+ }
 154+
 155+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java
@@ -125,7 +125,7 @@
126126 }
127127
128128 public class LinkAnalysisIterator implements Iterator<ArticleAnalytics>{
129 - int inx = 0, next = -1;
 129+ int inx = -1, next = -1;
130130 int maxdoc;
131131
132132 public LinkAnalysisIterator() throws IOException{
@@ -137,7 +137,7 @@
138138 if(inx >= maxdoc)
139139 return false;
140140 if(next == -1){
141 - for(next=inx;next<maxdoc;next++)
 141+ for(next=inx+1;next<maxdoc;next++)
142142 if(!reader.isDeleted(next))
143143 return true;
144144 return false;
@@ -152,6 +152,8 @@
153153 inx = next;
154154 next = -1;
155155 } else{
 156+ if(inx == -1)
 157+ inx = 0;
156158 for(;inx<maxdoc;inx++){
157159 if(!reader.isDeleted(inx))
158160 break;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
@@ -58,7 +58,7 @@
5959 /** If true, this machine is an indexer for this index */
6060 protected boolean myIndex;
6161
62 - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED };
 62+ protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED, PREFIX };
6363
6464 /** Type of index, enumeration */
6565 protected IndexType type;
@@ -162,6 +162,8 @@
163163 this.type = IndexType.LINK_ANALYSIS;
164164 else if(type.equals("related"))
165165 this.type = IndexType.RELATED;
 166+ else if(type.equals("prefix"))
 167+ this.type = IndexType.PREFIX;
166168
167169 // parts
168170 String[] parts = dbrole.split("\\.");
@@ -265,6 +267,10 @@
266268 public boolean isRelated(){
267269 return type == IndexType.RELATED;
268270 }
 271+ /** If this is the index storing article list for specific prefixes */
 272+ public boolean isPrefix(){
 273+ return type == IndexType.PREFIX;
 274+ }
269275
270276 /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */
271277 public int getPartNum() {
@@ -412,7 +418,7 @@
413419
414420 /** get all hosts that search db this iid belongs to */
415421 public HashSet<String> getDBSearchHosts(){
416 - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated())
 422+ if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix())
417423 return searchHosts;
418424 else{
419425 // add all hosts that search: dbname and all parts
@@ -463,7 +469,7 @@
464470 */
465471 public HashSet<String> getPhysicalIndexes() {
466472 HashSet<String> ret = new HashSet<String>();
467 - if(isSingle() || isSpell() || isLinkAnalysis() || isRelated())
 473+ if(isSingle() || isSpell() || isLinkAnalysis() || isRelated() || isPrefix())
468474 ret.add(dbrole);
469475 else if(isMainsplit() || isSplit() || isNssplit()){
470476 for(String p : splitParts)
@@ -549,6 +555,11 @@
550556 return get(dbname+".related");
551557 }
552558
 559+ /** Get the prefix index iid */
 560+ public IndexId getPrefix() {
 561+ return get(dbname+".prefix");
 562+ }
553563
 564+
554565
555566 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -232,7 +232,7 @@
233233 } else if(typeid.matches("nspart[1-9][0-9]*")){
234234 type = "nssplit";
235235 dbrole = dbname + "." + typeid;
236 - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related")){
 236+ } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){
237237 type = typeid;
238238 dbrole = dbname + "." + typeid;
239239 } else
@@ -519,7 +519,7 @@
520520 } else if(typeid.matches("nspart[1-9][0-9]*")){
521521 type = "nssplit";
522522 dbrole = dbname + "." + typeid;
523 - } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related")){
 523+ } else if(typeid.equals("spell") || typeid.equals("link_analysis") || typeid.equals("related") || typeid.equals("prefix")){
524524 type = typeid;
525525 dbrole = dbname + "." + typeid;
526526 } else
@@ -816,6 +816,12 @@
817817 System.out.println("Unrecognized suggest parameters in ("+role+")");
818818
819819 dbroles.put(type,params);
 820+ } else if(type.equals("prefix")){
 821+ // no params
 822+ if(tokens.length>1 && verbose)
 823+ System.out.println("Unrecognized prefix parameters in ("+role+")");
 824+
 825+ dbroles.put(type,params);
820826 } else{
821827 System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring.");
822828 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -1,17 +1,22 @@
22 package org.wikimedia.lsearch.search;
33
44 import java.io.IOException;
 5+import java.io.Reader;
56 import java.net.URI;
67 import java.text.MessageFormat;
78 import java.util.ArrayList;
89 import java.util.HashMap;
910 import java.util.HashSet;
1011 import java.util.Hashtable;
 12+import java.util.Iterator;
1113
1214 import org.apache.log4j.Logger;
1315 import org.apache.lucene.analysis.Analyzer;
1416 import org.apache.lucene.document.Document;
1517 import org.apache.lucene.index.IndexReader;
 18+import org.apache.lucene.index.Term;
 19+import org.apache.lucene.index.TermDocs;
 20+import org.apache.lucene.index.TermEnum;
1621 import org.apache.lucene.queryParser.ParseException;
1722 import org.apache.lucene.search.Hits;
1823 import org.apache.lucene.search.Query;
@@ -31,6 +36,7 @@
3237 import org.wikimedia.lsearch.frontend.SearchDaemon;
3338 import org.wikimedia.lsearch.frontend.SearchServer;
3439 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 40+import org.wikimedia.lsearch.ranks.StringList;
3541 import org.wikimedia.lsearch.spell.Suggest;
3642 import org.wikimedia.lsearch.spell.SuggestQuery;
3743 import org.wikimedia.lsearch.util.QueryStringMap;
@@ -57,9 +63,7 @@
5864 /** Main search method, call this from the search frontend */
5965 public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) {
6066
61 - if (what.equals("titlematch")) {
62 - // TODO: return searchTitles(searchterm);
63 - } else if (what.equals("search") || what.equals("explain")) {
 67+ if (what.equals("search") || what.equals("explain")) {
6468 int offset = 0, limit = 100; boolean exactCase = false;
6569 if (query.containsKey("offset"))
6670 offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
@@ -94,16 +98,57 @@
9599 exactCase = true;
96100 NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
97101 return search(iid, searchterm, offset, limit, namespaces, what.equals("rawexplain"), exactCase, true);
 102+ } else if (what.equals("titlematch")) {
 103+ // TODO: return searchTitles(searchterm);
 104+ } else if (what.equals("prefix")){
 105+ return prefixSearch(iid, searchterm);
98106 } else {
99107 SearchResults res = new SearchResults();
100108 res.setErrorMsg("Unrecognized search type. Try one of: " +
101 - "search, explain, raw, rawexplain.");
 109+ "search, explain, raw, rawexplain, prefix.");
102110 log.warn("Unknown request type [" + what + "].");
103111 return res;
104112 }
105113 return null;
106114 }
107115
 116+ private SearchResults prefixSearch(IndexId iid, String searchterm) {
 117+ IndexId pre = iid.getPrefix();
 118+ SearcherCache cache = SearcherCache.getInstance();
 119+ SearchResults res = new SearchResults();
 120+ try {
 121+ long start = System.currentTimeMillis();
 122+ searchterm = searchterm.toLowerCase();
 123+ IndexSearcherMul searcher = cache.getLocalSearcher(pre);
 124+ IndexReader reader = searcher.getIndexReader();
 125+ TermDocs td = reader.termDocs(new Term("prefix",searchterm));
 126+ if(td.next()){
 127+ // found entry with a prefix, return
 128+ StringList sl = new StringList(reader.document(td.doc()).get("articles"));
 129+ Iterator<String> it = sl.iterator();
 130+ while(it.hasNext())
 131+ res.addResult(new ResultSet(it.next()));
 132+ //logRequest(pre,"prefix",searchterm,null,res.getNumHits(),start,searcher);
 133+ return res;
 134+ }
 135+ // check if it's an unique prefix
 136+ TermEnum te = reader.terms(new Term("key",searchterm));
 137+ String r = te.term().text();
 138+ if(r.startsWith(searchterm)){
 139+ TermDocs td1 = reader.termDocs(new Term("key",r));
 140+ if(td1.next()){
 141+ res.addResult(new ResultSet(reader.document(td1.doc()).get("key")));
 142+ //logRequest(pre,"prefix",searchterm,null,res.getNumHits(),start,searcher);
 143+ return res;
 144+ }
 145+ }
 146+ } catch (IOException e) {
 147+ // res.setErrorMsg("Internal error during prefix search: "+e.getMessage());
 148+ log.error("Internal error in SearchEngine::prefixSearch : "+e.getMessage());
 149+ }
 150+ return res;
 151+ }
 152+
108153 /** Search mainpart or restpart of the split index */
109154 public SearchResults searchPart(IndexId iid, String searchterm, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){
110155 if( ! (iid.isMainsplit() || iid.isNssplit()))
@@ -390,6 +435,6 @@
391436 long delta = System.currentTimeMillis() - start;
392437 SearchServer.stats.add(true, delta, SearchDaemon.getOpenCount());
393438 log.info(MessageFormat.format("{0} {1}: query=[{2}] parsed=[{3}] hit=[{4}] in {5}ms using {6}",
394 - new Object[] {what, iid.toString(), searchterm, query.toString(), new Integer(numhits), new Long(delta), searcher.toString()}));
 439+ new Object[] {what, iid.toString(), searchterm, query==null? "" : query.toString(), new Integer(numhits), new Long(delta), searcher.toString()}));
395440 }
396441 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
@@ -40,7 +40,7 @@
4141 global = GlobalConfiguration.getInstance();
4242
4343 Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup");
44 - if(iid.isSpell()); // no warmup for spell-chekers
 44+ if(iid.isSpell() || iid.isPrefix()); // no warmup for spell-chekers and prefixes (for now)
4545 else if(warmup == null){
4646 makeNamespaceFilters(is,iid);
4747 simpleWarmup(is,iid);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -466,7 +466,15 @@
467467 p = makeRelated(doc,fields.related(),article,1);
468468
469469 // anchors
470 - makeKeywordField(doc,fields.anchor(),rankBoost);
 470+ // makeKeywordField(doc,fields.anchor(),rankBoost);
 471+
 472+ // add the whole title for extract boost
 473+ String wt = FastWikiTokenizerEngine.stipTitle(article.getTitle());
 474+ if(!bs.isExactCase())
 475+ wt = wt.toLowerCase();
 476+ Field wtitle = new Field(fields.wholetitle(),wt,Field.Store.NO, Field.Index.UN_TOKENIZED);
 477+ wtitle.setBoost(rankBoost);
 478+ doc.add(wtitle);
471479
472480 }
473481 // make analyzer
@@ -522,7 +530,7 @@
523531 if(ranks.get(i) == 0)
524532 break; // we don't want redirects with zero links
525533 //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i));
526 - Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.YES, Field.Index.TOKENIZED);
 534+ Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED);
527535 alttitle.setBoost(calculateArticleRank(ranks.get(i)));
528536 doc.add(alttitle);
529537 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -792,5 +792,50 @@
793793 return keywords;
794794 }
795795
796 -
 796+ /** Delete everything that is not being indexes, decompose chars */
 797+ public static String stipTitle(String title){
 798+ UnicodeDecomposer decomposer = UnicodeDecomposer.getInstance();
 799+ char[] str = title.toCharArray();
 800+ char[] buf = new char[256];
 801+ int len = 0;
 802+ for(int i=0;i<str.length;i++){
 803+ char ch = str[i];
 804+ if(ch == ':' || ch == '(' || ch == ')' || ch =='[' || ch == ']' || ch == '.' || ch == ','
 805+ || ch == ';' || ch == '"' || ch=='-' || ch=='+' || ch=='*' || ch=='!' || ch=='~' || ch=='$'
 806+ || ch == '%' || ch == '^' || ch == '&' || ch == '_' || ch=='=' || ch=='|' || ch=='\\'){
 807+ if(len > 0 && buf[len-1]!=' '){
 808+ if(len >= buf.length){ // extend buf
 809+ char[] n = new char[buf.length*2];
 810+ System.arraycopy(buf,0,n,0,buf.length);
 811+ buf = n;
 812+ }
 813+ buf[len++] = ' '; // replace the special char with space
 814+ }
 815+ } else{
 816+ char[] decomp = decomposer.decompose(ch);
 817+ if(decomp == null){
 818+ // no decomposition add char, but don't double spaces
 819+ if(ch!=' ' || (len>0 && buf[len-1]!=' ')){
 820+ if(len >= buf.length){
 821+ char[] n = new char[buf.length*2];
 822+ System.arraycopy(buf,0,n,0,buf.length);
 823+ buf = n;
 824+ }
 825+ buf[len++] = ch;
 826+ }
 827+ } else{
 828+ // add decomposed chars
 829+ for(int j = 0; j < decomp.length; j++){
 830+ if(len >= buf.length){
 831+ char[] n = new char[buf.length*2];
 832+ System.arraycopy(buf,0,n,0,buf.length);
 833+ buf = n;
 834+ }
 835+ buf[len++] = decomp[j];
 836+ }
 837+ }
 838+ }
 839+ }
 840+ return new String(buf,0,len);
 841+ }
797842 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PrefixAnalyzer.java
@@ -0,0 +1,37 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.io.Reader;
 6+
 7+import org.apache.lucene.analysis.Analyzer;
 8+import org.apache.lucene.analysis.Token;
 9+import org.apache.lucene.analysis.TokenStream;
 10+import org.apache.lucene.analysis.Tokenizer;
 11+
 12+public class PrefixAnalyzer extends Analyzer {
 13+ static public class PrefixTokenizer extends Tokenizer {
 14+ String in;
 15+ int count = 0;
 16+
 17+ public PrefixTokenizer(String input){
 18+ in = input;
 19+ }
 20+ @Override
 21+ public Token next() throws IOException {
 22+ count++;
 23+ if(count > in.length())
 24+ return null;
 25+ else
 26+ return new Token(in.substring(0,count),0,count);
 27+ }
 28+ }
 29+
 30+ public TokenStream tokenStream(String fieldName, String str) {
 31+ return new PrefixTokenizer(str.toLowerCase());
 32+ }
 33+
 34+ @Override
 35+ public TokenStream tokenStream(String fieldName, Reader reader) {
 36+ throw new UnsupportedOperationException("Use tokenStream(String,String)");
 37+ }
 38+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -86,15 +86,18 @@
8787 public static float ALT_TITLE_BOOST = 8;
8888 public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
8989 public static float KEYWORD_BOOST = 0.02f;
 90+ public static float CONTENTS_BOOST = 0.2f;
9091
9192 public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 20;
92 - public static float ADDITIONAL_BOOST_CONTENTS = 1;
93 - public static int ADDITIONAL_PHRASE_SLOP_TITLE = 10;
94 - public static float ADDITIONAL_BOOST_TITLE = 2;
 93+ public static float ADDITIONAL_BOOST_CONTENTS = 0.5f;
 94+ public static int ADDITIONAL_PHRASE_SLOP_TITLE = 1;
 95+ public static float ADDITIONAL_BOOST_TITLE = 0.5f;
9596 public static int ADDITIONAL_PHRASE_SLOP_RELATED = 10;
96 - public static float ADDITIONAL_BOOST_RELATED = 1f;
 97+ public static float ADDITIONAL_BOOST_RELATED = 0.04f;
9798
98 - public static float ANCHOR_BOOST = 1f;
 99+ public static float WHOLE_TITLE_BOOST = 8f;
 100+ public static float EXACT_CONTENTS_BOOST = 1f;
 101+ public static float ANCHOR_BOOST = 0.02f;
99102
100103 public static boolean ADD_STEM_TITLE = true;
101104 public static boolean ADD_TITLE_PHRASES = true;
@@ -1070,7 +1073,7 @@
10711074 }
10721075
10731076 /** Extract all words from the query */
1074 - public ArrayList<String> extractPhrases(Query query){
 1077+ public ArrayList<String> extractWords(Query query){
10751078 ArrayList<String> list = new ArrayList<String>();
10761079 if(query == null)
10771080 return list;
@@ -1106,7 +1109,7 @@
11071110 else if(bcl.length == 1 && bcl[0].getOccur() != Occur.MUST_NOT)
11081111 addWords(list,bcl[0].getQuery());
11091112 else if(bcl.length == 2){
1110 - // TODO: this might brake in some complex queries! (with some parenthesis and transliterations...)
 1113+ // TODO: this might break in some complex queries! (with some parenthesis and transliterations...)
11111114 if(bcl[0].getOccur() == Occur.MUST && bcl[1].getOccur() == Occur.SHOULD)
11121115 // second is alias
11131116 addWords(list,bcl[0].getQuery());
@@ -1315,7 +1318,7 @@
13161319 defaultBoost = olfDefaultBoost;
13171320 defaultAliasBoost = ALIAS_BOOST;
13181321
1319 - ArrayList<String> words = extractPhrases(qt);
 1322+ ArrayList<String> words = extractWords(qt);
13201323
13211324 if(qt == qs) // either null, or category query
13221325 return new Object[] {qt,words};
@@ -1470,6 +1473,20 @@
14711474 return bq;
14721475 }
14731476 return null;
 1477+ }
 1478+
 1479+ /** Join a collection via a char/string */
 1480+ protected String join(Collection<String> col, String sep){
 1481+ StringBuffer sb = new StringBuffer();
 1482+ boolean first = true;
 1483+ for(String s : col){
 1484+ if(!first){
 1485+ sb.append(sep);
 1486+ } else
 1487+ first = false;
 1488+ sb.append(s);
 1489+ }
 1490+ return sb.toString();
14741491 }
14751492
14761493 /**
@@ -1485,7 +1502,7 @@
14861503 queryText = quoteCJK(queryText);
14871504 if(policy != null)
14881505 this.namespacePolicy = policy;
1489 - defaultBoost = 1;
 1506+ defaultBoost = CONTENTS_BOOST;
14901507 defaultAliasBoost = ALIAS_BOOST;
14911508 Query qc = parseRaw(queryText);
14921509 Object[] qtwords = makeTitleQuery(queryText);
@@ -1497,7 +1514,7 @@
14981515 if(qc.equals(qt))
14991516 return qc; // don't duplicate (probably a query for categories only)
15001517
1501 - BooleanQuery bq = new BooleanQuery();
 1518+ BooleanQuery bq = new BooleanQuery(true);
15021519 bq.add(qc,BooleanClause.Occur.SHOULD);
15031520 bq.add(qt,BooleanClause.Occur.SHOULD);
15041521
@@ -1522,9 +1539,14 @@
15231540 bq.add(qk,BooleanClause.Occur.SHOULD);
15241541 }
15251542
 1543+ // whole title
 1544+ Query wt = new TermQuery(new Term(fields.wholetitle(),join(words," ")));
 1545+ wt.setBoost(WHOLE_TITLE_BOOST);
 1546+ Query wc = makePhrase(words,fields.contents(),0);
 1547+ wc.setBoost(EXACT_CONTENTS_BOOST);
15261548 // add additional score queries!
1527 - Query pqc = makePhraseQueries(words,"contents",ADDITIONAL_PHRASE_SLOP_CONTENTS,ADDITIONAL_BOOST_CONTENTS);
1528 - Query pqt = makePhraseQueries(words,"stemtitle",ADDITIONAL_PHRASE_SLOP_TITLE,ADDITIONAL_BOOST_TITLE);
 1549+ Query pqc = makePhraseQueries(words,fields.contents(),ADDITIONAL_PHRASE_SLOP_CONTENTS,ADDITIONAL_BOOST_CONTENTS);
 1550+ Query pqt = makePhraseQueries(words,fields.stemtitle(),ADDITIONAL_PHRASE_SLOP_TITLE,ADDITIONAL_BOOST_TITLE);
15291551 // skip last related group
15301552 Query[] pqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1];
15311553 for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
@@ -1534,16 +1556,20 @@
15351557 for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
15361558 wqr[i-1] = makeWordQueries(words,"related"+i,ADDITIONAL_BOOST_RELATED / 4);
15371559 }
1538 - if(pqc == null && pqt == null && pqr[0] == null && wqr[0] == null)
 1560+ if(wt==null && pqc == null && pqt == null && pqr[0] == null && wqr[0] == null)
15391561 return bq;
15401562 // build the final query
15411563 BooleanQuery finalQuery = new BooleanQuery(true);
15421564 BooleanQuery additional = new BooleanQuery(true);
1543 -
 1565+
15441566 if(pqc != null)
15451567 additional.add(pqc,Occur.MUST);
15461568 if(pqt != null)
15471569 additional.add(pqt,Occur.SHOULD);
 1570+ if(wt != null)
 1571+ additional.add(wt,Occur.SHOULD);
 1572+ if(wc != null)
 1573+ additional.add(wc,Occur.SHOULD);
15481574 for(Query q : pqr){
15491575 if(q != null)
15501576 additional.add(q,Occur.SHOULD);
@@ -1554,12 +1580,12 @@
15551581 }
15561582
15571583 // anchors
1558 - Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST);
 1584+ //Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST);
15591585
15601586 finalQuery.add(bq,Occur.MUST);
15611587 finalQuery.add(additional,Occur.SHOULD);
1562 - if(anchors != null)
1563 - finalQuery.add(anchors,Occur.SHOULD);
 1588+ //if(anchors != null)
 1589+ // finalQuery.add(anchors,Occur.SHOULD);
15641590
15651591 return finalQuery;
15661592
@@ -1617,8 +1643,6 @@
16181644 }
16191645 public void setBuilder(FieldBuilder.BuilderSet builder) {
16201646 this.builder = builder;
1621 - }
1622 -
 1647+ }
16231648
1624 -
16251649 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java
@@ -66,6 +66,13 @@
6767 else
6868 return "anchor";
6969 }
 70+
 71+ public String wholetitle(){
 72+ if(exactCase)
 73+ return "wholetitle_exact";
 74+ else
 75+ return "wholetitle";
 76+ }
7077
7178
7279 public boolean isExactCase() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
@@ -0,0 +1,44 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.io.Reader;
 6+
 7+import org.apache.lucene.analysis.Analyzer;
 8+import org.apache.lucene.analysis.Token;
 9+import org.apache.lucene.analysis.TokenStream;
 10+/**
 11+ * Analyzer that just lowecases the text, doesn't split up anything, etc..
 12+ *
 13+ * @author rainman
 14+ *
 15+ */
 16+public class LowercaseAnalyzer extends Analyzer {
 17+ public static class LowercaseTokenizer extends TokenStream {
 18+ String text;
 19+ boolean sent = false;
 20+ LowercaseTokenizer(String in){
 21+ text = in.toLowerCase();
 22+ }
 23+ @Override
 24+ public Token next() throws IOException {
 25+ if(sent)
 26+ return null;
 27+ else{
 28+ sent = true;
 29+ return new Token(text,0,text.length());
 30+ }
 31+ }
 32+
 33+ }
 34+
 35+ @Override
 36+ public TokenStream tokenStream(String fieldName, String text) {
 37+ return new LowercaseTokenizer(text);
 38+ }
 39+ @Override
 40+ public TokenStream tokenStream(String fieldName, Reader reader) {
 41+ throw new UnsupportedOperationException("Use tokenStream(String,String)");
 42+ }
 43+
 44+
 45+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
@@ -232,7 +232,71 @@
233233 writer.addDocument(doc,an);
234234 state = State.MODIFIED_ARTICLES;
235235 }
 236+ public static HashSet<Character> separators = new HashSet<Character>();
 237+ static{
 238+ separators.add(' ');
 239+ separators.add('\r');
 240+ separators.add('\n');
 241+ separators.add('\t');
 242+ separators.add(':');
 243+ separators.add('(');
 244+ separators.add(')');
 245+ separators.add('[');
 246+ separators.add(']');
 247+ separators.add('.');
 248+ separators.add(',');
 249+ separators.add(':');
 250+ separators.add(';');
 251+ separators.add('"');
 252+ separators.add('+');
 253+ separators.add('*');
 254+ separators.add('!');
 255+ separators.add('~');
 256+ separators.add('$');
 257+ separators.add('%');
 258+ separators.add('^');
 259+ separators.add('&');
 260+ separators.add('_');
 261+ separators.add('=');
 262+ separators.add('|');
 263+ separators.add('\\');
 264+ }
236265
 266+ /**
 267+ * Find a sentance boundaries
 268+ *
 269+ * @param text - raw text
 270+ * @param start - start index to search from
 271+ * @param reverse - if true, will lookup in reverse
 272+ * @param max - radius of search (if no boundary is found return last wordbreak)
 273+ * @return
 274+ */
 275+ protected int findSentance(char[] text, int start, boolean reverse, int max){
 276+ int inc = (reverse)? -1 : 1;
 277+ int count = 0;
 278+ int wordbreak = start;
 279+ int i = start;
 280+ for(;i>0 && i<text.length;i+=inc){
 281+ char c = text[i];
 282+ if(c == '.')
 283+ return i;
 284+ else if(c == '*' && ((i>1 && text[i-1]=='\n') || i==0))
 285+ return i;
 286+ else if(separators.contains(c))
 287+ wordbreak = i;
 288+ if(count >= max)
 289+ return wordbreak; // more than max chars away, return the latest wordbreak
 290+ count ++;
 291+ }
 292+ return i;
 293+ }
 294+
 295+ /** Find surrounding for a link - extract sentances, list items .... */
 296+ protected String findContext(char[] text, int start, int end){
 297+ // TODO: implement
 298+ return null;
 299+ }
 300+
237301 /** Find the target key to title (ns:title) to which the links is pointing to
238302 * @throws IOException */
239303 protected String findTargetLink(int ns, String title) throws IOException{
Index: branches/lucene-search-2.1/lsearch-global.conf
@@ -17,14 +17,14 @@
1818 wikidev : (single) (language,sr)
1919 wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[])
2020 wikilucene : (language,en) (warmup,10)
21 -wikilucene : (spell,3,1)
 21+wikilucene : (spell,3,1) (prefix)
2222
2323 # Search groups
2424 # Index parts of a split index are always taken from the node's group
2525 # host : db1.part db2.part
2626 # Mulitple hosts can search multiple dbs (N-N mapping)
2727 [Search-Group]
28 -oblak : wikilucene wikidev
 28+oblak : wikilucene wikidev wikilucene.prefix
2929
3030 # Index nodes
3131 # host: db1.part db2.part

Status & tagging log