r25925 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r25924‎ \| r25925 \| r25926 >
Date:	22:47, 18 September 2007
Author:	rainman
Status:	old
Tags:
Comment:	* Experimental implementation of ajax suggest engine, based on Julien Lemoine's ideas. Use lucene index instead of trie * Add (remote) spell-check unit test cases * More fine-tuning of the spell-check engine
Modified paths:	/branches/lucene-search-2.1/lsearch-global.conf (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PrefixAnalyzer.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/LocalIndex.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -130,12 +130,12 @@
131	131	Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
132	132	bs = new FieldBuilder("en").getBuilder();
133	133	parser = new WikiQueryParser(bs.getFields().title(),"0",analyzer,bs,NamespacePolicy.IGNORE,stopWords);
134		~~- assertEquals("[how, do, you, do]",parser.extractPhrases(parser.parseRaw("how do you do")).toString());~~
135		~~- assertEquals("[making, something, rest]",parser.extractPhrases(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString());~~
136		~~- assertEquals("[godel, theorem]",parser.extractPhrases(parser.parseRaw("gödel theorem")).toString());~~
137		~~- assertEquals("[some, text, and, some, phrase]",parser.extractPhrases(parser.parseRaw("some_text and \"some phrase\"")).toString());~~
	134	+ assertEquals("[how, do, you, do]",parser.extractWords(parser.parseRaw("how do you do")).toString());
	135	+ assertEquals("[making, something, rest]",parser.extractWords(parser.parseRaw("(help:making something incategory:blah) OR (rest incategory:crest)")).toString());
	136	+ assertEquals("[godel, theorem]",parser.extractWords(parser.parseRaw("gödel theorem")).toString());
	137	+ assertEquals("[some, text, and, some, phrase]",parser.extractWords(parser.parseRaw("some_text and \"some phrase\"")).toString());
138	138
139		~~- ArrayList<String> words = parser.extractPhrases(parser.parseRaw("the who band is something nobody knows about"));~~
	139	+ ArrayList<String> words = parser.extractWords(parser.parseRaw("the who band is something nobody knows about"));
140	140	assertEquals("contents:\"the who band\"~10 contents:\"band is something\"~10 contents:\"something nobody\"~10 contents:\"nobody knows\"~10 contents:\"knows about\"~10",parser.makePhraseQueries(words,"contents",10,1).toString());
141	141
142	142	// namespace policies
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
—	—	@@ -0,0 +1,100 @@
	2	+package org.wikimedia.lsearch.test;
	3	+
	4	+import java.io.BufferedReader;
	5	+import java.io.IOException;
	6	+import java.io.InputStreamReader;
	7	+import java.net.MalformedURLException;
	8	+import java.net.URL;
	9	+import java.net.URLEncoder;
	10	+
	11	+/**
	12	+ * Remotely test a spell-checker host
	13	+ *
	14	+ * @author rainman
	15	+ *
	16	+ */
	17	+public class SpellCheckTest {
	18	+ static String host = "localhost";
	19	+ static int port = 8123;
	20	+ static String db = "enwiki";
	21	+
	22	+ public static String getSuggestion(String query) throws IOException{
	23	+ query = query.replace(" ","%20");
	24	+ String urlString = "http://"+host+":"+port+"/search/"+db+"/"+query+"?case=ignore&limit=20&namespaces=0&offset=0";
	25	+ URL url = new URL(urlString);
	26	+ BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
	27	+ String line;
	28	+ int lineNum = 0;
	29	+ while ( (line = br.readLine()) != null ) {
	30	+ if(lineNum == 1){
	31	+ if(line.startsWith("#suggest")){
	32	+ br.close();
	33	+ return line.substring(9).replaceAll("<[^>]+>","");
	34	+ }
	35	+ }
	36	+ lineNum ++ ;
	37	+ }
	38	+ br.close();
	39	+ return "";
	40	+ }
	41	+
	42	+ /**
	43	+ * @param args
	44	+ * @throws IOException
	45	+ */
	46	+ public static void main(String[] args) throws IOException {
	47	+ int len = CHECK.length;
	48	+ System.out.println("Running "+len+" tests");
	49	+ int good = 0, failed = 0;
	50	+ int count = 1;
	51	+ for(String[] c : CHECK){
	52	+ String sug = getSuggestion(c[0]);
	53	+ if(!sug.equals(c[1])){
	54	+ System.out.println("["+count+"/"+len+"] FAILED {"+sug+"} EXPECTED ["+c[1]+"] FOR ["+c[0]+"]");
	55	+ failed++;
	56	+ } else{
	57	+ System.out.println("["+count+"/"+len+"] OK");
	58	+ good++;
	59	+ }
	60	+ count ++;
	61	+ }
	62	+ System.out.println("Good tests: "+good+", failed tests: "+failed);
	63	+ }
	64	+
	65	+ // wrong -> right
	66	+ private static final String[][] CHECK = {
	67	+ {"annul of improbably research", "annals of improbable research" },
	68	+ {"los angles", "los angeles" },
	69	+ {"what is the type of engineers thats deal with various depth of the eart crust", "what is the type of engineers thats deal with various depths of the earth crust"},
	70	+ {"argentina cilmage", "argentina climate"},
	71	+ {"Vista Compatibly", "Vista Compatible"},
	72	+ {"sarah thomson", "sarah thompson"},
	73	+ {"attribution (finance)", ""},
	74	+ {"SOUTH PARK EPISDOE LIST", "SOUTH PARK EPISODE LIST"},
	75	+ {"the grnd canyon", "the grand canyon"},
	76	+ {"ron burgand","ron burgundy"},
	77	+ {"fullmetal achemist ep 1","fullmetal alchemist ep 1"},
	78	+ {"fullmetal alchemist ep 1",""},
	79	+ {"enerst shackleton", "ernest shackleton"},
	80	+ {"los angles lakers", "los angeles lakers"},
	81	+ {"crab fisher","crab fishing"},
	82	+ {"discovery channe;", "discovery channel"},
	83	+ {"Young Cuties", ""},
	84	+ {"fire australia", ""},
	85	+ {"platoon film", ""},
	86	+ {"basillar artery","basilar artery"},
	87	+ {"franki vallie","frankie valli"},
	88	+ {"cuties",""},
	89	+ {"teh",""},
	90	+ {"21st ammendment", "21st amendment"},
	91	+ {"stargate junior",""},
	92	+ {"fire australia",""},
	93	+ {"ISO crack", ""},
	94	+ {"The James Gang (band)",""},
	95	+ {"cource", "course"},
	96	+ {"carolene products",""},
	97	+ {"orvileWright","overnight"},
	98	+
	99	+ };
	100	+
	101	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
—	—	@@ -65,8 +65,20 @@
66	66	HashMap query = new QueryStringMap(uri);
67	67	SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query);
68	68	contentType = "text/plain";
69		~~- if(res!=null && res.isSuccess()){~~
	69	+ // format:
	70	+ // <namespace> <title> (resNum-times)
	71	+ if(what.equals("prefix")){
70	72	sendHeaders(200, "OK");
	73	+ for(ResultSet rs : res.getResults()){
	74	+ sendResultLine(rs.namespace, rs.title);
	75	+ }
	76	+ }
	77	+ // format:
	78	+ // <num of hits>
	79	+ // #suggest <query> or #no suggestion
	80	+ // <score> <ns> <title> (resNum-times)
	81	+ else if(res!=null && res.isSuccess()){
	82	+ sendHeaders(200, "OK");
71	83	sendOutputLine(Integer.toString(res.getNumHits()));
72	84	if(res.getSuggest() != null)
73	85	sendOutputLine("#suggest "+res.getSuggest());
—	—	@@ -122,4 +134,12 @@
123	135	}
124	136	}
125	137
	138	+ private void sendResultLine(String namespace, String title) {
	139	+ try{
	140	+ sendOutputLine(namespace + " " + URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8"));
	141	+ } catch(Exception e){
	142	+ log.error("Error sending prefix result line (" + namespace + " " + title +"): "+e.getMessage());
	143	+ }
	144	+ }
	145	+
126	146	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
—	—	@@ -51,7 +51,7 @@
52	52	protected Set<String> stopWords;
53	53
54	54	/** Distance an metaphone metrics */
55		~~- static class Metric {~~
	55	+ static public class Metric {
56	56	protected DoubleMetaphone dmeta = new DoubleMetaphone();
57	57	protected String meta1, meta2;
58	58	protected EditDistance sd;
—	—	@@ -132,7 +132,7 @@
133	133	this.type = type;
134	134	}
135	135	public String toString(){
136		~~- return "dist:"+dist+"-freq:"+freq+"-sub:"+substitutes+"-pres:"+preserves;~~
	136	+ return "["+type+" dist:"+dist+" freq:"+freq+" sub:"+substitutes+" pres:"+preserves+"]";
137	137	}
138	138	}
139	139
—	—	@@ -176,15 +176,23 @@
177	177	ArrayList<Change> suggestionsTitle = new ArrayList<Change>();
178	178
179	179	// add correct words
180		~~- for(int i=0;i<tokens.size();i++){~~
	180	+ /*for(int i=0;i<tokens.size();i++){
181	181	Token t = tokens.get(i);
182	182	if(correctWords.contains(t.termText())){
183	183	Change c = new Change(0,1,Change.Type.TITLE_WORD);
184	184	c.preserves.put(i,t.termText());
185	185	suggestions.add(c);
186	186	}
	187	+ } */
	188	+
	189	+ // check for exact title match
	190	+ if(tokens.size() == 1){
	191	+ String w = tokens.get(0).termText();
	192	+ if(correctWords.contains(w) && reader.docFreq(new Term("title",w)) != 0)
	193	+ return null;
187	194	}
188	195
	196	+ HashSet<String> stemmedCorrectWords = stemSet(correctWords,parser.getBuilder().getFilters());
189	197	ArrayList<ArrayList<SuggestResult>> wordSug = new ArrayList<ArrayList<SuggestResult>>();
190	198	HashSet<Integer> correctIndex = new HashSet<Integer>();
191	199	ArrayList<SuggestResult> possibleStopWords = new ArrayList<SuggestResult>();
—	—	@@ -214,11 +222,7 @@
215	223	if(w2 == null)
216	224	continue;
217	225
218		~~- String phrase = w+gap+w2;~~
219		~~- if(reader.docFreq(new Term("phrase",phrase)) != 0){~~
220		~~- correctPhrases.add(i);~~
221		~~- correctPhrases.add(i2);~~
222		~~- } else if(correctWords.contains(w) && correctWords.contains(w2)){~~
	226	+ if(correctWords.contains(w) && correctWords.contains(w2)){
223	227	for(HashSet<String> title : titles){
224	228	if(title.contains(w) && title.contains(w2)){
225	229	correctPhrases.add(i);
—	—	@@ -263,26 +267,18 @@
264	268	}
265	269	}
266	270	possibleStopWords.add(maybeStopWord);
267		~~- // detect common misspells~~
268		~~- if(sug.size() > 1){~~
269		~~- SuggestResult r1 = sug.get(0);~~
270		~~- SuggestResult r2 = sug.get(1);~~
271		~~- if(r1.dist == 1 && r2.dist == 0 && r1.frequency > 100 * r2.frequency){~~
272		~~- Change c = new Change(r1.dist,r1.frequency,Change.Type.WORD);~~
273		~~- c.substitutes.put(i,r1.word);~~
274		~~- suggestions.add(c);~~
275		~~- }~~
276		~~- }~~
277	271	} else{
278	272	wordSug.add(null);
279	273	possibleStopWords.add(null);
280	274	}
281	275	// suggest split
282		~~- SuggestResult split = suggestSplit(w,minFreq);~~
283		~~- if(split != null){~~
284		~~- Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);~~
285		~~- sc.substitutes.put(i,split.word.replace("_"," "));~~
286		~~- suggestions.add(sc);~~
	276	+ if(!correctWords.contains(w)){
	277	+ SuggestResult split = suggestSplit(w,minFreq);
	278	+ if(split != null){
	279	+ Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);
	280	+ sc.substitutes.put(i,split.word.replace("_"," "));
	281	+ suggestions.add(sc);
	282	+ }
287	283	}
288	284	// suggest join
289	285	if(i-1 >= 0
—	—	@@ -306,7 +302,8 @@
307	303	ArrayList<SuggestResult> sug2 = null;
308	304	String w2 = null;
309	305	String gap = "_";
310		~~- boolean good1 = sug1.get(0).getDist() == 0; // w1 is spellchecked right~~
	306	+ // if w1 is spellchecked right
	307	+ boolean good1 = sug1.get(0).getDist() == 0;
311	308	int i2 = i;
312	309	boolean maybeStopWord = false; // the currecnt i2 might be a stop word, try to find phrases with it as stop word
313	310	int distOffset = 0; // if we spellcheked to stop word, all phrases should have this initial dist
—	—	@@ -331,7 +328,8 @@
332	329	}
333	330	if(sug2 == null)
334	331	continue;
335		~~- boolean good2 = sug2.get(0).getDist() == 0; // w2 is spellchecked right~~
	332	+ // if second word is spelled right
	333	+ boolean good2 = sug2.get(0).getDist() == 0;
336	334	int maxdist = Math.min((w1.length() + w2.length()) / 3, 5);
337	335	int mindist = -1;
338	336	boolean forTitlesOnly = false;
—	—	@@ -358,21 +356,30 @@
359	357	}
360	358	//log.info("Checking "+phrase);
361	359	if(freq > 0){
	360	+ // number of characters added/substracted
	361	+ int diff1 = Math.abs(s1.word.length()-w1.length());
	362	+ int diff2 = Math.abs(s2.word.length()-w2.length());
362	363	log.info("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle);
363	364	int dist = s1.dist + s2.dist + distOffset;
364	365	boolean accept = true;
365	366	Change c = new Change(dist,freq,Change.Type.PHRASE);
366	367	if(s1.word.equals(w1))
367	368	c.preserves.put(i,w1);
368		~~- else if(!good1 \|\| inTitle)~~
	369	+ else if(!good1 \|\| (inTitle && diff1 <= 2 && !correctWords.contains(w1)))
369	370	c.substitutes.put(i,s1.word);
370		~~- else~~
	371	+ else if(!good1 \|\| (inTitle && diff1 <=2)){
	372	+ forTitlesOnly = true;
	373	+ c.substitutes.put(i,s1.word);
	374	+ } else
371	375	accept = false;
372	376	if(s2.word.equals(w2))
373	377	c.preserves.put(i2,w2);
374		~~- else if(!good2 \|\| inTitle)~~
	378	+ else if(!good2 \|\| (inTitle && diff2 <= 2 && !correctWords.contains(w2)))
375	379	c.substitutes.put(i2,s2.word);
376		~~- else~~
	380	+ else if(!good2 \|\| (inTitle && diff2 <= 2)){
	381	+ forTitlesOnly = true;
	382	+ c.substitutes.put(i2,s2.word);
	383	+ } else
377	384	accept = false;
378	385	if(accept){
379	386	if(mindist == -1)
—	—	@@ -384,10 +391,11 @@
385	392	}
386	393	}
387	394	}
388		~~- } while(maybeStopWord);~~
	395	+ } while(maybeStopWord && i2+1<tokens.size());
389	396	}
390	397	// try to construct a valid title by spell-checking all words
391	398	if(suggestionsTitle.size() > 0){
	399	+ log.info("Trying exact-title matches");
392	400	Object[] ret = calculateChanges(suggestionsTitle,searchterm.length()/2);
393	401	ArrayList<Entry<Integer,String>> proposedTitle = (ArrayList<Entry<Integer, String>>) ret[0];
394	402	boolean madeChanges = false;
—	—	@@ -395,8 +403,10 @@
396	404	String formated = searchterm;
397	405	for(Entry<Integer,String> e : proposedTitle){
398	406	Token t = tokens.get(e.getKey());
399		~~- String nt = e.getValue();~~
400		~~- if(!stemsToSame(t.termText(),nt,parser.getBuilder().getFilters())){~~
	407	+ String nt = e.getValue();
	408	+ // replace words if they don't stem to same word, of they stem to same, but the words is misspelled
	409	+ boolean stemNotSame = stemNotSameOrInSet(t.termText(),nt,parser.getBuilder().getFilters(),stemmedCorrectWords);
	410	+ if(stemNotSame \|\| (!stemNotSame && reader.docFreq(new Term("word",t.termText())) == 0)){
401	411	formated = markSuggestion(formated,t,nt);
402	412	title = applySuggestion(title,t,nt);
403	413	madeChanges = true;
—	—	@@ -412,6 +422,7 @@
413	423	} else if(tokens.size() == 1 && wordSug.get(0)!=null
414	424	&& wordSug.get(0).size() > 0 && !correctWords.contains(tokens.get(0).termText())){
415	425	// only one token, try different spell-checks for title
	426	+ log.info("Trying exact-title single word match");
416	427	ArrayList<SuggestResult> sg = (ArrayList<SuggestResult>) wordSug.get(0).clone();
417	428	Collections.sort(sg,new SuggestResult.ComparatorNoCommonMisspell());
418	429	Token t = tokens.get(0);
—	—	@@ -434,6 +445,7 @@
435	446	ArrayList<Entry<Integer,String>> proposedChanges = new ArrayList<Entry<Integer,String>>();
436	447	if(suggestions.size() > 0){
437	448	// found some suggestions
	449	+ log.info("Trying phrases ...");
438	450	Object[] ret = calculateChanges(suggestions,searchterm.length()/2);
439	451	proposedChanges = (ArrayList<Entry<Integer, String>>) ret[0];
440	452	ArrayList<Entry<Integer,String>> preservedWords = (ArrayList<Entry<Integer, String>>) ret[1];
—	—	@@ -442,12 +454,13 @@
443	455	for(Entry<Integer,String> e : proposedChanges)
444	456	preserveTokens.add(e.getKey());
445	457	}
446		-
	458	+ log.info("Adding words, preserve tokens: "+preserveTokens+", preserve correct phrases: "+correctPhrases);
447	459	// last resort: go with individual word suggestions
448	460	HashMap<Integer,String> wordChanges = new HashMap<Integer,String>();
449		~~- for(int i=0;i<tokens.size();i++){~~
450		~~- if(preserveTokens.contains(i))~~
	461	+ for(int i=0;i<tokens.size();i++){
	462	+ if(preserveTokens.contains(i) \|\| correctPhrases.contains(i))
451	463	continue;
	464	+ // TODO: maybe check for common misspells here?!
452	465	ArrayList<SuggestResult> sug = wordSug.get(i);
453	466	if(sug == null)
454	467	continue;
—	—	@@ -457,7 +470,7 @@
458	471	}
459	472	if(wordChanges.size() != 0)
460	473	proposedChanges.addAll(wordChanges.entrySet());
461		-
	474	+
462	475	// sort in reverse order from that in query, i.e. first change in the last term
463	476	Collections.sort(proposedChanges,new Comparator<Entry<Integer,String>>() {
464	477	public int compare(Entry<Integer,String> o1, Entry<Integer,String> o2){
—	—	@@ -471,7 +484,9 @@
472	485	for(Entry<Integer,String> e : proposedChanges){
473	486	Token t = tokens.get(e.getKey());
474	487	String nt = e.getValue();
475		~~- if(!stemsToSame(t.termText(),nt,parser.getBuilder().getFilters())){~~
	488	+ // incorrect words, or doesn't stem to same
	489	+ boolean stemNotSame = stemNotSameOrInSet(t.termText(),nt,parser.getBuilder().getFilters(),stemmedCorrectWords);
	490	+ if(stemNotSame \|\| (!stemNotSame && reader.docFreq(new Term("word",t.termText())) == 0)){
476	491	formated = markSuggestion(formated,t,nt);
477	492	searchterm = applySuggestion(searchterm,t,nt);
478	493	madeChanges = true;
—	—	@@ -484,15 +499,27 @@
485	500	return null;
486	501	}
487	502
	503	+ /** try to figure out the case of original spell-checked word, and output the new word in that case */
	504	+ protected String simulateCase(String formated, Token t, String newWord) {
	505	+ String old = formated.substring(t.startOffset(),t.endOffset());
	506	+ if(old.equals(old.toLowerCase()))
	507	+ return newWord.toLowerCase();
	508	+ if(old.equals(old.toUpperCase()))
	509	+ return newWord.toUpperCase();
	510	+ if(old.length()>1 && old.equals(old.substring(0,1).toUpperCase()+old.substring(1)))
	511	+ return newWord.substring(0,1).toUpperCase()+newWord.substring(1).toLowerCase();
	512	+ return newWord;
	513	+ }
	514	+
488	515	protected String markSuggestion(String formated, Token t, String newWord){
489	516	return formated.substring(0,t.startOffset())
490		~~- + "<i>" + newWord + "</i>"~~
	517	+ + "<i>" + simulateCase(formated,t,newWord) + "</i>"
491	518	+ formated.substring(t.endOffset());
492	519	}
493	520
494	521	protected String applySuggestion(String searchterm, Token t, String newWord){
495	522	return searchterm.substring(0,t.startOffset())
496		~~- + newWord~~
	523	+ + simulateCase(searchterm,t,newWord)
497	524	+ searchterm.substring(t.endOffset());
498	525	}
499	526
—	—	@@ -575,7 +602,7 @@
576	603	hr.addAll(r1); hr.addAll(r2);
577	604	ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
578	605	res.addAll(hr);
579		~~- Collections.sort(res,new SuggestResult.Comparator());~~
	606	+ Collections.sort(res,new SuggestResult.ComparatorNoCommonMisspell());
580	607	return res;
581	608	}
582	609	return r1;
—	—	@@ -718,11 +745,46 @@
719	746	if(t1 != null && t2 != null && t1.termText().equals(t2.termText()))
720	747	return true;
721	748	} catch (IOException e) {
722		~~- log.error("Cannot stemm words "+word1+", "+word2+" : "+e.getMessage());~~
	749	+ log.error("Cannot stem words "+word1+", "+word2+" : "+e.getMessage());
723	750	}
724	751	return false;
725	752	}
726	753
	754	+ /** check if stemmed newWord is 1) not same to stememed oldWord, OR 2) not in stemmed set*/
	755	+ public boolean stemNotSameOrInSet(String oldWord, String newWord, FilterFactory filters, Set<String> stemmedSet){
	756	+ if(!filters.hasStemmer())
	757	+ return false;
	758	+ ArrayList<String> in = new ArrayList<String>();
	759	+ in.add(oldWord); in.add(newWord);
	760	+ TokenStream ts = filters.makeStemmer(new StringsTokenStream(in));
	761	+ try {
	762	+ Token t1 = ts.next();
	763	+ Token t2 = ts.next();
	764	+ if(t1 != null && t2 != null && (t1.termText().equals(t2.termText()) && stemmedSet.contains(t2.termText())))
	765	+ return false;
	766	+ } catch (IOException e) {
	767	+ log.error("Cannot stem words "+oldWord+", "+oldWord+" : "+e.getMessage());
	768	+ }
	769	+ return true;
	770	+ }
	771	+
	772	+ /** stem all words in the set */
	773	+ public HashSet<String> stemSet(HashSet<String> set, FilterFactory filters){
	774	+ if(!filters.hasStemmer())
	775	+ return new HashSet<String>();
	776	+ HashSet<String> ret = new HashSet<String>();
	777	+ TokenStream ts = filters.makeStemmer(new StringsTokenStream(set));
	778	+ try {
	779	+ Token t;
	780	+ while((t = ts.next()) != null)
	781	+ ret.add(t.termText());
	782	+ return ret;
	783	+ } catch (IOException e) {
	784	+ log.error("Cannot stem set "+set+" : "+e.getMessage());
	785	+ return new HashSet<String>();
	786	+ }
	787	+ }
	788	+
727	789	static class StringsTokenStream extends TokenStream {
728	790	Iterator<String> input;
729	791	int count = 0;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
—	—	@@ -62,7 +62,10 @@
63	63	if(text.length()>=2){
64	64	System.out.println("METAPHONES: "+dmeta.doubleMetaphone(text)+", "+dmeta.doubleMetaphone(text,true));
65	65	System.out.println("SUGGEST: ");
	66	+ int count = 0;
66	67	for(SuggestResult r : sc.suggestWords(text,10)){
	68	+ if(++count >= 10 )
	69	+ break;
67	70	System.out.println(r);
68	71	}
69	72
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java
—	—	@@ -40,23 +40,37 @@
41	41	private int count = 0;
42	42	private String field;
43	43	private boolean first = true;
	44	+ private String prefix = null;
	45	+ private boolean silent = false; // no report output
44	46
45	47	public LuceneDictionary(IndexReader reader, String field) {
46		~~- try {~~
47		~~- this.field = field;~~
48		~~- termEnum = reader.terms(new Term(field, ""));~~
49		~~- } catch (IOException e) {~~
50		~~- throw new RuntimeException(e);~~
51		~~- }~~
	48	+ this(reader,field,"");
52	49	}
53	50
	51	+ public LuceneDictionary(IndexReader reader, String field, String prefix) {
	52	+ if(!prefix.equals(""))
	53	+ this.prefix = prefix;
	54	+
	55	+ try {
	56	+ this.field = field;
	57	+ termEnum = reader.terms(new Term(field, prefix));
	58	+ } catch (IOException e) {
	59	+ throw new RuntimeException(e);
	60	+ }
	61	+ }
	62	+
	63	+ /** Don't print progress */
	64	+ public void setNoProgressReport(){
	65	+ silent = true;
	66	+ }
	67	+
54	68	public Word next() {
55		~~- if(++count % REPORT == 0){~~
	69	+ if(!silent && ++count % REPORT == 0){
56	70	System.out.println("Processed "+count+" terms");
57	71	}
58	72	try {
59	73	while(true){
60		~~- if(first){~~
	74	+ if(first && termEnum.term() != null){
61	75	first = false;
62	76	break;
63	77	}
—	—	@@ -64,6 +78,8 @@
65	79	return null;
66	80	else if(!termEnum.term().field().equals(field))
67	81	return null; // end of our field
	82	+ else if(prefix != null && !termEnum.term().text().startsWith(prefix))
	83	+ return null; // no longer same prefix
68	84
69	85	break;
70	86	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
—	—	@@ -32,18 +32,18 @@
33	33	import org.wikimedia.lsearch.index.WikiIndexModifier;
34	34	import org.wikimedia.lsearch.search.IndexSearcherMul;
35	35	import org.wikimedia.lsearch.search.WikiSearcher;
	36	+import org.wikimedia.lsearch.spell.Suggest;
36	37	import org.wikimedia.lsearch.spell.api.Dictionary.Word;
37	38	import org.wikimedia.lsearch.spell.dist.DoubleMetaphone;
38	39	import org.wikimedia.lsearch.util.HighFreqTerms;
39	40
40	41	/**
41		~~- * Index words and phrases from article titles.~~
	42	+ * Index words and phrases from articles.
42	43	*
43	44	* Fields:
44	45	* * word - word from title
	46	+ * * word_ngramN - word ngrams
45	47	* * phrase - phrase like douglas_adams
46		~~- * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3)~~
47		~~- * * namespace - namespaces where the word/phrase is present~~
48	48	*
49	49	* @author rainman
50	50	*
—	—	@@ -146,10 +146,9 @@
147	147	addPhrase(w,freq,true);
148	148	}
149	149	}
150		~~- }~~
	150	+ }
151	151	ngramWriter.closeAndOptimize();
152		~~- ir.close();~~
153		-
	152	+ ir.close();
154	153	} catch (IOException e) {
155	154	log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
156	155	e.printStackTrace();
—	—	@@ -158,6 +157,24 @@
159	158
160	159	}
161	160
	161	+ /** Check if there are common mispellings of this phrase */
	162	+ protected boolean checkCommonPhraseMisspell(String phrase, int freq, IndexReader ir, String field) {
	163	+ LuceneDictionary d = new LuceneDictionary(ir,field,phrase.substring(0,1));
	164	+ d.setNoProgressReport();
	165	+ Suggest.Metric metric = new Suggest.Metric(phrase);
	166	+ Word word;
	167	+ while((word = d.next()) != null){
	168	+ if(word.getFrequency() * 100 < freq && word.getWord().indexOf("_")!=-1 ){
	169	+ String w = word.getWord();
	170	+ if(metric.distance(w) == 1){
	171	+ System.out.println("Detected common mispelling for "+w+" (correct: "+phrase+")");
	172	+ return true;
	173	+ }
	174	+ }
	175	+ }
	176	+ return false;
	177	+ }
	178	+
162	179	/**
163	180	* Register a title in the index, without tokenization, just lowercase.
164	181	*
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/LocalIndex.java
—	—	@@ -49,5 +49,9 @@
50	50	this.timestamp = timestamp;
51	51	}
52	52
	53	+ public String toString(){
	54	+ return path+" at "+timestamp+" for "+iid;
	55	+ }
53	56
	57	+
54	58	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/ResultSet.java
—	—	@@ -10,6 +10,14 @@
11	11	public String namespace;
12	12	public String title;
13	13	Explanation explanation;
	14	+
	15	+ public ResultSet(String key) {
	16	+ int colon = key.indexOf(':');
	17	+ this.score = 0;
	18	+ this.namespace = key.substring(0,colon);
	19	+ this.title = key.substring(colon+1);
	20	+ this.explanation = null;
	21	+ }
14	22	public ResultSet(double score, String namespace, String title) {
15	23	this.score = score;
16	24	this.namespace = namespace;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Benchmark.java
—	—	@@ -107,10 +107,16 @@
108	108	@SuppressWarnings("deprecation")
109	109	protected int search(){
110	110	String query = "";
111		~~- for(int i=0;i<words;i++){~~
112		~~- if(!query.equals(""))~~
113		~~- query += " OR ";~~
114		~~- query += terms.next();~~
	111	+ if(verb.equals("prefix")){
	112	+ int num = (int)(Math.random()*8);
	113	+ String t = terms.next();
	114	+ query = namespaceFilter+":"+t.substring(0,Math.min(num,t.length()));
	115	+ } else{
	116	+ for(int i=0;i<words;i++){
	117	+ if(!query.equals(""))
	118	+ query += " OR ";
	119	+ query += terms.next();
	120	+ }
115	121	}
116	122	String urlString;
117	123	if(namespace.equals("")){
—	—	@@ -132,11 +138,13 @@
133	139	new InputStreamReader(
134	140	conn.getInputStream()));
135	141	String inputLine;
136		~~- int resCount = -1;~~
	142	+ int resCount = verb.equals("prefix")? 0 : -1;
137	143
138	144	while ((inputLine = in.readLine()) != null){
139	145	if(resCount == -1)
140	146	resCount = Integer.parseInt(inputLine);
	147	+ if(verb.equals("prefix"))
	148	+ resCount ++ ;
141	149	}
142	150	in.close();
143	151
—	—	@@ -195,7 +203,7 @@
196	204	} else if (args[i].equals("-c")) {
197	205	runs = Integer.parseInt(args[++i]);
198	206	} else if (args[i].equals("-v")) {
199		~~- database = args[++i];~~
	207	+ verb = args[++i];
200	208	} else if (args[i].equals("-wf")) {
201	209	wordfile = args[++i];
202	210	} else if (args[i].equals("-n") \|\| args[i].equals("-ns")) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
—	—	@@ -0,0 +1,154 @@
	2	+package org.wikimedia.lsearch.prefix;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.Collections;
	7	+import java.util.Comparator;
	8	+import java.util.HashMap;
	9	+import java.util.Iterator;
	10	+import java.util.Map.Entry;
	11	+
	12	+import org.apache.log4j.Logger;
	13	+import org.apache.lucene.analysis.SimpleAnalyzer;
	14	+import org.apache.lucene.document.Document;
	15	+import org.apache.lucene.document.Field;
	16	+import org.apache.lucene.index.IndexReader;
	17	+import org.apache.lucene.index.IndexWriter;
	18	+import org.apache.lucene.index.Term;
	19	+import org.apache.lucene.index.TermDocs;
	20	+import org.wikimedia.lsearch.analyzers.LowercaseAnalyzer;
	21	+import org.wikimedia.lsearch.analyzers.PrefixAnalyzer;
	22	+import org.wikimedia.lsearch.config.Configuration;
	23	+import org.wikimedia.lsearch.config.IndexId;
	24	+import org.wikimedia.lsearch.index.IndexThread;
	25	+import org.wikimedia.lsearch.ranks.StringList;
	26	+import org.wikimedia.lsearch.spell.api.LuceneDictionary;
	27	+import org.wikimedia.lsearch.spell.api.Dictionary.Word;
	28	+import org.wikimedia.lsearch.storage.ArticleAnalytics;
	29	+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
	30	+
	31	+/**
	32	+ * Build an index of all title prefixes
	33	+ *
	34	+ * @author rainman
	35	+ *
	36	+ */
	37	+public class PrefixIndexBuilder {
	38	+ static Logger log = Logger.getLogger(PrefixIndexBuilder.class);
	39	+
	40	+ public static void main(String[] args) throws IOException{
	41	+ final int PER_PREFIX = 10;
	42	+ boolean usetemp = false;
	43	+ String dbname = null;
	44	+
	45	+ Configuration.open();
	46	+ if(args.length == 0){
	47	+ System.out.println("Syntax: java PrefixIndexBuilder [-t] <dbname>");
	48	+ return;
	49	+ }
	50	+ for(int i=0;i<args.length;i++){
	51	+ if(args[i].equals("-t"))
	52	+ usetemp = true;
	53	+ else if(args[i].startsWith("-")){
	54	+ System.out.println("Unrecognized option "+args[i]);
	55	+ return;
	56	+ } else
	57	+ dbname = args[i];
	58	+ }
	59	+
	60	+ IndexId iid = IndexId.get(dbname);
	61	+ IndexId pre = iid.getPrefix();
	62	+
	63	+ long start = System.currentTimeMillis();
	64	+
	65	+ if(!usetemp){
	66	+ IndexWriter writer = new IndexWriter(pre.getTempPath(),new PrefixAnalyzer(),true);
	67	+ writer.setMergeFactor(20);
	68	+ writer.setMaxBufferedDocs(500);
	69	+ LinkAnalysisStorage st = new LinkAnalysisStorage(iid);
	70	+ log.info("Writing temp index");
	71	+ int count = 0;
	72	+ Iterator<ArticleAnalytics> it = st.iterator();
	73	+ while(it.hasNext()){
	74	+ if(++count % 1000 == 0)
	75	+ System.out.println("Processed "+count);
	76	+ ArticleAnalytics aa = it.next();
	77	+ String key = aa.getKey();
	78	+ //String title = key.substring(key.indexOf(":")+1).toLowerCase();
	79	+ String redirect = aa.getRedirectTarget();
	80	+ if(redirect == null)
	81	+ redirect = "";
	82	+ int ref = aa.getReferences();
	83	+ Document d = new Document();
	84	+ d.add(new Field("key",key,Field.Store.YES,Field.Index.TOKENIZED));
	85	+ d.add(new Field("redirect",redirect,Field.Store.YES,Field.Index.NO));
	86	+ d.add(new Field("ref",Integer.toString(ref),Field.Store.YES,Field.Index.NO));
	87	+ writer.addDocument(d);
	88	+ }
	89	+ log.info("Optimizing temp index");
	90	+ writer.optimize();
	91	+ writer.close();
	92	+ }
	93	+ log.info("Writing prefix index");
	94	+ IndexWriter writer = new IndexWriter(pre.getImportPath(), new LowercaseAnalyzer(),true);
	95	+ writer.setMergeFactor(20);
	96	+ writer.setMaxBufferedDocs(1000);
	97	+ IndexReader ir = IndexReader.open(pre.getTempPath());
	98	+ LuceneDictionary dict = new LuceneDictionary(ir,"key");
	99	+ Word w;
	100	+ while((w = dict.next()) != null){
	101	+ String prefix = w.getWord();
	102	+ Term t = new Term("key",prefix);
	103	+ if(ir.docFreq(t) < 2)
	104	+ continue;
	105	+ TermDocs td = ir.termDocs(t);
	106	+ HashMap<String,Integer> refs = new HashMap<String,Integer>();
	107	+ while(td.next()){
	108	+ Document d = ir.document(td.doc());
	109	+ refs.put(d.get("key"),Integer.parseInt(d.get("ref")));
	110	+ }
	111	+ ArrayList<Entry<String,Integer>> sorted = new ArrayList<Entry<String,Integer>>();
	112	+ sorted.addAll(refs.entrySet());
	113	+ Collections.sort(sorted,new Comparator<Entry<String,Integer>>() {
	114	+ public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2){
	115	+ return o2.getValue() - o1.getValue();
	116	+ }
	117	+ });
	118	+ ArrayList<String> selected = new ArrayList<String>();
	119	+ for(int i=0;i<PER_PREFIX && i<sorted.size();i++){
	120	+ selected.add(sorted.get(i).getKey());
	121	+ }
	122	+ Document d = new Document();
	123	+ d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.UN_TOKENIZED));
	124	+ d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO));
	125	+ writer.addDocument(d);
	126	+ }
	127	+ log.info("Adding title keys ...");
	128	+ int count = 0;
	129	+ for(int i=0;i<ir.maxDoc();i++){
	130	+ if(++count % 1000 == 0)
	131	+ System.out.println("Added "+count);
	132	+ if(ir.isDeleted(i))
	133	+ continue;
	134	+ Document d = new Document();
	135	+ d.add(new Field("key",ir.document(i).get("key"),Field.Store.YES,Field.Index.TOKENIZED));
	136	+ writer.addDocument(d);
	137	+ }
	138	+ ir.close();
	139	+ log.info("Optimizing ...");
	140	+ writer.optimize();
	141	+ writer.close();
	142	+
	143	+ IndexThread.makeIndexSnapshot(pre,pre.getImportPath());
	144	+ long delta = System.currentTimeMillis() - start;
	145	+ System.out.println("Finished in "+formatTime(delta));
	146	+ }
	147	+
	148	+ private static String formatTime(long l) {
	149	+ l /= 1000;
	150	+ if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
	151	+ else if(l >= 60) return (l%3600)/60+"m "+(l%60)+"s";
	152	+ else return l+"s";
	153	+ }
	154	+
	155	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java
—	—	@@ -125,7 +125,7 @@
126	126	}
127	127
128	128	public class LinkAnalysisIterator implements Iterator<ArticleAnalytics>{
129		~~- int inx = 0, next = -1;~~
	129	+ int inx = -1, next = -1;
130	130	int maxdoc;
131	131
132	132	public LinkAnalysisIterator() throws IOException{
—	—	@@ -137,7 +137,7 @@
138	138	if(inx >= maxdoc)
139	139	return false;
140	140	if(next == -1){
141		~~- for(next=inx;next<maxdoc;next++)~~
	141	+ for(next=inx+1;next<maxdoc;next++)
142	142	if(!reader.isDeleted(next))
143	143	return true;
144	144	return false;
—	—	@@ -152,6 +152,8 @@
153	153	inx = next;
154	154	next = -1;
155	155	} else{
	156	+ if(inx == -1)
	157	+ inx = 0;
156	158	for(;inx<maxdoc;inx++){
157	159	if(!reader.isDeleted(inx))
158	160	break;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -58,7 +58,7 @@
59	59	/** If true, this machine is an indexer for this index */
60	60	protected boolean myIndex;
61	61
62		~~- protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED };~~
	62	+ protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS, RELATED, PREFIX };
63	63
64	64	/** Type of index, enumeration */
65	65	protected IndexType type;
—	—	@@ -162,6 +162,8 @@
163	163	this.type = IndexType.LINK_ANALYSIS;
164	164	else if(type.equals("related"))
165	165	this.type = IndexType.RELATED;
	166	+ else if(type.equals("prefix"))
	167	+ this.type = IndexType.PREFIX;
166	168
167	169	// parts
168	170	String[] parts = dbrole.split("\\.");
—	—	@@ -265,6 +267,10 @@
266	268	public boolean isRelated(){
267	269	return type == IndexType.RELATED;
268	270	}
	271	+ /** If this is the index storing article list for specific prefixes */
	272	+ public boolean isPrefix(){
	273	+ return type == IndexType.PREFIX;
	274	+ }
269	275
270	276	/** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */
271	277	public int getPartNum() {
—	—	@@ -412,7 +418,7 @@
413	419
414	420	/** get all hosts that search db this iid belongs to */
415	421	public HashSet<String> getDBSearchHosts(){
416		~~- if(isSingle() \|\| isSpell() \|\| isLinkAnalysis() \|\| isRelated())~~
	422	+ if(isSingle() \|\| isSpell() \|\| isLinkAnalysis() \|\| isRelated() \|\| isPrefix())
417	423	return searchHosts;
418	424	else{
419	425	// add all hosts that search: dbname and all parts
—	—	@@ -463,7 +469,7 @@
464	470	*/
465	471	public HashSet<String> getPhysicalIndexes() {
466	472	HashSet<String> ret = new HashSet<String>();
467		~~- if(isSingle() \|\| isSpell() \|\| isLinkAnalysis() \|\| isRelated())~~
	473	+ if(isSingle() \|\| isSpell() \|\| isLinkAnalysis() \|\| isRelated() \|\| isPrefix())
468	474	ret.add(dbrole);
469	475	else if(isMainsplit() \|\| isSplit() \|\| isNssplit()){
470	476	for(String p : splitParts)
—	—	@@ -549,6 +555,11 @@
550	556	return get(dbname+".related");
551	557	}
552	558
	559	+ /** Get the prefix index iid */
	560	+ public IndexId getPrefix() {
	561	+ return get(dbname+".prefix");
	562	+ }
553	563
	564	+
554	565
555	566	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -232,7 +232,7 @@
233	233	} else if(typeid.matches("nspart[1-9][0-9]*")){
234	234	type = "nssplit";
235	235	dbrole = dbname + "." + typeid;
236		~~- } else if(typeid.equals("spell") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related")){~~
	236	+ } else if(typeid.equals("spell") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related") \|\| typeid.equals("prefix")){
237	237	type = typeid;
238	238	dbrole = dbname + "." + typeid;
239	239	} else
—	—	@@ -519,7 +519,7 @@
520	520	} else if(typeid.matches("nspart[1-9][0-9]*")){
521	521	type = "nssplit";
522	522	dbrole = dbname + "." + typeid;
523		~~- } else if(typeid.equals("spell") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related")){~~
	523	+ } else if(typeid.equals("spell") \|\| typeid.equals("link_analysis") \|\| typeid.equals("related") \|\| typeid.equals("prefix")){
524	524	type = typeid;
525	525	dbrole = dbname + "." + typeid;
526	526	} else
—	—	@@ -816,6 +816,12 @@
817	817	System.out.println("Unrecognized suggest parameters in ("+role+")");
818	818
819	819	dbroles.put(type,params);
	820	+ } else if(type.equals("prefix")){
	821	+ // no params
	822	+ if(tokens.length>1 && verbose)
	823	+ System.out.println("Unrecognized prefix parameters in ("+role+")");
	824	+
	825	+ dbroles.put(type,params);
820	826	} else{
821	827	System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring.");
822	828	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -1,17 +1,22 @@
2	2	package org.wikimedia.lsearch.search;
3	3
4	4	import java.io.IOException;
	5	+import java.io.Reader;
5	6	import java.net.URI;
6	7	import java.text.MessageFormat;
7	8	import java.util.ArrayList;
8	9	import java.util.HashMap;
9	10	import java.util.HashSet;
10	11	import java.util.Hashtable;
	12	+import java.util.Iterator;
11	13
12	14	import org.apache.log4j.Logger;
13	15	import org.apache.lucene.analysis.Analyzer;
14	16	import org.apache.lucene.document.Document;
15	17	import org.apache.lucene.index.IndexReader;
	18	+import org.apache.lucene.index.Term;
	19	+import org.apache.lucene.index.TermDocs;
	20	+import org.apache.lucene.index.TermEnum;
16	21	import org.apache.lucene.queryParser.ParseException;
17	22	import org.apache.lucene.search.Hits;
18	23	import org.apache.lucene.search.Query;
—	—	@@ -31,6 +36,7 @@
32	37	import org.wikimedia.lsearch.frontend.SearchDaemon;
33	38	import org.wikimedia.lsearch.frontend.SearchServer;
34	39	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	40	+import org.wikimedia.lsearch.ranks.StringList;
35	41	import org.wikimedia.lsearch.spell.Suggest;
36	42	import org.wikimedia.lsearch.spell.SuggestQuery;
37	43	import org.wikimedia.lsearch.util.QueryStringMap;
—	—	@@ -57,9 +63,7 @@
58	64	/** Main search method, call this from the search frontend */
59	65	public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) {
60	66
61		~~- if (what.equals("titlematch")) {~~
62		~~- // TODO: return searchTitles(searchterm);~~
63		~~- } else if (what.equals("search") \|\| what.equals("explain")) {~~
	67	+ if (what.equals("search") \|\| what.equals("explain")) {
64	68	int offset = 0, limit = 100; boolean exactCase = false;
65	69	if (query.containsKey("offset"))
66	70	offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
—	—	@@ -94,16 +98,57 @@
95	99	exactCase = true;
96	100	NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
97	101	return search(iid, searchterm, offset, limit, namespaces, what.equals("rawexplain"), exactCase, true);
	102	+ } else if (what.equals("titlematch")) {
	103	+ // TODO: return searchTitles(searchterm);
	104	+ } else if (what.equals("prefix")){
	105	+ return prefixSearch(iid, searchterm);
98	106	} else {
99	107	SearchResults res = new SearchResults();
100	108	res.setErrorMsg("Unrecognized search type. Try one of: " +
101		~~- "search, explain, raw, rawexplain.");~~
	109	+ "search, explain, raw, rawexplain, prefix.");
102	110	log.warn("Unknown request type [" + what + "].");
103	111	return res;
104	112	}
105	113	return null;
106	114	}
107	115
	116	+ private SearchResults prefixSearch(IndexId iid, String searchterm) {
	117	+ IndexId pre = iid.getPrefix();
	118	+ SearcherCache cache = SearcherCache.getInstance();
	119	+ SearchResults res = new SearchResults();
	120	+ try {
	121	+ long start = System.currentTimeMillis();
	122	+ searchterm = searchterm.toLowerCase();
	123	+ IndexSearcherMul searcher = cache.getLocalSearcher(pre);
	124	+ IndexReader reader = searcher.getIndexReader();
	125	+ TermDocs td = reader.termDocs(new Term("prefix",searchterm));
	126	+ if(td.next()){
	127	+ // found entry with a prefix, return
	128	+ StringList sl = new StringList(reader.document(td.doc()).get("articles"));
	129	+ Iterator<String> it = sl.iterator();
	130	+ while(it.hasNext())
	131	+ res.addResult(new ResultSet(it.next()));
	132	+ //logRequest(pre,"prefix",searchterm,null,res.getNumHits(),start,searcher);
	133	+ return res;
	134	+ }
	135	+ // check if it's an unique prefix
	136	+ TermEnum te = reader.terms(new Term("key",searchterm));
	137	+ String r = te.term().text();
	138	+ if(r.startsWith(searchterm)){
	139	+ TermDocs td1 = reader.termDocs(new Term("key",r));
	140	+ if(td1.next()){
	141	+ res.addResult(new ResultSet(reader.document(td1.doc()).get("key")));
	142	+ //logRequest(pre,"prefix",searchterm,null,res.getNumHits(),start,searcher);
	143	+ return res;
	144	+ }
	145	+ }
	146	+ } catch (IOException e) {
	147	+ // res.setErrorMsg("Internal error during prefix search: "+e.getMessage());
	148	+ log.error("Internal error in SearchEngine::prefixSearch : "+e.getMessage());
	149	+ }
	150	+ return res;
	151	+ }
	152	+
108	153	/** Search mainpart or restpart of the split index */
109	154	public SearchResults searchPart(IndexId iid, String searchterm, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){
110	155	if( ! (iid.isMainsplit() \|\| iid.isNssplit()))
—	—	@@ -390,6 +435,6 @@
391	436	long delta = System.currentTimeMillis() - start;
392	437	SearchServer.stats.add(true, delta, SearchDaemon.getOpenCount());
393	438	log.info(MessageFormat.format("{0} {1}: query=[{2}] parsed=[{3}] hit=[{4}] in {5}ms using {6}",
394		~~- new Object[] {what, iid.toString(), searchterm, query.toString(), new Integer(numhits), new Long(delta), searcher.toString()}));~~
	439	+ new Object[] {what, iid.toString(), searchterm, query==null? "" : query.toString(), new Integer(numhits), new Long(delta), searcher.toString()}));
395	440	}
396	441	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
—	—	@@ -40,7 +40,7 @@
41	41	global = GlobalConfiguration.getInstance();
42	42
43	43	Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup");
44		~~- if(iid.isSpell()); // no warmup for spell-chekers~~
	44	+ if(iid.isSpell() \|\| iid.isPrefix()); // no warmup for spell-chekers and prefixes (for now)
45	45	else if(warmup == null){
46	46	makeNamespaceFilters(is,iid);
47	47	simpleWarmup(is,iid);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -466,7 +466,15 @@
467	467	p = makeRelated(doc,fields.related(),article,1);
468	468
469	469	// anchors
470		~~- makeKeywordField(doc,fields.anchor(),rankBoost);~~
	470	+ // makeKeywordField(doc,fields.anchor(),rankBoost);
	471	+
	472	+ // add the whole title for extract boost
	473	+ String wt = FastWikiTokenizerEngine.stipTitle(article.getTitle());
	474	+ if(!bs.isExactCase())
	475	+ wt = wt.toLowerCase();
	476	+ Field wtitle = new Field(fields.wholetitle(),wt,Field.Store.NO, Field.Index.UN_TOKENIZED);
	477	+ wtitle.setBoost(rankBoost);
	478	+ doc.add(wtitle);
471	479
472	480	}
473	481	// make analyzer
—	—	@@ -522,7 +530,7 @@
523	531	if(ranks.get(i) == 0)
524	532	break; // we don't want redirects with zero links
525	533	//log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i));
526		~~- Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.YES, Field.Index.TOKENIZED);~~
	534	+ Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED);
527	535	alttitle.setBoost(calculateArticleRank(ranks.get(i)));
528	536	doc.add(alttitle);
529	537	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -792,5 +792,50 @@
793	793	return keywords;
794	794	}
795	795
796		-
	796	+ /** Delete everything that is not being indexes, decompose chars */
	797	+ public static String stipTitle(String title){
	798	+ UnicodeDecomposer decomposer = UnicodeDecomposer.getInstance();
	799	+ char[] str = title.toCharArray();
	800	+ char[] buf = new char[256];
	801	+ int len = 0;
	802	+ for(int i=0;i<str.length;i++){
	803	+ char ch = str[i];
	804	+ if(ch == ':' \|\| ch == '(' \|\| ch == ')' \|\| ch =='[' \|\| ch == ']' \|\| ch == '.' \|\| ch == ','
	805	+ \|\| ch == ';' \|\| ch == '"' \|\| ch=='-' \|\| ch=='+' \|\| ch=='*' \|\| ch=='!' \|\| ch=='~' \|\| ch=='$'
	806	+ \|\| ch == '%' \|\| ch == '^' \|\| ch == '&' \|\| ch == '_' \|\| ch=='=' \|\| ch=='\|' \|\| ch=='\\'){
	807	+ if(len > 0 && buf[len-1]!=' '){
	808	+ if(len >= buf.length){ // extend buf
	809	+ char[] n = new char[buf.length*2];
	810	+ System.arraycopy(buf,0,n,0,buf.length);
	811	+ buf = n;
	812	+ }
	813	+ buf[len++] = ' '; // replace the special char with space
	814	+ }
	815	+ } else{
	816	+ char[] decomp = decomposer.decompose(ch);
	817	+ if(decomp == null){
	818	+ // no decomposition add char, but don't double spaces
	819	+ if(ch!=' ' \|\| (len>0 && buf[len-1]!=' ')){
	820	+ if(len >= buf.length){
	821	+ char[] n = new char[buf.length*2];
	822	+ System.arraycopy(buf,0,n,0,buf.length);
	823	+ buf = n;
	824	+ }
	825	+ buf[len++] = ch;
	826	+ }
	827	+ } else{
	828	+ // add decomposed chars
	829	+ for(int j = 0; j < decomp.length; j++){
	830	+ if(len >= buf.length){
	831	+ char[] n = new char[buf.length*2];
	832	+ System.arraycopy(buf,0,n,0,buf.length);
	833	+ buf = n;
	834	+ }
	835	+ buf[len++] = decomp[j];
	836	+ }
	837	+ }
	838	+ }
	839	+ }
	840	+ return new String(buf,0,len);
	841	+ }
797	842	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PrefixAnalyzer.java
—	—	@@ -0,0 +1,37 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.Reader;
	6	+
	7	+import org.apache.lucene.analysis.Analyzer;
	8	+import org.apache.lucene.analysis.Token;
	9	+import org.apache.lucene.analysis.TokenStream;
	10	+import org.apache.lucene.analysis.Tokenizer;
	11	+
	12	+public class PrefixAnalyzer extends Analyzer {
	13	+ static public class PrefixTokenizer extends Tokenizer {
	14	+ String in;
	15	+ int count = 0;
	16	+
	17	+ public PrefixTokenizer(String input){
	18	+ in = input;
	19	+ }
	20	+ @Override
	21	+ public Token next() throws IOException {
	22	+ count++;
	23	+ if(count > in.length())
	24	+ return null;
	25	+ else
	26	+ return new Token(in.substring(0,count),0,count);
	27	+ }
	28	+ }
	29	+
	30	+ public TokenStream tokenStream(String fieldName, String str) {
	31	+ return new PrefixTokenizer(str.toLowerCase());
	32	+ }
	33	+
	34	+ @Override
	35	+ public TokenStream tokenStream(String fieldName, Reader reader) {
	36	+ throw new UnsupportedOperationException("Use tokenStream(String,String)");
	37	+ }
	38	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -86,15 +86,18 @@
87	87	public static float ALT_TITLE_BOOST = 8;
88	88	public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
89	89	public static float KEYWORD_BOOST = 0.02f;
	90	+ public static float CONTENTS_BOOST = 0.2f;
90	91
91	92	public static int ADDITIONAL_PHRASE_SLOP_CONTENTS = 20;
92		~~- public static float ADDITIONAL_BOOST_CONTENTS = 1;~~
93		~~- public static int ADDITIONAL_PHRASE_SLOP_TITLE = 10;~~
94		~~- public static float ADDITIONAL_BOOST_TITLE = 2;~~
	93	+ public static float ADDITIONAL_BOOST_CONTENTS = 0.5f;
	94	+ public static int ADDITIONAL_PHRASE_SLOP_TITLE = 1;
	95	+ public static float ADDITIONAL_BOOST_TITLE = 0.5f;
95	96	public static int ADDITIONAL_PHRASE_SLOP_RELATED = 10;
96		~~- public static float ADDITIONAL_BOOST_RELATED = 1f;~~
	97	+ public static float ADDITIONAL_BOOST_RELATED = 0.04f;
97	98
98		~~- public static float ANCHOR_BOOST = 1f;~~
	99	+ public static float WHOLE_TITLE_BOOST = 8f;
	100	+ public static float EXACT_CONTENTS_BOOST = 1f;
	101	+ public static float ANCHOR_BOOST = 0.02f;
99	102
100	103	public static boolean ADD_STEM_TITLE = true;
101	104	public static boolean ADD_TITLE_PHRASES = true;
—	—	@@ -1070,7 +1073,7 @@
1071	1074	}
1072	1075
1073	1076	/** Extract all words from the query */
1074		~~- public ArrayList<String> extractPhrases(Query query){~~
	1077	+ public ArrayList<String> extractWords(Query query){
1075	1078	ArrayList<String> list = new ArrayList<String>();
1076	1079	if(query == null)
1077	1080	return list;
—	—	@@ -1106,7 +1109,7 @@
1107	1110	else if(bcl.length == 1 && bcl[0].getOccur() != Occur.MUST_NOT)
1108	1111	addWords(list,bcl[0].getQuery());
1109	1112	else if(bcl.length == 2){
1110		~~- // TODO: this might brake in some complex queries! (with some parenthesis and transliterations...)~~
	1113	+ // TODO: this might break in some complex queries! (with some parenthesis and transliterations...)
1111	1114	if(bcl[0].getOccur() == Occur.MUST && bcl[1].getOccur() == Occur.SHOULD)
1112	1115	// second is alias
1113	1116	addWords(list,bcl[0].getQuery());
—	—	@@ -1315,7 +1318,7 @@
1316	1319	defaultBoost = olfDefaultBoost;
1317	1320	defaultAliasBoost = ALIAS_BOOST;
1318	1321
1319		~~- ArrayList<String> words = extractPhrases(qt);~~
	1322	+ ArrayList<String> words = extractWords(qt);
1320	1323
1321	1324	if(qt == qs) // either null, or category query
1322	1325	return new Object[] {qt,words};
—	—	@@ -1470,6 +1473,20 @@
1471	1474	return bq;
1472	1475	}
1473	1476	return null;
	1477	+ }
	1478	+
	1479	+ /** Join a collection via a char/string */
	1480	+ protected String join(Collection<String> col, String sep){
	1481	+ StringBuffer sb = new StringBuffer();
	1482	+ boolean first = true;
	1483	+ for(String s : col){
	1484	+ if(!first){
	1485	+ sb.append(sep);
	1486	+ } else
	1487	+ first = false;
	1488	+ sb.append(s);
	1489	+ }
	1490	+ return sb.toString();
1474	1491	}
1475	1492
1476	1493	/**
—	—	@@ -1485,7 +1502,7 @@
1486	1503	queryText = quoteCJK(queryText);
1487	1504	if(policy != null)
1488	1505	this.namespacePolicy = policy;
1489		~~- defaultBoost = 1;~~
	1506	+ defaultBoost = CONTENTS_BOOST;
1490	1507	defaultAliasBoost = ALIAS_BOOST;
1491	1508	Query qc = parseRaw(queryText);
1492	1509	Object[] qtwords = makeTitleQuery(queryText);
—	—	@@ -1497,7 +1514,7 @@
1498	1515	if(qc.equals(qt))
1499	1516	return qc; // don't duplicate (probably a query for categories only)
1500	1517
1501		~~- BooleanQuery bq = new BooleanQuery();~~
	1518	+ BooleanQuery bq = new BooleanQuery(true);
1502	1519	bq.add(qc,BooleanClause.Occur.SHOULD);
1503	1520	bq.add(qt,BooleanClause.Occur.SHOULD);
1504	1521
—	—	@@ -1522,9 +1539,14 @@
1523	1540	bq.add(qk,BooleanClause.Occur.SHOULD);
1524	1541	}
1525	1542
	1543	+ // whole title
	1544	+ Query wt = new TermQuery(new Term(fields.wholetitle(),join(words," ")));
	1545	+ wt.setBoost(WHOLE_TITLE_BOOST);
	1546	+ Query wc = makePhrase(words,fields.contents(),0);
	1547	+ wc.setBoost(EXACT_CONTENTS_BOOST);
1526	1548	// add additional score queries!
1527		~~- Query pqc = makePhraseQueries(words,"contents",ADDITIONAL_PHRASE_SLOP_CONTENTS,ADDITIONAL_BOOST_CONTENTS);~~
1528		~~- Query pqt = makePhraseQueries(words,"stemtitle",ADDITIONAL_PHRASE_SLOP_TITLE,ADDITIONAL_BOOST_TITLE);~~
	1549	+ Query pqc = makePhraseQueries(words,fields.contents(),ADDITIONAL_PHRASE_SLOP_CONTENTS,ADDITIONAL_BOOST_CONTENTS);
	1550	+ Query pqt = makePhraseQueries(words,fields.stemtitle(),ADDITIONAL_PHRASE_SLOP_TITLE,ADDITIONAL_BOOST_TITLE);
1529	1551	// skip last related group
1530	1552	Query[] pqr = new Query[RelatedAnalyzer.RELATED_GROUPS-1];
1531	1553	for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
—	—	@@ -1534,16 +1556,20 @@
1535	1557	for(int i=1;i<RelatedAnalyzer.RELATED_GROUPS;i++){
1536	1558	wqr[i-1] = makeWordQueries(words,"related"+i,ADDITIONAL_BOOST_RELATED / 4);
1537	1559	}
1538		~~- if(pqc == null && pqt == null && pqr[0] == null && wqr[0] == null)~~
	1560	+ if(wt==null && pqc == null && pqt == null && pqr[0] == null && wqr[0] == null)
1539	1561	return bq;
1540	1562	// build the final query
1541	1563	BooleanQuery finalQuery = new BooleanQuery(true);
1542	1564	BooleanQuery additional = new BooleanQuery(true);
1543		-
	1565	+
1544	1566	if(pqc != null)
1545	1567	additional.add(pqc,Occur.MUST);
1546	1568	if(pqt != null)
1547	1569	additional.add(pqt,Occur.SHOULD);
	1570	+ if(wt != null)
	1571	+ additional.add(wt,Occur.SHOULD);
	1572	+ if(wc != null)
	1573	+ additional.add(wc,Occur.SHOULD);
1548	1574	for(Query q : pqr){
1549	1575	if(q != null)
1550	1576	additional.add(q,Occur.SHOULD);
—	—	@@ -1554,12 +1580,12 @@
1555	1581	}
1556	1582
1557	1583	// anchors
1558		~~- Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST);~~
	1584	+ //Query anchors = multiplySpans(nostem,0,fields.anchor(),ANCHOR_BOOST);
1559	1585
1560	1586	finalQuery.add(bq,Occur.MUST);
1561	1587	finalQuery.add(additional,Occur.SHOULD);
1562		~~- if(anchors != null)~~
1563		~~- finalQuery.add(anchors,Occur.SHOULD);~~
	1588	+ //if(anchors != null)
	1589	+ // finalQuery.add(anchors,Occur.SHOULD);
1564	1590
1565	1591	return finalQuery;
1566	1592
—	—	@@ -1617,8 +1643,6 @@
1618	1644	}
1619	1645	public void setBuilder(FieldBuilder.BuilderSet builder) {
1620	1646	this.builder = builder;
1621		~~- }~~
1622		-
	1647	+ }
1623	1648
1624		-
1625	1649	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java
—	—	@@ -66,6 +66,13 @@
67	67	else
68	68	return "anchor";
69	69	}
	70	+
	71	+ public String wholetitle(){
	72	+ if(exactCase)
	73	+ return "wholetitle_exact";
	74	+ else
	75	+ return "wholetitle";
	76	+ }
70	77
71	78
72	79	public boolean isExactCase() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
—	—	@@ -0,0 +1,44 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.Reader;
	6	+
	7	+import org.apache.lucene.analysis.Analyzer;
	8	+import org.apache.lucene.analysis.Token;
	9	+import org.apache.lucene.analysis.TokenStream;
	10	+/**
	11	+ * Analyzer that just lowecases the text, doesn't split up anything, etc..
	12	+ *
	13	+ * @author rainman
	14	+ *
	15	+ */
	16	+public class LowercaseAnalyzer extends Analyzer {
	17	+ public static class LowercaseTokenizer extends TokenStream {
	18	+ String text;
	19	+ boolean sent = false;
	20	+ LowercaseTokenizer(String in){
	21	+ text = in.toLowerCase();
	22	+ }
	23	+ @Override
	24	+ public Token next() throws IOException {
	25	+ if(sent)
	26	+ return null;
	27	+ else{
	28	+ sent = true;
	29	+ return new Token(text,0,text.length());
	30	+ }
	31	+ }
	32	+
	33	+ }
	34	+
	35	+ @Override
	36	+ public TokenStream tokenStream(String fieldName, String text) {
	37	+ return new LowercaseTokenizer(text);
	38	+ }
	39	+ @Override
	40	+ public TokenStream tokenStream(String fieldName, Reader reader) {
	41	+ throw new UnsupportedOperationException("Use tokenStream(String,String)");
	42	+ }
	43	+
	44	+
	45	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
—	—	@@ -232,7 +232,71 @@
233	233	writer.addDocument(doc,an);
234	234	state = State.MODIFIED_ARTICLES;
235	235	}
	236	+ public static HashSet<Character> separators = new HashSet<Character>();
	237	+ static{
	238	+ separators.add(' ');
	239	+ separators.add('\r');
	240	+ separators.add('\n');
	241	+ separators.add('\t');
	242	+ separators.add(':');
	243	+ separators.add('(');
	244	+ separators.add(')');
	245	+ separators.add('[');
	246	+ separators.add(']');
	247	+ separators.add('.');
	248	+ separators.add(',');
	249	+ separators.add(':');
	250	+ separators.add(';');
	251	+ separators.add('"');
	252	+ separators.add('+');
	253	+ separators.add('*');
	254	+ separators.add('!');
	255	+ separators.add('~');
	256	+ separators.add('$');
	257	+ separators.add('%');
	258	+ separators.add('^');
	259	+ separators.add('&');
	260	+ separators.add('_');
	261	+ separators.add('=');
	262	+ separators.add('\|');
	263	+ separators.add('\\');
	264	+ }
236	265
	266	+ /**
	267	+ * Find a sentance boundaries
	268	+ *
	269	+ * @param text - raw text
	270	+ * @param start - start index to search from
	271	+ * @param reverse - if true, will lookup in reverse
	272	+ * @param max - radius of search (if no boundary is found return last wordbreak)
	273	+ * @return
	274	+ */
	275	+ protected int findSentance(char[] text, int start, boolean reverse, int max){
	276	+ int inc = (reverse)? -1 : 1;
	277	+ int count = 0;
	278	+ int wordbreak = start;
	279	+ int i = start;
	280	+ for(;i>0 && i<text.length;i+=inc){
	281	+ char c = text[i];
	282	+ if(c == '.')
	283	+ return i;
	284	+ else if(c == '*' && ((i>1 && text[i-1]=='\n') \|\| i==0))
	285	+ return i;
	286	+ else if(separators.contains(c))
	287	+ wordbreak = i;
	288	+ if(count >= max)
	289	+ return wordbreak; // more than max chars away, return the latest wordbreak
	290	+ count ++;
	291	+ }
	292	+ return i;
	293	+ }
	294	+
	295	+ /** Find surrounding for a link - extract sentances, list items .... */
	296	+ protected String findContext(char[] text, int start, int end){
	297	+ // TODO: implement
	298	+ return null;
	299	+ }
	300	+
237	301	/** Find the target key to title (ns:title) to which the links is pointing to
238	302	* @throws IOException */
239	303	protected String findTargetLink(int ns, String title) throws IOException{
Index: branches/lucene-search-2.1/lsearch-global.conf
—	—	@@ -17,14 +17,14 @@
18	18	wikidev : (single) (language,sr)
19	19	wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[])
20	20	wikilucene : (language,en) (warmup,10)
21		~~-wikilucene : (spell,3,1)~~
	21	+wikilucene : (spell,3,1) (prefix)
22	22
23	23	# Search groups
24	24	# Index parts of a split index are always taken from the node's group
25	25	# host : db1.part db2.part
26	26	# Mulitple hosts can search multiple dbs (N-N mapping)
27	27	[Search-Group]
28		~~-oblak : wikilucene wikidev~~
	28	+oblak : wikilucene wikidev wikilucene.prefix
29	29
30	30	# Index nodes
31	31	# host: db1.part db2.part

Status & tagging log

15:20, 12 September 2011 Meno25 (talk | contribs) changed the status of r25925 [removed: ok added: old]