r24539 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r24538‎ \| r24539 \| r24540 >
Date:	12:35, 2 August 2007
Author:	rainman
Status:	old
Tags:
Comment:	Refactor: * droped PhraseIndexer, replaced it with TitleIndexer which handles words and phrases that appear in titles * refactor api to enable incremental updates, use NgramIndexer as base class Add: * untokenized interwiki field and interwiki analyzer (need more work) * raw search method * suggest keyword in global settings Need to refactor suggestions to make use of words in title index.
Modified paths:	/branches/lucene-search-2.1/build.xml (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/InterwikiAnalyzer.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/CleanIndexImporter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/Suggest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/SuggestBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Dictionary.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Indexer.java (deleted) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/LuceneDictionary.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NamespaceFreq.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NgramIndexer.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/PhraseIndexer.java (deleted) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitleIndexer.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitlesIndexer.java (deleted) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/WordsIndexer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history) /branches/lucene-search-2.1/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/test-data/mwsearch-global.test
—	—	@@ -9,7 +9,7 @@
10	10	# aspell <language>
11	11	[Database]
12	12	entest : (mainsplit), (mainpart,false,2,10), (restpart,true,2)
13		~~-entest : (ngram), (aspell,en)~~
	13	+entest : (ngram), (suggest,1,2,3)
14	14	detest,rutest : (single,true,2,10)
15	15	frtest : (split,3) (part1) (part2) (part3)
16	16	srwiki : (single)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
—	—	@@ -190,6 +190,12 @@
191	191	assertEquals("http://rs.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("rswikimedia"));
192	192	assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki"));
193	193
	194	+ // test suggest tag
	195	+ Hashtable<String,String> sug = testgc.getDBParams("entest","suggest");
	196	+ assertEquals("1",sug.get("wordsMinFreq"));
	197	+ assertEquals("2",sug.get("titlesWordsMinFreq"));
	198	+ assertEquals("3",sug.get("titlesPhrasesMinFreq"));
	199	+
194	200	} catch (MalformedURLException e) {
195	201	e.printStackTrace();
196	202	} catch (IOException e) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
—	—	@@ -0,0 +1,635 @@
	2	+package org.wikimedia.lsearch.test;
	3	+
	4	+import java.io.BufferedReader;
	5	+import java.io.IOException;
	6	+import java.io.InputStreamReader;
	7	+import java.util.ArrayList;
	8	+
	9	+import org.apache.lucene.index.IndexReader;
	10	+import org.apache.lucene.search.spell.SpellChecker;
	11	+import org.apache.lucene.store.FSDirectory;
	12	+import org.wikimedia.lsearch.config.Configuration;
	13	+import org.wikimedia.lsearch.config.IndexId;
	14	+import org.wikimedia.lsearch.config.IndexRegistry;
	15	+import org.wikimedia.lsearch.suggest.Suggest;
	16	+import org.wikimedia.lsearch.suggest.SuggestResult;
	17	+import org.wikimedia.lsearch.suggest.Suggest.SuggestSplit;
	18	+
	19	+public class SuggestTest {
	20	+
	21	+ public static void testSpellCheck(String dbname) throws IOException{
	22	+ IndexId iid = IndexId.get(dbname);
	23	+ SpellChecker sc = new SpellChecker(FSDirectory.getDirectory(iid.getSpellcheckPath(),false));
	24	+ IndexReader ir = IndexReader.open(iid.getSuggestCleanPath());
	25	+ int good=0;
	26	+ int bad=0;
	27	+ long start = System.currentTimeMillis();
	28	+ for(String[] m : DATA){
	29	+ String[] res = sc.suggestSimilar(m[0],20,ir,"contents",true);
	30	+ if(res.length > 0 && m[1].equals(res[0]))
	31	+ good++;
	32	+ else{
	33	+ reportBad(m[0],m[1],res.length>0? res[0] : "");
	34	+ bad++;
	35	+ }
	36	+ }
	37	+ int total = good + bad;
	38	+ long delta = System.currentTimeMillis() - start;
	39	+ System.out.println("SpellCheck test ("+delta+"ms): good: "+good+" ("+((double)good/total*100)+"%), bad: "+bad+", total="+total);
	40	+ }
	41	+
	42	+ public static void testSuggest(String dbname) throws IOException{
	43	+ IndexId iid = IndexId.get(dbname);
	44	+ Suggest sc = new Suggest(iid);
	45	+ int good=0;
	46	+ int bad=0;
	47	+ long start = System.currentTimeMillis();
	48	+ for(String[] m : DATA){
	49	+ ArrayList<SuggestResult> res = sc.suggestWords(m[0],5);
	50	+ if(res.size() > 0){
	51	+ SuggestResult r = res.get(0);
	52	+ if(r.getWord().equals(m[1]))
	53	+ good++;
	54	+ else if(r.getWord().equals(m[0]) && res.size()>1 && res.get(1).getFrequency()>r.getFrequency()
	55	+ && res.get(1).getWord().equals(m[1]))
	56	+ good++;
	57	+ else if(r.getDist() > 1){
	58	+ ArrayList<SuggestSplit> split = sc.suggestSplitFromTitle(m[0]);
	59	+ if(split.size()>0 && m[1].equals(split.get(0).getWord()))
	60	+ good++;
	61	+ else{
	62	+ reportBad(m[0],m[1],r.getWord());
	63	+ bad++;
	64	+ }
	65	+
	66	+ }
	67	+ else{
	68	+ reportBad(m[0],m[1],r.getWord());
	69	+ bad++;
	70	+ }
	71	+ } else{
	72	+ reportBad(m[0],m[1],"");
	73	+ bad++;
	74	+ }
	75	+ }
	76	+ int total = good + bad;
	77	+ long delta = System.currentTimeMillis() - start;
	78	+ System.out.println("Suggest test ("+delta+"ms): good: "+good+" ("+((double)good/total*100)+"%), bad: "+bad+", total="+total);
	79	+ }
	80	+
	81	+ public static void reportBad(String bad, String expected, String got){
	82	+ System.out.println("FOR ["+bad+"] EXPECTED: ["+expected+"], BUT GOT ["+got+"]");
	83	+ }
	84	+
	85	+ public static void main(String[] args) throws IOException{
	86	+ Configuration.open();
	87	+ String dbname = "wikilucene";
	88	+ if(args.length==1)
	89	+ dbname = args[0];
	90	+
	91	+ testSpellCheck(dbname);
	92	+ testSuggest(dbname);
	93	+ }
	94	+
	95	+
	96	+ private static final String[][] DATA = { {
	97	+ "abilitey", "ability" }, {
	98	+ "abouy", "about" }, {
	99	+ "absorbtion", "absorption" }, {
	100	+ "accidently", "accidentally" }, {
	101	+ "accomodate", "accommodate" }, {
	102	+ "acommadate", "accommodate" }, {
	103	+ "acord", "accord" }, {
	104	+ "adultry", "adultery" }, {
	105	+ "aggresive", "aggressive" }, {
	106	+ "alchohol", "alcohol" }, {
	107	+ "alchoholic", "alcoholic" }, {
	108	+ "allieve", "alive" }, {
	109	+ "alot", "a lot" }, {
	110	+ "alright", "all right" }, {
	111	+ "amature", "amateur" }, {
	112	+ "ambivilant", "ambivalent" }, {
	113	+ "amification", "amplification" }, {
	114	+ "amourfous", "amorphous" }, {
	115	+ "annoint", "anoint" }, {
	116	+ "annonsment", "announcement" }, {
	117	+ "annoyting", "anting" }, {
	118	+ "annuncio", "announce" }, {
	119	+ "anonomy", "anatomy" }, {
	120	+ "anotomy", "anatomy" }, {
	121	+ "antidesestablishmentarianism", "antidisestablishmentarianism" }, {
	122	+ "antidisestablishmentarism", "antidisestablishmentarianism" }, {
	123	+ "anynomous", "anonymous" }, {
	124	+ "appelet", "applet" }, {
	125	+ "appreceiated", "appreciated" }, {
	126	+ "appresteate", "appreciate" }, {
	127	+ "aquantance", "acquaintance" }, {
	128	+ "aratictature", "architecture" }, {
	129	+ "archeype", "archetype" }, {
	130	+ "aricticure", "architecture" }, {
	131	+ "artic", "arctic" }, {
	132	+ "asentote", "asymptote" }, {
	133	+ "ast", "at" }, {
	134	+ "asterick", "asterisk" }, {
	135	+ "asymetric", "asymmetric" }, {
	136	+ "atentively", "attentively" }, {
	137	+ "autoamlly", "automatically" }, {
	138	+ "bankrot", "bankrupt" }, {
	139	+ "basicly", "basically" }, {
	140	+ "batallion", "battalion" }, {
	141	+ "bbrose", "browse" }, {
	142	+ "beauro", "bureau" }, {
	143	+ "beaurocracy", "bureaucracy" }, {
	144	+ "beggining", "beginning" }, {
	145	+ "beging", "beginning" }, {
	146	+ "behaviour", "behavior" }, {
	147	+ "beleive", "believe" }, {
	148	+ "belive", "believe" }, {
	149	+ "benidifs", "benefits" }, {
	150	+ "bigginging", "beginning" }, {
	151	+ "blait", "bleat" }, {
	152	+ "bouyant", "buoyant" }, {
	153	+ "boygot", "boycott" }, {
	154	+ "brocolli", "broccoli" }, {
	155	+ "buch", "bush" }, {
	156	+ "buder", "butter" }, {
	157	+ "budr", "butter" }, {
	158	+ "budter", "butter" }, {
	159	+ "buracracy", "bureaucracy" }, {
	160	+ "burracracy", "bureaucracy" }, {
	161	+ "buton", "button" }, {
	162	+ "byby", "by by" }, {
	163	+ "cauler", "caller" }, {
	164	+ "ceasar", "caesar" }, {
	165	+ "cemetary", "cemetery" }, {
	166	+ "changeing", "changing" }, {
	167	+ "cheet", "cheat" }, {
	168	+ "cicle", "circle" }, {
	169	+ "cimplicity", "simplicity" }, {
	170	+ "circumstaces", "circumstances" }, {
	171	+ "clob", "club" }, {
	172	+ "coaln", "colon" }, {
	173	+ "cocamena", "cockamamie" }, {
	174	+ "colleaque", "colleague" }, {
	175	+ "colloquilism", "colloquialism" }, {
	176	+ "columne", "column" }, {
	177	+ "comiler", "compiler" }, {
	178	+ "comitmment", "commitment" }, {
	179	+ "comitte", "committee" }, {
	180	+ "comittmen", "commitment" }, {
	181	+ "comittmend", "commitment" }, {
	182	+ "commerciasl", "commercials" }, {
	183	+ "commited", "committed" }, {
	184	+ "commitee", "committee" }, {
	185	+ "companys", "companies" }, {
	186	+ "compicated", "complicated" }, {
	187	+ "comupter", "computer" }, {
	188	+ "concensus", "consensus" }, {
	189	+ "confusionism", "confucianism" }, {
	190	+ "congradulations", "congratulations" }, {
	191	+ "conibation", "contribution" }, {
	192	+ "consident", "consistent" }, {
	193	+ "consident", "consonant" }, {
	194	+ "contast", "constant" }, {
	195	+ "contastant", "constant" }, {
	196	+ "contunie", "continue" }, {
	197	+ "cooly", "coolly" }, {
	198	+ "copping", "coping" }, {
	199	+ "cosmoplyton", "cosmopolitan" }, {
	200	+ "courst", "court" }, {
	201	+ "crasy", "crazy" }, {
	202	+ "cravets", "caveats" }, {
	203	+ "credetability", "credibility" }, {
	204	+ "criqitue", "critique" }, {
	205	+ "croke", "croak" }, {
	206	+ "crucifiction", "crucifixion" }, {
	207	+ "crusifed", "crucified" }, {
	208	+ "ctitique", "critique" }, {
	209	+ "cumba", "combo" }, {
	210	+ "custamisation", "customization" }, {
	211	+ "dag", "dog" }, {
	212	+ "daly", "daily" }, {
	213	+ "danguages", "dangerous" }, {
	214	+ "deaft", "draft" }, {
	215	+ "defence", "defense" }, {
	216	+ "defenly", "defiantly" }, {
	217	+ "definate", "definite" }, {
	218	+ "definately", "definitely" }, {
	219	+ "dependeble", "dependable" }, {
	220	+ "descrption", "description" }, {
	221	+ "descrptn", "description" }, {
	222	+ "desparate", "desperate" }, {
	223	+ "dessicate", "desiccate" }, {
	224	+ "destint", "distant" }, {
	225	+ "develepment", "developments" }, {
	226	+ "developement", "development" }, {
	227	+ "develpond", "development" }, {
	228	+ "devulge", "divulge" }, {
	229	+ "diagree", "disagree" }, {
	230	+ "dieties", "deities" }, {
	231	+ "dinasaur", "dinosaur" }, {
	232	+ "dinasour", "dinosaur" }, {
	233	+ "direcyly", "directly" }, {
	234	+ "discuess", "discuss" }, {
	235	+ "disect", "dissect" }, {
	236	+ "disippate", "dissipate" }, {
	237	+ "disition", "decision" }, {
	238	+ "dispair", "despair" }, {
	239	+ "disssicion", "discussion" }, {
	240	+ "distarct", "distract" }, {
	241	+ "distart", "distort" }, {
	242	+ "distroy", "destroy" }, {
	243	+ "documtations", "documentation" }, {
	244	+ "doenload", "download" }, {
	245	+ "dongle", "dangle" }, {
	246	+ "doog", "dog" }, {
	247	+ "dramaticly", "dramatically" }, {
	248	+ "drunkeness", "drunkenness" }, {
	249	+ "ductioneery", "dictionary" }, {
	250	+ "dur", "due" }, {
	251	+ "duren", "during" }, {
	252	+ "dymatic", "dynamic" }, {
	253	+ "dynaic", "dynamic" }, {
	254	+ "ecstacy", "ecstasy" }, {
	255	+ "efficat", "efficient" }, {
	256	+ "efficity", "efficacy" }, {
	257	+ "effots", "efforts" }, {
	258	+ "egsistence", "existence" }, {
	259	+ "eitiology", "etiology" }, {
	260	+ "elagent", "elegant" }, {
	261	+ "elligit", "elegant" }, {
	262	+ "embarass", "embarrass" }, {
	263	+ "embarassment", "embarrassment" }, {
	264	+ "embaress", "embarrass" }, {
	265	+ "encapsualtion", "encapsulation" }, {
	266	+ "encyclapidia", "encyclopedia" }, {
	267	+ "encyclopia", "encyclopedia" }, {
	268	+ "engins", "engine" }, {
	269	+ "enhence", "enhance" }, {
	270	+ "enligtment", "Enlightenment" }, {
	271	+ "ennuui", "ennui" }, {
	272	+ "enought", "enough" }, {
	273	+ "enventions", "inventions" }, {
	274	+ "envireminakl", "environmental" }, {
	275	+ "enviroment", "environment" }, {
	276	+ "epitomy", "epitome" }, {
	277	+ "equire", "acquire" }, {
	278	+ "errara", "error" }, {
	279	+ "erro", "error" }, {
	280	+ "evaualtion", "evaluation" }, {
	281	+ "evething", "everything" }, {
	282	+ "evtually", "eventually" }, {
	283	+ "excede", "exceed" }, {
	284	+ "excercise", "exercise" }, {
	285	+ "excpt", "except" }, {
	286	+ "excution", "execution" }, {
	287	+ "exhileration", "exhilaration" }, {
	288	+ "existance", "existence" }, {
	289	+ "expleyly", "explicitly" }, {
	290	+ "explity", "explicitly" }, {
	291	+ "expresso", "espresso" }, {
	292	+ "exspidient", "expedient" }, {
	293	+ "extions", "extensions" }, {
	294	+ "factontion", "factorization" }, {
	295	+ "failer", "failure" }, {
	296	+ "famdasy", "fantasy" }, {
	297	+ "faver", "favor" }, {
	298	+ "faxe", "fax" }, {
	299	+ "febuary", "february" }, {
	300	+ "firey", "fiery" }, {
	301	+ "fistival", "festival" }, {
	302	+ "flatterring", "flattering" }, {
	303	+ "fluk", "flux" }, {
	304	+ "flukse", "flux" }, {
	305	+ "fone", "phone" }, {
	306	+ "forsee", "foresee" }, {
	307	+ "frustartaion", "frustrating" }, {
	308	+ "fuction", "function" }, {
	309	+ "funetik", "phonetic" }, {
	310	+ "futs", "guts" }, {
	311	+ "gamne", "came" }, {
	312	+ "gaurd", "guard" }, {
	313	+ "generly", "generally" }, {
	314	+ "ghandi", "gandhi" }, {
	315	+ "goberment", "government" }, {
	316	+ "gobernement", "government" }, {
	317	+ "gobernment", "government" }, {
	318	+ "gotton", "gotten" }, {
	319	+ "gracefull", "graceful" }, {
	320	+ "gradualy", "gradually" }, {
	321	+ "grammer", "grammar" }, {
	322	+ "hallo", "hello" }, {
	323	+ "hapily", "happily" }, {
	324	+ "harrass", "harass" }, {
	325	+ "havne", "have" }, {
	326	+ "heellp", "help" }, {
	327	+ "heighth", "height" }, {
	328	+ "hellp", "help" }, {
	329	+ "helo", "hello" }, {
	330	+ "herlo", "hello" }, {
	331	+ "hifin", "hyphen" }, {
	332	+ "hifine", "hyphen" }, {
	333	+ "higer", "higher" }, {
	334	+ "hiphine", "hyphen" }, {
	335	+ "hippie", "hippy" }, {
	336	+ "hippopotamous", "hippopotamus" }, {
	337	+ "hlp", "help" }, {
	338	+ "hourse", "horse" }, {
	339	+ "houssing", "housing" }, {
	340	+ "howaver", "however" }, {
	341	+ "howver", "however" }, {
	342	+ "humaniti", "humanity" }, {
	343	+ "hyfin", "hyphen" }, {
	344	+ "hypotathes", "hypothesis" }, {
	345	+ "hypotathese", "hypothesis" }, {
	346	+ "hystrical", "hysterical" }, {
	347	+ "ident", "indent" }, {
	348	+ "illegitament", "illegitimate" }, {
	349	+ "imbed", "embed" }, {
	350	+ "imediaetly", "immediately" }, {
	351	+ "imfamy", "infamy" }, {
	352	+ "immenant", "immanent" }, {
	353	+ "implemtes", "implements" }, {
	354	+ "inadvertant", "inadvertent" }, {
	355	+ "incase", "in case" }, {
	356	+ "incedious", "insidious" }, {
	357	+ "incompleet", "incomplete" }, {
	358	+ "incomplot", "incomplete" }, {
	359	+ "inconvenant", "inconvenient" }, {
	360	+ "inconvience", "inconvenience" }, {
	361	+ "independant", "independent" }, {
	362	+ "independenent", "independent" }, {
	363	+ "indepnends", "independent" }, {
	364	+ "indepth", "in depth" }, {
	365	+ "indispensible", "indispensable" }, {
	366	+ "inefficite", "inefficient" }, {
	367	+ "inerface", "interface" }, {
	368	+ "infact", "in fact" }, {
	369	+ "influencial", "influential" }, {
	370	+ "inital", "initial" }, {
	371	+ "initinized", "initialized" }, {
	372	+ "initized", "initialized" }, {
	373	+ "innoculate", "inoculate" }, {
	374	+ "insistant", "insistent" }, {
	375	+ "insistenet", "insistent" }, {
	376	+ "instulation", "installation" }, {
	377	+ "intealignt", "intelligent" }, {
	378	+ "intejilent", "intelligent" }, {
	379	+ "intelegent", "intelligent" }, {
	380	+ "intelegnent", "intelligent" }, {
	381	+ "intelejent", "intelligent" }, {
	382	+ "inteligent", "intelligent" }, {
	383	+ "intelignt", "intelligent" }, {
	384	+ "intellagant", "intelligent" }, {
	385	+ "intellegent", "intelligent" }, {
	386	+ "intellegint", "intelligent" }, {
	387	+ "intellgnt", "intelligent" }, {
	388	+ "intensionality", "intensionally" }, {
	389	+ "interate", "iterate" }, {
	390	+ "internation", "international" }, {
	391	+ "interpretate", "interpret" }, {
	392	+ "interpretter", "interpreter" }, {
	393	+ "intertes", "interested" }, {
	394	+ "intertesd", "interested" }, {
	395	+ "invermeantial", "environmental" }, {
	396	+ "irregardless", "regardless" }, {
	397	+ "irresistable", "irresistible" }, {
	398	+ "irritible", "irritable" }, {
	399	+ "islams", "muslims" }, {
	400	+ "isotrop", "isotope" }, {
	401	+ "isreal", "israel" }, {
	402	+ "johhn", "john" }, {
	403	+ "judgement", "judgment" }, {
	404	+ "kippur", "kipper" }, {
	405	+ "knawing", "knowing" }, {
	406	+ "latext", "latest" }, {
	407	+ "leasve", "leave" }, {
	408	+ "lesure", "leisure" }, {
	409	+ "liasion", "lesion" }, {
	410	+ "liason", "liaison" }, {
	411	+ "libary", "library" }, {
	412	+ "likly", "likely" }, {
	413	+ "lilometer", "kilometer" }, {
	414	+ "liquify", "liquefy" }, {
	415	+ "lloyer", "layer" }, {
	416	+ "lossing", "losing" }, {
	417	+ "luser", "laser" }, {
	418	+ "maintanence", "maintenance" }, {
	419	+ "majaerly", "majority" }, {
	420	+ "majoraly", "majority" }, {
	421	+ "maks", "masks" }, {
	422	+ "mandelbrot", "Mandelbrot" }, {
	423	+ "mant", "want" }, {
	424	+ "marshall", "marshal" }, {
	425	+ "maxium", "maximum" }, {
	426	+ "meory", "memory" }, {
	427	+ "metter", "better" }, {
	428	+ "mic", "mike" }, {
	429	+ "midia", "media" }, {
	430	+ "millenium", "millennium" }, {
	431	+ "miniscule", "minuscule" }, {
	432	+ "minkay", "monkey" }, {
	433	+ "minum", "minimum" }, {
	434	+ "mischievious", "mischievous" }, {
	435	+ "misilous", "miscellaneous" }, {
	436	+ "momento", "memento" }, {
	437	+ "monkay", "monkey" }, {
	438	+ "mosaik", "mosaic" }, {
	439	+ "mostlikely", "most likely" }, {
	440	+ "mousr", "mouser" }, {
	441	+ "mroe", "more" }, {
	442	+ "neccessary", "necessary" }, {
	443	+ "necesary", "necessary" }, {
	444	+ "necesser", "necessary" }, {
	445	+ "neice", "niece" }, {
	446	+ "neighbour", "neighbor" }, {
	447	+ "nemonic", "pneumonic" }, {
	448	+ "nevade", "Nevada" }, {
	449	+ "nickleodeon", "nickelodeon" }, {
	450	+ "nieve", "naive" }, {
	451	+ "noone", "no one" }, {
	452	+ "noticably", "noticeably" }, {
	453	+ "notin", "not in" }, {
	454	+ "nozled", "nuzzled" }, {
	455	+ "objectsion", "objects" }, {
	456	+ "obsfuscate", "obfuscate" }, {
	457	+ "ocassion", "occasion" }, {
	458	+ "occuppied", "occupied" }, {
	459	+ "occurence", "occurrence" }, {
	460	+ "octagenarian", "octogenarian" }, {
	461	+ "olf", "old" }, {
	462	+ "opposim", "opossum" }, {
	463	+ "organise", "organize" }, {
	464	+ "organiz", "organize" }, {
	465	+ "orientate", "orient" }, {
	466	+ "oscilascope", "oscilloscope" }, {
	467	+ "oving", "moving" }, {
	468	+ "paramers", "parameters" }, {
	469	+ "parametic", "parameter" }, {
	470	+ "paranets", "parameters" }, {
	471	+ "partrucal", "particular" }, {
	472	+ "pataphysical", "metaphysical" }, {
	473	+ "patten", "pattern" }, {
	474	+ "permissable", "permissible" }, {
	475	+ "permition", "permission" }, {
	476	+ "permmasivie", "permissive" }, {
	477	+ "perogative", "prerogative" }, {
	478	+ "persue", "pursue" }, {
	479	+ "phantasia", "fantasia" }, {
	480	+ "phenominal", "phenomenal" }, {
	481	+ "picaresque", "picturesque" }, {
	482	+ "playwrite", "playwright" }, {
	483	+ "poeses", "poesies" }, {
	484	+ "polation", "politician" }, {
	485	+ "poligamy", "polygamy" }, {
	486	+ "politict", "politic" }, {
	487	+ "pollice", "police" }, {
	488	+ "polypropalene", "polypropylene" }, {
	489	+ "pompom", "pompon" }, {
	490	+ "possable", "possible" }, {
	491	+ "practicle", "practical" }, {
	492	+ "pragmaticism", "pragmatism" }, {
	493	+ "preceeding", "preceding" }, {
	494	+ "precion", "precision" }, {
	495	+ "precios", "precision" }, {
	496	+ "preemptory", "peremptory" }, {
	497	+ "prefices", "prefixes" }, {
	498	+ "prefixt", "prefixed" }, {
	499	+ "presbyterian", "Presbyterian" }, {
	500	+ "presue", "pursue" }, {
	501	+ "presued", "pursued" }, {
	502	+ "privielage", "privilege" }, {
	503	+ "priviledge", "privilege" }, {
	504	+ "proceedures", "procedures" }, {
	505	+ "pronensiation", "pronunciation" }, {
	506	+ "pronisation", "pronunciation" }, {
	507	+ "pronounciation", "pronunciation" }, {
	508	+ "properally", "properly" }, {
	509	+ "proplematic", "problematic" }, {
	510	+ "protray", "portray" }, {
	511	+ "pscolgst", "psychologist" }, {
	512	+ "psicolagest", "psychologist" }, {
	513	+ "psycolagest", "psychologist" }, {
	514	+ "quoz", "quiz" }, {
	515	+ "radious", "radius" }, {
	516	+ "ramplily", "rampantly" }, {
	517	+ "reccomend", "recommend" }, {
	518	+ "reccona", "raccoon" }, {
	519	+ "recieve", "receive" }, {
	520	+ "reconise", "recognize" }, {
	521	+ "rectangeles", "rectangle" }, {
	522	+ "redign", "redesign" }, {
	523	+ "reoccurring", "recurring" }, {
	524	+ "repitition", "repetition" }, {
	525	+ "replasments", "replacement" }, {
	526	+ "reposable", "responsible" }, {
	527	+ "reseblence", "resemblance" }, {
	528	+ "respct", "respect" }, {
	529	+ "respecally", "respectfully" }, {
	530	+ "roon", "room" }, {
	531	+ "rought", "roughly" }, {
	532	+ "rsx", "RSX" }, {
	533	+ "rudemtry", "rudimentary" }, {
	534	+ "runnung", "running" }, {
	535	+ "sacreligious", "sacrilegious" }, {
	536	+ "saftly", "safely" }, {
	537	+ "salut", "salute" }, {
	538	+ "satifly", "satisfy" }, {
	539	+ "scrabdle", "scrabble" }, {
	540	+ "searcheable", "searchable" }, {
	541	+ "secion", "section" }, {
	542	+ "seferal", "several" }, {
	543	+ "segements", "segments" }, {
	544	+ "sence", "sense" }, {
	545	+ "seperate", "separate" }, {
	546	+ "sherbert", "sherbet" }, {
	547	+ "sicolagest", "psychologist" }, {
	548	+ "sieze", "seize" }, {
	549	+ "simpfilty", "simplicity" }, {
	550	+ "simplye", "simply" }, {
	551	+ "singal", "signal" }, {
	552	+ "sitte", "site" }, {
	553	+ "situration", "situation" }, {
	554	+ "slyph", "sylph" }, {
	555	+ "smil", "smile" }, {
	556	+ "snuck", "sneaked" }, {
	557	+ "sometmes", "sometimes" }, {
	558	+ "soonec", "sonic" }, {
	559	+ "specificialy", "specifically" }, {
	560	+ "spel", "spell" }, {
	561	+ "spoak", "spoke" }, {
	562	+ "sponsered", "sponsored" }, {
	563	+ "stering", "steering" }, {
	564	+ "straightjacket", "straitjacket" }, {
	565	+ "stumach", "stomach" }, {
	566	+ "stutent", "student" }, {
	567	+ "styleguide", "style guide" }, {
	568	+ "subisitions", "substitutions" }, {
	569	+ "subjecribed", "subscribed" }, {
	570	+ "subpena", "subpoena" }, {
	571	+ "substations", "substitutions" }, {
	572	+ "suger", "sugar" }, {
	573	+ "supercede", "supersede" }, {
	574	+ "superfulous", "superfluous" }, {
	575	+ "susan", "Susan" }, {
	576	+ "swimwear", "swim wear" }, {
	577	+ "syncorization", "synchronization" }, {
	578	+ "taff", "tough" }, {
	579	+ "taht", "that" }, {
	580	+ "tattos", "tattoos" }, {
	581	+ "techniquely", "technically" }, {
	582	+ "teh", "the" }, {
	583	+ "tem", "team" }, {
	584	+ "teo", "two" }, {
	585	+ "teridical", "theoretical" }, {
	586	+ "tesst", "test" }, {
	587	+ "tets", "tests" }, {
	588	+ "thanot", "than or" }, {
	589	+ "theirselves", "themselves" }, {
	590	+ "theridically", "theoretical" }, {
	591	+ "thredically", "theoretically" }, {
	592	+ "thruout", "throughout" }, {
	593	+ "ths", "this" }, {
	594	+ "titalate", "titillate" }, {
	595	+ "tobagan", "tobaggon" }, {
	596	+ "tommorrow", "tomorrow" }, {
	597	+ "tomorow", "tomorrow" }, {
	598	+ "tradegy", "tragedy" }, {
	599	+ "trubbel", "trouble" }, {
	600	+ "ttest", "test" }, {
	601	+ "tunnellike", "tunnel like" }, {
	602	+ "tured", "turned" }, {
	603	+ "tyrrany", "tyranny" }, {
	604	+ "unatourral", "unnatural" }, {
	605	+ "unaturral", "unnatural" }, {
	606	+ "unconisitional", "unconstitutional" }, {
	607	+ "unconscience", "unconscious" }, {
	608	+ "underladder", "under ladder" }, {
	609	+ "unentelegible", "unintelligible" }, {
	610	+ "unfortunently", "unfortunately" }, {
	611	+ "unnaturral", "unnatural" }, {
	612	+ "upcast", "up cast" }, {
	613	+ "upmost", "utmost" }, {
	614	+ "uranisium", "uranium" }, {
	615	+ "verison", "version" }, {
	616	+ "vinagarette", "vinaigrette" }, {
	617	+ "volumptuous", "voluptuous" }, {
	618	+ "volunteerism", "voluntarism" }, {
	619	+ "volye", "volley" }, {
	620	+ "wadting", "wasting" }, {
	621	+ "waite", "wait" }, {
	622	+ "wan't", "won't" }, {
	623	+ "warloord", "warlord" }, {
	624	+ "whaaat", "what" }, {
	625	+ "whard", "ward" }, {
	626	+ "whimp", "wimp" }, {
	627	+ "wicken", "weaken" }, {
	628	+ "wierd", "weird" }, {
	629	+ "wrank", "rank" }, {
	630	+ "writeen", "righten" }, {
	631	+ "writting", "writing" }, {
	632	+ "wundeews", "windows" }, {
	633	+ "yeild", "yield" }, {
	634	+ "youe", "your" }
	635	+};
	636	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -36,7 +36,8 @@
37	37	WikiQueryParser.TITLE_BOOST = 2;
38	38	WikiQueryParser.ALT_TITLE_BOOST = 6;
39	39	WikiQueryParser.KEYWORD_BOOST = 0.05f;
40		~~- WikiIndexModifier.ALT_TITLES = 3;~~
	40	+ WikiQueryParser.ADD_TITLE_PHRASES = false;
	41	+ WikiIndexModifier.ALT_TITLES = 3;
41	42	FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder();
42	43	FieldNameFactory ff = new FieldNameFactory();
43	44	try{
—	—	@@ -316,6 +317,12 @@
317	318	q = parser.parseFourPass("Israeli-Palestinian conflict",NamespacePolicy.IGNORE,true);
318	319	assertEquals("(+(+(contents:israeli contents:isra^0.5) +contents:palestinian) +contents:conflict) (+(+title:israeli^2.0 +title:palestinian^2.0) +title:conflict^2.0) ((+(+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0) +alttitle1:conflict^6.0) (+(+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0) +alttitle2:conflict^6.0) (+(+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0) +alttitle3:conflict^6.0))",q.toString());
319	320
	321	+ // title phrases
	322	+ WikiQueryParser.ADD_TITLE_PHRASES = true;
	323	+ q = parser.parseFourPass("Israeli Palestinian conflict",NamespacePolicy.IGNORE,true);
	324	+ assertEquals("(+(contents:israeli contents:isra^0.5) +contents:palestinian +contents:conflict (title:\"israeli palestinian\"~2^2.0 title:\"palestinian conflict\"~2^2.0)) (+title:israeli^2.0 +title:palestinian^2.0 +title:conflict^2.0) ((+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0 +alttitle1:conflict^6.0) (+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0 +alttitle2:conflict^6.0) (+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0 +alttitle3:conflict^6.0)) (spanNear([keyword1:israeli, keyword1:palestinian, keyword1:conflict], 100, false)^0.05 spanNear([keyword2:israeli, keyword2:palestinian, keyword2:conflict], 100, false)^0.025 spanNear([keyword3:israeli, keyword3:palestinian, keyword3:conflict], 100, false)^0.016666668 spanNear([keyword4:israeli, keyword4:palestinian, keyword4:conflict], 100, false)^0.0125 spanNear([keyword5:israeli, keyword5:palestinian, keyword5:conflict], 100, false)^0.01)",q.toString());
	325	+ WikiQueryParser.ADD_TITLE_PHRASES = false;
	326	+
320	327	// alternative transliterations
321	328	q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true);
322	329	assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/Suggest.java
—	—	@@ -24,6 +24,8 @@
25	25	import org.apache.lucene.search.TopDocs;
26	26	import org.wikimedia.lsearch.beans.SearchResults;
27	27	import org.wikimedia.lsearch.config.IndexId;
	28	+import org.wikimedia.lsearch.suggest.api.NgramIndexer;
	29	+import org.wikimedia.lsearch.suggest.api.NamespaceFreq;
28	30	import org.wikimedia.lsearch.suggest.api.WordsIndexer;
29	31	import org.wikimedia.lsearch.suggest.dist.DoubleMetaphone;
30	32	import org.wikimedia.lsearch.suggest.dist.EditDistance;
—	—	@@ -44,7 +46,7 @@
45	47	public Suggest(IndexId iid) throws IOException{
46	48	this.iid = iid;
47	49	this.searcher = new IndexSearcher(iid.getSuggestWordsPath());
48		~~- this.phrases = new IndexSearcher(iid.getSuggestPhrasesPath());~~
	50	+ this.phrases = new IndexSearcher(iid.getSuggestTitlesPath());
49	51	this.dmeta = new DoubleMetaphone();
50	52	}
51	53
—	—	@@ -54,7 +56,7 @@
55	57	BooleanQuery bq = new BooleanQuery();
56	58	addQuery(bq,"metaphone1",meta1,2);
57	59	addQuery(bq,"metaphone2",meta2,2);
58		~~- bq.add(makeWordQuery(word),BooleanClause.Occur.SHOULD);~~
	60	+ bq.add(makeWordQuery(word,""),BooleanClause.Occur.SHOULD);
59	61
60	62	try {
61	63	TopDocs docs = searcher.search(bq,null,POOL);
—	—	@@ -126,13 +128,14 @@
127	129	return Math.log10(1+score*99)/2;
128	130	}
129	131
130		~~- public Query makeWordQuery(String word){~~
	132	+ public Query makeWordQuery(String word, String prefix){
131	133	BooleanQuery bq = new BooleanQuery(true);
132		~~- int min = WordsIndexer.getMinNgram(word);~~
133		~~- int max = WordsIndexer.getMaxNgram(word);~~
	134	+ int min = NgramIndexer.getMinNgram(word);
	135	+ int max = NgramIndexer.getMaxNgram(word);
	136	+ String fieldBase = NgramIndexer.getNgramField(prefix);
134	137	for(int i=min; i <= max; i++ ){
135		~~- String[] ngrams = WordsIndexer.nGrams(word,i);~~
136		~~- String field = "ngram"+i;~~
	138	+ String[] ngrams = NgramIndexer.nGrams(word,i);
	139	+ String field = fieldBase+i;
137	140	for(int j=0 ; j<ngrams.length ; j++){
138	141	String ngram = ngrams[j];
139	142	/*if(j == 0)
—	—	@@ -244,7 +247,7 @@
245	248	try {
246	249	Hits hits = phrases.search(new TermQuery(new Term("word",word1+word2)));
247	250	if(hits.length() > 0){
248		~~- int freq = Integer.parseInt(hits.doc(0).get("freq"));~~
	251	+ int freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(0);
249	252	if(freq >= JOIN_FREQ)
250	253	return new SuggestResult(word1+word2,freq);
251	254	}
—	—	@@ -257,10 +260,10 @@
258	261
259	262	public ArrayList<SuggestResult> suggestPhrase(String word1, String word2, int num){
260	263	String phrase = word1+"_"+word2;
261		~~- Query q = makeWordQuery(phrase);~~
	264	+ Query q = makeWordQuery(phrase,"phrase");
262	265
263	266	try {
264		~~- TopDocs docs = phrases.search(q,null,50);~~
	267	+ TopDocs docs = phrases.search(q,null,200);
265	268	EditDistance sd = new EditDistance(phrase);
266	269	ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
267	270	int minfreq = -1;
—	—	@@ -268,7 +271,7 @@
269	272	for(ScoreDoc sc : docs.scoreDocs){
270	273	Document d = phrases.doc(sc.doc);
271	274	SuggestResult r = new SuggestResult(d.get("phrase"),
272		~~- Integer.parseInt(d.get("freq")));~~
	275	+ new NamespaceFreq(d.get("freq")).getFrequency(0));
273	276	if(phrase.equals(r.word)){
274	277	minfreq = r.frequency;
275	278	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/SuggestBuilder.java
—	—	@@ -3,6 +3,9 @@
4	4	import java.io.IOException;
5	5	import java.io.InputStream;
6	6	import java.util.ArrayList;
	7	+import java.util.HashMap;
	8	+import java.util.HashSet;
	9	+import java.util.Hashtable;
7	10	import java.util.Map.Entry;
8	11
9	12	import org.apache.log4j.Logger;
—	—	@@ -10,23 +13,30 @@
11	14	import org.apache.lucene.document.Document;
12	15	import org.apache.lucene.index.IndexReader;
13	16	import org.apache.lucene.index.Term;
	17	+import org.apache.lucene.search.CachingWrapperFilter;
	18	+import org.apache.lucene.search.Filter;
14	19	import org.apache.lucene.search.Hits;
15	20	import org.apache.lucene.search.IndexSearcher;
	21	+import org.apache.lucene.search.PhraseQuery;
	22	+import org.apache.lucene.search.QueryFilter;
16	23	import org.apache.lucene.search.TermQuery;
17	24	import org.apache.lucene.store.FSDirectory;
18	25	import org.mediawiki.dumper.ProgressFilter;
19	26	import org.mediawiki.dumper.Tools;
20	27	import org.mediawiki.importer.XmlDumpReader;
21	28	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
	29	+import org.wikimedia.lsearch.analyzers.WikiQueryParser;
22	30	import org.wikimedia.lsearch.config.Configuration;
23	31	import org.wikimedia.lsearch.config.GlobalConfiguration;
24	32	import org.wikimedia.lsearch.config.IndexId;
25	33	import org.wikimedia.lsearch.config.IndexRegistry;
26	34	import org.wikimedia.lsearch.importer.DumpImporter;
	35	+import org.wikimedia.lsearch.search.NamespaceFilter;
27	36	import org.wikimedia.lsearch.suggest.api.LuceneDictionary;
28		~~-import org.wikimedia.lsearch.suggest.api.PhraseIndexer;~~
	37	+import org.wikimedia.lsearch.suggest.api.NamespaceFreq;
	38	+import org.wikimedia.lsearch.suggest.api.TitleIndexer;
29	39	import org.wikimedia.lsearch.suggest.api.WordsIndexer;
30		~~-import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;~~
	40	+import org.wikimedia.lsearch.suggest.api.Dictionary.Word;
31	41	import org.wikimedia.lsearch.util.Localization;
32	42	import org.wikimedia.lsearch.util.StringCounter;
33	43	import org.wikimedia.lsearch.util.UnicodeDecomposer;
—	—	@@ -55,8 +65,8 @@
56	66	inputfile = args.length>1? args[1] : null;
57	67	dbname = args[0];
58	68
59		-
60		~~- String langCode = GlobalConfiguration.getInstance().getLanguage(dbname);~~
	69	+ GlobalConfiguration global = GlobalConfiguration.getInstance();
	70	+ String langCode = global.getLanguage(dbname);
61	71	// preload
62	72	UnicodeDecomposer.getInstance();
63	73	Localization.readLocalization(langCode);
—	—	@@ -89,66 +99,37 @@
90	100	}
91	101	}
92	102	// make words index
	103	+ log.info("Making words index");
93	104	try {
94	105	LuceneDictionary dict = new LuceneDictionary(IndexReader.open(iid.getSuggestCleanPath()),"contents");
95		~~- WordsIndexer writer = new WordsIndexer(iid.getSuggestWordsPath(),50);~~
	106	+ WordsIndexer writer = new WordsIndexer(iid.getSuggestWordsPath(),(dbname.equals("wikilucene")? 3 : 50));
	107	+ writer.createIndex();
96	108	Word word;
97	109	while((word = dict.next()) != null){
98	110	writer.addWord(word);
99	111	}
100		~~- writer.close();~~
	112	+ writer.closeAndOptimze();
101	113	} catch (IOException e) {
102	114	log.fatal("Cannot open clean dictionary for "+iid+" : "+e.getMessage());
103	115	e.printStackTrace();
104	116	return;
105	117	}
106	118
	119	+ log.info("Making suggest title index");
107	120	// make phrase index
108		~~- try {~~
109		~~- LuceneDictionary dict = new LuceneDictionary(IndexReader.open(iid.getSuggestCleanPath()),"title");~~
110		~~- PhraseIndexer writer = new PhraseIndexer(iid.getSuggestPhrasesPath(),1);~~
111		~~- IndexSearcher searcher = new IndexSearcher(iid.getSuggestCleanPath());~~
112		~~- Word word;~~
113		~~- while((word = dict.next()) != null){~~
114		~~- // index word~~
115		~~- writer.addWord(word);~~
116		~~- String w = word.getWord();~~
117		~~- StringCounter counter = new StringCounter();~~
118		~~- Hits hits = searcher.search(new TermQuery(new Term("title",w)));~~
119		~~- // find all phrases beginning with word~~
120		~~- for(int i=0;i<hits.length();i++){~~
121		~~- Document doc = hits.doc(i);~~
122		~~- // get original tokens~~
123		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false);~~
124		~~- ArrayList<Token> tokens = parser.parse();~~
125		~~- for(int j=0;j<tokens.size()-1;j++){~~
126		~~- Token t = tokens.get(j);~~
127		~~- // ignore aliases~~
128		~~- if(t.getPositionIncrement() == 0)~~
129		~~- continue;~~
130		~~- // find phrases beginning with the target word~~
131		~~- if(w.equals(t.termText())){~~
132		~~- counter.count(t.termText()+"_"+tokens.get(j+1).termText());~~
133		~~- }~~
134		~~- }~~
135		~~- }~~
136		~~- // index phrases~~
137		~~- for(Entry<String,Count> e : counter.getSet()){~~
138		~~- writer.addPhrase(e.getKey(),e.getValue().num);~~
139		~~- }~~
140		-
141		~~- }~~
142		~~- writer.close();~~
143		~~- } catch (IOException e) {~~
144		~~- log.fatal("Cannot open clean dictionary for "+iid+" : "+e.getMessage());~~
145		~~- e.printStackTrace();~~
146		~~- return;~~
147		~~- }~~
	121	+ Hashtable<String,String> suggest = global.getDBParams(iid.getDBname(),"suggest");
	122	+ int titlesWordsMinFreq = 3;
	123	+ int titlesPhrasesMinFreq = 1;
	124	+ if(suggest!=null && suggest.containsKey("titlesWordsMinFreq"))
	125	+ titlesWordsMinFreq = Integer.parseInt(suggest.get("titlesWordsMinFreq"));
	126	+ if(suggest!=null && suggest.containsKey("titlesPhrasesMinFreq"))
	127	+ titlesWordsMinFreq = Integer.parseInt(suggest.get("titlesPhrasesMinFreq"));
	128	+ TitleIndexer tInx = new TitleIndexer(iid,titlesWordsMinFreq,titlesPhrasesMinFreq);
	129	+ tInx.createFromExistingIndex(iid);
148	130
149		-
150	131	long end = System.currentTimeMillis();
151	132
152		~~- System.out.println("Finished making spell-check index in "+formatTime(end-start));~~
	133	+ System.out.println("Finished making suggest index in "+formatTime(end-start));
153	134	}
154	135
155	136	private static String formatTime(long l) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Indexer.java
—	—	@@ -1,109 +0,0 @@
2		~~-package org.wikimedia.lsearch.suggest.api;~~
3		-
4		~~-import java.io.IOException;~~
5		-
6		~~-import org.apache.log4j.Logger;~~
7		~~-import org.apache.lucene.analysis.Analyzer;~~
8		~~-import org.apache.lucene.document.Document;~~
9		~~-import org.apache.lucene.document.Field;~~
10		~~-import org.apache.lucene.index.IndexWriter;~~
11		~~-import org.wikimedia.lsearch.index.WikiIndexModifier;~~
12		-
13		-/**
14		~~- * Base indexer class. Open/close index.~~
15		- *
16		~~- * @author rainman~~
17		- *
18		~~- */~~
19		~~-public class Indexer {~~
20		~~- Logger log = Logger.getLogger(Indexer.class);~~
21		~~- protected String path;~~
22		~~- protected Analyzer analyzer;~~
23		~~- protected IndexWriter writer;~~
24		-
25		~~- public Indexer(String path, Analyzer analyzer) throws IOException{~~
26		~~- this.path = path;~~
27		~~- this.analyzer = analyzer;~~
28		~~- try {~~
29		~~- writer = new IndexWriter(path,analyzer,true); // always make new index~~
30		~~- } catch (IOException e) {~~
31		~~- try {~~
32		~~- log.info("Making new index at path "+path);~~
33		~~- // try to make brand new index~~
34		~~- WikiIndexModifier.makeDBPath(path); // ensure all directories are made~~
35		~~- writer = new IndexWriter(path,analyzer,true);~~
36		~~- } catch (IOException e1) {~~
37		~~- log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage());~~
38		~~- throw e1;~~
39		~~- }~~
40		~~- }~~
41		~~- writer.setMergeFactor(20);~~
42		~~- writer.setMaxBufferedDocs(500);~~
43		~~- writer.setUseCompoundFile(true);~~
44		~~- writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);~~
45		-
46		~~- }~~
47		-
48		~~- /** Optimize and close index, always call when done indexing */~~
49		~~- public void close() throws IOException {~~
50		~~- try{~~
51		~~- writer.optimize();~~
52		~~- writer.close();~~
53		~~- } catch(IOException e){~~
54		~~- log.warn("I/O error optimizing/closing index at "+path);~~
55		~~- throw e;~~
56		~~- }~~
57		~~- }~~
58		-
59		~~- /** Return ngrams of specific size for text */~~
60		~~- public static String[] nGrams(String text, int size) {~~
61		~~- int len = text.length();~~
62		~~- String[] res = new String[len - size + 1];~~
63		~~- for (int i = 0; i < len - size + 1; i++) {~~
64		~~- res[i] = text.substring(i, i + size);~~
65		~~- }~~
66		~~- return res;~~
67		~~- }~~
68		-
69		~~- /** Get minimal ngram size for word. Short words (<=3 chars) will have 1-grams, other 2-grams */~~
70		~~- public static int getMinNgram(String word){~~
71		~~- if(word.length() <= 3)~~
72		~~- return 1;~~
73		~~- else if(word.length() == 4)~~
74		~~- return 2;~~
75		~~- else~~
76		~~- return 3;~~
77		~~- }~~
78		~~- /** Get minimal ngram size for word. Long words: 4-grams, other 3-grams, 2-char word only 1-grams */~~
79		~~- public static int getMaxNgram(String word){~~
80		~~- if(word.length() > 4)~~
81		~~- return 3;~~
82		~~- if(word.length() == 2)~~
83		~~- return 1;~~
84		~~- return 2;~~
85		~~- }~~
86		-
87		- /**
88		~~- * Add ngrams of all sizes from 1 to word.length to document~~
89		- *
90		~~- * @param doc - document to add fields to~~
91		~~- * @param word - word~~
92		~~- */~~
93		~~- protected void addNgramFields(Document doc, String word) {~~
94		~~- int min = getMinNgram(word);~~
95		~~- int max = getMaxNgram(word);~~
96		~~- for(int i=min ; i <= max ; i++ ){~~
97		~~- String[] ngrams = nGrams(word,i);~~
98		~~- String field = "ngram"+i;~~
99		~~- for(int j=0 ; j<ngrams.length ; j++){~~
100		~~- String ngram = ngrams[j];~~
101		~~- if(j == 0)~~
102		~~- doc.add(new Field("start"+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));~~
103		~~- else if(j == ngrams.length-1)~~
104		~~- doc.add(new Field("end"+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));~~
105		~~- // finally add regular ngram~~
106		~~- doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));~~
107		~~- }~~
108		~~- }~~
109		~~- }~~
110		-}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/PhraseIndexer.java
—	—	@@ -1,82 +0,0 @@
2		~~-package org.wikimedia.lsearch.suggest.api;~~
3		-
4		~~-import java.io.IOException;~~
5		-
6		~~-import org.apache.lucene.analysis.SimpleAnalyzer;~~
7		~~-import org.apache.lucene.document.Document;~~
8		~~-import org.apache.lucene.document.Field;~~
9		~~-import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;~~
10		-
11		-/**
12		~~- * Class to build an index of phrases. It indexes:~~
13		~~- * 1) sets of two words as douglas_adams~~
14		~~- * 2) individual words~~
15		- *
16		~~- * 1) is useful for content-dependant suggestions and~~
17		~~- * suggesting splits (splitting one word into two), while~~
18		~~- * 2) is useful for suggesting joins~~
19		- *
20		~~- * @author rainman~~
21		- *
22		~~- */~~
23		~~-public class PhraseIndexer extends Indexer {~~
24		~~- int minFreq;~~
25		-
26		~~- public PhraseIndexer(String path, int minFreq) throws IOException{~~
27		~~- super(path,new SimpleAnalyzer());~~
28		~~- this.minFreq = minFreq;~~
29		~~- }~~
30		-
31		~~- /** Add phrase, convenient for suggesting splits and context-dependend suggestions */~~
32		~~- public void addPhrase(String word1, String word2, int frequency){~~
33		~~- addPhrase(word1+"_"+word2,frequency);~~
34		~~- }~~
35		~~- /** Add phrase, join two words by underscore */~~
36		~~- public void addPhrase(String phrase, int frequency){~~
37		~~- if(frequency < minFreq)~~
38		~~- return;~~
39		~~- Document doc = new Document();~~
40		~~- addNgramFields(doc,phrase);~~
41		~~- doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));~~
42		~~- doc.add(new Field("freq",Integer.toString(frequency), Field.Store.YES, Field.Index.UN_TOKENIZED));~~
43		-
44		~~- try {~~
45		~~- writer.addDocument(doc);~~
46		~~- } catch (Exception e) {~~
47		~~- log.error("Cannot add document "+doc);~~
48		~~- e.printStackTrace();~~
49		~~- }~~
50		~~- }~~
51		-
52		~~- /** Add ordinary word to the index, convenient for suggesting joins */~~
53		~~- public void addWord(Word word){~~
54		~~- Document doc = new Document();~~
55		~~- doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED));~~
56		~~- doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.UN_TOKENIZED));~~
57		-
58		~~- try {~~
59		~~- writer.addDocument(doc);~~
60		~~- } catch (Exception e) {~~
61		~~- log.error("Cannot add document "+doc);~~
62		~~- e.printStackTrace();~~
63		~~- }~~
64		~~- }~~
65		-
66		~~- /** Get minimal ngram size for word. Short words (<=3 chars) will have 1-grams, other 2-grams */~~
67		~~- public static int getMinNgram(String word){~~
68		~~- if(word.length() == 2)~~
69		~~- return 1;~~
70		~~- if(word.length() <= 6)~~
71		~~- return word.length() - 2;~~
72		~~- else~~
73		~~- return 5;~~
74		~~- }~~
75		-
76		~~- /** Get minimal ngram size for word. Long words: 4-grams, other 3-grams, 2-char word only 1-grams */~~
77		~~- public static int getMaxNgram(String word){~~
78		~~- if(word.length() == 2)~~
79		~~- return 1;~~
80		~~- else~~
81		~~- return getMinNgram(word) + 4;~~
82		~~- }~~
83		-}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitlesIndexer.java
—	—	@@ -1,31 +0,0 @@
2		~~-package org.wikimedia.lsearch.suggest.api;~~
3		-
4		~~-import java.io.IOException;~~
5		-
6		~~-import org.apache.log4j.Logger;~~
7		~~-import org.apache.lucene.analysis.Analyzer;~~
8		~~-import org.apache.lucene.document.Document;~~
9		~~-import org.apache.lucene.document.Field;~~
10		~~-import org.apache.lucene.document.Field.Index;~~
11		~~-import org.apache.lucene.document.Field.Store;~~
12		-
13		~~-public class TitlesIndexer extends Indexer {~~
14		~~- static Logger log = Logger.getLogger(TitlesIndexer.class);~~
15		-
16		~~- public TitlesIndexer(String path, Analyzer analyzer) throws IOException{~~
17		~~- super(path,analyzer);~~
18		~~- }~~
19		-
20		~~- public void addTitle(int ns, String title){~~
21		~~- Document doc = new Document();~~
22		~~- doc.add(new Field("title",title,Store.YES,Index.TOKENIZED));~~
23		~~- doc.add(new Field("namespace",Integer.toString(ns),Store.YES,Index.UN_TOKENIZED));~~
24		~~- try {~~
25		~~- writer.addDocument(doc);~~
26		~~- } catch (IOException e) {~~
27		~~- log.error("Cannot add document "+doc);~~
28		~~- e.printStackTrace();~~
29		~~- }~~
30		~~- }~~
31		-
32		-}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/WordsIndexer.java
—	—	@@ -6,10 +6,7 @@
7	7	import org.apache.lucene.analysis.SimpleAnalyzer;
8	8	import org.apache.lucene.document.Document;
9	9	import org.apache.lucene.document.Field;
10		~~-import org.apache.lucene.index.IndexWriter;~~
11		~~-import org.wikimedia.lsearch.config.GlobalConfiguration;~~
12		~~-import org.wikimedia.lsearch.config.IndexId;~~
13		~~-import org.wikimedia.lsearch.index.WikiIndexModifier;~~
	10	+import org.wikimedia.lsearch.suggest.api.Dictionary.Word;
14	11	import org.wikimedia.lsearch.suggest.dist.DoubleMetaphone;
15	12
16	13	/**
—	—	@@ -20,59 +17,42 @@
21	18	* @author rainman
22	19	*
23	20	*/
24		~~-public class WordsIndexer extends Indexer {~~
25		~~- public static class Word {~~
26		~~- protected String word;~~
27		~~- protected int frequency;~~
28		~~- public Word(String word, int frequency) {~~
29		~~- super();~~
30		~~- this.word = word;~~
31		~~- this.frequency = frequency;~~
32		~~- }~~
33		~~- public int getFrequency() {~~
34		~~- return frequency;~~
35		~~- }~~
36		~~- public void setFrequency(int frequency) {~~
37		~~- this.frequency = frequency;~~
38		~~- }~~
39		~~- public String getWord() {~~
40		~~- return word;~~
41		~~- }~~
42		~~- public void setWord(String word) {~~
43		~~- this.word = word;~~
44		~~- }~~
45		~~- public String toString(){~~
46		~~- return word+" : "+frequency;~~
47		~~- }~~
48		-
49		~~- }~~
	21	+public class WordsIndexer {
50	22	static Logger log = Logger.getLogger(WordsIndexer.class);
51		~~- DoubleMetaphone dmeta;~~
	23	+ protected DoubleMetaphone dmeta;
52	24	/** If word occurs less that minFreq times, it will be discarded */
53	25	protected int minFreq;
	26	+ protected NgramIndexer indexer;
	27	+ String path;
54	28
55	29	public WordsIndexer(String path, int minFreq) throws IOException {
56		~~- super(path,new SimpleAnalyzer());~~
	30	+ this.path = path;
57	31	this.minFreq = minFreq;
58	32	this.dmeta = new DoubleMetaphone();
	33	+ this.indexer = new NgramIndexer();
59	34	}
	35	+
	36	+ public void createIndex() throws IOException{
	37	+ indexer.createIndex(path, new SimpleAnalyzer());
	38	+ }
60	39
61		~~- /** Add word to the index */~~
	40	+ /** Add word to the index, make sure index is open */
62	41	public void addWord(Word word){
63	42	if(word.frequency < minFreq)
64	43	return;
	44	+ if(word.getWord().length() < 2)
	45	+ return;
65	46	Document doc = new Document();
66		~~- addNgramFields(doc,word.word);~~
	47	+ indexer.createNgramFields(doc,"",word.word);
67	48	doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED));
68		~~- doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.UN_TOKENIZED));~~
	49	+ doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.NO));
69	50	doc.add(new Field("metaphone1",dmeta.doubleMetaphone(word.word), Field.Store.NO, Field.Index.UN_TOKENIZED));
70	51	doc.add(new Field("metaphone2",dmeta.doubleMetaphone(word.word,true), Field.Store.NO, Field.Index.UN_TOKENIZED));
71	52
72		~~- try {~~
73		~~- writer.addDocument(doc);~~
74		~~- } catch (Exception e) {~~
75		~~- log.error("Cannot add document "+doc);~~
76		~~- e.printStackTrace();~~
77		~~- }~~
	53	+ indexer.addDocument(doc);
78	54	}
	55	+
	56	+ public void closeAndOptimze() throws IOException{
	57	+ indexer.closeAndOptimize();
	58	+ }
79	59	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitleIndexer.java
—	—	@@ -0,0 +1,323 @@
	2	+package org.wikimedia.lsearch.suggest.api;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.Collection;
	7	+import java.util.HashMap;
	8	+import java.util.HashSet;
	9	+import java.util.Map.Entry;
	10	+
	11	+import org.apache.log4j.Logger;
	12	+import org.apache.lucene.analysis.SimpleAnalyzer;
	13	+import org.apache.lucene.analysis.Token;
	14	+import org.apache.lucene.document.Document;
	15	+import org.apache.lucene.document.Field;
	16	+import org.apache.lucene.index.IndexReader;
	17	+import org.apache.lucene.index.Term;
	18	+import org.apache.lucene.search.Hits;
	19	+import org.apache.lucene.search.IndexSearcher;
	20	+import org.apache.lucene.search.MultiSearcher;
	21	+import org.apache.lucene.search.PhraseQuery;
	22	+import org.apache.lucene.search.Query;
	23	+import org.apache.lucene.search.SearchableMul;
	24	+import org.apache.lucene.search.Searcher;
	25	+import org.apache.lucene.search.TermQuery;
	26	+import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
	27	+import org.wikimedia.lsearch.config.GlobalConfiguration;
	28	+import org.wikimedia.lsearch.config.IndexId;
	29	+import org.wikimedia.lsearch.config.IndexRegistry;
	30	+import org.wikimedia.lsearch.index.IndexUpdateRecord;
	31	+import org.wikimedia.lsearch.search.IndexSearcherMul;
	32	+import org.wikimedia.lsearch.search.WikiSearcher;
	33	+import org.wikimedia.lsearch.suggest.api.Dictionary.Word;
	34	+
	35	+/**
	36	+ * Index words and phrases from article titles.
	37	+ *
	38	+ * Fields:
	39	+ * * word - word from title
	40	+ * * phrase - phrase like douglas_adams
	41	+ * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3)
	42	+ * * namespace - namespaces where the word/phrase is present
	43	+ *
	44	+ * @author rainman
	45	+ *
	46	+ */
	47	+public class TitleIndexer {
	48	+ static Logger log = Logger.getLogger(TitleIndexer.class);
	49	+ protected NgramIndexer ngramWriter;
	50	+ public static final boolean NEW_INDEX = true;
	51	+ protected boolean createNew;
	52	+ protected int minWordFreq, minPhraseFreq;
	53	+ protected IndexId iid;
	54	+ protected String langCode;
	55	+ protected IndexRegistry registry;
	56	+ protected String path;
	57	+
	58	+ public TitleIndexer(IndexId iid, int minWordFreq, int minPhraseFreq){
	59	+ this(iid,minWordFreq,minPhraseFreq,false);
	60	+ }
	61	+
	62	+ public TitleIndexer(IndexId iid, int minWordFreq, int minPhraseFreq, boolean createNew){
	63	+ this.iid = iid;
	64	+ this.minWordFreq = minWordFreq;
	65	+ this.minPhraseFreq = minPhraseFreq;
	66	+ this.createNew = createNew;
	67	+ this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
	68	+ this.ngramWriter = new NgramIndexer();
	69	+ this.registry = IndexRegistry.getInstance();
	70	+ this.path = iid.getSuggestTitlesPath();
	71	+ }
	72	+
	73	+ protected Searcher makeSearcher(IndexId logical) throws IOException{
	74	+ if(logical.isSingle())
	75	+ return new IndexSearcherMul(registry.getLatestSnapshot(logical).path);
	76	+ else{
	77	+ ArrayList<IndexSearcherMul> searchers = new ArrayList<IndexSearcherMul>();
	78	+ for(String part : iid.getPhysicalIndexes()){
	79	+ searchers.add(new IndexSearcherMul(registry.getLatestSnapshot(IndexId.get(part)).path));
	80	+ }
	81	+ return new MultiSearcher(searchers.toArray(new SearchableMul[]{}));
	82	+ }
	83	+ }
	84	+
	85	+ protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, Query q) throws IOException{
	86	+ Hits hits = searcher.search(q);
	87	+ NamespaceFreq wnf = new NamespaceFreq();
	88	+ for(int j=0;j<hits.length();j++){
	89	+ wnf.incFrequency(namespaces[hits.id(j)]);
	90	+ }
	91	+ return wnf;
	92	+ }
	93	+
	94	+ /** Get frequency for a single word */
	95	+ protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String word) throws IOException{
	96	+ return getFrequency(searcher,namespaces,new TermQuery(new Term("contents",word)));
	97	+ }
	98	+
	99	+ /** Get frequency of phrase (invidual words as array) */
	100	+ protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{
	101	+ PhraseQuery pq = new PhraseQuery();
	102	+ for(String p : phrase){
	103	+ pq.add(new Term("contents",p));
	104	+ }
	105	+ return getFrequency(searcher,namespaces,pq);
	106	+ }
	107	+
	108	+ /** Get namespaces where word appears in title */
	109	+ protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, Query q) throws IOException{
	110	+ Hits hits = searcher.search(q);
	111	+ HashSet<Integer> ns = new HashSet<Integer>();
	112	+ for(int j=0;j<hits.length();j++){
	113	+ ns.add(namespaces[hits.id(j)]);
	114	+ }
	115	+ return ns;
	116	+ }
	117	+
	118	+ protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String word) throws IOException{
	119	+ return getNamespaces(searcher,namespaces,new TermQuery(new Term("title",word)));
	120	+ }
	121	+
	122	+ protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{
	123	+ PhraseQuery pq = new PhraseQuery();
	124	+ for(String p : phrase){
	125	+ pq.add(new Term("title",p));
	126	+ }
	127	+ return getNamespaces(searcher,namespaces,pq);
	128	+ }
	129	+
	130	+ /**
	131	+ * Returns the namespace for each doc_id
	132	+ * @throws IOException
	133	+ * @FIXME: assumes optimized index
	134	+ */
	135	+ protected int[] makeNamespaceMap(Searcher searcher) throws IOException{
	136	+ log.debug("Making namespace map...");
	137	+ int[] namespaces = new int[searcher.maxDoc()];
	138	+ for(int i=0;i<namespaces.length;i++){
	139	+ namespaces[i] = -100;
	140	+ Document doc = searcher.doc(i);
	141	+ if(doc != null)
	142	+ namespaces[i] = Integer.parseInt(doc.get("namespace"));
	143	+ }
	144	+ log.debug("Done making namespace map");
	145	+ return namespaces;
	146	+ }
	147	+
	148	+ /** Create new title word/phrases index from an existing index snapshot by reading all terms in the index */
	149	+ public void createFromExistingIndex(IndexId src){
	150	+ try{
	151	+ log.debug("Creating new suggest index");
	152	+ ngramWriter.createIndex(path,new SimpleAnalyzer());
	153	+ Searcher searcher = makeSearcher(iid.getLogical());
	154	+ // map doc_id -> namespace
	155	+ int[] namespaces = makeNamespaceMap(searcher);
	156	+
	157	+ for(String dbrole : src.getPhysicalIndexes()){
	158	+ log.info("Processing index "+dbrole);
	159	+ if(!ngramWriter.isOpen()) // if we closed the index previously
	160	+ ngramWriter.reopenIndex(path,new SimpleAnalyzer());
	161	+
	162	+ IndexId part = IndexId.get(dbrole);
	163	+ IndexReader ir = IndexReader.open(registry.getLatestSnapshot(part).path);
	164	+ LuceneDictionary dict = new LuceneDictionary(ir,"title");
	165	+ IndexSearcher ngramSearcher = new IndexSearcher(path);
	166	+ Word word;
	167	+ // get all words, and all phrases beginning with word
	168	+ while((word = dict.next()) != null){
	169	+ log.debug("Processing word "+word);
	170	+ String w = word.getWord();
	171	+
	172	+ // check if word is already in the index
	173	+ if(ngramSearcher.docFreq(new Term("word",w)) != 0)
	174	+ continue;
	175	+
	176	+ // index word
	177	+ NamespaceFreq wnf = getFrequency(searcher,namespaces,w);
	178	+ Collection<Integer> wns = getNamespaces(searcher,namespaces,w);
	179	+ addWord(w,wnf,wns);
	180	+
	181	+ // index phrases
	182	+ HashSet<String> phrases = new HashSet<String>();
	183	+ Hits hits = searcher.search(new TermQuery(new Term("title",w)));
	184	+ // find all phrases beginning with word
	185	+ for(int i=0;i<hits.length();i++){
	186	+ Document doc = hits.doc(i);
	187	+ // tokenize to make phrases
	188	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false);
	189	+ ArrayList<Token> tokens = parser.parse();
	190	+ for(int j=0;j<tokens.size()-1;j++){
	191	+ Token t = tokens.get(j);
	192	+ // ignore aliases
	193	+ if(t.getPositionIncrement() == 0)
	194	+ continue;
	195	+ // find phrases beginning with the target word
	196	+ if(w.equals(t.termText())){
	197	+ phrases.add(t.termText()+"_"+tokens.get(j+1).termText());
	198	+ }
	199	+ }
	200	+ }
	201	+ log.debug("Adding "+phrases.size()+" phrases "+phrases);
	202	+ // index phrases
	203	+ for(String phrase : phrases){
	204	+ NamespaceFreq nf = getFrequency(searcher,namespaces,phrase.split("_"));
	205	+ Collection<Integer> pns = getNamespaces(searcher,namespaces,phrase.split("_"));
	206	+ addPhrase(phrase,nf,pns);
	207	+ }
	208	+ }
	209	+ log.debug("Finished index "+dbrole+", closing/optimizing.");
	210	+ ir.close();
	211	+ ngramSearcher.close();
	212	+ ngramWriter.closeAndOptimize();
	213	+ }
	214	+ searcher.close();
	215	+ } catch (IOException e) {
	216	+ log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
	217	+ e.printStackTrace();
	218	+ return;
	219	+ }
	220	+ }
	221	+
	222	+ /**
	223	+ * Add phrase to index
	224	+ *
	225	+ * @param phrase - 2+ words joined with underscore
	226	+ * @param nf - frequencies of phrase in various namespaces
	227	+ * @param namespaces - namespaces where phrase appears in title
	228	+ */
	229	+ public void addPhrase(String phrase, NamespaceFreq nf, Collection<Integer> namespaces){
	230	+ String freq = nf.serialize(minPhraseFreq);
	231	+ if(freq.length() == 0)
	232	+ return;
	233	+ if(phrase.length() <= 2){
	234	+ log.warn("Invalid phrase: "+phrase);
	235	+ return;
	236	+ }
	237	+ Document doc = new Document();
	238	+ ngramWriter.createNgramFields(doc,"phrase",phrase);
	239	+ doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
	240	+ doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO));
	241	+ for(Integer ns : namespaces){
	242	+ doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED));
	243	+ }
	244	+
	245	+ ngramWriter.addDocument(doc);
	246	+ }
	247	+
	248	+ /** Add ordinary word to the index, convenient for suggesting joins
	249	+ *
	250	+ * @param word - word to add
	251	+ * @param nf - frequencies in namespaces
	252	+ * @param namespaces - namespaces where word appears in title
	253	+ */
	254	+ public void addWord(String word, NamespaceFreq nf, Collection<Integer> namespaces){
	255	+ if(word.length() < 2)
	256	+ return;
	257	+ String freq = nf.serialize(minWordFreq);
	258	+ if(freq.length() == 0)
	259	+ return;
	260	+ Document doc = new Document();
	261	+ ngramWriter.createNgramFields(doc,"word",word);
	262	+ doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED));
	263	+ doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO));
	264	+ for(Integer ns : namespaces){
	265	+ doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED));
	266	+ }
	267	+
	268	+ ngramWriter.addDocument(doc);
	269	+ }
	270	+
	271	+ /** Update the index */
	272	+ public void update(Collection<IndexUpdateRecord> records){
	273	+ try{
	274	+ log.info("Updating suggest index for "+iid+" with "+records.size());
	275	+ IndexReader ir = IndexReader.open(path);
	276	+ Searcher searcher = makeSearcher(iid.getLogical());
	277	+ int[] namespaces = makeNamespaceMap(searcher);
	278	+ // get all words and phrases
	279	+ HashSet<String> words = new HashSet<String>();
	280	+ HashSet<String> phrases = new HashSet<String>();
	281	+ for(IndexUpdateRecord rec : records){
	282	+ String title = rec.getArticle().getTitle();
	283	+ ArrayList<Token> tokens = new FastWikiTokenizerEngine(title,langCode,false).parse();
	284	+ String last = null;
	285	+ // register word/phrases
	286	+ for(Token t : tokens){
	287	+ String w = t.termText();
	288	+ words.add(w);
	289	+ if(last != null){
	290	+ phrases.add(last+"_"+w);
	291	+ }
	292	+ last = w;
	293	+ }
	294	+ }
	295	+ searcher.close();
	296	+
	297	+ // batch delete old values
	298	+ for(String word : words){
	299	+ ir.deleteDocuments(new Term("word",word));
	300	+ }
	301	+ for(String phrase : phrases){
	302	+ ir.deleteDocuments(new Term("phrase",phrase));
	303	+ }
	304	+ ir.close();
	305	+ ngramWriter.reopenIndex(path,new SimpleAnalyzer());
	306	+
	307	+ // batch add new stuff
	308	+ for(String word : words){
	309	+ addWord(word,getFrequency(searcher,namespaces,word),getNamespaces(searcher,namespaces,word));
	310	+ }
	311	+ for(String phrase : phrases){
	312	+ String[] ph = phrase.split("_");
	313	+ addPhrase(phrase,getFrequency(searcher,namespaces,ph),getNamespaces(searcher,namespaces,ph));
	314	+ }
	315	+
	316	+ ngramWriter.close();
	317	+ } catch(IOException e){
	318	+ log.error("Cannot update suggest index for "+iid+" : "+e.getMessage());
	319	+ e.printStackTrace();
	320	+ return;
	321	+ }
	322	+ }
	323	+
	324	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/LuceneDictionary.java
—	—	@@ -20,10 +20,7 @@
21	21	import org.apache.lucene.index.IndexReader;
22	22	import org.apache.lucene.index.Term;
23	23
24		~~-import java.util.Iterator;~~
25		-
26	24	import org.apache.lucene.index.TermEnum;
27		~~-import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;~~
28	25
29	26	import java.io.*;
30	27
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NgramIndexer.java
—	—	@@ -0,0 +1,162 @@
	2	+package org.wikimedia.lsearch.suggest.api;
	3	+
	4	+import java.io.IOException;
	5	+
	6	+import org.apache.log4j.Logger;
	7	+import org.apache.lucene.analysis.Analyzer;
	8	+import org.apache.lucene.document.Document;
	9	+import org.apache.lucene.document.Field;
	10	+import org.apache.lucene.index.IndexWriter;
	11	+import org.wikimedia.lsearch.index.WikiIndexModifier;
	12	+
	13	+/**
	14	+ * Useful for basic ngram indexes handling, open/close indexes, add ngram fields, etc..
	15	+ *
	16	+ * @author rainman
	17	+ *
	18	+ */
	19	+public class NgramIndexer {
	20	+ Logger log = Logger.getLogger(NgramIndexer.class);
	21	+ protected String path;
	22	+ protected Analyzer analyzer;
	23	+ protected IndexWriter writer;
	24	+
	25	+ public NgramIndexer(){
	26	+ path = null;
	27	+ analyzer = null;
	28	+ writer = null;
	29	+ }
	30	+
	31	+ /** Make a new ngram index */
	32	+ public void createIndex(String path, Analyzer analyzer) throws IOException{
	33	+ openIndex(path,analyzer,true);
	34	+ }
	35	+
	36	+ /** Reopen old index, make if doesn't exist */
	37	+ public void reopenIndex(String path, Analyzer analyzer) throws IOException{
	38	+ openIndex(path,analyzer,false);
	39	+ }
	40	+
	41	+ public void openIndex(String path, Analyzer analyzer, boolean newIndex) throws IOException{
	42	+ this.path = path;
	43	+ this.analyzer = analyzer;
	44	+ try {
	45	+ writer = new IndexWriter(path,analyzer,newIndex);
	46	+ } catch (IOException e) {
	47	+ try {
	48	+ log.info("Making new index at path "+path);
	49	+ // try to make brand new index
	50	+ WikiIndexModifier.makeDBPath(path); // ensure all directories are made
	51	+ writer = new IndexWriter(path,analyzer,newIndex);
	52	+ } catch (IOException e1) {
	53	+ log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage());
	54	+ throw e1;
	55	+ }
	56	+ }
	57	+ writer.setMergeFactor(20);
	58	+ writer.setMaxBufferedDocs(500);
	59	+ writer.setUseCompoundFile(true);
	60	+ writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
	61	+
	62	+ }
	63	+
	64	+ /** Check if index is open and ready for modification */
	65	+ public boolean isOpen(){
	66	+ return writer != null;
	67	+ }
	68	+
	69	+ /** Optimize and close index, always call when done indexing */
	70	+ public void close() throws IOException {
	71	+ try{
	72	+ writer.close();
	73	+ writer = null;
	74	+ } catch(IOException e){
	75	+ log.warn("I/O error closing index at "+path);
	76	+ throw e;
	77	+ }
	78	+ }
	79	+
	80	+ /** Optimize and close index, always call when done indexing */
	81	+ public void closeAndOptimize() throws IOException {
	82	+ try{
	83	+ writer.optimize();
	84	+ writer.close();
	85	+ writer = null;
	86	+ } catch(IOException e){
	87	+ log.warn("I/O error optimizing/closing index at "+path);
	88	+ throw e;
	89	+ }
	90	+ }
	91	+
	92	+ /** Return ngrams of specific size for text */
	93	+ public static String[] nGrams(String text, int size) {
	94	+ int len = text.length();
	95	+ String[] res = new String[len - size + 1];
	96	+ for (int i = 0; i < len - size + 1; i++) {
	97	+ res[i] = text.substring(i, i + size);
	98	+ }
	99	+ return res;
	100	+ }
	101	+
	102	+ /** Get minimal ngram size for word. the minimal size should be at least 1/2 of word length */
	103	+ public static int getMinNgram(String word){
	104	+ if(word.length() <= 3)
	105	+ return 1;
	106	+ else if(word.length() == 4 \|\| word.length() == 5)
	107	+ return 2;
	108	+ else
	109	+ return 3;
	110	+ }
	111	+
	112	+ /** Maximal size of ngram block, at most the length of word */
	113	+ public static int getMaxNgram(String word){
	114	+ if(word.length() == 2)
	115	+ return 2;
	116	+ else
	117	+ return 3;
	118	+ }
	119	+
	120	+ /** Get ngram field name with no prefix */
	121	+ public static String getNgramField(){
	122	+ return getNgramField(null);
	123	+ }
	124	+
	125	+ /** Get prefixed ngram field name */
	126	+ public static String getNgramField(String prefix){
	127	+ if(prefix == null \|\| prefix.equals(""))
	128	+ return "ngram";
	129	+ else
	130	+ return prefix+"_ngram";
	131	+ }
	132	+
	133	+ /**
	134	+ * Add ngrams of all sizes from 1 to word.length to document
	135	+ *
	136	+ * @param doc - document to add fields to
	137	+ * @param prefix - prefix to ngram field name
	138	+ * @param word - word
	139	+ */
	140	+ protected void createNgramFields(Document doc, String prefix, String word) {
	141	+ int min = getMinNgram(word);
	142	+ int max = getMaxNgram(word);
	143	+ String fieldBase = getNgramField(prefix);
	144	+ for(int i=min ; i <= max ; i++ ){
	145	+ String[] ngrams = nGrams(word,i);
	146	+ String field = fieldBase+i;
	147	+ for(int j=0 ; j<ngrams.length ; j++){
	148	+ String ngram = ngrams[j];
	149	+ doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
	150	+ }
	151	+ }
	152	+ }
	153	+
	154	+ public void addDocument(Document doc){
	155	+ try {
	156	+ log.debug("Adding document "+doc);
	157	+ writer.addDocument(doc);
	158	+ } catch (Exception e) {
	159	+ log.error("Cannot add document "+doc+" : "+e.getMessage());
	160	+ e.printStackTrace();
	161	+ }
	162	+ }
	163	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Dictionary.java
—	—	@@ -1,9 +1,31 @@
2	2	package org.wikimedia.lsearch.suggest.api;
3	3
4		~~-import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;~~
5		-
6		-
7	4	public interface Dictionary {
	5	+ public static class Word {
	6	+ protected String word;
	7	+ protected int frequency;
	8	+ public Word(String word, int frequency) {
	9	+ super();
	10	+ this.word = word;
	11	+ this.frequency = frequency;
	12	+ }
	13	+ public int getFrequency() {
	14	+ return frequency;
	15	+ }
	16	+ public void setFrequency(int frequency) {
	17	+ this.frequency = frequency;
	18	+ }
	19	+ public String getWord() {
	20	+ return word;
	21	+ }
	22	+ public void setWord(String word) {
	23	+ this.word = word;
	24	+ }
	25	+ public String toString(){
	26	+ return word+" : "+frequency;
	27	+ }
	28	+
	29	+ }
8	30	/** Get next term or null if there is no more terms */
9	31	public Word next();
10	32	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NamespaceFreq.java
—	—	@@ -0,0 +1,88 @@
	2	+package org.wikimedia.lsearch.suggest.api;
	3	+
	4	+import java.util.BitSet;
	5	+import java.util.HashMap;
	6	+import java.util.Set;
	7	+import java.util.Map.Entry;
	8	+
	9	+import org.wikimedia.lsearch.search.NamespaceFilter;
	10	+
	11	+/** Mapping from namespaces to frequencies */
	12	+public class NamespaceFreq {
	13	+ class IntWrap{
	14	+ int val = 0;
	15	+ IntWrap() {}
	16	+ IntWrap(int value){ val = value; }
	17	+ IntWrap(String value){ val = Integer.parseInt(value); }
	18	+ public String toString(){ return ""+val; }
	19	+ }
	20	+ /** namespace -> frequency */
	21	+ protected HashMap<Integer,IntWrap> nsmap = new HashMap<Integer,IntWrap>();
	22	+
	23	+ public NamespaceFreq(String field){
	24	+ String[] pairs = field.split(" ");
	25	+ for(String pair : pairs){
	26	+ if(pair.length() == 0)
	27	+ continue;
	28	+ String[] nsf = pair.split(":");
	29	+ if(nsf.length == 2)
	30	+ nsmap.put(Integer.parseInt(nsf[0]),new IntWrap(nsf[1]));
	31	+ else {
	32	+ throw new RuntimeException("Bad syntax for namespace-frequency pairs : "+field);
	33	+ }
	34	+ }
	35	+ }
	36	+
	37	+ public NamespaceFreq() {
	38	+ }
	39	+
	40	+ public int getFrequency(int namespace){
	41	+ if(nsmap.containsKey(namespace))
	42	+ return nsmap.get(namespace).val;
	43	+ else
	44	+ return 0;
	45	+ }
	46	+
	47	+ public int getFrequency(NamespaceFilter nsf){
	48	+ int sum = 0;
	49	+ BitSet ns = nsf.getIncluded();
	50	+ for(int i=ns.nextSetBit(0); i>=0; i=ns.nextSetBit(i+1)){
	51	+ sum += getFrequency(i);
	52	+ }
	53	+ return sum;
	54	+ }
	55	+
	56	+ public String serialize(int minFreq){
	57	+ StringBuilder sb = new StringBuilder();
	58	+ int sum = 0;
	59	+ for(Entry<Integer,IntWrap> e : nsmap.entrySet()){
	60	+ sum += e.getValue().val;
	61	+ sb.append(e.getKey());
	62	+ sb.append(":");
	63	+ sb.append(e.getValue());
	64	+ sb.append(" ");
	65	+ }
	66	+ if(sum < minFreq)
	67	+ return "";
	68	+ return sb.toString();
	69	+ }
	70	+
	71	+ public String serialize(){
	72	+ return serialize(0);
	73	+ }
	74	+
	75	+ public void setFrequency(int namespace, int frequency){
	76	+ nsmap.put(namespace,new IntWrap(frequency));
	77	+ }
	78	+
	79	+ public void incFrequency(int namespace){
	80	+ if(nsmap.containsKey(namespace)){
	81	+ nsmap.get(namespace).val++;
	82	+ } else
	83	+ nsmap.put(namespace,new IntWrap(1));
	84	+ }
	85	+
	86	+ public Set<Integer> getNamespaces(){
	87	+ return nsmap.keySet();
	88	+ }
	89	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/CleanIndexImporter.java
—	—	@@ -25,8 +25,6 @@
26	26	import org.wikimedia.lsearch.config.IndexId;
27	27	import org.wikimedia.lsearch.ranks.CompactArticleLinks;
28	28	import org.wikimedia.lsearch.ranks.Links;
29		~~-import org.wikimedia.lsearch.suggest.api.PhraseIndexer;~~
30		~~-import org.wikimedia.lsearch.suggest.api.TitlesIndexer;~~
31	29	import org.wikimedia.lsearch.util.Localization;
32	30
33	31	/**
—	—	@@ -59,8 +57,10 @@
60	58	boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null;
61	59	// make article
62	60	Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects);
63		~~- if(page.Title.Namespace == 0)~~
64		~~- writer.addArticle(article);~~
	61	+ if(page.Title.Namespace != 0)
	62	+ article.setContents("");
	63	+
	64	+ writer.addArticle(article);
65	65	// generate phrases
66	66	/* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false);
67	67	ArrayList<Token> tokens = parser.parse();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
—	—	@@ -196,8 +196,13 @@
197	197	/** Get title object corresponding to this article */
198	198	public Title makeTitle(){
199	199	return new Title(Integer.parseInt(namespace),title);
	200	+ }
	201	+
	202	+ public void setContents(String contents) {
	203	+ this.contents = contents;
200	204	}
201	205
202	206
203	207
	208	+
204	209	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -530,6 +530,11 @@
531	531	return namespaceSet;
532	532	}
533	533
	534	+ /** Get logical iid for this index, i.e. enwiki.mainpart -> enwiki */
	535	+ public IndexId getLogical(){
	536	+ return get(dbname);
	537	+ }
534	538
	539	+
535	540
536	541	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -769,6 +769,20 @@
770	770
771	771	dbroles.put(type,params);
772	772
	773	+ } else if(type.equals("suggest")){
	774	+ // all params are optional, if absent default will be used
	775	+ if(tokens.length>1)
	776	+ params.put("wordsMinFreq",tokens[1]);
	777	+ if(tokens.length>2)
	778	+ params.put("titlesWordsMinFreq",tokens[2]);
	779	+ if(tokens.length>3)
	780	+ params.put("titlesPhrasesMinFreq", tokens[3]);
	781	+
	782	+ if(tokens.length>4)
	783	+ System.out.println("Unrecognized suggest parameters in ("+role+")");
	784	+
	785	+ dbroles.put(type,params);
	786	+
773	787	} else{
774	788	System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring.");
775	789	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -64,7 +64,7 @@
65	65	if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
66	66	exactCase = true;
67	67	NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
68		~~- SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase);~~
	68	+ SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase, false);
69	69	if(res!=null && res.isRetry()){
70	70	int retries = 0;
71	71	if(iid.isSplit() \|\| iid.isNssplit()){
—	—	@@ -73,19 +73,27 @@
74	74	retries = 1;
75	75
76	76	while(retries > 0 && res.isRetry()){
77		~~- res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase);~~
	77	+ res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase, false);
78	78	retries--;
79	79	}
80	80	if(res.isRetry())
81	81	res.setErrorMsg("Internal error, too many internal retries.");
82	82	}
83	83	return res;
84		~~- } else if (what.equals("raw")) {~~
85		~~- //TODO: return searchRaw(searchterm);~~
	84	+ } else if (what.equals("raw") \|\| what.equals("rawexplain")) {
	85	+ int offset = 0, limit = 100; boolean exactCase = false;
	86	+ if (query.containsKey("offset"))
	87	+ offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
	88	+ if (query.containsKey("limit"))
	89	+ limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines);
	90	+ if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
	91	+ exactCase = true;
	92	+ NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
	93	+ return search(iid, searchterm, offset, limit, namespaces, what.equals("rawexplain"), exactCase, true);
86	94	} else {
87	95	SearchResults res = new SearchResults();
88	96	res.setErrorMsg("Unrecognized search type. Try one of: " +
89		~~- "titlematch, titleprefix, search, explain, quit, raw.");~~
	97	+ "search, explain, raw, rawexplain.");
90	98	log.warn("Unknown request type [" + what + "].");
91	99	return res;
92	100	}
—	—	@@ -124,7 +132,7 @@
125	133	* Search on iid, with query searchterm. View results from offset to offset+limit, using
126	134	* the default namespaces filter
127	135	*/
128		~~- public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase){~~
	136	+ public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw){
129	137	Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
130	138	if(nsDefault == null \|\| nsDefault.cardinality() == 0)
131	139	nsDefault = new NamespaceFilter("0"); // default to main namespace
—	—	@@ -155,13 +163,16 @@
156	164	}
157	165
158	166	try {
159		~~- if(nsfw == null){~~
	167	+ if(raw){
	168	+ // do minimal parsing, make a raw query
	169	+ parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE);
	170	+ q = parser.parseRaw(searchterm);
	171	+ } else if(nsfw == null){
160	172	if(searchAll)
161	173	q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
162	174	else
163	175	q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
164		~~- }~~
165		~~- else{~~
	176	+ } else{
166	177	q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
167	178	log.info("Using NamespaceFilterWrapper "+nsfw);
168	179	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -404,6 +404,10 @@
405	405	doc.add(new Field("category", "",
406	406	Field.Store.NO, Field.Index.TOKENIZED));
407	407
	408	+ // interwiki associated with this page
	409	+ doc.add(new Field("interwiki", "",
	410	+ Field.Store.NO, Field.Index.TOKENIZED));
	411	+
408	412	for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
409	413	FieldNameFactory fields = bs.getFields();
410	414	// boost document title with it's article rank
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -77,13 +77,15 @@
78	78	/** boost for title field */
79	79	public static float TITLE_BOOST = 6;
80	80	public static float TITLE_ALIAS_BOOST = 0.2f;
	81	+ public static float TITLE_PHRASE_BOOST = 2;
81	82	public static float STEM_TITLE_BOOST = 2;
82	83	public static float STEM_TITLE_ALIAS_BOOST = 0.4f;
83		~~- public static float ALT_TITLE_BOOST = 4;~~
	84	+ public static float ALT_TITLE_BOOST = 8;
84	85	public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
85	86	public static float KEYWORD_BOOST = 0.02f;
86	87
87	88	public static boolean ADD_STEM_TITLE = true;
	89	+ public static boolean ADD_TITLE_PHRASES = true;
88	90
89	91	/** Policies in treating field names:
90	92	*
—	—	@@ -295,7 +297,7 @@
296	298	if(length == 0 && ch == ' ')
297	299	continue; // ignore whitespaces
298	300
299		~~- // pluses and minuses, underscores can be within words, *,? are for wildcard queries~~
	301	+ // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries
300	302	if(Character.isLetterOrDigit(ch) \|\| ch=='-' \|\| ch=='+' \|\| ch=='_' \|\| ch=='*'){
301	303	if(length<buffer.length)
302	304	buffer[length++] = ch;
—	—	@@ -322,7 +324,7 @@
323	325	else if(ch == ':'){
324	326	// check if it's a valid field
325	327	String f = new String(buffer,0,length);
326		~~- if(f.equals(namespaceAllKeyword) \|\| f.equals("incategory") \|\| namespaceFilters.containsKey(f)){~~
	328	+ if(f.equals(namespaceAllKeyword) \|\| f.equals("incategory") \|\| namespaceFilters.containsKey(f) \|\| namespacePolicy == NamespacePolicy.LEAVE){
327	329	cur = lookup;
328	330	return TokenType.FIELD;
329	331	} else
—	—	@@ -1094,7 +1096,7 @@
1095	1097	}
1096	1098
1097	1099	/** Make title query in format: title:query stemtitle:stemmedquery */
1098		~~- protected Query makeTitleQuery(String queryText) {~~
	1100	+ protected Query[] makeTitleQuery(String queryText) {
1099	1101	String contentField = defaultField;
1100	1102	float olfDefaultBoost = defaultBoost;
1101	1103	defaultField = fields.title(); // now parse the title part
—	—	@@ -1117,16 +1119,19 @@
1118	1120	defaultBoost = olfDefaultBoost;
1119	1121	defaultAliasBoost = ALIAS_BOOST;
1120	1122
	1123	+ // make title phrases
	1124	+ Query qp = ADD_TITLE_PHRASES? makeTitlePhrases(qt) : null;
	1125	+
1121	1126	if(qt == qs) // either null, or category query
1122		~~- return qt;~~
	1127	+ return new Query[] {qt,qp};
1123	1128	if(qt == null)
1124		~~- return qs;~~
	1129	+ return new Query[] {qs,qp};
1125	1130	if(qs == null)
1126		~~- return qt;~~
	1131	+ return new Query[] {qt,qp};
1127	1132	BooleanQuery bq = new BooleanQuery(true);
1128	1133	bq.add(qt,BooleanClause.Occur.SHOULD);
1129	1134	bq.add(qs,BooleanClause.Occur.SHOULD);
1130		~~- return bq;~~
	1135	+ return new Query[] {bq,qp};
1131	1136	}
1132	1137
1133	1138	/** Quote CJK chars to avoid frequency-based analysis */
—	—	@@ -1173,6 +1178,44 @@
1174	1179	}
1175	1180	}
1176	1181
	1182	+ /** make two-word queries for some simple queries */
	1183	+ protected Query makeTitlePhrases(Query q){
	1184	+ if(q instanceof BooleanQuery){
	1185	+ boolean allReq = true;
	1186	+ BooleanQuery bq = (BooleanQuery) q;
	1187	+ for(BooleanClause bc : bq.getClauses()){
	1188	+ if(!bc.getOccur().equals(BooleanClause.Occur.MUST) \|\| !(bc.getQuery() instanceof TermQuery) \|\|
	1189	+ !(((TermQuery)bc.getQuery()).getTerm().field().equals("title"))){
	1190	+ allReq = false;
	1191	+ break;
	1192	+ }
	1193	+ }
	1194	+ if(allReq){
	1195	+ BooleanQuery ret = new BooleanQuery(true);
	1196	+ Term last = null;
	1197	+ // make phrases '+very +long +query' => "very long" "long query"
	1198	+ for(BooleanClause bc : bq.getClauses()){
	1199	+ Term t = ((TermQuery)bc.getQuery()).getTerm();
	1200	+ if(last != null){
	1201	+ PhraseQuery pq = new PhraseQuery();
	1202	+ pq.add(new Term("stemtitle",last.text()));
	1203	+ pq.add(new Term("stemtitle",t.text()));
	1204	+ pq.setBoost(TITLE_PHRASE_BOOST);
	1205	+ pq.setSlop(2);
	1206	+ ret.add(pq,BooleanClause.Occur.SHOULD);
	1207	+ }
	1208	+ last = t;
	1209	+
	1210	+ }
	1211	+ if(ret.getClauses() != null && ret.getClauses().length != 0)
	1212	+ return ret;
	1213	+ }
	1214	+ }
	1215	+
	1216	+ return null;
	1217	+
	1218	+ }
	1219	+
1177	1220	/**
1178	1221	* Main function for multi-pass parsing.
1179	1222	*
—	—	@@ -1188,12 +1231,23 @@
1189	1232	defaultBoost = 1;
1190	1233	defaultAliasBoost = ALIAS_BOOST;
1191	1234	Query qc = parseRaw(queryText);
1192		-
1193		~~- Query qt = makeTitleQuery(queryText);~~
	1235	+ Query[] qtqp = makeTitleQuery(queryText);
	1236	+ // qt = title query, qp = title phrase query
	1237	+ Query qt = qtqp[0];
	1238	+ Query qp = null;
	1239	+ qp = qtqp[1];
1194	1240	if(qc == null \|\| qt == null)
1195	1241	return new BooleanQuery();
1196	1242	if(qc.equals(qt))
1197	1243	return qc; // don't duplicate (probably a query for categories only)
	1244	+
	1245	+ // embedd phrase queries into main contents query
	1246	+ if(qp!=null && qc instanceof BooleanQuery){
	1247	+ ((BooleanQuery)qc).add(qp,BooleanClause.Occur.SHOULD);
	1248	+ } else if(qp !=null && !(qc instanceof BooleanQuery)){
	1249	+ // TODO: delete in release
	1250	+ System.out.println("SHOULD NEVER HAPPEN");
	1251	+ }
1198	1252	BooleanQuery bq = new BooleanQuery();
1199	1253	bq.add(qc,BooleanClause.Occur.SHOULD);
1200	1254	bq.add(qt,BooleanClause.Occur.SHOULD);
—	—	@@ -1263,6 +1317,14 @@
1264	1318	public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
1265	1319	return parseMultiPass(queryText,policy,false,false);
1266	1320	}
	1321	+
	1322	+ public NamespacePolicy getNamespacePolicy() {
	1323	+ return namespacePolicy;
	1324	+ }
	1325	+ public void setNamespacePolicy(NamespacePolicy namespacePolicy) {
	1326	+ this.namespacePolicy = namespacePolicy;
	1327	+ }
1267	1328
1268	1329
	1330	+
1269	1331	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -1,6 +1,7 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
4	4	import java.util.ArrayList;
	5	+import java.util.HashMap;
5	6
6	7	import org.apache.log4j.Logger;
7	8	import org.apache.lucene.analysis.Analyzer;
—	—	@@ -69,6 +70,7 @@
70	71	WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);
71	72	tokenizer.tokenize();
72	73	ArrayList<String> categories = tokenizer.getCategories();
	74	+ HashMap<String,String> interwiki = tokenizer.getInterwikis();
73	75
74	76	ArrayList<String> allKeywords = new ArrayList<String>();
75	77	if(addKeywords && tokenizer.getKeywords()!=null)
—	—	@@ -80,6 +82,8 @@
81	83	new LanguageAnalyzer(filters,tokenizer));
82	84	perFieldAnalyzer.addAnalyzer("category",
83	85	new CategoryAnalyzer(categories,exactCase));
	86	+ perFieldAnalyzer.addAnalyzer("interwiki",
	87	+ new InterwikiAnalyzer(interwiki));
84	88	perFieldAnalyzer.addAnalyzer(fields.title(),
85	89	getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
86	90	perFieldAnalyzer.addAnalyzer(fields.stemtitle(),
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/InterwikiAnalyzer.java
—	—	@@ -0,0 +1,58 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.Reader;
	6	+import java.util.HashMap;
	7	+import java.util.Iterator;
	8	+import java.util.Map.Entry;
	9	+
	10	+import org.apache.lucene.analysis.Analyzer;
	11	+import org.apache.lucene.analysis.Token;
	12	+import org.apache.lucene.analysis.TokenStream;
	13	+
	14	+public class InterwikiAnalyzer extends Analyzer {
	15	+ public class InterwikiTokenStream extends TokenStream {
	16	+ protected Iterator<Entry<String,String>> tokensIt;
	17	+ protected int start;
	18	+ protected Token next = null;
	19	+
	20	+ InterwikiTokenStream(){
	21	+ tokensIt = interwiki.entrySet().iterator();
	22	+ start = 0;
	23	+ }
	24	+
	25	+ @Override
	26	+ public Token next() throws IOException {
	27	+ if(next != null){
	28	+ Token t = next;
	29	+ next = null;
	30	+ return t;
	31	+ }
	32	+ if(tokensIt.hasNext()){
	33	+ Entry<String,String> map = tokensIt.next();
	34	+ String iw = map.getKey()+":"; // e.g. en:
	35	+ String title = map.getValue().toLowerCase(); // e.g. "douglas adams"
	36	+ Token t = new Token(iw,start,start+iw.length());
	37	+ start += iw.length()+1;
	38	+ next = new Token(title,start,start+title.length());
	39	+ start += title.length()+1;
	40	+
	41	+ return t;
	42	+ } else
	43	+ return null;
	44	+ }
	45	+
	46	+ }
	47	+
	48	+ HashMap<String,String> interwiki;
	49	+
	50	+ public InterwikiAnalyzer(HashMap<String,String> interwiki) {
	51	+ this.interwiki = interwiki;
	52	+ }
	53	+
	54	+ @Override
	55	+ public TokenStream tokenStream(String fieldName, Reader reader) {
	56	+ return new InterwikiTokenStream();
	57	+ }
	58	+
	59	+}
Index: branches/lucene-search-2.1/build.xml
—	—	@@ -6,9 +6,11 @@
7	7	<property name="jars" value="jars"/>
8	8	<property name="dist" location="dist"/>
9	9	<property name="pack.name" value="lucene-search-2.1"/>
	10	+ <property name="src.name" value="lucene-search-src-2.1"/>
10	11	<property name="binary.name" value="ls2-bin"/>
11	12	<property name="jar.name" value="LuceneSearch.jar"/>
12	13	<property name="include" value="src/ lib/ sql/ test-data/ webinterface/** -example .txt lsearch* build.xml scripts/*"/>
	14	+ <property name="include.src" value="src/ sql/ build.xml scripts/*"/>
13	15
14	16	<property file="${basedir}/hostname"/>
15	17
—	—	@@ -82,6 +84,19 @@
83	85	<delete file="${dist}/${pack.name}.tar"/>
84	86	</target>
85	87
	88	+ <target name="pack-src" description="Make tar.gz distribution of only core source files">
	89	+ <mkdir dir="${dist}"/>
	90	+ <delete file="${dist}/${src.name}.tar"/>
	91	+ <delete file="${dist}/${src.name}.tar.gz"/>
	92	+ <tar tarfile="${dist}/${src.name}.tar">
	93	+ <tarfileset prefix="${pack.name}" dir="." includes="${include.src}"/>
	94	+ </tar>
	95	+
	96	+ <gzip zipfile="${dist}/${src.name}.tar.gz" src="${dist}/${src.name}.tar"/>
	97	+ <delete file="${dist}/${src.name}.tar"/>
	98	+ </target>
	99	+
	100	+
86	101	<target name="binary" depends="alljar" description="Make binary tar.gz distribution">
87	102	<mkdir dir="${bin}"/>
88	103	<delete file="${dist}/${binary.name}.tar"/>

Status & tagging log

15:20, 12 September 2011 Meno25 (talk | contribs) changed the status of r24539 [removed: ok added: old]