r24539 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r24538‎ | r24539 | r24540 >
Date:12:35, 2 August 2007
Author:rainman
Status:old
Tags:
Comment:
Refactor:
* droped PhraseIndexer, replaced it with TitleIndexer which handles
words and phrases that appear in titles
* refactor api to enable incremental updates, use NgramIndexer as
base class
Add:
* untokenized interwiki field and interwiki analyzer (need more work)
* raw search method
* suggest keyword in global settings
Need to refactor suggestions to make use of words in title index.
Modified paths:
  • /branches/lucene-search-2.1/build.xml (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/InterwikiAnalyzer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/CleanIndexImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/SuggestBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Dictionary.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Indexer.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/LuceneDictionary.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NamespaceFreq.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NgramIndexer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/PhraseIndexer.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitleIndexer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitlesIndexer.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/WordsIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /branches/lucene-search-2.1/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/test-data/mwsearch-global.test
@@ -9,7 +9,7 @@
1010 # aspell <language>
1111 [Database]
1212 entest : (mainsplit), (mainpart,false,2,10), (restpart,true,2)
13 -entest : (ngram), (aspell,en)
 13+entest : (ngram), (suggest,1,2,3)
1414 detest,rutest : (single,true,2,10)
1515 frtest : (split,3) (part1) (part2) (part3)
1616 srwiki : (single)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -190,6 +190,12 @@
191191 assertEquals("http://rs.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("rswikimedia"));
192192 assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki"));
193193
 194+ // test suggest tag
 195+ Hashtable<String,String> sug = testgc.getDBParams("entest","suggest");
 196+ assertEquals("1",sug.get("wordsMinFreq"));
 197+ assertEquals("2",sug.get("titlesWordsMinFreq"));
 198+ assertEquals("3",sug.get("titlesPhrasesMinFreq"));
 199+
194200 } catch (MalformedURLException e) {
195201 e.printStackTrace();
196202 } catch (IOException e) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
@@ -0,0 +1,635 @@
 2+package org.wikimedia.lsearch.test;
 3+
 4+import java.io.BufferedReader;
 5+import java.io.IOException;
 6+import java.io.InputStreamReader;
 7+import java.util.ArrayList;
 8+
 9+import org.apache.lucene.index.IndexReader;
 10+import org.apache.lucene.search.spell.SpellChecker;
 11+import org.apache.lucene.store.FSDirectory;
 12+import org.wikimedia.lsearch.config.Configuration;
 13+import org.wikimedia.lsearch.config.IndexId;
 14+import org.wikimedia.lsearch.config.IndexRegistry;
 15+import org.wikimedia.lsearch.suggest.Suggest;
 16+import org.wikimedia.lsearch.suggest.SuggestResult;
 17+import org.wikimedia.lsearch.suggest.Suggest.SuggestSplit;
 18+
 19+public class SuggestTest {
 20+
 21+ public static void testSpellCheck(String dbname) throws IOException{
 22+ IndexId iid = IndexId.get(dbname);
 23+ SpellChecker sc = new SpellChecker(FSDirectory.getDirectory(iid.getSpellcheckPath(),false));
 24+ IndexReader ir = IndexReader.open(iid.getSuggestCleanPath());
 25+ int good=0;
 26+ int bad=0;
 27+ long start = System.currentTimeMillis();
 28+ for(String[] m : DATA){
 29+ String[] res = sc.suggestSimilar(m[0],20,ir,"contents",true);
 30+ if(res.length > 0 && m[1].equals(res[0]))
 31+ good++;
 32+ else{
 33+ reportBad(m[0],m[1],res.length>0? res[0] : "");
 34+ bad++;
 35+ }
 36+ }
 37+ int total = good + bad;
 38+ long delta = System.currentTimeMillis() - start;
 39+ System.out.println("SpellCheck test ("+delta+"ms): good: "+good+" ("+((double)good/total*100)+"%), bad: "+bad+", total="+total);
 40+ }
 41+
 42+ public static void testSuggest(String dbname) throws IOException{
 43+ IndexId iid = IndexId.get(dbname);
 44+ Suggest sc = new Suggest(iid);
 45+ int good=0;
 46+ int bad=0;
 47+ long start = System.currentTimeMillis();
 48+ for(String[] m : DATA){
 49+ ArrayList<SuggestResult> res = sc.suggestWords(m[0],5);
 50+ if(res.size() > 0){
 51+ SuggestResult r = res.get(0);
 52+ if(r.getWord().equals(m[1]))
 53+ good++;
 54+ else if(r.getWord().equals(m[0]) && res.size()>1 && res.get(1).getFrequency()>r.getFrequency()
 55+ && res.get(1).getWord().equals(m[1]))
 56+ good++;
 57+ else if(r.getDist() > 1){
 58+ ArrayList<SuggestSplit> split = sc.suggestSplitFromTitle(m[0]);
 59+ if(split.size()>0 && m[1].equals(split.get(0).getWord()))
 60+ good++;
 61+ else{
 62+ reportBad(m[0],m[1],r.getWord());
 63+ bad++;
 64+ }
 65+
 66+ }
 67+ else{
 68+ reportBad(m[0],m[1],r.getWord());
 69+ bad++;
 70+ }
 71+ } else{
 72+ reportBad(m[0],m[1],"");
 73+ bad++;
 74+ }
 75+ }
 76+ int total = good + bad;
 77+ long delta = System.currentTimeMillis() - start;
 78+ System.out.println("Suggest test ("+delta+"ms): good: "+good+" ("+((double)good/total*100)+"%), bad: "+bad+", total="+total);
 79+ }
 80+
 81+ public static void reportBad(String bad, String expected, String got){
 82+ System.out.println("FOR ["+bad+"] EXPECTED: ["+expected+"], BUT GOT ["+got+"]");
 83+ }
 84+
 85+ public static void main(String[] args) throws IOException{
 86+ Configuration.open();
 87+ String dbname = "wikilucene";
 88+ if(args.length==1)
 89+ dbname = args[0];
 90+
 91+ testSpellCheck(dbname);
 92+ testSuggest(dbname);
 93+ }
 94+
 95+
 96+ private static final String[][] DATA = { {
 97+ "abilitey", "ability" }, {
 98+ "abouy", "about" }, {
 99+ "absorbtion", "absorption" }, {
 100+ "accidently", "accidentally" }, {
 101+ "accomodate", "accommodate" }, {
 102+ "acommadate", "accommodate" }, {
 103+ "acord", "accord" }, {
 104+ "adultry", "adultery" }, {
 105+ "aggresive", "aggressive" }, {
 106+ "alchohol", "alcohol" }, {
 107+ "alchoholic", "alcoholic" }, {
 108+ "allieve", "alive" }, {
 109+ "alot", "a lot" }, {
 110+ "alright", "all right" }, {
 111+ "amature", "amateur" }, {
 112+ "ambivilant", "ambivalent" }, {
 113+ "amification", "amplification" }, {
 114+ "amourfous", "amorphous" }, {
 115+ "annoint", "anoint" }, {
 116+ "annonsment", "announcement" }, {
 117+ "annoyting", "anting" }, {
 118+ "annuncio", "announce" }, {
 119+ "anonomy", "anatomy" }, {
 120+ "anotomy", "anatomy" }, {
 121+ "antidesestablishmentarianism", "antidisestablishmentarianism" }, {
 122+ "antidisestablishmentarism", "antidisestablishmentarianism" }, {
 123+ "anynomous", "anonymous" }, {
 124+ "appelet", "applet" }, {
 125+ "appreceiated", "appreciated" }, {
 126+ "appresteate", "appreciate" }, {
 127+ "aquantance", "acquaintance" }, {
 128+ "aratictature", "architecture" }, {
 129+ "archeype", "archetype" }, {
 130+ "aricticure", "architecture" }, {
 131+ "artic", "arctic" }, {
 132+ "asentote", "asymptote" }, {
 133+ "ast", "at" }, {
 134+ "asterick", "asterisk" }, {
 135+ "asymetric", "asymmetric" }, {
 136+ "atentively", "attentively" }, {
 137+ "autoamlly", "automatically" }, {
 138+ "bankrot", "bankrupt" }, {
 139+ "basicly", "basically" }, {
 140+ "batallion", "battalion" }, {
 141+ "bbrose", "browse" }, {
 142+ "beauro", "bureau" }, {
 143+ "beaurocracy", "bureaucracy" }, {
 144+ "beggining", "beginning" }, {
 145+ "beging", "beginning" }, {
 146+ "behaviour", "behavior" }, {
 147+ "beleive", "believe" }, {
 148+ "belive", "believe" }, {
 149+ "benidifs", "benefits" }, {
 150+ "bigginging", "beginning" }, {
 151+ "blait", "bleat" }, {
 152+ "bouyant", "buoyant" }, {
 153+ "boygot", "boycott" }, {
 154+ "brocolli", "broccoli" }, {
 155+ "buch", "bush" }, {
 156+ "buder", "butter" }, {
 157+ "budr", "butter" }, {
 158+ "budter", "butter" }, {
 159+ "buracracy", "bureaucracy" }, {
 160+ "burracracy", "bureaucracy" }, {
 161+ "buton", "button" }, {
 162+ "byby", "by by" }, {
 163+ "cauler", "caller" }, {
 164+ "ceasar", "caesar" }, {
 165+ "cemetary", "cemetery" }, {
 166+ "changeing", "changing" }, {
 167+ "cheet", "cheat" }, {
 168+ "cicle", "circle" }, {
 169+ "cimplicity", "simplicity" }, {
 170+ "circumstaces", "circumstances" }, {
 171+ "clob", "club" }, {
 172+ "coaln", "colon" }, {
 173+ "cocamena", "cockamamie" }, {
 174+ "colleaque", "colleague" }, {
 175+ "colloquilism", "colloquialism" }, {
 176+ "columne", "column" }, {
 177+ "comiler", "compiler" }, {
 178+ "comitmment", "commitment" }, {
 179+ "comitte", "committee" }, {
 180+ "comittmen", "commitment" }, {
 181+ "comittmend", "commitment" }, {
 182+ "commerciasl", "commercials" }, {
 183+ "commited", "committed" }, {
 184+ "commitee", "committee" }, {
 185+ "companys", "companies" }, {
 186+ "compicated", "complicated" }, {
 187+ "comupter", "computer" }, {
 188+ "concensus", "consensus" }, {
 189+ "confusionism", "confucianism" }, {
 190+ "congradulations", "congratulations" }, {
 191+ "conibation", "contribution" }, {
 192+ "consident", "consistent" }, {
 193+ "consident", "consonant" }, {
 194+ "contast", "constant" }, {
 195+ "contastant", "constant" }, {
 196+ "contunie", "continue" }, {
 197+ "cooly", "coolly" }, {
 198+ "copping", "coping" }, {
 199+ "cosmoplyton", "cosmopolitan" }, {
 200+ "courst", "court" }, {
 201+ "crasy", "crazy" }, {
 202+ "cravets", "caveats" }, {
 203+ "credetability", "credibility" }, {
 204+ "criqitue", "critique" }, {
 205+ "croke", "croak" }, {
 206+ "crucifiction", "crucifixion" }, {
 207+ "crusifed", "crucified" }, {
 208+ "ctitique", "critique" }, {
 209+ "cumba", "combo" }, {
 210+ "custamisation", "customization" }, {
 211+ "dag", "dog" }, {
 212+ "daly", "daily" }, {
 213+ "danguages", "dangerous" }, {
 214+ "deaft", "draft" }, {
 215+ "defence", "defense" }, {
 216+ "defenly", "defiantly" }, {
 217+ "definate", "definite" }, {
 218+ "definately", "definitely" }, {
 219+ "dependeble", "dependable" }, {
 220+ "descrption", "description" }, {
 221+ "descrptn", "description" }, {
 222+ "desparate", "desperate" }, {
 223+ "dessicate", "desiccate" }, {
 224+ "destint", "distant" }, {
 225+ "develepment", "developments" }, {
 226+ "developement", "development" }, {
 227+ "develpond", "development" }, {
 228+ "devulge", "divulge" }, {
 229+ "diagree", "disagree" }, {
 230+ "dieties", "deities" }, {
 231+ "dinasaur", "dinosaur" }, {
 232+ "dinasour", "dinosaur" }, {
 233+ "direcyly", "directly" }, {
 234+ "discuess", "discuss" }, {
 235+ "disect", "dissect" }, {
 236+ "disippate", "dissipate" }, {
 237+ "disition", "decision" }, {
 238+ "dispair", "despair" }, {
 239+ "disssicion", "discussion" }, {
 240+ "distarct", "distract" }, {
 241+ "distart", "distort" }, {
 242+ "distroy", "destroy" }, {
 243+ "documtations", "documentation" }, {
 244+ "doenload", "download" }, {
 245+ "dongle", "dangle" }, {
 246+ "doog", "dog" }, {
 247+ "dramaticly", "dramatically" }, {
 248+ "drunkeness", "drunkenness" }, {
 249+ "ductioneery", "dictionary" }, {
 250+ "dur", "due" }, {
 251+ "duren", "during" }, {
 252+ "dymatic", "dynamic" }, {
 253+ "dynaic", "dynamic" }, {
 254+ "ecstacy", "ecstasy" }, {
 255+ "efficat", "efficient" }, {
 256+ "efficity", "efficacy" }, {
 257+ "effots", "efforts" }, {
 258+ "egsistence", "existence" }, {
 259+ "eitiology", "etiology" }, {
 260+ "elagent", "elegant" }, {
 261+ "elligit", "elegant" }, {
 262+ "embarass", "embarrass" }, {
 263+ "embarassment", "embarrassment" }, {
 264+ "embaress", "embarrass" }, {
 265+ "encapsualtion", "encapsulation" }, {
 266+ "encyclapidia", "encyclopedia" }, {
 267+ "encyclopia", "encyclopedia" }, {
 268+ "engins", "engine" }, {
 269+ "enhence", "enhance" }, {
 270+ "enligtment", "Enlightenment" }, {
 271+ "ennuui", "ennui" }, {
 272+ "enought", "enough" }, {
 273+ "enventions", "inventions" }, {
 274+ "envireminakl", "environmental" }, {
 275+ "enviroment", "environment" }, {
 276+ "epitomy", "epitome" }, {
 277+ "equire", "acquire" }, {
 278+ "errara", "error" }, {
 279+ "erro", "error" }, {
 280+ "evaualtion", "evaluation" }, {
 281+ "evething", "everything" }, {
 282+ "evtually", "eventually" }, {
 283+ "excede", "exceed" }, {
 284+ "excercise", "exercise" }, {
 285+ "excpt", "except" }, {
 286+ "excution", "execution" }, {
 287+ "exhileration", "exhilaration" }, {
 288+ "existance", "existence" }, {
 289+ "expleyly", "explicitly" }, {
 290+ "explity", "explicitly" }, {
 291+ "expresso", "espresso" }, {
 292+ "exspidient", "expedient" }, {
 293+ "extions", "extensions" }, {
 294+ "factontion", "factorization" }, {
 295+ "failer", "failure" }, {
 296+ "famdasy", "fantasy" }, {
 297+ "faver", "favor" }, {
 298+ "faxe", "fax" }, {
 299+ "febuary", "february" }, {
 300+ "firey", "fiery" }, {
 301+ "fistival", "festival" }, {
 302+ "flatterring", "flattering" }, {
 303+ "fluk", "flux" }, {
 304+ "flukse", "flux" }, {
 305+ "fone", "phone" }, {
 306+ "forsee", "foresee" }, {
 307+ "frustartaion", "frustrating" }, {
 308+ "fuction", "function" }, {
 309+ "funetik", "phonetic" }, {
 310+ "futs", "guts" }, {
 311+ "gamne", "came" }, {
 312+ "gaurd", "guard" }, {
 313+ "generly", "generally" }, {
 314+ "ghandi", "gandhi" }, {
 315+ "goberment", "government" }, {
 316+ "gobernement", "government" }, {
 317+ "gobernment", "government" }, {
 318+ "gotton", "gotten" }, {
 319+ "gracefull", "graceful" }, {
 320+ "gradualy", "gradually" }, {
 321+ "grammer", "grammar" }, {
 322+ "hallo", "hello" }, {
 323+ "hapily", "happily" }, {
 324+ "harrass", "harass" }, {
 325+ "havne", "have" }, {
 326+ "heellp", "help" }, {
 327+ "heighth", "height" }, {
 328+ "hellp", "help" }, {
 329+ "helo", "hello" }, {
 330+ "herlo", "hello" }, {
 331+ "hifin", "hyphen" }, {
 332+ "hifine", "hyphen" }, {
 333+ "higer", "higher" }, {
 334+ "hiphine", "hyphen" }, {
 335+ "hippie", "hippy" }, {
 336+ "hippopotamous", "hippopotamus" }, {
 337+ "hlp", "help" }, {
 338+ "hourse", "horse" }, {
 339+ "houssing", "housing" }, {
 340+ "howaver", "however" }, {
 341+ "howver", "however" }, {
 342+ "humaniti", "humanity" }, {
 343+ "hyfin", "hyphen" }, {
 344+ "hypotathes", "hypothesis" }, {
 345+ "hypotathese", "hypothesis" }, {
 346+ "hystrical", "hysterical" }, {
 347+ "ident", "indent" }, {
 348+ "illegitament", "illegitimate" }, {
 349+ "imbed", "embed" }, {
 350+ "imediaetly", "immediately" }, {
 351+ "imfamy", "infamy" }, {
 352+ "immenant", "immanent" }, {
 353+ "implemtes", "implements" }, {
 354+ "inadvertant", "inadvertent" }, {
 355+ "incase", "in case" }, {
 356+ "incedious", "insidious" }, {
 357+ "incompleet", "incomplete" }, {
 358+ "incomplot", "incomplete" }, {
 359+ "inconvenant", "inconvenient" }, {
 360+ "inconvience", "inconvenience" }, {
 361+ "independant", "independent" }, {
 362+ "independenent", "independent" }, {
 363+ "indepnends", "independent" }, {
 364+ "indepth", "in depth" }, {
 365+ "indispensible", "indispensable" }, {
 366+ "inefficite", "inefficient" }, {
 367+ "inerface", "interface" }, {
 368+ "infact", "in fact" }, {
 369+ "influencial", "influential" }, {
 370+ "inital", "initial" }, {
 371+ "initinized", "initialized" }, {
 372+ "initized", "initialized" }, {
 373+ "innoculate", "inoculate" }, {
 374+ "insistant", "insistent" }, {
 375+ "insistenet", "insistent" }, {
 376+ "instulation", "installation" }, {
 377+ "intealignt", "intelligent" }, {
 378+ "intejilent", "intelligent" }, {
 379+ "intelegent", "intelligent" }, {
 380+ "intelegnent", "intelligent" }, {
 381+ "intelejent", "intelligent" }, {
 382+ "inteligent", "intelligent" }, {
 383+ "intelignt", "intelligent" }, {
 384+ "intellagant", "intelligent" }, {
 385+ "intellegent", "intelligent" }, {
 386+ "intellegint", "intelligent" }, {
 387+ "intellgnt", "intelligent" }, {
 388+ "intensionality", "intensionally" }, {
 389+ "interate", "iterate" }, {
 390+ "internation", "international" }, {
 391+ "interpretate", "interpret" }, {
 392+ "interpretter", "interpreter" }, {
 393+ "intertes", "interested" }, {
 394+ "intertesd", "interested" }, {
 395+ "invermeantial", "environmental" }, {
 396+ "irregardless", "regardless" }, {
 397+ "irresistable", "irresistible" }, {
 398+ "irritible", "irritable" }, {
 399+ "islams", "muslims" }, {
 400+ "isotrop", "isotope" }, {
 401+ "isreal", "israel" }, {
 402+ "johhn", "john" }, {
 403+ "judgement", "judgment" }, {
 404+ "kippur", "kipper" }, {
 405+ "knawing", "knowing" }, {
 406+ "latext", "latest" }, {
 407+ "leasve", "leave" }, {
 408+ "lesure", "leisure" }, {
 409+ "liasion", "lesion" }, {
 410+ "liason", "liaison" }, {
 411+ "libary", "library" }, {
 412+ "likly", "likely" }, {
 413+ "lilometer", "kilometer" }, {
 414+ "liquify", "liquefy" }, {
 415+ "lloyer", "layer" }, {
 416+ "lossing", "losing" }, {
 417+ "luser", "laser" }, {
 418+ "maintanence", "maintenance" }, {
 419+ "majaerly", "majority" }, {
 420+ "majoraly", "majority" }, {
 421+ "maks", "masks" }, {
 422+ "mandelbrot", "Mandelbrot" }, {
 423+ "mant", "want" }, {
 424+ "marshall", "marshal" }, {
 425+ "maxium", "maximum" }, {
 426+ "meory", "memory" }, {
 427+ "metter", "better" }, {
 428+ "mic", "mike" }, {
 429+ "midia", "media" }, {
 430+ "millenium", "millennium" }, {
 431+ "miniscule", "minuscule" }, {
 432+ "minkay", "monkey" }, {
 433+ "minum", "minimum" }, {
 434+ "mischievious", "mischievous" }, {
 435+ "misilous", "miscellaneous" }, {
 436+ "momento", "memento" }, {
 437+ "monkay", "monkey" }, {
 438+ "mosaik", "mosaic" }, {
 439+ "mostlikely", "most likely" }, {
 440+ "mousr", "mouser" }, {
 441+ "mroe", "more" }, {
 442+ "neccessary", "necessary" }, {
 443+ "necesary", "necessary" }, {
 444+ "necesser", "necessary" }, {
 445+ "neice", "niece" }, {
 446+ "neighbour", "neighbor" }, {
 447+ "nemonic", "pneumonic" }, {
 448+ "nevade", "Nevada" }, {
 449+ "nickleodeon", "nickelodeon" }, {
 450+ "nieve", "naive" }, {
 451+ "noone", "no one" }, {
 452+ "noticably", "noticeably" }, {
 453+ "notin", "not in" }, {
 454+ "nozled", "nuzzled" }, {
 455+ "objectsion", "objects" }, {
 456+ "obsfuscate", "obfuscate" }, {
 457+ "ocassion", "occasion" }, {
 458+ "occuppied", "occupied" }, {
 459+ "occurence", "occurrence" }, {
 460+ "octagenarian", "octogenarian" }, {
 461+ "olf", "old" }, {
 462+ "opposim", "opossum" }, {
 463+ "organise", "organize" }, {
 464+ "organiz", "organize" }, {
 465+ "orientate", "orient" }, {
 466+ "oscilascope", "oscilloscope" }, {
 467+ "oving", "moving" }, {
 468+ "paramers", "parameters" }, {
 469+ "parametic", "parameter" }, {
 470+ "paranets", "parameters" }, {
 471+ "partrucal", "particular" }, {
 472+ "pataphysical", "metaphysical" }, {
 473+ "patten", "pattern" }, {
 474+ "permissable", "permissible" }, {
 475+ "permition", "permission" }, {
 476+ "permmasivie", "permissive" }, {
 477+ "perogative", "prerogative" }, {
 478+ "persue", "pursue" }, {
 479+ "phantasia", "fantasia" }, {
 480+ "phenominal", "phenomenal" }, {
 481+ "picaresque", "picturesque" }, {
 482+ "playwrite", "playwright" }, {
 483+ "poeses", "poesies" }, {
 484+ "polation", "politician" }, {
 485+ "poligamy", "polygamy" }, {
 486+ "politict", "politic" }, {
 487+ "pollice", "police" }, {
 488+ "polypropalene", "polypropylene" }, {
 489+ "pompom", "pompon" }, {
 490+ "possable", "possible" }, {
 491+ "practicle", "practical" }, {
 492+ "pragmaticism", "pragmatism" }, {
 493+ "preceeding", "preceding" }, {
 494+ "precion", "precision" }, {
 495+ "precios", "precision" }, {
 496+ "preemptory", "peremptory" }, {
 497+ "prefices", "prefixes" }, {
 498+ "prefixt", "prefixed" }, {
 499+ "presbyterian", "Presbyterian" }, {
 500+ "presue", "pursue" }, {
 501+ "presued", "pursued" }, {
 502+ "privielage", "privilege" }, {
 503+ "priviledge", "privilege" }, {
 504+ "proceedures", "procedures" }, {
 505+ "pronensiation", "pronunciation" }, {
 506+ "pronisation", "pronunciation" }, {
 507+ "pronounciation", "pronunciation" }, {
 508+ "properally", "properly" }, {
 509+ "proplematic", "problematic" }, {
 510+ "protray", "portray" }, {
 511+ "pscolgst", "psychologist" }, {
 512+ "psicolagest", "psychologist" }, {
 513+ "psycolagest", "psychologist" }, {
 514+ "quoz", "quiz" }, {
 515+ "radious", "radius" }, {
 516+ "ramplily", "rampantly" }, {
 517+ "reccomend", "recommend" }, {
 518+ "reccona", "raccoon" }, {
 519+ "recieve", "receive" }, {
 520+ "reconise", "recognize" }, {
 521+ "rectangeles", "rectangle" }, {
 522+ "redign", "redesign" }, {
 523+ "reoccurring", "recurring" }, {
 524+ "repitition", "repetition" }, {
 525+ "replasments", "replacement" }, {
 526+ "reposable", "responsible" }, {
 527+ "reseblence", "resemblance" }, {
 528+ "respct", "respect" }, {
 529+ "respecally", "respectfully" }, {
 530+ "roon", "room" }, {
 531+ "rought", "roughly" }, {
 532+ "rsx", "RSX" }, {
 533+ "rudemtry", "rudimentary" }, {
 534+ "runnung", "running" }, {
 535+ "sacreligious", "sacrilegious" }, {
 536+ "saftly", "safely" }, {
 537+ "salut", "salute" }, {
 538+ "satifly", "satisfy" }, {
 539+ "scrabdle", "scrabble" }, {
 540+ "searcheable", "searchable" }, {
 541+ "secion", "section" }, {
 542+ "seferal", "several" }, {
 543+ "segements", "segments" }, {
 544+ "sence", "sense" }, {
 545+ "seperate", "separate" }, {
 546+ "sherbert", "sherbet" }, {
 547+ "sicolagest", "psychologist" }, {
 548+ "sieze", "seize" }, {
 549+ "simpfilty", "simplicity" }, {
 550+ "simplye", "simply" }, {
 551+ "singal", "signal" }, {
 552+ "sitte", "site" }, {
 553+ "situration", "situation" }, {
 554+ "slyph", "sylph" }, {
 555+ "smil", "smile" }, {
 556+ "snuck", "sneaked" }, {
 557+ "sometmes", "sometimes" }, {
 558+ "soonec", "sonic" }, {
 559+ "specificialy", "specifically" }, {
 560+ "spel", "spell" }, {
 561+ "spoak", "spoke" }, {
 562+ "sponsered", "sponsored" }, {
 563+ "stering", "steering" }, {
 564+ "straightjacket", "straitjacket" }, {
 565+ "stumach", "stomach" }, {
 566+ "stutent", "student" }, {
 567+ "styleguide", "style guide" }, {
 568+ "subisitions", "substitutions" }, {
 569+ "subjecribed", "subscribed" }, {
 570+ "subpena", "subpoena" }, {
 571+ "substations", "substitutions" }, {
 572+ "suger", "sugar" }, {
 573+ "supercede", "supersede" }, {
 574+ "superfulous", "superfluous" }, {
 575+ "susan", "Susan" }, {
 576+ "swimwear", "swim wear" }, {
 577+ "syncorization", "synchronization" }, {
 578+ "taff", "tough" }, {
 579+ "taht", "that" }, {
 580+ "tattos", "tattoos" }, {
 581+ "techniquely", "technically" }, {
 582+ "teh", "the" }, {
 583+ "tem", "team" }, {
 584+ "teo", "two" }, {
 585+ "teridical", "theoretical" }, {
 586+ "tesst", "test" }, {
 587+ "tets", "tests" }, {
 588+ "thanot", "than or" }, {
 589+ "theirselves", "themselves" }, {
 590+ "theridically", "theoretical" }, {
 591+ "thredically", "theoretically" }, {
 592+ "thruout", "throughout" }, {
 593+ "ths", "this" }, {
 594+ "titalate", "titillate" }, {
 595+ "tobagan", "tobaggon" }, {
 596+ "tommorrow", "tomorrow" }, {
 597+ "tomorow", "tomorrow" }, {
 598+ "tradegy", "tragedy" }, {
 599+ "trubbel", "trouble" }, {
 600+ "ttest", "test" }, {
 601+ "tunnellike", "tunnel like" }, {
 602+ "tured", "turned" }, {
 603+ "tyrrany", "tyranny" }, {
 604+ "unatourral", "unnatural" }, {
 605+ "unaturral", "unnatural" }, {
 606+ "unconisitional", "unconstitutional" }, {
 607+ "unconscience", "unconscious" }, {
 608+ "underladder", "under ladder" }, {
 609+ "unentelegible", "unintelligible" }, {
 610+ "unfortunently", "unfortunately" }, {
 611+ "unnaturral", "unnatural" }, {
 612+ "upcast", "up cast" }, {
 613+ "upmost", "utmost" }, {
 614+ "uranisium", "uranium" }, {
 615+ "verison", "version" }, {
 616+ "vinagarette", "vinaigrette" }, {
 617+ "volumptuous", "voluptuous" }, {
 618+ "volunteerism", "voluntarism" }, {
 619+ "volye", "volley" }, {
 620+ "wadting", "wasting" }, {
 621+ "waite", "wait" }, {
 622+ "wan't", "won't" }, {
 623+ "warloord", "warlord" }, {
 624+ "whaaat", "what" }, {
 625+ "whard", "ward" }, {
 626+ "whimp", "wimp" }, {
 627+ "wicken", "weaken" }, {
 628+ "wierd", "weird" }, {
 629+ "wrank", "rank" }, {
 630+ "writeen", "righten" }, {
 631+ "writting", "writing" }, {
 632+ "wundeews", "windows" }, {
 633+ "yeild", "yield" }, {
 634+ "youe", "your" }
 635+};
 636+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -36,7 +36,8 @@
3737 WikiQueryParser.TITLE_BOOST = 2;
3838 WikiQueryParser.ALT_TITLE_BOOST = 6;
3939 WikiQueryParser.KEYWORD_BOOST = 0.05f;
40 - WikiIndexModifier.ALT_TITLES = 3;
 40+ WikiQueryParser.ADD_TITLE_PHRASES = false;
 41+ WikiIndexModifier.ALT_TITLES = 3;
4142 FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder();
4243 FieldNameFactory ff = new FieldNameFactory();
4344 try{
@@ -316,6 +317,12 @@
317318 q = parser.parseFourPass("Israeli-Palestinian conflict",NamespacePolicy.IGNORE,true);
318319 assertEquals("(+(+(contents:israeli contents:isra^0.5) +contents:palestinian) +contents:conflict) (+(+title:israeli^2.0 +title:palestinian^2.0) +title:conflict^2.0) ((+(+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0) +alttitle1:conflict^6.0) (+(+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0) +alttitle2:conflict^6.0) (+(+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0) +alttitle3:conflict^6.0))",q.toString());
319320
 321+ // title phrases
 322+ WikiQueryParser.ADD_TITLE_PHRASES = true;
 323+ q = parser.parseFourPass("Israeli Palestinian conflict",NamespacePolicy.IGNORE,true);
 324+ assertEquals("(+(contents:israeli contents:isra^0.5) +contents:palestinian +contents:conflict (title:\"israeli palestinian\"~2^2.0 title:\"palestinian conflict\"~2^2.0)) (+title:israeli^2.0 +title:palestinian^2.0 +title:conflict^2.0) ((+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0 +alttitle1:conflict^6.0) (+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0 +alttitle2:conflict^6.0) (+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0 +alttitle3:conflict^6.0)) (spanNear([keyword1:israeli, keyword1:palestinian, keyword1:conflict], 100, false)^0.05 spanNear([keyword2:israeli, keyword2:palestinian, keyword2:conflict], 100, false)^0.025 spanNear([keyword3:israeli, keyword3:palestinian, keyword3:conflict], 100, false)^0.016666668 spanNear([keyword4:israeli, keyword4:palestinian, keyword4:conflict], 100, false)^0.0125 spanNear([keyword5:israeli, keyword5:palestinian, keyword5:conflict], 100, false)^0.01)",q.toString());
 325+ WikiQueryParser.ADD_TITLE_PHRASES = false;
 326+
320327 // alternative transliterations
321328 q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true);
322329 assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/Suggest.java
@@ -24,6 +24,8 @@
2525 import org.apache.lucene.search.TopDocs;
2626 import org.wikimedia.lsearch.beans.SearchResults;
2727 import org.wikimedia.lsearch.config.IndexId;
 28+import org.wikimedia.lsearch.suggest.api.NgramIndexer;
 29+import org.wikimedia.lsearch.suggest.api.NamespaceFreq;
2830 import org.wikimedia.lsearch.suggest.api.WordsIndexer;
2931 import org.wikimedia.lsearch.suggest.dist.DoubleMetaphone;
3032 import org.wikimedia.lsearch.suggest.dist.EditDistance;
@@ -44,7 +46,7 @@
4547 public Suggest(IndexId iid) throws IOException{
4648 this.iid = iid;
4749 this.searcher = new IndexSearcher(iid.getSuggestWordsPath());
48 - this.phrases = new IndexSearcher(iid.getSuggestPhrasesPath());
 50+ this.phrases = new IndexSearcher(iid.getSuggestTitlesPath());
4951 this.dmeta = new DoubleMetaphone();
5052 }
5153
@@ -54,7 +56,7 @@
5557 BooleanQuery bq = new BooleanQuery();
5658 addQuery(bq,"metaphone1",meta1,2);
5759 addQuery(bq,"metaphone2",meta2,2);
58 - bq.add(makeWordQuery(word),BooleanClause.Occur.SHOULD);
 60+ bq.add(makeWordQuery(word,""),BooleanClause.Occur.SHOULD);
5961
6062 try {
6163 TopDocs docs = searcher.search(bq,null,POOL);
@@ -126,13 +128,14 @@
127129 return Math.log10(1+score*99)/2;
128130 }
129131
130 - public Query makeWordQuery(String word){
 132+ public Query makeWordQuery(String word, String prefix){
131133 BooleanQuery bq = new BooleanQuery(true);
132 - int min = WordsIndexer.getMinNgram(word);
133 - int max = WordsIndexer.getMaxNgram(word);
 134+ int min = NgramIndexer.getMinNgram(word);
 135+ int max = NgramIndexer.getMaxNgram(word);
 136+ String fieldBase = NgramIndexer.getNgramField(prefix);
134137 for(int i=min; i <= max; i++ ){
135 - String[] ngrams = WordsIndexer.nGrams(word,i);
136 - String field = "ngram"+i;
 138+ String[] ngrams = NgramIndexer.nGrams(word,i);
 139+ String field = fieldBase+i;
137140 for(int j=0 ; j<ngrams.length ; j++){
138141 String ngram = ngrams[j];
139142 /*if(j == 0)
@@ -244,7 +247,7 @@
245248 try {
246249 Hits hits = phrases.search(new TermQuery(new Term("word",word1+word2)));
247250 if(hits.length() > 0){
248 - int freq = Integer.parseInt(hits.doc(0).get("freq"));
 251+ int freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(0);
249252 if(freq >= JOIN_FREQ)
250253 return new SuggestResult(word1+word2,freq);
251254 }
@@ -257,10 +260,10 @@
258261
259262 public ArrayList<SuggestResult> suggestPhrase(String word1, String word2, int num){
260263 String phrase = word1+"_"+word2;
261 - Query q = makeWordQuery(phrase);
 264+ Query q = makeWordQuery(phrase,"phrase");
262265
263266 try {
264 - TopDocs docs = phrases.search(q,null,50);
 267+ TopDocs docs = phrases.search(q,null,200);
265268 EditDistance sd = new EditDistance(phrase);
266269 ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
267270 int minfreq = -1;
@@ -268,7 +271,7 @@
269272 for(ScoreDoc sc : docs.scoreDocs){
270273 Document d = phrases.doc(sc.doc);
271274 SuggestResult r = new SuggestResult(d.get("phrase"),
272 - Integer.parseInt(d.get("freq")));
 275+ new NamespaceFreq(d.get("freq")).getFrequency(0));
273276 if(phrase.equals(r.word)){
274277 minfreq = r.frequency;
275278 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/SuggestBuilder.java
@@ -3,6 +3,9 @@
44 import java.io.IOException;
55 import java.io.InputStream;
66 import java.util.ArrayList;
 7+import java.util.HashMap;
 8+import java.util.HashSet;
 9+import java.util.Hashtable;
710 import java.util.Map.Entry;
811
912 import org.apache.log4j.Logger;
@@ -10,23 +13,30 @@
1114 import org.apache.lucene.document.Document;
1215 import org.apache.lucene.index.IndexReader;
1316 import org.apache.lucene.index.Term;
 17+import org.apache.lucene.search.CachingWrapperFilter;
 18+import org.apache.lucene.search.Filter;
1419 import org.apache.lucene.search.Hits;
1520 import org.apache.lucene.search.IndexSearcher;
 21+import org.apache.lucene.search.PhraseQuery;
 22+import org.apache.lucene.search.QueryFilter;
1623 import org.apache.lucene.search.TermQuery;
1724 import org.apache.lucene.store.FSDirectory;
1825 import org.mediawiki.dumper.ProgressFilter;
1926 import org.mediawiki.dumper.Tools;
2027 import org.mediawiki.importer.XmlDumpReader;
2128 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 29+import org.wikimedia.lsearch.analyzers.WikiQueryParser;
2230 import org.wikimedia.lsearch.config.Configuration;
2331 import org.wikimedia.lsearch.config.GlobalConfiguration;
2432 import org.wikimedia.lsearch.config.IndexId;
2533 import org.wikimedia.lsearch.config.IndexRegistry;
2634 import org.wikimedia.lsearch.importer.DumpImporter;
 35+import org.wikimedia.lsearch.search.NamespaceFilter;
2736 import org.wikimedia.lsearch.suggest.api.LuceneDictionary;
28 -import org.wikimedia.lsearch.suggest.api.PhraseIndexer;
 37+import org.wikimedia.lsearch.suggest.api.NamespaceFreq;
 38+import org.wikimedia.lsearch.suggest.api.TitleIndexer;
2939 import org.wikimedia.lsearch.suggest.api.WordsIndexer;
30 -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;
 40+import org.wikimedia.lsearch.suggest.api.Dictionary.Word;
3141 import org.wikimedia.lsearch.util.Localization;
3242 import org.wikimedia.lsearch.util.StringCounter;
3343 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -55,8 +65,8 @@
5666 inputfile = args.length>1? args[1] : null;
5767 dbname = args[0];
5868
59 -
60 - String langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
 69+ GlobalConfiguration global = GlobalConfiguration.getInstance();
 70+ String langCode = global.getLanguage(dbname);
6171 // preload
6272 UnicodeDecomposer.getInstance();
6373 Localization.readLocalization(langCode);
@@ -89,66 +99,37 @@
90100 }
91101 }
92102 // make words index
 103+ log.info("Making words index");
93104 try {
94105 LuceneDictionary dict = new LuceneDictionary(IndexReader.open(iid.getSuggestCleanPath()),"contents");
95 - WordsIndexer writer = new WordsIndexer(iid.getSuggestWordsPath(),50);
 106+ WordsIndexer writer = new WordsIndexer(iid.getSuggestWordsPath(),(dbname.equals("wikilucene")? 3 : 50));
 107+ writer.createIndex();
96108 Word word;
97109 while((word = dict.next()) != null){
98110 writer.addWord(word);
99111 }
100 - writer.close();
 112+ writer.closeAndOptimze();
101113 } catch (IOException e) {
102114 log.fatal("Cannot open clean dictionary for "+iid+" : "+e.getMessage());
103115 e.printStackTrace();
104116 return;
105117 }
106118
 119+ log.info("Making suggest title index");
107120 // make phrase index
108 - try {
109 - LuceneDictionary dict = new LuceneDictionary(IndexReader.open(iid.getSuggestCleanPath()),"title");
110 - PhraseIndexer writer = new PhraseIndexer(iid.getSuggestPhrasesPath(),1);
111 - IndexSearcher searcher = new IndexSearcher(iid.getSuggestCleanPath());
112 - Word word;
113 - while((word = dict.next()) != null){
114 - // index word
115 - writer.addWord(word);
116 - String w = word.getWord();
117 - StringCounter counter = new StringCounter();
118 - Hits hits = searcher.search(new TermQuery(new Term("title",w)));
119 - // find all phrases beginning with word
120 - for(int i=0;i<hits.length();i++){
121 - Document doc = hits.doc(i);
122 - // get original tokens
123 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false);
124 - ArrayList<Token> tokens = parser.parse();
125 - for(int j=0;j<tokens.size()-1;j++){
126 - Token t = tokens.get(j);
127 - // ignore aliases
128 - if(t.getPositionIncrement() == 0)
129 - continue;
130 - // find phrases beginning with the target word
131 - if(w.equals(t.termText())){
132 - counter.count(t.termText()+"_"+tokens.get(j+1).termText());
133 - }
134 - }
135 - }
136 - // index phrases
137 - for(Entry<String,Count> e : counter.getSet()){
138 - writer.addPhrase(e.getKey(),e.getValue().num);
139 - }
140 -
141 - }
142 - writer.close();
143 - } catch (IOException e) {
144 - log.fatal("Cannot open clean dictionary for "+iid+" : "+e.getMessage());
145 - e.printStackTrace();
146 - return;
147 - }
 121+ Hashtable<String,String> suggest = global.getDBParams(iid.getDBname(),"suggest");
 122+ int titlesWordsMinFreq = 3;
 123+ int titlesPhrasesMinFreq = 1;
 124+ if(suggest!=null && suggest.containsKey("titlesWordsMinFreq"))
 125+ titlesWordsMinFreq = Integer.parseInt(suggest.get("titlesWordsMinFreq"));
 126+ if(suggest!=null && suggest.containsKey("titlesPhrasesMinFreq"))
 127+ titlesWordsMinFreq = Integer.parseInt(suggest.get("titlesPhrasesMinFreq"));
 128+ TitleIndexer tInx = new TitleIndexer(iid,titlesWordsMinFreq,titlesPhrasesMinFreq);
 129+ tInx.createFromExistingIndex(iid);
148130
149 -
150131 long end = System.currentTimeMillis();
151132
152 - System.out.println("Finished making spell-check index in "+formatTime(end-start));
 133+ System.out.println("Finished making suggest index in "+formatTime(end-start));
153134 }
154135
155136 private static String formatTime(long l) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Indexer.java
@@ -1,109 +0,0 @@
2 -package org.wikimedia.lsearch.suggest.api;
3 -
4 -import java.io.IOException;
5 -
6 -import org.apache.log4j.Logger;
7 -import org.apache.lucene.analysis.Analyzer;
8 -import org.apache.lucene.document.Document;
9 -import org.apache.lucene.document.Field;
10 -import org.apache.lucene.index.IndexWriter;
11 -import org.wikimedia.lsearch.index.WikiIndexModifier;
12 -
13 -/**
14 - * Base indexer class. Open/close index.
15 - *
16 - * @author rainman
17 - *
18 - */
19 -public class Indexer {
20 - Logger log = Logger.getLogger(Indexer.class);
21 - protected String path;
22 - protected Analyzer analyzer;
23 - protected IndexWriter writer;
24 -
25 - public Indexer(String path, Analyzer analyzer) throws IOException{
26 - this.path = path;
27 - this.analyzer = analyzer;
28 - try {
29 - writer = new IndexWriter(path,analyzer,true); // always make new index
30 - } catch (IOException e) {
31 - try {
32 - log.info("Making new index at path "+path);
33 - // try to make brand new index
34 - WikiIndexModifier.makeDBPath(path); // ensure all directories are made
35 - writer = new IndexWriter(path,analyzer,true);
36 - } catch (IOException e1) {
37 - log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage());
38 - throw e1;
39 - }
40 - }
41 - writer.setMergeFactor(20);
42 - writer.setMaxBufferedDocs(500);
43 - writer.setUseCompoundFile(true);
44 - writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
45 -
46 - }
47 -
48 - /** Optimize and close index, always call when done indexing */
49 - public void close() throws IOException {
50 - try{
51 - writer.optimize();
52 - writer.close();
53 - } catch(IOException e){
54 - log.warn("I/O error optimizing/closing index at "+path);
55 - throw e;
56 - }
57 - }
58 -
59 - /** Return ngrams of specific size for text */
60 - public static String[] nGrams(String text, int size) {
61 - int len = text.length();
62 - String[] res = new String[len - size + 1];
63 - for (int i = 0; i < len - size + 1; i++) {
64 - res[i] = text.substring(i, i + size);
65 - }
66 - return res;
67 - }
68 -
69 - /** Get minimal ngram size for word. Short words (<=3 chars) will have 1-grams, other 2-grams */
70 - public static int getMinNgram(String word){
71 - if(word.length() <= 3)
72 - return 1;
73 - else if(word.length() == 4)
74 - return 2;
75 - else
76 - return 3;
77 - }
78 - /** Get minimal ngram size for word. Long words: 4-grams, other 3-grams, 2-char word only 1-grams */
79 - public static int getMaxNgram(String word){
80 - if(word.length() > 4)
81 - return 3;
82 - if(word.length() == 2)
83 - return 1;
84 - return 2;
85 - }
86 -
87 - /**
88 - * Add ngrams of all sizes from 1 to word.length to document
89 - *
90 - * @param doc - document to add fields to
91 - * @param word - word
92 - */
93 - protected void addNgramFields(Document doc, String word) {
94 - int min = getMinNgram(word);
95 - int max = getMaxNgram(word);
96 - for(int i=min ; i <= max ; i++ ){
97 - String[] ngrams = nGrams(word,i);
98 - String field = "ngram"+i;
99 - for(int j=0 ; j<ngrams.length ; j++){
100 - String ngram = ngrams[j];
101 - if(j == 0)
102 - doc.add(new Field("start"+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
103 - else if(j == ngrams.length-1)
104 - doc.add(new Field("end"+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
105 - // finally add regular ngram
106 - doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
107 - }
108 - }
109 - }
110 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/PhraseIndexer.java
@@ -1,82 +0,0 @@
2 -package org.wikimedia.lsearch.suggest.api;
3 -
4 -import java.io.IOException;
5 -
6 -import org.apache.lucene.analysis.SimpleAnalyzer;
7 -import org.apache.lucene.document.Document;
8 -import org.apache.lucene.document.Field;
9 -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;
10 -
11 -/**
12 - * Class to build an index of phrases. It indexes:
13 - * 1) sets of two words as douglas_adams
14 - * 2) individual words
15 - *
16 - * 1) is useful for content-dependant suggestions and
17 - * suggesting splits (splitting one word into two), while
18 - * 2) is useful for suggesting joins
19 - *
20 - * @author rainman
21 - *
22 - */
23 -public class PhraseIndexer extends Indexer {
24 - int minFreq;
25 -
26 - public PhraseIndexer(String path, int minFreq) throws IOException{
27 - super(path,new SimpleAnalyzer());
28 - this.minFreq = minFreq;
29 - }
30 -
31 - /** Add phrase, convenient for suggesting splits and context-dependend suggestions */
32 - public void addPhrase(String word1, String word2, int frequency){
33 - addPhrase(word1+"_"+word2,frequency);
34 - }
35 - /** Add phrase, join two words by underscore */
36 - public void addPhrase(String phrase, int frequency){
37 - if(frequency < minFreq)
38 - return;
39 - Document doc = new Document();
40 - addNgramFields(doc,phrase);
41 - doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
42 - doc.add(new Field("freq",Integer.toString(frequency), Field.Store.YES, Field.Index.UN_TOKENIZED));
43 -
44 - try {
45 - writer.addDocument(doc);
46 - } catch (Exception e) {
47 - log.error("Cannot add document "+doc);
48 - e.printStackTrace();
49 - }
50 - }
51 -
52 - /** Add ordinary word to the index, convenient for suggesting joins */
53 - public void addWord(Word word){
54 - Document doc = new Document();
55 - doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED));
56 - doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.UN_TOKENIZED));
57 -
58 - try {
59 - writer.addDocument(doc);
60 - } catch (Exception e) {
61 - log.error("Cannot add document "+doc);
62 - e.printStackTrace();
63 - }
64 - }
65 -
66 - /** Get minimal ngram size for word. Short words (<=3 chars) will have 1-grams, other 2-grams */
67 - public static int getMinNgram(String word){
68 - if(word.length() == 2)
69 - return 1;
70 - if(word.length() <= 6)
71 - return word.length() - 2;
72 - else
73 - return 5;
74 - }
75 -
76 - /** Get minimal ngram size for word. Long words: 4-grams, other 3-grams, 2-char word only 1-grams */
77 - public static int getMaxNgram(String word){
78 - if(word.length() == 2)
79 - return 1;
80 - else
81 - return getMinNgram(word) + 4;
82 - }
83 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitlesIndexer.java
@@ -1,31 +0,0 @@
2 -package org.wikimedia.lsearch.suggest.api;
3 -
4 -import java.io.IOException;
5 -
6 -import org.apache.log4j.Logger;
7 -import org.apache.lucene.analysis.Analyzer;
8 -import org.apache.lucene.document.Document;
9 -import org.apache.lucene.document.Field;
10 -import org.apache.lucene.document.Field.Index;
11 -import org.apache.lucene.document.Field.Store;
12 -
13 -public class TitlesIndexer extends Indexer {
14 - static Logger log = Logger.getLogger(TitlesIndexer.class);
15 -
16 - public TitlesIndexer(String path, Analyzer analyzer) throws IOException{
17 - super(path,analyzer);
18 - }
19 -
20 - public void addTitle(int ns, String title){
21 - Document doc = new Document();
22 - doc.add(new Field("title",title,Store.YES,Index.TOKENIZED));
23 - doc.add(new Field("namespace",Integer.toString(ns),Store.YES,Index.UN_TOKENIZED));
24 - try {
25 - writer.addDocument(doc);
26 - } catch (IOException e) {
27 - log.error("Cannot add document "+doc);
28 - e.printStackTrace();
29 - }
30 - }
31 -
32 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/WordsIndexer.java
@@ -6,10 +6,7 @@
77 import org.apache.lucene.analysis.SimpleAnalyzer;
88 import org.apache.lucene.document.Document;
99 import org.apache.lucene.document.Field;
10 -import org.apache.lucene.index.IndexWriter;
11 -import org.wikimedia.lsearch.config.GlobalConfiguration;
12 -import org.wikimedia.lsearch.config.IndexId;
13 -import org.wikimedia.lsearch.index.WikiIndexModifier;
 10+import org.wikimedia.lsearch.suggest.api.Dictionary.Word;
1411 import org.wikimedia.lsearch.suggest.dist.DoubleMetaphone;
1512
1613 /**
@@ -20,59 +17,42 @@
2118 * @author rainman
2219 *
2320 */
24 -public class WordsIndexer extends Indexer {
25 - public static class Word {
26 - protected String word;
27 - protected int frequency;
28 - public Word(String word, int frequency) {
29 - super();
30 - this.word = word;
31 - this.frequency = frequency;
32 - }
33 - public int getFrequency() {
34 - return frequency;
35 - }
36 - public void setFrequency(int frequency) {
37 - this.frequency = frequency;
38 - }
39 - public String getWord() {
40 - return word;
41 - }
42 - public void setWord(String word) {
43 - this.word = word;
44 - }
45 - public String toString(){
46 - return word+" : "+frequency;
47 - }
48 -
49 - }
 21+public class WordsIndexer {
5022 static Logger log = Logger.getLogger(WordsIndexer.class);
51 - DoubleMetaphone dmeta;
 23+ protected DoubleMetaphone dmeta;
5224 /** If word occurs less that minFreq times, it will be discarded */
5325 protected int minFreq;
 26+ protected NgramIndexer indexer;
 27+ String path;
5428
5529 public WordsIndexer(String path, int minFreq) throws IOException {
56 - super(path,new SimpleAnalyzer());
 30+ this.path = path;
5731 this.minFreq = minFreq;
5832 this.dmeta = new DoubleMetaphone();
 33+ this.indexer = new NgramIndexer();
5934 }
 35+
 36+ public void createIndex() throws IOException{
 37+ indexer.createIndex(path, new SimpleAnalyzer());
 38+ }
6039
61 - /** Add word to the index */
 40+ /** Add word to the index, make sure index is open */
6241 public void addWord(Word word){
6342 if(word.frequency < minFreq)
6443 return;
 44+ if(word.getWord().length() < 2)
 45+ return;
6546 Document doc = new Document();
66 - addNgramFields(doc,word.word);
 47+ indexer.createNgramFields(doc,"",word.word);
6748 doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED));
68 - doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.UN_TOKENIZED));
 49+ doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.NO));
6950 doc.add(new Field("metaphone1",dmeta.doubleMetaphone(word.word), Field.Store.NO, Field.Index.UN_TOKENIZED));
7051 doc.add(new Field("metaphone2",dmeta.doubleMetaphone(word.word,true), Field.Store.NO, Field.Index.UN_TOKENIZED));
7152
72 - try {
73 - writer.addDocument(doc);
74 - } catch (Exception e) {
75 - log.error("Cannot add document "+doc);
76 - e.printStackTrace();
77 - }
 53+ indexer.addDocument(doc);
7854 }
 55+
 56+ public void closeAndOptimze() throws IOException{
 57+ indexer.closeAndOptimize();
 58+ }
7959 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitleIndexer.java
@@ -0,0 +1,323 @@
 2+package org.wikimedia.lsearch.suggest.api;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Collection;
 7+import java.util.HashMap;
 8+import java.util.HashSet;
 9+import java.util.Map.Entry;
 10+
 11+import org.apache.log4j.Logger;
 12+import org.apache.lucene.analysis.SimpleAnalyzer;
 13+import org.apache.lucene.analysis.Token;
 14+import org.apache.lucene.document.Document;
 15+import org.apache.lucene.document.Field;
 16+import org.apache.lucene.index.IndexReader;
 17+import org.apache.lucene.index.Term;
 18+import org.apache.lucene.search.Hits;
 19+import org.apache.lucene.search.IndexSearcher;
 20+import org.apache.lucene.search.MultiSearcher;
 21+import org.apache.lucene.search.PhraseQuery;
 22+import org.apache.lucene.search.Query;
 23+import org.apache.lucene.search.SearchableMul;
 24+import org.apache.lucene.search.Searcher;
 25+import org.apache.lucene.search.TermQuery;
 26+import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 27+import org.wikimedia.lsearch.config.GlobalConfiguration;
 28+import org.wikimedia.lsearch.config.IndexId;
 29+import org.wikimedia.lsearch.config.IndexRegistry;
 30+import org.wikimedia.lsearch.index.IndexUpdateRecord;
 31+import org.wikimedia.lsearch.search.IndexSearcherMul;
 32+import org.wikimedia.lsearch.search.WikiSearcher;
 33+import org.wikimedia.lsearch.suggest.api.Dictionary.Word;
 34+
 35+/**
 36+ * Index words and phrases from article titles.
 37+ *
 38+ * Fields:
 39+ * * word - word from title
 40+ * * phrase - phrase like douglas_adams
 41+ * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3)
 42+ * * namespace - namespaces where the word/phrase is present
 43+ *
 44+ * @author rainman
 45+ *
 46+ */
 47+public class TitleIndexer {
 48+ static Logger log = Logger.getLogger(TitleIndexer.class);
 49+ protected NgramIndexer ngramWriter;
 50+ public static final boolean NEW_INDEX = true;
 51+ protected boolean createNew;
 52+ protected int minWordFreq, minPhraseFreq;
 53+ protected IndexId iid;
 54+ protected String langCode;
 55+ protected IndexRegistry registry;
 56+ protected String path;
 57+
 58+ public TitleIndexer(IndexId iid, int minWordFreq, int minPhraseFreq){
 59+ this(iid,minWordFreq,minPhraseFreq,false);
 60+ }
 61+
 62+ public TitleIndexer(IndexId iid, int minWordFreq, int minPhraseFreq, boolean createNew){
 63+ this.iid = iid;
 64+ this.minWordFreq = minWordFreq;
 65+ this.minPhraseFreq = minPhraseFreq;
 66+ this.createNew = createNew;
 67+ this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
 68+ this.ngramWriter = new NgramIndexer();
 69+ this.registry = IndexRegistry.getInstance();
 70+ this.path = iid.getSuggestTitlesPath();
 71+ }
 72+
 73+ protected Searcher makeSearcher(IndexId logical) throws IOException{
 74+ if(logical.isSingle())
 75+ return new IndexSearcherMul(registry.getLatestSnapshot(logical).path);
 76+ else{
 77+ ArrayList<IndexSearcherMul> searchers = new ArrayList<IndexSearcherMul>();
 78+ for(String part : iid.getPhysicalIndexes()){
 79+ searchers.add(new IndexSearcherMul(registry.getLatestSnapshot(IndexId.get(part)).path));
 80+ }
 81+ return new MultiSearcher(searchers.toArray(new SearchableMul[]{}));
 82+ }
 83+ }
 84+
 85+ protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, Query q) throws IOException{
 86+ Hits hits = searcher.search(q);
 87+ NamespaceFreq wnf = new NamespaceFreq();
 88+ for(int j=0;j<hits.length();j++){
 89+ wnf.incFrequency(namespaces[hits.id(j)]);
 90+ }
 91+ return wnf;
 92+ }
 93+
 94+ /** Get frequency for a single word */
 95+ protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String word) throws IOException{
 96+ return getFrequency(searcher,namespaces,new TermQuery(new Term("contents",word)));
 97+ }
 98+
 99+ /** Get frequency of phrase (invidual words as array) */
 100+ protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{
 101+ PhraseQuery pq = new PhraseQuery();
 102+ for(String p : phrase){
 103+ pq.add(new Term("contents",p));
 104+ }
 105+ return getFrequency(searcher,namespaces,pq);
 106+ }
 107+
 108+ /** Get namespaces where word appears in title */
 109+ protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, Query q) throws IOException{
 110+ Hits hits = searcher.search(q);
 111+ HashSet<Integer> ns = new HashSet<Integer>();
 112+ for(int j=0;j<hits.length();j++){
 113+ ns.add(namespaces[hits.id(j)]);
 114+ }
 115+ return ns;
 116+ }
 117+
 118+ protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String word) throws IOException{
 119+ return getNamespaces(searcher,namespaces,new TermQuery(new Term("title",word)));
 120+ }
 121+
 122+ protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{
 123+ PhraseQuery pq = new PhraseQuery();
 124+ for(String p : phrase){
 125+ pq.add(new Term("title",p));
 126+ }
 127+ return getNamespaces(searcher,namespaces,pq);
 128+ }
 129+
 130+ /**
 131+ * Returns the namespace for each doc_id
 132+ * @throws IOException
 133+ * @FIXME: assumes optimized index
 134+ */
 135+ protected int[] makeNamespaceMap(Searcher searcher) throws IOException{
 136+ log.debug("Making namespace map...");
 137+ int[] namespaces = new int[searcher.maxDoc()];
 138+ for(int i=0;i<namespaces.length;i++){
 139+ namespaces[i] = -100;
 140+ Document doc = searcher.doc(i);
 141+ if(doc != null)
 142+ namespaces[i] = Integer.parseInt(doc.get("namespace"));
 143+ }
 144+ log.debug("Done making namespace map");
 145+ return namespaces;
 146+ }
 147+
 148+ /** Create new title word/phrases index from an existing index *snapshot* by reading all terms in the index */
 149+ public void createFromExistingIndex(IndexId src){
 150+ try{
 151+ log.debug("Creating new suggest index");
 152+ ngramWriter.createIndex(path,new SimpleAnalyzer());
 153+ Searcher searcher = makeSearcher(iid.getLogical());
 154+ // map doc_id -> namespace
 155+ int[] namespaces = makeNamespaceMap(searcher);
 156+
 157+ for(String dbrole : src.getPhysicalIndexes()){
 158+ log.info("Processing index "+dbrole);
 159+ if(!ngramWriter.isOpen()) // if we closed the index previously
 160+ ngramWriter.reopenIndex(path,new SimpleAnalyzer());
 161+
 162+ IndexId part = IndexId.get(dbrole);
 163+ IndexReader ir = IndexReader.open(registry.getLatestSnapshot(part).path);
 164+ LuceneDictionary dict = new LuceneDictionary(ir,"title");
 165+ IndexSearcher ngramSearcher = new IndexSearcher(path);
 166+ Word word;
 167+ // get all words, and all phrases beginning with word
 168+ while((word = dict.next()) != null){
 169+ log.debug("Processing word "+word);
 170+ String w = word.getWord();
 171+
 172+ // check if word is already in the index
 173+ if(ngramSearcher.docFreq(new Term("word",w)) != 0)
 174+ continue;
 175+
 176+ // index word
 177+ NamespaceFreq wnf = getFrequency(searcher,namespaces,w);
 178+ Collection<Integer> wns = getNamespaces(searcher,namespaces,w);
 179+ addWord(w,wnf,wns);
 180+
 181+ // index phrases
 182+ HashSet<String> phrases = new HashSet<String>();
 183+ Hits hits = searcher.search(new TermQuery(new Term("title",w)));
 184+ // find all phrases beginning with word
 185+ for(int i=0;i<hits.length();i++){
 186+ Document doc = hits.doc(i);
 187+ // tokenize to make phrases
 188+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false);
 189+ ArrayList<Token> tokens = parser.parse();
 190+ for(int j=0;j<tokens.size()-1;j++){
 191+ Token t = tokens.get(j);
 192+ // ignore aliases
 193+ if(t.getPositionIncrement() == 0)
 194+ continue;
 195+ // find phrases beginning with the target word
 196+ if(w.equals(t.termText())){
 197+ phrases.add(t.termText()+"_"+tokens.get(j+1).termText());
 198+ }
 199+ }
 200+ }
 201+ log.debug("Adding "+phrases.size()+" phrases "+phrases);
 202+ // index phrases
 203+ for(String phrase : phrases){
 204+ NamespaceFreq nf = getFrequency(searcher,namespaces,phrase.split("_"));
 205+ Collection<Integer> pns = getNamespaces(searcher,namespaces,phrase.split("_"));
 206+ addPhrase(phrase,nf,pns);
 207+ }
 208+ }
 209+ log.debug("Finished index "+dbrole+", closing/optimizing.");
 210+ ir.close();
 211+ ngramSearcher.close();
 212+ ngramWriter.closeAndOptimize();
 213+ }
 214+ searcher.close();
 215+ } catch (IOException e) {
 216+ log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
 217+ e.printStackTrace();
 218+ return;
 219+ }
 220+ }
 221+
 222+ /**
 223+ * Add phrase to index
 224+ *
 225+ * @param phrase - 2+ words joined with underscore
 226+ * @param nf - frequencies of phrase in various namespaces
 227+ * @param namespaces - namespaces where phrase appears in title
 228+ */
 229+ public void addPhrase(String phrase, NamespaceFreq nf, Collection<Integer> namespaces){
 230+ String freq = nf.serialize(minPhraseFreq);
 231+ if(freq.length() == 0)
 232+ return;
 233+ if(phrase.length() <= 2){
 234+ log.warn("Invalid phrase: "+phrase);
 235+ return;
 236+ }
 237+ Document doc = new Document();
 238+ ngramWriter.createNgramFields(doc,"phrase",phrase);
 239+ doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
 240+ doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO));
 241+ for(Integer ns : namespaces){
 242+ doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED));
 243+ }
 244+
 245+ ngramWriter.addDocument(doc);
 246+ }
 247+
 248+ /** Add ordinary word to the index, convenient for suggesting joins
 249+ *
 250+ * @param word - word to add
 251+ * @param nf - frequencies in namespaces
 252+ * @param namespaces - namespaces where word appears in title
 253+ */
 254+ public void addWord(String word, NamespaceFreq nf, Collection<Integer> namespaces){
 255+ if(word.length() < 2)
 256+ return;
 257+ String freq = nf.serialize(minWordFreq);
 258+ if(freq.length() == 0)
 259+ return;
 260+ Document doc = new Document();
 261+ ngramWriter.createNgramFields(doc,"word",word);
 262+ doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED));
 263+ doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO));
 264+ for(Integer ns : namespaces){
 265+ doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED));
 266+ }
 267+
 268+ ngramWriter.addDocument(doc);
 269+ }
 270+
 271+ /** Update the index */
 272+ public void update(Collection<IndexUpdateRecord> records){
 273+ try{
 274+ log.info("Updating suggest index for "+iid+" with "+records.size());
 275+ IndexReader ir = IndexReader.open(path);
 276+ Searcher searcher = makeSearcher(iid.getLogical());
 277+ int[] namespaces = makeNamespaceMap(searcher);
 278+ // get all words and phrases
 279+ HashSet<String> words = new HashSet<String>();
 280+ HashSet<String> phrases = new HashSet<String>();
 281+ for(IndexUpdateRecord rec : records){
 282+ String title = rec.getArticle().getTitle();
 283+ ArrayList<Token> tokens = new FastWikiTokenizerEngine(title,langCode,false).parse();
 284+ String last = null;
 285+ // register word/phrases
 286+ for(Token t : tokens){
 287+ String w = t.termText();
 288+ words.add(w);
 289+ if(last != null){
 290+ phrases.add(last+"_"+w);
 291+ }
 292+ last = w;
 293+ }
 294+ }
 295+ searcher.close();
 296+
 297+ // batch delete old values
 298+ for(String word : words){
 299+ ir.deleteDocuments(new Term("word",word));
 300+ }
 301+ for(String phrase : phrases){
 302+ ir.deleteDocuments(new Term("phrase",phrase));
 303+ }
 304+ ir.close();
 305+ ngramWriter.reopenIndex(path,new SimpleAnalyzer());
 306+
 307+ // batch add new stuff
 308+ for(String word : words){
 309+ addWord(word,getFrequency(searcher,namespaces,word),getNamespaces(searcher,namespaces,word));
 310+ }
 311+ for(String phrase : phrases){
 312+ String[] ph = phrase.split("_");
 313+ addPhrase(phrase,getFrequency(searcher,namespaces,ph),getNamespaces(searcher,namespaces,ph));
 314+ }
 315+
 316+ ngramWriter.close();
 317+ } catch(IOException e){
 318+ log.error("Cannot update suggest index for "+iid+" : "+e.getMessage());
 319+ e.printStackTrace();
 320+ return;
 321+ }
 322+ }
 323+
 324+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/LuceneDictionary.java
@@ -20,10 +20,7 @@
2121 import org.apache.lucene.index.IndexReader;
2222 import org.apache.lucene.index.Term;
2323
24 -import java.util.Iterator;
25 -
2624 import org.apache.lucene.index.TermEnum;
27 -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;
2825
2926 import java.io.*;
3027
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NgramIndexer.java
@@ -0,0 +1,162 @@
 2+package org.wikimedia.lsearch.suggest.api;
 3+
 4+import java.io.IOException;
 5+
 6+import org.apache.log4j.Logger;
 7+import org.apache.lucene.analysis.Analyzer;
 8+import org.apache.lucene.document.Document;
 9+import org.apache.lucene.document.Field;
 10+import org.apache.lucene.index.IndexWriter;
 11+import org.wikimedia.lsearch.index.WikiIndexModifier;
 12+
 13+/**
 14+ * Useful for basic ngram indexes handling, open/close indexes, add ngram fields, etc..
 15+ *
 16+ * @author rainman
 17+ *
 18+ */
 19+public class NgramIndexer {
 20+ Logger log = Logger.getLogger(NgramIndexer.class);
 21+ protected String path;
 22+ protected Analyzer analyzer;
 23+ protected IndexWriter writer;
 24+
 25+ public NgramIndexer(){
 26+ path = null;
 27+ analyzer = null;
 28+ writer = null;
 29+ }
 30+
 31+ /** Make a new ngram index */
 32+ public void createIndex(String path, Analyzer analyzer) throws IOException{
 33+ openIndex(path,analyzer,true);
 34+ }
 35+
 36+ /** Reopen old index, make if doesn't exist */
 37+ public void reopenIndex(String path, Analyzer analyzer) throws IOException{
 38+ openIndex(path,analyzer,false);
 39+ }
 40+
 41+ public void openIndex(String path, Analyzer analyzer, boolean newIndex) throws IOException{
 42+ this.path = path;
 43+ this.analyzer = analyzer;
 44+ try {
 45+ writer = new IndexWriter(path,analyzer,newIndex);
 46+ } catch (IOException e) {
 47+ try {
 48+ log.info("Making new index at path "+path);
 49+ // try to make brand new index
 50+ WikiIndexModifier.makeDBPath(path); // ensure all directories are made
 51+ writer = new IndexWriter(path,analyzer,newIndex);
 52+ } catch (IOException e1) {
 53+ log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage());
 54+ throw e1;
 55+ }
 56+ }
 57+ writer.setMergeFactor(20);
 58+ writer.setMaxBufferedDocs(500);
 59+ writer.setUseCompoundFile(true);
 60+ writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
 61+
 62+ }
 63+
 64+ /** Check if index is open and ready for modification */
 65+ public boolean isOpen(){
 66+ return writer != null;
 67+ }
 68+
 69+ /** Optimize and close index, always call when done indexing */
 70+ public void close() throws IOException {
 71+ try{
 72+ writer.close();
 73+ writer = null;
 74+ } catch(IOException e){
 75+ log.warn("I/O error closing index at "+path);
 76+ throw e;
 77+ }
 78+ }
 79+
 80+ /** Optimize and close index, always call when done indexing */
 81+ public void closeAndOptimize() throws IOException {
 82+ try{
 83+ writer.optimize();
 84+ writer.close();
 85+ writer = null;
 86+ } catch(IOException e){
 87+ log.warn("I/O error optimizing/closing index at "+path);
 88+ throw e;
 89+ }
 90+ }
 91+
 92+ /** Return ngrams of specific size for text */
 93+ public static String[] nGrams(String text, int size) {
 94+ int len = text.length();
 95+ String[] res = new String[len - size + 1];
 96+ for (int i = 0; i < len - size + 1; i++) {
 97+ res[i] = text.substring(i, i + size);
 98+ }
 99+ return res;
 100+ }
 101+
 102+ /** Get minimal ngram size for word. the minimal size should be at least 1/2 of word length */
 103+ public static int getMinNgram(String word){
 104+ if(word.length() <= 3)
 105+ return 1;
 106+ else if(word.length() == 4 || word.length() == 5)
 107+ return 2;
 108+ else
 109+ return 3;
 110+ }
 111+
 112+ /** Maximal size of ngram block, at most the length of word */
 113+ public static int getMaxNgram(String word){
 114+ if(word.length() == 2)
 115+ return 2;
 116+ else
 117+ return 3;
 118+ }
 119+
 120+ /** Get ngram field name with no prefix */
 121+ public static String getNgramField(){
 122+ return getNgramField(null);
 123+ }
 124+
 125+ /** Get prefixed ngram field name */
 126+ public static String getNgramField(String prefix){
 127+ if(prefix == null || prefix.equals(""))
 128+ return "ngram";
 129+ else
 130+ return prefix+"_ngram";
 131+ }
 132+
 133+ /**
 134+ * Add ngrams of all sizes from 1 to word.length to document
 135+ *
 136+ * @param doc - document to add fields to
 137+ * @param prefix - prefix to ngram field name
 138+ * @param word - word
 139+ */
 140+ protected void createNgramFields(Document doc, String prefix, String word) {
 141+ int min = getMinNgram(word);
 142+ int max = getMaxNgram(word);
 143+ String fieldBase = getNgramField(prefix);
 144+ for(int i=min ; i <= max ; i++ ){
 145+ String[] ngrams = nGrams(word,i);
 146+ String field = fieldBase+i;
 147+ for(int j=0 ; j<ngrams.length ; j++){
 148+ String ngram = ngrams[j];
 149+ doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
 150+ }
 151+ }
 152+ }
 153+
 154+ public void addDocument(Document doc){
 155+ try {
 156+ log.debug("Adding document "+doc);
 157+ writer.addDocument(doc);
 158+ } catch (Exception e) {
 159+ log.error("Cannot add document "+doc+" : "+e.getMessage());
 160+ e.printStackTrace();
 161+ }
 162+ }
 163+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Dictionary.java
@@ -1,9 +1,31 @@
22 package org.wikimedia.lsearch.suggest.api;
33
4 -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word;
5 -
6 -
74 public interface Dictionary {
 5+ public static class Word {
 6+ protected String word;
 7+ protected int frequency;
 8+ public Word(String word, int frequency) {
 9+ super();
 10+ this.word = word;
 11+ this.frequency = frequency;
 12+ }
 13+ public int getFrequency() {
 14+ return frequency;
 15+ }
 16+ public void setFrequency(int frequency) {
 17+ this.frequency = frequency;
 18+ }
 19+ public String getWord() {
 20+ return word;
 21+ }
 22+ public void setWord(String word) {
 23+ this.word = word;
 24+ }
 25+ public String toString(){
 26+ return word+" : "+frequency;
 27+ }
 28+
 29+ }
830 /** Get next term or null if there is no more terms */
931 public Word next();
1032 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NamespaceFreq.java
@@ -0,0 +1,88 @@
 2+package org.wikimedia.lsearch.suggest.api;
 3+
 4+import java.util.BitSet;
 5+import java.util.HashMap;
 6+import java.util.Set;
 7+import java.util.Map.Entry;
 8+
 9+import org.wikimedia.lsearch.search.NamespaceFilter;
 10+
 11+/** Mapping from namespaces to frequencies */
 12+public class NamespaceFreq {
 13+ class IntWrap{
 14+ int val = 0;
 15+ IntWrap() {}
 16+ IntWrap(int value){ val = value; }
 17+ IntWrap(String value){ val = Integer.parseInt(value); }
 18+ public String toString(){ return ""+val; }
 19+ }
 20+ /** namespace -> frequency */
 21+ protected HashMap<Integer,IntWrap> nsmap = new HashMap<Integer,IntWrap>();
 22+
 23+ public NamespaceFreq(String field){
 24+ String[] pairs = field.split(" ");
 25+ for(String pair : pairs){
 26+ if(pair.length() == 0)
 27+ continue;
 28+ String[] nsf = pair.split(":");
 29+ if(nsf.length == 2)
 30+ nsmap.put(Integer.parseInt(nsf[0]),new IntWrap(nsf[1]));
 31+ else {
 32+ throw new RuntimeException("Bad syntax for namespace-frequency pairs : "+field);
 33+ }
 34+ }
 35+ }
 36+
 37+ public NamespaceFreq() {
 38+ }
 39+
 40+ public int getFrequency(int namespace){
 41+ if(nsmap.containsKey(namespace))
 42+ return nsmap.get(namespace).val;
 43+ else
 44+ return 0;
 45+ }
 46+
 47+ public int getFrequency(NamespaceFilter nsf){
 48+ int sum = 0;
 49+ BitSet ns = nsf.getIncluded();
 50+ for(int i=ns.nextSetBit(0); i>=0; i=ns.nextSetBit(i+1)){
 51+ sum += getFrequency(i);
 52+ }
 53+ return sum;
 54+ }
 55+
 56+ public String serialize(int minFreq){
 57+ StringBuilder sb = new StringBuilder();
 58+ int sum = 0;
 59+ for(Entry<Integer,IntWrap> e : nsmap.entrySet()){
 60+ sum += e.getValue().val;
 61+ sb.append(e.getKey());
 62+ sb.append(":");
 63+ sb.append(e.getValue());
 64+ sb.append(" ");
 65+ }
 66+ if(sum < minFreq)
 67+ return "";
 68+ return sb.toString();
 69+ }
 70+
 71+ public String serialize(){
 72+ return serialize(0);
 73+ }
 74+
 75+ public void setFrequency(int namespace, int frequency){
 76+ nsmap.put(namespace,new IntWrap(frequency));
 77+ }
 78+
 79+ public void incFrequency(int namespace){
 80+ if(nsmap.containsKey(namespace)){
 81+ nsmap.get(namespace).val++;
 82+ } else
 83+ nsmap.put(namespace,new IntWrap(1));
 84+ }
 85+
 86+ public Set<Integer> getNamespaces(){
 87+ return nsmap.keySet();
 88+ }
 89+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/CleanIndexImporter.java
@@ -25,8 +25,6 @@
2626 import org.wikimedia.lsearch.config.IndexId;
2727 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
2828 import org.wikimedia.lsearch.ranks.Links;
29 -import org.wikimedia.lsearch.suggest.api.PhraseIndexer;
30 -import org.wikimedia.lsearch.suggest.api.TitlesIndexer;
3129 import org.wikimedia.lsearch.util.Localization;
3230
3331 /**
@@ -59,8 +57,10 @@
6058 boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null;
6159 // make article
6260 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects);
63 - if(page.Title.Namespace == 0)
64 - writer.addArticle(article);
 61+ if(page.Title.Namespace != 0)
 62+ article.setContents("");
 63+
 64+ writer.addArticle(article);
6565 // generate phrases
6666 /* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false);
6767 ArrayList<Token> tokens = parser.parse();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
@@ -196,8 +196,13 @@
197197 /** Get title object corresponding to this article */
198198 public Title makeTitle(){
199199 return new Title(Integer.parseInt(namespace),title);
 200+ }
 201+
 202+ public void setContents(String contents) {
 203+ this.contents = contents;
200204 }
201205
202206
203207
 208+
204209 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
@@ -530,6 +530,11 @@
531531 return namespaceSet;
532532 }
533533
 534+ /** Get logical iid for this index, i.e. enwiki.mainpart -> enwiki */
 535+ public IndexId getLogical(){
 536+ return get(dbname);
 537+ }
534538
 539+
535540
536541 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -769,6 +769,20 @@
770770
771771 dbroles.put(type,params);
772772
 773+ } else if(type.equals("suggest")){
 774+ // all params are optional, if absent default will be used
 775+ if(tokens.length>1)
 776+ params.put("wordsMinFreq",tokens[1]);
 777+ if(tokens.length>2)
 778+ params.put("titlesWordsMinFreq",tokens[2]);
 779+ if(tokens.length>3)
 780+ params.put("titlesPhrasesMinFreq", tokens[3]);
 781+
 782+ if(tokens.length>4)
 783+ System.out.println("Unrecognized suggest parameters in ("+role+")");
 784+
 785+ dbroles.put(type,params);
 786+
773787 } else{
774788 System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring.");
775789 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -64,7 +64,7 @@
6565 if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
6666 exactCase = true;
6767 NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
68 - SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase);
 68+ SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase, false);
6969 if(res!=null && res.isRetry()){
7070 int retries = 0;
7171 if(iid.isSplit() || iid.isNssplit()){
@@ -73,19 +73,27 @@
7474 retries = 1;
7575
7676 while(retries > 0 && res.isRetry()){
77 - res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase);
 77+ res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase, false);
7878 retries--;
7979 }
8080 if(res.isRetry())
8181 res.setErrorMsg("Internal error, too many internal retries.");
8282 }
8383 return res;
84 - } else if (what.equals("raw")) {
85 - //TODO: return searchRaw(searchterm);
 84+ } else if (what.equals("raw") || what.equals("rawexplain")) {
 85+ int offset = 0, limit = 100; boolean exactCase = false;
 86+ if (query.containsKey("offset"))
 87+ offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
 88+ if (query.containsKey("limit"))
 89+ limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines);
 90+ if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
 91+ exactCase = true;
 92+ NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
 93+ return search(iid, searchterm, offset, limit, namespaces, what.equals("rawexplain"), exactCase, true);
8694 } else {
8795 SearchResults res = new SearchResults();
8896 res.setErrorMsg("Unrecognized search type. Try one of: " +
89 - "titlematch, titleprefix, search, explain, quit, raw.");
 97+ "search, explain, raw, rawexplain.");
9098 log.warn("Unknown request type [" + what + "].");
9199 return res;
92100 }
@@ -124,7 +132,7 @@
125133 * Search on iid, with query searchterm. View results from offset to offset+limit, using
126134 * the default namespaces filter
127135 */
128 - public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase){
 136+ public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw){
129137 Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
130138 if(nsDefault == null || nsDefault.cardinality() == 0)
131139 nsDefault = new NamespaceFilter("0"); // default to main namespace
@@ -155,13 +163,16 @@
156164 }
157165
158166 try {
159 - if(nsfw == null){
 167+ if(raw){
 168+ // do minimal parsing, make a raw query
 169+ parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE);
 170+ q = parser.parseRaw(searchterm);
 171+ } else if(nsfw == null){
160172 if(searchAll)
161173 q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
162174 else
163175 q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
164 - }
165 - else{
 176+ } else{
166177 q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
167178 log.info("Using NamespaceFilterWrapper "+nsfw);
168179 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -404,6 +404,10 @@
405405 doc.add(new Field("category", "",
406406 Field.Store.NO, Field.Index.TOKENIZED));
407407
 408+ // interwiki associated with this page
 409+ doc.add(new Field("interwiki", "",
 410+ Field.Store.NO, Field.Index.TOKENIZED));
 411+
408412 for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
409413 FieldNameFactory fields = bs.getFields();
410414 // boost document title with it's article rank
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -77,13 +77,15 @@
7878 /** boost for title field */
7979 public static float TITLE_BOOST = 6;
8080 public static float TITLE_ALIAS_BOOST = 0.2f;
 81+ public static float TITLE_PHRASE_BOOST = 2;
8182 public static float STEM_TITLE_BOOST = 2;
8283 public static float STEM_TITLE_ALIAS_BOOST = 0.4f;
83 - public static float ALT_TITLE_BOOST = 4;
 84+ public static float ALT_TITLE_BOOST = 8;
8485 public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
8586 public static float KEYWORD_BOOST = 0.02f;
8687
8788 public static boolean ADD_STEM_TITLE = true;
 89+ public static boolean ADD_TITLE_PHRASES = true;
8890
8991 /** Policies in treating field names:
9092 *
@@ -295,7 +297,7 @@
296298 if(length == 0 && ch == ' ')
297299 continue; // ignore whitespaces
298300
299 - // pluses and minuses, underscores can be within words, *,? are for wildcard queries
 301+ // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries
300302 if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*'){
301303 if(length<buffer.length)
302304 buffer[length++] = ch;
@@ -322,7 +324,7 @@
323325 else if(ch == ':'){
324326 // check if it's a valid field
325327 String f = new String(buffer,0,length);
326 - if(f.equals(namespaceAllKeyword) || f.equals("incategory") || namespaceFilters.containsKey(f)){
 328+ if(f.equals(namespaceAllKeyword) || f.equals("incategory") || namespaceFilters.containsKey(f) || namespacePolicy == NamespacePolicy.LEAVE){
327329 cur = lookup;
328330 return TokenType.FIELD;
329331 } else
@@ -1094,7 +1096,7 @@
10951097 }
10961098
10971099 /** Make title query in format: title:query stemtitle:stemmedquery */
1098 - protected Query makeTitleQuery(String queryText) {
 1100+ protected Query[] makeTitleQuery(String queryText) {
10991101 String contentField = defaultField;
11001102 float olfDefaultBoost = defaultBoost;
11011103 defaultField = fields.title(); // now parse the title part
@@ -1117,16 +1119,19 @@
11181120 defaultBoost = olfDefaultBoost;
11191121 defaultAliasBoost = ALIAS_BOOST;
11201122
 1123+ // make title phrases
 1124+ Query qp = ADD_TITLE_PHRASES? makeTitlePhrases(qt) : null;
 1125+
11211126 if(qt == qs) // either null, or category query
1122 - return qt;
 1127+ return new Query[] {qt,qp};
11231128 if(qt == null)
1124 - return qs;
 1129+ return new Query[] {qs,qp};
11251130 if(qs == null)
1126 - return qt;
 1131+ return new Query[] {qt,qp};
11271132 BooleanQuery bq = new BooleanQuery(true);
11281133 bq.add(qt,BooleanClause.Occur.SHOULD);
11291134 bq.add(qs,BooleanClause.Occur.SHOULD);
1130 - return bq;
 1135+ return new Query[] {bq,qp};
11311136 }
11321137
11331138 /** Quote CJK chars to avoid frequency-based analysis */
@@ -1173,6 +1178,44 @@
11741179 }
11751180 }
11761181
 1182+ /** make two-word queries for some simple queries */
 1183+ protected Query makeTitlePhrases(Query q){
 1184+ if(q instanceof BooleanQuery){
 1185+ boolean allReq = true;
 1186+ BooleanQuery bq = (BooleanQuery) q;
 1187+ for(BooleanClause bc : bq.getClauses()){
 1188+ if(!bc.getOccur().equals(BooleanClause.Occur.MUST) || !(bc.getQuery() instanceof TermQuery) ||
 1189+ !(((TermQuery)bc.getQuery()).getTerm().field().equals("title"))){
 1190+ allReq = false;
 1191+ break;
 1192+ }
 1193+ }
 1194+ if(allReq){
 1195+ BooleanQuery ret = new BooleanQuery(true);
 1196+ Term last = null;
 1197+ // make phrases '+very +long +query' => "very long" "long query"
 1198+ for(BooleanClause bc : bq.getClauses()){
 1199+ Term t = ((TermQuery)bc.getQuery()).getTerm();
 1200+ if(last != null){
 1201+ PhraseQuery pq = new PhraseQuery();
 1202+ pq.add(new Term("stemtitle",last.text()));
 1203+ pq.add(new Term("stemtitle",t.text()));
 1204+ pq.setBoost(TITLE_PHRASE_BOOST);
 1205+ pq.setSlop(2);
 1206+ ret.add(pq,BooleanClause.Occur.SHOULD);
 1207+ }
 1208+ last = t;
 1209+
 1210+ }
 1211+ if(ret.getClauses() != null && ret.getClauses().length != 0)
 1212+ return ret;
 1213+ }
 1214+ }
 1215+
 1216+ return null;
 1217+
 1218+ }
 1219+
11771220 /**
11781221 * Main function for multi-pass parsing.
11791222 *
@@ -1188,12 +1231,23 @@
11891232 defaultBoost = 1;
11901233 defaultAliasBoost = ALIAS_BOOST;
11911234 Query qc = parseRaw(queryText);
1192 -
1193 - Query qt = makeTitleQuery(queryText);
 1235+ Query[] qtqp = makeTitleQuery(queryText);
 1236+ // qt = title query, qp = title phrase query
 1237+ Query qt = qtqp[0];
 1238+ Query qp = null;
 1239+ qp = qtqp[1];
11941240 if(qc == null || qt == null)
11951241 return new BooleanQuery();
11961242 if(qc.equals(qt))
11971243 return qc; // don't duplicate (probably a query for categories only)
 1244+
 1245+ // embedd phrase queries into main contents query
 1246+ if(qp!=null && qc instanceof BooleanQuery){
 1247+ ((BooleanQuery)qc).add(qp,BooleanClause.Occur.SHOULD);
 1248+ } else if(qp !=null && !(qc instanceof BooleanQuery)){
 1249+ // TODO: delete in release
 1250+ System.out.println("SHOULD NEVER HAPPEN");
 1251+ }
11981252 BooleanQuery bq = new BooleanQuery();
11991253 bq.add(qc,BooleanClause.Occur.SHOULD);
12001254 bq.add(qt,BooleanClause.Occur.SHOULD);
@@ -1263,6 +1317,14 @@
12641318 public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
12651319 return parseMultiPass(queryText,policy,false,false);
12661320 }
 1321+
 1322+ public NamespacePolicy getNamespacePolicy() {
 1323+ return namespacePolicy;
 1324+ }
 1325+ public void setNamespacePolicy(NamespacePolicy namespacePolicy) {
 1326+ this.namespacePolicy = namespacePolicy;
 1327+ }
12671328
12681329
 1330+
12691331 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -1,6 +1,7 @@
22 package org.wikimedia.lsearch.analyzers;
33
44 import java.util.ArrayList;
 5+import java.util.HashMap;
56
67 import org.apache.log4j.Logger;
78 import org.apache.lucene.analysis.Analyzer;
@@ -69,6 +70,7 @@
7071 WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);
7172 tokenizer.tokenize();
7273 ArrayList<String> categories = tokenizer.getCategories();
 74+ HashMap<String,String> interwiki = tokenizer.getInterwikis();
7375
7476 ArrayList<String> allKeywords = new ArrayList<String>();
7577 if(addKeywords && tokenizer.getKeywords()!=null)
@@ -80,6 +82,8 @@
8183 new LanguageAnalyzer(filters,tokenizer));
8284 perFieldAnalyzer.addAnalyzer("category",
8385 new CategoryAnalyzer(categories,exactCase));
 86+ perFieldAnalyzer.addAnalyzer("interwiki",
 87+ new InterwikiAnalyzer(interwiki));
8488 perFieldAnalyzer.addAnalyzer(fields.title(),
8589 getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
8690 perFieldAnalyzer.addAnalyzer(fields.stemtitle(),
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/InterwikiAnalyzer.java
@@ -0,0 +1,58 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.io.Reader;
 6+import java.util.HashMap;
 7+import java.util.Iterator;
 8+import java.util.Map.Entry;
 9+
 10+import org.apache.lucene.analysis.Analyzer;
 11+import org.apache.lucene.analysis.Token;
 12+import org.apache.lucene.analysis.TokenStream;
 13+
 14+public class InterwikiAnalyzer extends Analyzer {
 15+ public class InterwikiTokenStream extends TokenStream {
 16+ protected Iterator<Entry<String,String>> tokensIt;
 17+ protected int start;
 18+ protected Token next = null;
 19+
 20+ InterwikiTokenStream(){
 21+ tokensIt = interwiki.entrySet().iterator();
 22+ start = 0;
 23+ }
 24+
 25+ @Override
 26+ public Token next() throws IOException {
 27+ if(next != null){
 28+ Token t = next;
 29+ next = null;
 30+ return t;
 31+ }
 32+ if(tokensIt.hasNext()){
 33+ Entry<String,String> map = tokensIt.next();
 34+ String iw = map.getKey()+":"; // e.g. en:
 35+ String title = map.getValue().toLowerCase(); // e.g. "douglas adams"
 36+ Token t = new Token(iw,start,start+iw.length());
 37+ start += iw.length()+1;
 38+ next = new Token(title,start,start+title.length());
 39+ start += title.length()+1;
 40+
 41+ return t;
 42+ } else
 43+ return null;
 44+ }
 45+
 46+ }
 47+
 48+ HashMap<String,String> interwiki;
 49+
 50+ public InterwikiAnalyzer(HashMap<String,String> interwiki) {
 51+ this.interwiki = interwiki;
 52+ }
 53+
 54+ @Override
 55+ public TokenStream tokenStream(String fieldName, Reader reader) {
 56+ return new InterwikiTokenStream();
 57+ }
 58+
 59+}
Index: branches/lucene-search-2.1/build.xml
@@ -6,9 +6,11 @@
77 <property name="jars" value="jars"/>
88 <property name="dist" location="dist"/>
99 <property name="pack.name" value="lucene-search-2.1"/>
 10+ <property name="src.name" value="lucene-search-src-2.1"/>
1011 <property name="binary.name" value="ls2-bin"/>
1112 <property name="jar.name" value="LuceneSearch.jar"/>
1213 <property name="include" value="src/** lib/** sql/** test-data/** webinterface/** *-example *.txt lsearch* build.xml scripts/*"/>
 14+ <property name="include.src" value="src/** sql/** build.xml scripts/*"/>
1315
1416 <property file="${basedir}/hostname"/>
1517
@@ -82,6 +84,19 @@
8385 <delete file="${dist}/${pack.name}.tar"/>
8486 </target>
8587
 88+ <target name="pack-src" description="Make tar.gz distribution of only core source files">
 89+ <mkdir dir="${dist}"/>
 90+ <delete file="${dist}/${src.name}.tar"/>
 91+ <delete file="${dist}/${src.name}.tar.gz"/>
 92+ <tar tarfile="${dist}/${src.name}.tar">
 93+ <tarfileset prefix="${pack.name}" dir="." includes="${include.src}"/>
 94+ </tar>
 95+
 96+ <gzip zipfile="${dist}/${src.name}.tar.gz" src="${dist}/${src.name}.tar"/>
 97+ <delete file="${dist}/${src.name}.tar"/>
 98+ </target>
 99+
 100+
86101 <target name="binary" depends="alljar" description="Make binary tar.gz distribution">
87102 <mkdir dir="${bin}"/>
88103 <delete file="${dist}/${binary.name}.tar"/>

Status & tagging log