Index: branches/lucene-search-2.1/test-data/mwsearch-global.test |
— | — | @@ -9,7 +9,7 @@ |
10 | 10 | # aspell <language> |
11 | 11 | [Database] |
12 | 12 | entest : (mainsplit), (mainpart,false,2,10), (restpart,true,2) |
13 | | -entest : (ngram), (aspell,en) |
| 13 | +entest : (ngram), (suggest,1,2,3) |
14 | 14 | detest,rutest : (single,true,2,10) |
15 | 15 | frtest : (split,3) (part1) (part2) (part3) |
16 | 16 | srwiki : (single) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -190,6 +190,12 @@ |
191 | 191 | assertEquals("http://rs.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("rswikimedia")); |
192 | 192 | assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki")); |
193 | 193 | |
| 194 | + // test suggest tag |
| 195 | + Hashtable<String,String> sug = testgc.getDBParams("entest","suggest"); |
| 196 | + assertEquals("1",sug.get("wordsMinFreq")); |
| 197 | + assertEquals("2",sug.get("titlesWordsMinFreq")); |
| 198 | + assertEquals("3",sug.get("titlesPhrasesMinFreq")); |
| 199 | + |
194 | 200 | } catch (MalformedURLException e) { |
195 | 201 | e.printStackTrace(); |
196 | 202 | } catch (IOException e) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java |
— | — | @@ -0,0 +1,635 @@ |
| 2 | +package org.wikimedia.lsearch.test; |
| 3 | + |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.IOException; |
| 6 | +import java.io.InputStreamReader; |
| 7 | +import java.util.ArrayList; |
| 8 | + |
| 9 | +import org.apache.lucene.index.IndexReader; |
| 10 | +import org.apache.lucene.search.spell.SpellChecker; |
| 11 | +import org.apache.lucene.store.FSDirectory; |
| 12 | +import org.wikimedia.lsearch.config.Configuration; |
| 13 | +import org.wikimedia.lsearch.config.IndexId; |
| 14 | +import org.wikimedia.lsearch.config.IndexRegistry; |
| 15 | +import org.wikimedia.lsearch.suggest.Suggest; |
| 16 | +import org.wikimedia.lsearch.suggest.SuggestResult; |
| 17 | +import org.wikimedia.lsearch.suggest.Suggest.SuggestSplit; |
| 18 | + |
| 19 | +public class SuggestTest { |
| 20 | + |
| 21 | + public static void testSpellCheck(String dbname) throws IOException{ |
| 22 | + IndexId iid = IndexId.get(dbname); |
| 23 | + SpellChecker sc = new SpellChecker(FSDirectory.getDirectory(iid.getSpellcheckPath(),false)); |
| 24 | + IndexReader ir = IndexReader.open(iid.getSuggestCleanPath()); |
| 25 | + int good=0; |
| 26 | + int bad=0; |
| 27 | + long start = System.currentTimeMillis(); |
| 28 | + for(String[] m : DATA){ |
| 29 | + String[] res = sc.suggestSimilar(m[0],20,ir,"contents",true); |
| 30 | + if(res.length > 0 && m[1].equals(res[0])) |
| 31 | + good++; |
| 32 | + else{ |
| 33 | + reportBad(m[0],m[1],res.length>0? res[0] : ""); |
| 34 | + bad++; |
| 35 | + } |
| 36 | + } |
| 37 | + int total = good + bad; |
| 38 | + long delta = System.currentTimeMillis() - start; |
| 39 | + System.out.println("SpellCheck test ("+delta+"ms): good: "+good+" ("+((double)good/total*100)+"%), bad: "+bad+", total="+total); |
| 40 | + } |
| 41 | + |
| 42 | + public static void testSuggest(String dbname) throws IOException{ |
| 43 | + IndexId iid = IndexId.get(dbname); |
| 44 | + Suggest sc = new Suggest(iid); |
| 45 | + int good=0; |
| 46 | + int bad=0; |
| 47 | + long start = System.currentTimeMillis(); |
| 48 | + for(String[] m : DATA){ |
| 49 | + ArrayList<SuggestResult> res = sc.suggestWords(m[0],5); |
| 50 | + if(res.size() > 0){ |
| 51 | + SuggestResult r = res.get(0); |
| 52 | + if(r.getWord().equals(m[1])) |
| 53 | + good++; |
| 54 | + else if(r.getWord().equals(m[0]) && res.size()>1 && res.get(1).getFrequency()>r.getFrequency() |
| 55 | + && res.get(1).getWord().equals(m[1])) |
| 56 | + good++; |
| 57 | + else if(r.getDist() > 1){ |
| 58 | + ArrayList<SuggestSplit> split = sc.suggestSplitFromTitle(m[0]); |
| 59 | + if(split.size()>0 && m[1].equals(split.get(0).getWord())) |
| 60 | + good++; |
| 61 | + else{ |
| 62 | + reportBad(m[0],m[1],r.getWord()); |
| 63 | + bad++; |
| 64 | + } |
| 65 | + |
| 66 | + } |
| 67 | + else{ |
| 68 | + reportBad(m[0],m[1],r.getWord()); |
| 69 | + bad++; |
| 70 | + } |
| 71 | + } else{ |
| 72 | + reportBad(m[0],m[1],""); |
| 73 | + bad++; |
| 74 | + } |
| 75 | + } |
| 76 | + int total = good + bad; |
| 77 | + long delta = System.currentTimeMillis() - start; |
| 78 | + System.out.println("Suggest test ("+delta+"ms): good: "+good+" ("+((double)good/total*100)+"%), bad: "+bad+", total="+total); |
| 79 | + } |
| 80 | + |
| 81 | + public static void reportBad(String bad, String expected, String got){ |
| 82 | + System.out.println("FOR ["+bad+"] EXPECTED: ["+expected+"], BUT GOT ["+got+"]"); |
| 83 | + } |
| 84 | + |
| 85 | + public static void main(String[] args) throws IOException{ |
| 86 | + Configuration.open(); |
| 87 | + String dbname = "wikilucene"; |
| 88 | + if(args.length==1) |
| 89 | + dbname = args[0]; |
| 90 | + |
| 91 | + testSpellCheck(dbname); |
| 92 | + testSuggest(dbname); |
| 93 | + } |
| 94 | + |
| 95 | + |
| 96 | + private static final String[][] DATA = { { |
| 97 | + "abilitey", "ability" }, { |
| 98 | + "abouy", "about" }, { |
| 99 | + "absorbtion", "absorption" }, { |
| 100 | + "accidently", "accidentally" }, { |
| 101 | + "accomodate", "accommodate" }, { |
| 102 | + "acommadate", "accommodate" }, { |
| 103 | + "acord", "accord" }, { |
| 104 | + "adultry", "adultery" }, { |
| 105 | + "aggresive", "aggressive" }, { |
| 106 | + "alchohol", "alcohol" }, { |
| 107 | + "alchoholic", "alcoholic" }, { |
| 108 | + "allieve", "alive" }, { |
| 109 | + "alot", "a lot" }, { |
| 110 | + "alright", "all right" }, { |
| 111 | + "amature", "amateur" }, { |
| 112 | + "ambivilant", "ambivalent" }, { |
| 113 | + "amification", "amplification" }, { |
| 114 | + "amourfous", "amorphous" }, { |
| 115 | + "annoint", "anoint" }, { |
| 116 | + "annonsment", "announcement" }, { |
| 117 | + "annoyting", "anting" }, { |
| 118 | + "annuncio", "announce" }, { |
| 119 | + "anonomy", "anatomy" }, { |
| 120 | + "anotomy", "anatomy" }, { |
| 121 | + "antidesestablishmentarianism", "antidisestablishmentarianism" }, { |
| 122 | + "antidisestablishmentarism", "antidisestablishmentarianism" }, { |
| 123 | + "anynomous", "anonymous" }, { |
| 124 | + "appelet", "applet" }, { |
| 125 | + "appreceiated", "appreciated" }, { |
| 126 | + "appresteate", "appreciate" }, { |
| 127 | + "aquantance", "acquaintance" }, { |
| 128 | + "aratictature", "architecture" }, { |
| 129 | + "archeype", "archetype" }, { |
| 130 | + "aricticure", "architecture" }, { |
| 131 | + "artic", "arctic" }, { |
| 132 | + "asentote", "asymptote" }, { |
| 133 | + "ast", "at" }, { |
| 134 | + "asterick", "asterisk" }, { |
| 135 | + "asymetric", "asymmetric" }, { |
| 136 | + "atentively", "attentively" }, { |
| 137 | + "autoamlly", "automatically" }, { |
| 138 | + "bankrot", "bankrupt" }, { |
| 139 | + "basicly", "basically" }, { |
| 140 | + "batallion", "battalion" }, { |
| 141 | + "bbrose", "browse" }, { |
| 142 | + "beauro", "bureau" }, { |
| 143 | + "beaurocracy", "bureaucracy" }, { |
| 144 | + "beggining", "beginning" }, { |
| 145 | + "beging", "beginning" }, { |
| 146 | + "behaviour", "behavior" }, { |
| 147 | + "beleive", "believe" }, { |
| 148 | + "belive", "believe" }, { |
| 149 | + "benidifs", "benefits" }, { |
| 150 | + "bigginging", "beginning" }, { |
| 151 | + "blait", "bleat" }, { |
| 152 | + "bouyant", "buoyant" }, { |
| 153 | + "boygot", "boycott" }, { |
| 154 | + "brocolli", "broccoli" }, { |
| 155 | + "buch", "bush" }, { |
| 156 | + "buder", "butter" }, { |
| 157 | + "budr", "butter" }, { |
| 158 | + "budter", "butter" }, { |
| 159 | + "buracracy", "bureaucracy" }, { |
| 160 | + "burracracy", "bureaucracy" }, { |
| 161 | + "buton", "button" }, { |
| 162 | + "byby", "by by" }, { |
| 163 | + "cauler", "caller" }, { |
| 164 | + "ceasar", "caesar" }, { |
| 165 | + "cemetary", "cemetery" }, { |
| 166 | + "changeing", "changing" }, { |
| 167 | + "cheet", "cheat" }, { |
| 168 | + "cicle", "circle" }, { |
| 169 | + "cimplicity", "simplicity" }, { |
| 170 | + "circumstaces", "circumstances" }, { |
| 171 | + "clob", "club" }, { |
| 172 | + "coaln", "colon" }, { |
| 173 | + "cocamena", "cockamamie" }, { |
| 174 | + "colleaque", "colleague" }, { |
| 175 | + "colloquilism", "colloquialism" }, { |
| 176 | + "columne", "column" }, { |
| 177 | + "comiler", "compiler" }, { |
| 178 | + "comitmment", "commitment" }, { |
| 179 | + "comitte", "committee" }, { |
| 180 | + "comittmen", "commitment" }, { |
| 181 | + "comittmend", "commitment" }, { |
| 182 | + "commerciasl", "commercials" }, { |
| 183 | + "commited", "committed" }, { |
| 184 | + "commitee", "committee" }, { |
| 185 | + "companys", "companies" }, { |
| 186 | + "compicated", "complicated" }, { |
| 187 | + "comupter", "computer" }, { |
| 188 | + "concensus", "consensus" }, { |
| 189 | + "confusionism", "confucianism" }, { |
| 190 | + "congradulations", "congratulations" }, { |
| 191 | + "conibation", "contribution" }, { |
| 192 | + "consident", "consistent" }, { |
| 193 | + "consident", "consonant" }, { |
| 194 | + "contast", "constant" }, { |
| 195 | + "contastant", "constant" }, { |
| 196 | + "contunie", "continue" }, { |
| 197 | + "cooly", "coolly" }, { |
| 198 | + "copping", "coping" }, { |
| 199 | + "cosmoplyton", "cosmopolitan" }, { |
| 200 | + "courst", "court" }, { |
| 201 | + "crasy", "crazy" }, { |
| 202 | + "cravets", "caveats" }, { |
| 203 | + "credetability", "credibility" }, { |
| 204 | + "criqitue", "critique" }, { |
| 205 | + "croke", "croak" }, { |
| 206 | + "crucifiction", "crucifixion" }, { |
| 207 | + "crusifed", "crucified" }, { |
| 208 | + "ctitique", "critique" }, { |
| 209 | + "cumba", "combo" }, { |
| 210 | + "custamisation", "customization" }, { |
| 211 | + "dag", "dog" }, { |
| 212 | + "daly", "daily" }, { |
| 213 | + "danguages", "dangerous" }, { |
| 214 | + "deaft", "draft" }, { |
| 215 | + "defence", "defense" }, { |
| 216 | + "defenly", "defiantly" }, { |
| 217 | + "definate", "definite" }, { |
| 218 | + "definately", "definitely" }, { |
| 219 | + "dependeble", "dependable" }, { |
| 220 | + "descrption", "description" }, { |
| 221 | + "descrptn", "description" }, { |
| 222 | + "desparate", "desperate" }, { |
| 223 | + "dessicate", "desiccate" }, { |
| 224 | + "destint", "distant" }, { |
| 225 | + "develepment", "developments" }, { |
| 226 | + "developement", "development" }, { |
| 227 | + "develpond", "development" }, { |
| 228 | + "devulge", "divulge" }, { |
| 229 | + "diagree", "disagree" }, { |
| 230 | + "dieties", "deities" }, { |
| 231 | + "dinasaur", "dinosaur" }, { |
| 232 | + "dinasour", "dinosaur" }, { |
| 233 | + "direcyly", "directly" }, { |
| 234 | + "discuess", "discuss" }, { |
| 235 | + "disect", "dissect" }, { |
| 236 | + "disippate", "dissipate" }, { |
| 237 | + "disition", "decision" }, { |
| 238 | + "dispair", "despair" }, { |
| 239 | + "disssicion", "discussion" }, { |
| 240 | + "distarct", "distract" }, { |
| 241 | + "distart", "distort" }, { |
| 242 | + "distroy", "destroy" }, { |
| 243 | + "documtations", "documentation" }, { |
| 244 | + "doenload", "download" }, { |
| 245 | + "dongle", "dangle" }, { |
| 246 | + "doog", "dog" }, { |
| 247 | + "dramaticly", "dramatically" }, { |
| 248 | + "drunkeness", "drunkenness" }, { |
| 249 | + "ductioneery", "dictionary" }, { |
| 250 | + "dur", "due" }, { |
| 251 | + "duren", "during" }, { |
| 252 | + "dymatic", "dynamic" }, { |
| 253 | + "dynaic", "dynamic" }, { |
| 254 | + "ecstacy", "ecstasy" }, { |
| 255 | + "efficat", "efficient" }, { |
| 256 | + "efficity", "efficacy" }, { |
| 257 | + "effots", "efforts" }, { |
| 258 | + "egsistence", "existence" }, { |
| 259 | + "eitiology", "etiology" }, { |
| 260 | + "elagent", "elegant" }, { |
| 261 | + "elligit", "elegant" }, { |
| 262 | + "embarass", "embarrass" }, { |
| 263 | + "embarassment", "embarrassment" }, { |
| 264 | + "embaress", "embarrass" }, { |
| 265 | + "encapsualtion", "encapsulation" }, { |
| 266 | + "encyclapidia", "encyclopedia" }, { |
| 267 | + "encyclopia", "encyclopedia" }, { |
| 268 | + "engins", "engine" }, { |
| 269 | + "enhence", "enhance" }, { |
| 270 | + "enligtment", "Enlightenment" }, { |
| 271 | + "ennuui", "ennui" }, { |
| 272 | + "enought", "enough" }, { |
| 273 | + "enventions", "inventions" }, { |
| 274 | + "envireminakl", "environmental" }, { |
| 275 | + "enviroment", "environment" }, { |
| 276 | + "epitomy", "epitome" }, { |
| 277 | + "equire", "acquire" }, { |
| 278 | + "errara", "error" }, { |
| 279 | + "erro", "error" }, { |
| 280 | + "evaualtion", "evaluation" }, { |
| 281 | + "evething", "everything" }, { |
| 282 | + "evtually", "eventually" }, { |
| 283 | + "excede", "exceed" }, { |
| 284 | + "excercise", "exercise" }, { |
| 285 | + "excpt", "except" }, { |
| 286 | + "excution", "execution" }, { |
| 287 | + "exhileration", "exhilaration" }, { |
| 288 | + "existance", "existence" }, { |
| 289 | + "expleyly", "explicitly" }, { |
| 290 | + "explity", "explicitly" }, { |
| 291 | + "expresso", "espresso" }, { |
| 292 | + "exspidient", "expedient" }, { |
| 293 | + "extions", "extensions" }, { |
| 294 | + "factontion", "factorization" }, { |
| 295 | + "failer", "failure" }, { |
| 296 | + "famdasy", "fantasy" }, { |
| 297 | + "faver", "favor" }, { |
| 298 | + "faxe", "fax" }, { |
| 299 | + "febuary", "february" }, { |
| 300 | + "firey", "fiery" }, { |
| 301 | + "fistival", "festival" }, { |
| 302 | + "flatterring", "flattering" }, { |
| 303 | + "fluk", "flux" }, { |
| 304 | + "flukse", "flux" }, { |
| 305 | + "fone", "phone" }, { |
| 306 | + "forsee", "foresee" }, { |
| 307 | + "frustartaion", "frustrating" }, { |
| 308 | + "fuction", "function" }, { |
| 309 | + "funetik", "phonetic" }, { |
| 310 | + "futs", "guts" }, { |
| 311 | + "gamne", "came" }, { |
| 312 | + "gaurd", "guard" }, { |
| 313 | + "generly", "generally" }, { |
| 314 | + "ghandi", "gandhi" }, { |
| 315 | + "goberment", "government" }, { |
| 316 | + "gobernement", "government" }, { |
| 317 | + "gobernment", "government" }, { |
| 318 | + "gotton", "gotten" }, { |
| 319 | + "gracefull", "graceful" }, { |
| 320 | + "gradualy", "gradually" }, { |
| 321 | + "grammer", "grammar" }, { |
| 322 | + "hallo", "hello" }, { |
| 323 | + "hapily", "happily" }, { |
| 324 | + "harrass", "harass" }, { |
| 325 | + "havne", "have" }, { |
| 326 | + "heellp", "help" }, { |
| 327 | + "heighth", "height" }, { |
| 328 | + "hellp", "help" }, { |
| 329 | + "helo", "hello" }, { |
| 330 | + "herlo", "hello" }, { |
| 331 | + "hifin", "hyphen" }, { |
| 332 | + "hifine", "hyphen" }, { |
| 333 | + "higer", "higher" }, { |
| 334 | + "hiphine", "hyphen" }, { |
| 335 | + "hippie", "hippy" }, { |
| 336 | + "hippopotamous", "hippopotamus" }, { |
| 337 | + "hlp", "help" }, { |
| 338 | + "hourse", "horse" }, { |
| 339 | + "houssing", "housing" }, { |
| 340 | + "howaver", "however" }, { |
| 341 | + "howver", "however" }, { |
| 342 | + "humaniti", "humanity" }, { |
| 343 | + "hyfin", "hyphen" }, { |
| 344 | + "hypotathes", "hypothesis" }, { |
| 345 | + "hypotathese", "hypothesis" }, { |
| 346 | + "hystrical", "hysterical" }, { |
| 347 | + "ident", "indent" }, { |
| 348 | + "illegitament", "illegitimate" }, { |
| 349 | + "imbed", "embed" }, { |
| 350 | + "imediaetly", "immediately" }, { |
| 351 | + "imfamy", "infamy" }, { |
| 352 | + "immenant", "immanent" }, { |
| 353 | + "implemtes", "implements" }, { |
| 354 | + "inadvertant", "inadvertent" }, { |
| 355 | + "incase", "in case" }, { |
| 356 | + "incedious", "insidious" }, { |
| 357 | + "incompleet", "incomplete" }, { |
| 358 | + "incomplot", "incomplete" }, { |
| 359 | + "inconvenant", "inconvenient" }, { |
| 360 | + "inconvience", "inconvenience" }, { |
| 361 | + "independant", "independent" }, { |
| 362 | + "independenent", "independent" }, { |
| 363 | + "indepnends", "independent" }, { |
| 364 | + "indepth", "in depth" }, { |
| 365 | + "indispensible", "indispensable" }, { |
| 366 | + "inefficite", "inefficient" }, { |
| 367 | + "inerface", "interface" }, { |
| 368 | + "infact", "in fact" }, { |
| 369 | + "influencial", "influential" }, { |
| 370 | + "inital", "initial" }, { |
| 371 | + "initinized", "initialized" }, { |
| 372 | + "initized", "initialized" }, { |
| 373 | + "innoculate", "inoculate" }, { |
| 374 | + "insistant", "insistent" }, { |
| 375 | + "insistenet", "insistent" }, { |
| 376 | + "instulation", "installation" }, { |
| 377 | + "intealignt", "intelligent" }, { |
| 378 | + "intejilent", "intelligent" }, { |
| 379 | + "intelegent", "intelligent" }, { |
| 380 | + "intelegnent", "intelligent" }, { |
| 381 | + "intelejent", "intelligent" }, { |
| 382 | + "inteligent", "intelligent" }, { |
| 383 | + "intelignt", "intelligent" }, { |
| 384 | + "intellagant", "intelligent" }, { |
| 385 | + "intellegent", "intelligent" }, { |
| 386 | + "intellegint", "intelligent" }, { |
| 387 | + "intellgnt", "intelligent" }, { |
| 388 | + "intensionality", "intensionally" }, { |
| 389 | + "interate", "iterate" }, { |
| 390 | + "internation", "international" }, { |
| 391 | + "interpretate", "interpret" }, { |
| 392 | + "interpretter", "interpreter" }, { |
| 393 | + "intertes", "interested" }, { |
| 394 | + "intertesd", "interested" }, { |
| 395 | + "invermeantial", "environmental" }, { |
| 396 | + "irregardless", "regardless" }, { |
| 397 | + "irresistable", "irresistible" }, { |
| 398 | + "irritible", "irritable" }, { |
| 399 | + "islams", "muslims" }, { |
| 400 | + "isotrop", "isotope" }, { |
| 401 | + "isreal", "israel" }, { |
| 402 | + "johhn", "john" }, { |
| 403 | + "judgement", "judgment" }, { |
| 404 | + "kippur", "kipper" }, { |
| 405 | + "knawing", "knowing" }, { |
| 406 | + "latext", "latest" }, { |
| 407 | + "leasve", "leave" }, { |
| 408 | + "lesure", "leisure" }, { |
| 409 | + "liasion", "lesion" }, { |
| 410 | + "liason", "liaison" }, { |
| 411 | + "libary", "library" }, { |
| 412 | + "likly", "likely" }, { |
| 413 | + "lilometer", "kilometer" }, { |
| 414 | + "liquify", "liquefy" }, { |
| 415 | + "lloyer", "layer" }, { |
| 416 | + "lossing", "losing" }, { |
| 417 | + "luser", "laser" }, { |
| 418 | + "maintanence", "maintenance" }, { |
| 419 | + "majaerly", "majority" }, { |
| 420 | + "majoraly", "majority" }, { |
| 421 | + "maks", "masks" }, { |
| 422 | + "mandelbrot", "Mandelbrot" }, { |
| 423 | + "mant", "want" }, { |
| 424 | + "marshall", "marshal" }, { |
| 425 | + "maxium", "maximum" }, { |
| 426 | + "meory", "memory" }, { |
| 427 | + "metter", "better" }, { |
| 428 | + "mic", "mike" }, { |
| 429 | + "midia", "media" }, { |
| 430 | + "millenium", "millennium" }, { |
| 431 | + "miniscule", "minuscule" }, { |
| 432 | + "minkay", "monkey" }, { |
| 433 | + "minum", "minimum" }, { |
| 434 | + "mischievious", "mischievous" }, { |
| 435 | + "misilous", "miscellaneous" }, { |
| 436 | + "momento", "memento" }, { |
| 437 | + "monkay", "monkey" }, { |
| 438 | + "mosaik", "mosaic" }, { |
| 439 | + "mostlikely", "most likely" }, { |
| 440 | + "mousr", "mouser" }, { |
| 441 | + "mroe", "more" }, { |
| 442 | + "neccessary", "necessary" }, { |
| 443 | + "necesary", "necessary" }, { |
| 444 | + "necesser", "necessary" }, { |
| 445 | + "neice", "niece" }, { |
| 446 | + "neighbour", "neighbor" }, { |
| 447 | + "nemonic", "pneumonic" }, { |
| 448 | + "nevade", "Nevada" }, { |
| 449 | + "nickleodeon", "nickelodeon" }, { |
| 450 | + "nieve", "naive" }, { |
| 451 | + "noone", "no one" }, { |
| 452 | + "noticably", "noticeably" }, { |
| 453 | + "notin", "not in" }, { |
| 454 | + "nozled", "nuzzled" }, { |
| 455 | + "objectsion", "objects" }, { |
| 456 | + "obsfuscate", "obfuscate" }, { |
| 457 | + "ocassion", "occasion" }, { |
| 458 | + "occuppied", "occupied" }, { |
| 459 | + "occurence", "occurrence" }, { |
| 460 | + "octagenarian", "octogenarian" }, { |
| 461 | + "olf", "old" }, { |
| 462 | + "opposim", "opossum" }, { |
| 463 | + "organise", "organize" }, { |
| 464 | + "organiz", "organize" }, { |
| 465 | + "orientate", "orient" }, { |
| 466 | + "oscilascope", "oscilloscope" }, { |
| 467 | + "oving", "moving" }, { |
| 468 | + "paramers", "parameters" }, { |
| 469 | + "parametic", "parameter" }, { |
| 470 | + "paranets", "parameters" }, { |
| 471 | + "partrucal", "particular" }, { |
| 472 | + "pataphysical", "metaphysical" }, { |
| 473 | + "patten", "pattern" }, { |
| 474 | + "permissable", "permissible" }, { |
| 475 | + "permition", "permission" }, { |
| 476 | + "permmasivie", "permissive" }, { |
| 477 | + "perogative", "prerogative" }, { |
| 478 | + "persue", "pursue" }, { |
| 479 | + "phantasia", "fantasia" }, { |
| 480 | + "phenominal", "phenomenal" }, { |
| 481 | + "picaresque", "picturesque" }, { |
| 482 | + "playwrite", "playwright" }, { |
| 483 | + "poeses", "poesies" }, { |
| 484 | + "polation", "politician" }, { |
| 485 | + "poligamy", "polygamy" }, { |
| 486 | + "politict", "politic" }, { |
| 487 | + "pollice", "police" }, { |
| 488 | + "polypropalene", "polypropylene" }, { |
| 489 | + "pompom", "pompon" }, { |
| 490 | + "possable", "possible" }, { |
| 491 | + "practicle", "practical" }, { |
| 492 | + "pragmaticism", "pragmatism" }, { |
| 493 | + "preceeding", "preceding" }, { |
| 494 | + "precion", "precision" }, { |
| 495 | + "precios", "precision" }, { |
| 496 | + "preemptory", "peremptory" }, { |
| 497 | + "prefices", "prefixes" }, { |
| 498 | + "prefixt", "prefixed" }, { |
| 499 | + "presbyterian", "Presbyterian" }, { |
| 500 | + "presue", "pursue" }, { |
| 501 | + "presued", "pursued" }, { |
| 502 | + "privielage", "privilege" }, { |
| 503 | + "priviledge", "privilege" }, { |
| 504 | + "proceedures", "procedures" }, { |
| 505 | + "pronensiation", "pronunciation" }, { |
| 506 | + "pronisation", "pronunciation" }, { |
| 507 | + "pronounciation", "pronunciation" }, { |
| 508 | + "properally", "properly" }, { |
| 509 | + "proplematic", "problematic" }, { |
| 510 | + "protray", "portray" }, { |
| 511 | + "pscolgst", "psychologist" }, { |
| 512 | + "psicolagest", "psychologist" }, { |
| 513 | + "psycolagest", "psychologist" }, { |
| 514 | + "quoz", "quiz" }, { |
| 515 | + "radious", "radius" }, { |
| 516 | + "ramplily", "rampantly" }, { |
| 517 | + "reccomend", "recommend" }, { |
| 518 | + "reccona", "raccoon" }, { |
| 519 | + "recieve", "receive" }, { |
| 520 | + "reconise", "recognize" }, { |
| 521 | + "rectangeles", "rectangle" }, { |
| 522 | + "redign", "redesign" }, { |
| 523 | + "reoccurring", "recurring" }, { |
| 524 | + "repitition", "repetition" }, { |
| 525 | + "replasments", "replacement" }, { |
| 526 | + "reposable", "responsible" }, { |
| 527 | + "reseblence", "resemblance" }, { |
| 528 | + "respct", "respect" }, { |
| 529 | + "respecally", "respectfully" }, { |
| 530 | + "roon", "room" }, { |
| 531 | + "rought", "roughly" }, { |
| 532 | + "rsx", "RSX" }, { |
| 533 | + "rudemtry", "rudimentary" }, { |
| 534 | + "runnung", "running" }, { |
| 535 | + "sacreligious", "sacrilegious" }, { |
| 536 | + "saftly", "safely" }, { |
| 537 | + "salut", "salute" }, { |
| 538 | + "satifly", "satisfy" }, { |
| 539 | + "scrabdle", "scrabble" }, { |
| 540 | + "searcheable", "searchable" }, { |
| 541 | + "secion", "section" }, { |
| 542 | + "seferal", "several" }, { |
| 543 | + "segements", "segments" }, { |
| 544 | + "sence", "sense" }, { |
| 545 | + "seperate", "separate" }, { |
| 546 | + "sherbert", "sherbet" }, { |
| 547 | + "sicolagest", "psychologist" }, { |
| 548 | + "sieze", "seize" }, { |
| 549 | + "simpfilty", "simplicity" }, { |
| 550 | + "simplye", "simply" }, { |
| 551 | + "singal", "signal" }, { |
| 552 | + "sitte", "site" }, { |
| 553 | + "situration", "situation" }, { |
| 554 | + "slyph", "sylph" }, { |
| 555 | + "smil", "smile" }, { |
| 556 | + "snuck", "sneaked" }, { |
| 557 | + "sometmes", "sometimes" }, { |
| 558 | + "soonec", "sonic" }, { |
| 559 | + "specificialy", "specifically" }, { |
| 560 | + "spel", "spell" }, { |
| 561 | + "spoak", "spoke" }, { |
| 562 | + "sponsered", "sponsored" }, { |
| 563 | + "stering", "steering" }, { |
| 564 | + "straightjacket", "straitjacket" }, { |
| 565 | + "stumach", "stomach" }, { |
| 566 | + "stutent", "student" }, { |
| 567 | + "styleguide", "style guide" }, { |
| 568 | + "subisitions", "substitutions" }, { |
| 569 | + "subjecribed", "subscribed" }, { |
| 570 | + "subpena", "subpoena" }, { |
| 571 | + "substations", "substitutions" }, { |
| 572 | + "suger", "sugar" }, { |
| 573 | + "supercede", "supersede" }, { |
| 574 | + "superfulous", "superfluous" }, { |
| 575 | + "susan", "Susan" }, { |
| 576 | + "swimwear", "swim wear" }, { |
| 577 | + "syncorization", "synchronization" }, { |
| 578 | + "taff", "tough" }, { |
| 579 | + "taht", "that" }, { |
| 580 | + "tattos", "tattoos" }, { |
| 581 | + "techniquely", "technically" }, { |
| 582 | + "teh", "the" }, { |
| 583 | + "tem", "team" }, { |
| 584 | + "teo", "two" }, { |
| 585 | + "teridical", "theoretical" }, { |
| 586 | + "tesst", "test" }, { |
| 587 | + "tets", "tests" }, { |
| 588 | + "thanot", "than or" }, { |
| 589 | + "theirselves", "themselves" }, { |
| 590 | + "theridically", "theoretical" }, { |
| 591 | + "thredically", "theoretically" }, { |
| 592 | + "thruout", "throughout" }, { |
| 593 | + "ths", "this" }, { |
| 594 | + "titalate", "titillate" }, { |
| 595 | + "tobagan", "tobaggon" }, { |
| 596 | + "tommorrow", "tomorrow" }, { |
| 597 | + "tomorow", "tomorrow" }, { |
| 598 | + "tradegy", "tragedy" }, { |
| 599 | + "trubbel", "trouble" }, { |
| 600 | + "ttest", "test" }, { |
| 601 | + "tunnellike", "tunnel like" }, { |
| 602 | + "tured", "turned" }, { |
| 603 | + "tyrrany", "tyranny" }, { |
| 604 | + "unatourral", "unnatural" }, { |
| 605 | + "unaturral", "unnatural" }, { |
| 606 | + "unconisitional", "unconstitutional" }, { |
| 607 | + "unconscience", "unconscious" }, { |
| 608 | + "underladder", "under ladder" }, { |
| 609 | + "unentelegible", "unintelligible" }, { |
| 610 | + "unfortunently", "unfortunately" }, { |
| 611 | + "unnaturral", "unnatural" }, { |
| 612 | + "upcast", "up cast" }, { |
| 613 | + "upmost", "utmost" }, { |
| 614 | + "uranisium", "uranium" }, { |
| 615 | + "verison", "version" }, { |
| 616 | + "vinagarette", "vinaigrette" }, { |
| 617 | + "volumptuous", "voluptuous" }, { |
| 618 | + "volunteerism", "voluntarism" }, { |
| 619 | + "volye", "volley" }, { |
| 620 | + "wadting", "wasting" }, { |
| 621 | + "waite", "wait" }, { |
| 622 | + "wan't", "won't" }, { |
| 623 | + "warloord", "warlord" }, { |
| 624 | + "whaaat", "what" }, { |
| 625 | + "whard", "ward" }, { |
| 626 | + "whimp", "wimp" }, { |
| 627 | + "wicken", "weaken" }, { |
| 628 | + "wierd", "weird" }, { |
| 629 | + "wrank", "rank" }, { |
| 630 | + "writeen", "righten" }, { |
| 631 | + "writting", "writing" }, { |
| 632 | + "wundeews", "windows" }, { |
| 633 | + "yeild", "yield" }, { |
| 634 | + "youe", "your" } |
| 635 | +}; |
| 636 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -36,7 +36,8 @@ |
37 | 37 | WikiQueryParser.TITLE_BOOST = 2; |
38 | 38 | WikiQueryParser.ALT_TITLE_BOOST = 6; |
39 | 39 | WikiQueryParser.KEYWORD_BOOST = 0.05f; |
40 | | - WikiIndexModifier.ALT_TITLES = 3; |
| 40 | + WikiQueryParser.ADD_TITLE_PHRASES = false; |
| 41 | + WikiIndexModifier.ALT_TITLES = 3; |
41 | 42 | FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder(); |
42 | 43 | FieldNameFactory ff = new FieldNameFactory(); |
43 | 44 | try{ |
— | — | @@ -316,6 +317,12 @@ |
317 | 318 | q = parser.parseFourPass("Israeli-Palestinian conflict",NamespacePolicy.IGNORE,true); |
318 | 319 | assertEquals("(+(+(contents:israeli contents:isra^0.5) +contents:palestinian) +contents:conflict) (+(+title:israeli^2.0 +title:palestinian^2.0) +title:conflict^2.0) ((+(+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0) +alttitle1:conflict^6.0) (+(+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0) +alttitle2:conflict^6.0) (+(+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0) +alttitle3:conflict^6.0))",q.toString()); |
319 | 320 | |
| 321 | + // title phrases |
| 322 | + WikiQueryParser.ADD_TITLE_PHRASES = true; |
| 323 | + q = parser.parseFourPass("Israeli Palestinian conflict",NamespacePolicy.IGNORE,true); |
| 324 | + assertEquals("(+(contents:israeli contents:isra^0.5) +contents:palestinian +contents:conflict (title:\"israeli palestinian\"~2^2.0 title:\"palestinian conflict\"~2^2.0)) (+title:israeli^2.0 +title:palestinian^2.0 +title:conflict^2.0) ((+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0 +alttitle1:conflict^6.0) (+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0 +alttitle2:conflict^6.0) (+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0 +alttitle3:conflict^6.0)) (spanNear([keyword1:israeli, keyword1:palestinian, keyword1:conflict], 100, false)^0.05 spanNear([keyword2:israeli, keyword2:palestinian, keyword2:conflict], 100, false)^0.025 spanNear([keyword3:israeli, keyword3:palestinian, keyword3:conflict], 100, false)^0.016666668 spanNear([keyword4:israeli, keyword4:palestinian, keyword4:conflict], 100, false)^0.0125 spanNear([keyword5:israeli, keyword5:palestinian, keyword5:conflict], 100, false)^0.01)",q.toString()); |
| 325 | + WikiQueryParser.ADD_TITLE_PHRASES = false; |
| 326 | + |
320 | 327 | // alternative transliterations |
321 | 328 | q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true); |
322 | 329 | assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString()); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/Suggest.java |
— | — | @@ -24,6 +24,8 @@ |
25 | 25 | import org.apache.lucene.search.TopDocs; |
26 | 26 | import org.wikimedia.lsearch.beans.SearchResults; |
27 | 27 | import org.wikimedia.lsearch.config.IndexId; |
| 28 | +import org.wikimedia.lsearch.suggest.api.NgramIndexer; |
| 29 | +import org.wikimedia.lsearch.suggest.api.NamespaceFreq; |
28 | 30 | import org.wikimedia.lsearch.suggest.api.WordsIndexer; |
29 | 31 | import org.wikimedia.lsearch.suggest.dist.DoubleMetaphone; |
30 | 32 | import org.wikimedia.lsearch.suggest.dist.EditDistance; |
— | — | @@ -44,7 +46,7 @@ |
45 | 47 | public Suggest(IndexId iid) throws IOException{ |
46 | 48 | this.iid = iid; |
47 | 49 | this.searcher = new IndexSearcher(iid.getSuggestWordsPath()); |
48 | | - this.phrases = new IndexSearcher(iid.getSuggestPhrasesPath()); |
| 50 | + this.phrases = new IndexSearcher(iid.getSuggestTitlesPath()); |
49 | 51 | this.dmeta = new DoubleMetaphone(); |
50 | 52 | } |
51 | 53 | |
— | — | @@ -54,7 +56,7 @@ |
55 | 57 | BooleanQuery bq = new BooleanQuery(); |
56 | 58 | addQuery(bq,"metaphone1",meta1,2); |
57 | 59 | addQuery(bq,"metaphone2",meta2,2); |
58 | | - bq.add(makeWordQuery(word),BooleanClause.Occur.SHOULD); |
| 60 | + bq.add(makeWordQuery(word,""),BooleanClause.Occur.SHOULD); |
59 | 61 | |
60 | 62 | try { |
61 | 63 | TopDocs docs = searcher.search(bq,null,POOL); |
— | — | @@ -126,13 +128,14 @@ |
127 | 129 | return Math.log10(1+score*99)/2; |
128 | 130 | } |
129 | 131 | |
130 | | - public Query makeWordQuery(String word){ |
| 132 | + public Query makeWordQuery(String word, String prefix){ |
131 | 133 | BooleanQuery bq = new BooleanQuery(true); |
132 | | - int min = WordsIndexer.getMinNgram(word); |
133 | | - int max = WordsIndexer.getMaxNgram(word); |
| 134 | + int min = NgramIndexer.getMinNgram(word); |
| 135 | + int max = NgramIndexer.getMaxNgram(word); |
| 136 | + String fieldBase = NgramIndexer.getNgramField(prefix); |
134 | 137 | for(int i=min; i <= max; i++ ){ |
135 | | - String[] ngrams = WordsIndexer.nGrams(word,i); |
136 | | - String field = "ngram"+i; |
| 138 | + String[] ngrams = NgramIndexer.nGrams(word,i); |
| 139 | + String field = fieldBase+i; |
137 | 140 | for(int j=0 ; j<ngrams.length ; j++){ |
138 | 141 | String ngram = ngrams[j]; |
139 | 142 | /*if(j == 0) |
— | — | @@ -244,7 +247,7 @@ |
245 | 248 | try { |
246 | 249 | Hits hits = phrases.search(new TermQuery(new Term("word",word1+word2))); |
247 | 250 | if(hits.length() > 0){ |
248 | | - int freq = Integer.parseInt(hits.doc(0).get("freq")); |
| 251 | + int freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(0); |
249 | 252 | if(freq >= JOIN_FREQ) |
250 | 253 | return new SuggestResult(word1+word2,freq); |
251 | 254 | } |
— | — | @@ -257,10 +260,10 @@ |
258 | 261 | |
259 | 262 | public ArrayList<SuggestResult> suggestPhrase(String word1, String word2, int num){ |
260 | 263 | String phrase = word1+"_"+word2; |
261 | | - Query q = makeWordQuery(phrase); |
| 264 | + Query q = makeWordQuery(phrase,"phrase"); |
262 | 265 | |
263 | 266 | try { |
264 | | - TopDocs docs = phrases.search(q,null,50); |
| 267 | + TopDocs docs = phrases.search(q,null,200); |
265 | 268 | EditDistance sd = new EditDistance(phrase); |
266 | 269 | ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
267 | 270 | int minfreq = -1; |
— | — | @@ -268,7 +271,7 @@ |
269 | 272 | for(ScoreDoc sc : docs.scoreDocs){ |
270 | 273 | Document d = phrases.doc(sc.doc); |
271 | 274 | SuggestResult r = new SuggestResult(d.get("phrase"), |
272 | | - Integer.parseInt(d.get("freq"))); |
| 275 | + new NamespaceFreq(d.get("freq")).getFrequency(0)); |
273 | 276 | if(phrase.equals(r.word)){ |
274 | 277 | minfreq = r.frequency; |
275 | 278 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/SuggestBuilder.java |
— | — | @@ -3,6 +3,9 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.io.InputStream; |
6 | 6 | import java.util.ArrayList; |
| 7 | +import java.util.HashMap; |
| 8 | +import java.util.HashSet; |
| 9 | +import java.util.Hashtable; |
7 | 10 | import java.util.Map.Entry; |
8 | 11 | |
9 | 12 | import org.apache.log4j.Logger; |
— | — | @@ -10,23 +13,30 @@ |
11 | 14 | import org.apache.lucene.document.Document; |
12 | 15 | import org.apache.lucene.index.IndexReader; |
13 | 16 | import org.apache.lucene.index.Term; |
| 17 | +import org.apache.lucene.search.CachingWrapperFilter; |
| 18 | +import org.apache.lucene.search.Filter; |
14 | 19 | import org.apache.lucene.search.Hits; |
15 | 20 | import org.apache.lucene.search.IndexSearcher; |
| 21 | +import org.apache.lucene.search.PhraseQuery; |
| 22 | +import org.apache.lucene.search.QueryFilter; |
16 | 23 | import org.apache.lucene.search.TermQuery; |
17 | 24 | import org.apache.lucene.store.FSDirectory; |
18 | 25 | import org.mediawiki.dumper.ProgressFilter; |
19 | 26 | import org.mediawiki.dumper.Tools; |
20 | 27 | import org.mediawiki.importer.XmlDumpReader; |
21 | 28 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 29 | +import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
22 | 30 | import org.wikimedia.lsearch.config.Configuration; |
23 | 31 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
24 | 32 | import org.wikimedia.lsearch.config.IndexId; |
25 | 33 | import org.wikimedia.lsearch.config.IndexRegistry; |
26 | 34 | import org.wikimedia.lsearch.importer.DumpImporter; |
| 35 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
27 | 36 | import org.wikimedia.lsearch.suggest.api.LuceneDictionary; |
28 | | -import org.wikimedia.lsearch.suggest.api.PhraseIndexer; |
| 37 | +import org.wikimedia.lsearch.suggest.api.NamespaceFreq; |
| 38 | +import org.wikimedia.lsearch.suggest.api.TitleIndexer; |
29 | 39 | import org.wikimedia.lsearch.suggest.api.WordsIndexer; |
30 | | -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word; |
| 40 | +import org.wikimedia.lsearch.suggest.api.Dictionary.Word; |
31 | 41 | import org.wikimedia.lsearch.util.Localization; |
32 | 42 | import org.wikimedia.lsearch.util.StringCounter; |
33 | 43 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -55,8 +65,8 @@ |
56 | 66 | inputfile = args.length>1? args[1] : null; |
57 | 67 | dbname = args[0]; |
58 | 68 | |
59 | | - |
60 | | - String langCode = GlobalConfiguration.getInstance().getLanguage(dbname); |
| 69 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
| 70 | + String langCode = global.getLanguage(dbname); |
61 | 71 | // preload |
62 | 72 | UnicodeDecomposer.getInstance(); |
63 | 73 | Localization.readLocalization(langCode); |
— | — | @@ -89,66 +99,37 @@ |
90 | 100 | } |
91 | 101 | } |
92 | 102 | // make words index |
| 103 | + log.info("Making words index"); |
93 | 104 | try { |
94 | 105 | LuceneDictionary dict = new LuceneDictionary(IndexReader.open(iid.getSuggestCleanPath()),"contents"); |
95 | | - WordsIndexer writer = new WordsIndexer(iid.getSuggestWordsPath(),50); |
| 106 | + WordsIndexer writer = new WordsIndexer(iid.getSuggestWordsPath(),(dbname.equals("wikilucene")? 3 : 50)); |
| 107 | + writer.createIndex(); |
96 | 108 | Word word; |
97 | 109 | while((word = dict.next()) != null){ |
98 | 110 | writer.addWord(word); |
99 | 111 | } |
100 | | - writer.close(); |
| 112 | + writer.closeAndOptimze(); |
101 | 113 | } catch (IOException e) { |
102 | 114 | log.fatal("Cannot open clean dictionary for "+iid+" : "+e.getMessage()); |
103 | 115 | e.printStackTrace(); |
104 | 116 | return; |
105 | 117 | } |
106 | 118 | |
| 119 | + log.info("Making suggest title index"); |
107 | 120 | // make phrase index |
108 | | - try { |
109 | | - LuceneDictionary dict = new LuceneDictionary(IndexReader.open(iid.getSuggestCleanPath()),"title"); |
110 | | - PhraseIndexer writer = new PhraseIndexer(iid.getSuggestPhrasesPath(),1); |
111 | | - IndexSearcher searcher = new IndexSearcher(iid.getSuggestCleanPath()); |
112 | | - Word word; |
113 | | - while((word = dict.next()) != null){ |
114 | | - // index word |
115 | | - writer.addWord(word); |
116 | | - String w = word.getWord(); |
117 | | - StringCounter counter = new StringCounter(); |
118 | | - Hits hits = searcher.search(new TermQuery(new Term("title",w))); |
119 | | - // find all phrases beginning with word |
120 | | - for(int i=0;i<hits.length();i++){ |
121 | | - Document doc = hits.doc(i); |
122 | | - // get original tokens |
123 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false); |
124 | | - ArrayList<Token> tokens = parser.parse(); |
125 | | - for(int j=0;j<tokens.size()-1;j++){ |
126 | | - Token t = tokens.get(j); |
127 | | - // ignore aliases |
128 | | - if(t.getPositionIncrement() == 0) |
129 | | - continue; |
130 | | - // find phrases beginning with the target word |
131 | | - if(w.equals(t.termText())){ |
132 | | - counter.count(t.termText()+"_"+tokens.get(j+1).termText()); |
133 | | - } |
134 | | - } |
135 | | - } |
136 | | - // index phrases |
137 | | - for(Entry<String,Count> e : counter.getSet()){ |
138 | | - writer.addPhrase(e.getKey(),e.getValue().num); |
139 | | - } |
140 | | - |
141 | | - } |
142 | | - writer.close(); |
143 | | - } catch (IOException e) { |
144 | | - log.fatal("Cannot open clean dictionary for "+iid+" : "+e.getMessage()); |
145 | | - e.printStackTrace(); |
146 | | - return; |
147 | | - } |
| 121 | + Hashtable<String,String> suggest = global.getDBParams(iid.getDBname(),"suggest"); |
| 122 | + int titlesWordsMinFreq = 3; |
| 123 | + int titlesPhrasesMinFreq = 1; |
| 124 | + if(suggest!=null && suggest.containsKey("titlesWordsMinFreq")) |
| 125 | + titlesWordsMinFreq = Integer.parseInt(suggest.get("titlesWordsMinFreq")); |
| 126 | + if(suggest!=null && suggest.containsKey("titlesPhrasesMinFreq")) |
| 127 | + titlesWordsMinFreq = Integer.parseInt(suggest.get("titlesPhrasesMinFreq")); |
| 128 | + TitleIndexer tInx = new TitleIndexer(iid,titlesWordsMinFreq,titlesPhrasesMinFreq); |
| 129 | + tInx.createFromExistingIndex(iid); |
148 | 130 | |
149 | | - |
150 | 131 | long end = System.currentTimeMillis(); |
151 | 132 | |
152 | | - System.out.println("Finished making spell-check index in "+formatTime(end-start)); |
| 133 | + System.out.println("Finished making suggest index in "+formatTime(end-start)); |
153 | 134 | } |
154 | 135 | |
155 | 136 | private static String formatTime(long l) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Indexer.java |
— | — | @@ -1,109 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.suggest.api; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | - |
6 | | -import org.apache.log4j.Logger; |
7 | | -import org.apache.lucene.analysis.Analyzer; |
8 | | -import org.apache.lucene.document.Document; |
9 | | -import org.apache.lucene.document.Field; |
10 | | -import org.apache.lucene.index.IndexWriter; |
11 | | -import org.wikimedia.lsearch.index.WikiIndexModifier; |
12 | | - |
13 | | -/** |
14 | | - * Base indexer class. Open/close index. |
15 | | - * |
16 | | - * @author rainman |
17 | | - * |
18 | | - */ |
19 | | -public class Indexer { |
20 | | - Logger log = Logger.getLogger(Indexer.class); |
21 | | - protected String path; |
22 | | - protected Analyzer analyzer; |
23 | | - protected IndexWriter writer; |
24 | | - |
25 | | - public Indexer(String path, Analyzer analyzer) throws IOException{ |
26 | | - this.path = path; |
27 | | - this.analyzer = analyzer; |
28 | | - try { |
29 | | - writer = new IndexWriter(path,analyzer,true); // always make new index |
30 | | - } catch (IOException e) { |
31 | | - try { |
32 | | - log.info("Making new index at path "+path); |
33 | | - // try to make brand new index |
34 | | - WikiIndexModifier.makeDBPath(path); // ensure all directories are made |
35 | | - writer = new IndexWriter(path,analyzer,true); |
36 | | - } catch (IOException e1) { |
37 | | - log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage()); |
38 | | - throw e1; |
39 | | - } |
40 | | - } |
41 | | - writer.setMergeFactor(20); |
42 | | - writer.setMaxBufferedDocs(500); |
43 | | - writer.setUseCompoundFile(true); |
44 | | - writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH); |
45 | | - |
46 | | - } |
47 | | - |
48 | | - /** Optimize and close index, always call when done indexing */ |
49 | | - public void close() throws IOException { |
50 | | - try{ |
51 | | - writer.optimize(); |
52 | | - writer.close(); |
53 | | - } catch(IOException e){ |
54 | | - log.warn("I/O error optimizing/closing index at "+path); |
55 | | - throw e; |
56 | | - } |
57 | | - } |
58 | | - |
59 | | - /** Return ngrams of specific size for text */ |
60 | | - public static String[] nGrams(String text, int size) { |
61 | | - int len = text.length(); |
62 | | - String[] res = new String[len - size + 1]; |
63 | | - for (int i = 0; i < len - size + 1; i++) { |
64 | | - res[i] = text.substring(i, i + size); |
65 | | - } |
66 | | - return res; |
67 | | - } |
68 | | - |
69 | | - /** Get minimal ngram size for word. Short words (<=3 chars) will have 1-grams, other 2-grams */ |
70 | | - public static int getMinNgram(String word){ |
71 | | - if(word.length() <= 3) |
72 | | - return 1; |
73 | | - else if(word.length() == 4) |
74 | | - return 2; |
75 | | - else |
76 | | - return 3; |
77 | | - } |
78 | | - /** Get minimal ngram size for word. Long words: 4-grams, other 3-grams, 2-char word only 1-grams */ |
79 | | - public static int getMaxNgram(String word){ |
80 | | - if(word.length() > 4) |
81 | | - return 3; |
82 | | - if(word.length() == 2) |
83 | | - return 1; |
84 | | - return 2; |
85 | | - } |
86 | | - |
87 | | - /** |
88 | | - * Add ngrams of all sizes from 1 to word.length to document |
89 | | - * |
90 | | - * @param doc - document to add fields to |
91 | | - * @param word - word |
92 | | - */ |
93 | | - protected void addNgramFields(Document doc, String word) { |
94 | | - int min = getMinNgram(word); |
95 | | - int max = getMaxNgram(word); |
96 | | - for(int i=min ; i <= max ; i++ ){ |
97 | | - String[] ngrams = nGrams(word,i); |
98 | | - String field = "ngram"+i; |
99 | | - for(int j=0 ; j<ngrams.length ; j++){ |
100 | | - String ngram = ngrams[j]; |
101 | | - if(j == 0) |
102 | | - doc.add(new Field("start"+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED)); |
103 | | - else if(j == ngrams.length-1) |
104 | | - doc.add(new Field("end"+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED)); |
105 | | - // finally add regular ngram |
106 | | - doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED)); |
107 | | - } |
108 | | - } |
109 | | - } |
110 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/PhraseIndexer.java |
— | — | @@ -1,82 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.suggest.api; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | - |
6 | | -import org.apache.lucene.analysis.SimpleAnalyzer; |
7 | | -import org.apache.lucene.document.Document; |
8 | | -import org.apache.lucene.document.Field; |
9 | | -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word; |
10 | | - |
11 | | -/** |
12 | | - * Class to build an index of phrases. It indexes: |
13 | | - * 1) sets of two words as douglas_adams |
14 | | - * 2) individual words |
15 | | - * |
16 | | - * 1) is useful for content-dependant suggestions and |
17 | | - * suggesting splits (splitting one word into two), while |
18 | | - * 2) is useful for suggesting joins |
19 | | - * |
20 | | - * @author rainman |
21 | | - * |
22 | | - */ |
23 | | -public class PhraseIndexer extends Indexer { |
24 | | - int minFreq; |
25 | | - |
26 | | - public PhraseIndexer(String path, int minFreq) throws IOException{ |
27 | | - super(path,new SimpleAnalyzer()); |
28 | | - this.minFreq = minFreq; |
29 | | - } |
30 | | - |
31 | | - /** Add phrase, convenient for suggesting splits and context-dependend suggestions */ |
32 | | - public void addPhrase(String word1, String word2, int frequency){ |
33 | | - addPhrase(word1+"_"+word2,frequency); |
34 | | - } |
35 | | - /** Add phrase, join two words by underscore */ |
36 | | - public void addPhrase(String phrase, int frequency){ |
37 | | - if(frequency < minFreq) |
38 | | - return; |
39 | | - Document doc = new Document(); |
40 | | - addNgramFields(doc,phrase); |
41 | | - doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
42 | | - doc.add(new Field("freq",Integer.toString(frequency), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
43 | | - |
44 | | - try { |
45 | | - writer.addDocument(doc); |
46 | | - } catch (Exception e) { |
47 | | - log.error("Cannot add document "+doc); |
48 | | - e.printStackTrace(); |
49 | | - } |
50 | | - } |
51 | | - |
52 | | - /** Add ordinary word to the index, convenient for suggesting joins */ |
53 | | - public void addWord(Word word){ |
54 | | - Document doc = new Document(); |
55 | | - doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
56 | | - doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
57 | | - |
58 | | - try { |
59 | | - writer.addDocument(doc); |
60 | | - } catch (Exception e) { |
61 | | - log.error("Cannot add document "+doc); |
62 | | - e.printStackTrace(); |
63 | | - } |
64 | | - } |
65 | | - |
66 | | - /** Get minimal ngram size for word. Short words (<=3 chars) will have 1-grams, other 2-grams */ |
67 | | - public static int getMinNgram(String word){ |
68 | | - if(word.length() == 2) |
69 | | - return 1; |
70 | | - if(word.length() <= 6) |
71 | | - return word.length() - 2; |
72 | | - else |
73 | | - return 5; |
74 | | - } |
75 | | - |
76 | | - /** Get minimal ngram size for word. Long words: 4-grams, other 3-grams, 2-char word only 1-grams */ |
77 | | - public static int getMaxNgram(String word){ |
78 | | - if(word.length() == 2) |
79 | | - return 1; |
80 | | - else |
81 | | - return getMinNgram(word) + 4; |
82 | | - } |
83 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitlesIndexer.java |
— | — | @@ -1,31 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.suggest.api; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | - |
6 | | -import org.apache.log4j.Logger; |
7 | | -import org.apache.lucene.analysis.Analyzer; |
8 | | -import org.apache.lucene.document.Document; |
9 | | -import org.apache.lucene.document.Field; |
10 | | -import org.apache.lucene.document.Field.Index; |
11 | | -import org.apache.lucene.document.Field.Store; |
12 | | - |
13 | | -public class TitlesIndexer extends Indexer { |
14 | | - static Logger log = Logger.getLogger(TitlesIndexer.class); |
15 | | - |
16 | | - public TitlesIndexer(String path, Analyzer analyzer) throws IOException{ |
17 | | - super(path,analyzer); |
18 | | - } |
19 | | - |
20 | | - public void addTitle(int ns, String title){ |
21 | | - Document doc = new Document(); |
22 | | - doc.add(new Field("title",title,Store.YES,Index.TOKENIZED)); |
23 | | - doc.add(new Field("namespace",Integer.toString(ns),Store.YES,Index.UN_TOKENIZED)); |
24 | | - try { |
25 | | - writer.addDocument(doc); |
26 | | - } catch (IOException e) { |
27 | | - log.error("Cannot add document "+doc); |
28 | | - e.printStackTrace(); |
29 | | - } |
30 | | - } |
31 | | - |
32 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/WordsIndexer.java |
— | — | @@ -6,10 +6,7 @@ |
7 | 7 | import org.apache.lucene.analysis.SimpleAnalyzer; |
8 | 8 | import org.apache.lucene.document.Document; |
9 | 9 | import org.apache.lucene.document.Field; |
10 | | -import org.apache.lucene.index.IndexWriter; |
11 | | -import org.wikimedia.lsearch.config.GlobalConfiguration; |
12 | | -import org.wikimedia.lsearch.config.IndexId; |
13 | | -import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 10 | +import org.wikimedia.lsearch.suggest.api.Dictionary.Word; |
14 | 11 | import org.wikimedia.lsearch.suggest.dist.DoubleMetaphone; |
15 | 12 | |
16 | 13 | /** |
— | — | @@ -20,59 +17,42 @@ |
21 | 18 | * @author rainman |
22 | 19 | * |
23 | 20 | */ |
24 | | -public class WordsIndexer extends Indexer { |
25 | | - public static class Word { |
26 | | - protected String word; |
27 | | - protected int frequency; |
28 | | - public Word(String word, int frequency) { |
29 | | - super(); |
30 | | - this.word = word; |
31 | | - this.frequency = frequency; |
32 | | - } |
33 | | - public int getFrequency() { |
34 | | - return frequency; |
35 | | - } |
36 | | - public void setFrequency(int frequency) { |
37 | | - this.frequency = frequency; |
38 | | - } |
39 | | - public String getWord() { |
40 | | - return word; |
41 | | - } |
42 | | - public void setWord(String word) { |
43 | | - this.word = word; |
44 | | - } |
45 | | - public String toString(){ |
46 | | - return word+" : "+frequency; |
47 | | - } |
48 | | - |
49 | | - } |
| 21 | +public class WordsIndexer { |
50 | 22 | static Logger log = Logger.getLogger(WordsIndexer.class); |
51 | | - DoubleMetaphone dmeta; |
| 23 | + protected DoubleMetaphone dmeta; |
52 | 24 | /** If word occurs less that minFreq times, it will be discarded */ |
53 | 25 | protected int minFreq; |
| 26 | + protected NgramIndexer indexer; |
| 27 | + String path; |
54 | 28 | |
55 | 29 | public WordsIndexer(String path, int minFreq) throws IOException { |
56 | | - super(path,new SimpleAnalyzer()); |
| 30 | + this.path = path; |
57 | 31 | this.minFreq = minFreq; |
58 | 32 | this.dmeta = new DoubleMetaphone(); |
| 33 | + this.indexer = new NgramIndexer(); |
59 | 34 | } |
| 35 | + |
| 36 | + public void createIndex() throws IOException{ |
| 37 | + indexer.createIndex(path, new SimpleAnalyzer()); |
| 38 | + } |
60 | 39 | |
61 | | - /** Add word to the index */ |
| 40 | + /** Add word to the index, make sure index is open */ |
62 | 41 | public void addWord(Word word){ |
63 | 42 | if(word.frequency < minFreq) |
64 | 43 | return; |
| 44 | + if(word.getWord().length() < 2) |
| 45 | + return; |
65 | 46 | Document doc = new Document(); |
66 | | - addNgramFields(doc,word.word); |
| 47 | + indexer.createNgramFields(doc,"",word.word); |
67 | 48 | doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
68 | | - doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 49 | + doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.NO)); |
69 | 50 | doc.add(new Field("metaphone1",dmeta.doubleMetaphone(word.word), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
70 | 51 | doc.add(new Field("metaphone2",dmeta.doubleMetaphone(word.word,true), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
71 | 52 | |
72 | | - try { |
73 | | - writer.addDocument(doc); |
74 | | - } catch (Exception e) { |
75 | | - log.error("Cannot add document "+doc); |
76 | | - e.printStackTrace(); |
77 | | - } |
| 53 | + indexer.addDocument(doc); |
78 | 54 | } |
| 55 | + |
| 56 | + public void closeAndOptimze() throws IOException{ |
| 57 | + indexer.closeAndOptimize(); |
| 58 | + } |
79 | 59 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/TitleIndexer.java |
— | — | @@ -0,0 +1,323 @@ |
| 2 | +package org.wikimedia.lsearch.suggest.api; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
| 7 | +import java.util.HashMap; |
| 8 | +import java.util.HashSet; |
| 9 | +import java.util.Map.Entry; |
| 10 | + |
| 11 | +import org.apache.log4j.Logger; |
| 12 | +import org.apache.lucene.analysis.SimpleAnalyzer; |
| 13 | +import org.apache.lucene.analysis.Token; |
| 14 | +import org.apache.lucene.document.Document; |
| 15 | +import org.apache.lucene.document.Field; |
| 16 | +import org.apache.lucene.index.IndexReader; |
| 17 | +import org.apache.lucene.index.Term; |
| 18 | +import org.apache.lucene.search.Hits; |
| 19 | +import org.apache.lucene.search.IndexSearcher; |
| 20 | +import org.apache.lucene.search.MultiSearcher; |
| 21 | +import org.apache.lucene.search.PhraseQuery; |
| 22 | +import org.apache.lucene.search.Query; |
| 23 | +import org.apache.lucene.search.SearchableMul; |
| 24 | +import org.apache.lucene.search.Searcher; |
| 25 | +import org.apache.lucene.search.TermQuery; |
| 26 | +import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 27 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 28 | +import org.wikimedia.lsearch.config.IndexId; |
| 29 | +import org.wikimedia.lsearch.config.IndexRegistry; |
| 30 | +import org.wikimedia.lsearch.index.IndexUpdateRecord; |
| 31 | +import org.wikimedia.lsearch.search.IndexSearcherMul; |
| 32 | +import org.wikimedia.lsearch.search.WikiSearcher; |
| 33 | +import org.wikimedia.lsearch.suggest.api.Dictionary.Word; |
| 34 | + |
| 35 | +/** |
| 36 | + * Index words and phrases from article titles. |
| 37 | + * |
| 38 | + * Fields: |
| 39 | + * * word - word from title |
| 40 | + * * phrase - phrase like douglas_adams |
| 41 | + * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3) |
| 42 | + * * namespace - namespaces where the word/phrase is present |
| 43 | + * |
| 44 | + * @author rainman |
| 45 | + * |
| 46 | + */ |
| 47 | +public class TitleIndexer { |
| 48 | + static Logger log = Logger.getLogger(TitleIndexer.class); |
| 49 | + protected NgramIndexer ngramWriter; |
| 50 | + public static final boolean NEW_INDEX = true; |
| 51 | + protected boolean createNew; |
| 52 | + protected int minWordFreq, minPhraseFreq; |
| 53 | + protected IndexId iid; |
| 54 | + protected String langCode; |
| 55 | + protected IndexRegistry registry; |
| 56 | + protected String path; |
| 57 | + |
| 58 | + public TitleIndexer(IndexId iid, int minWordFreq, int minPhraseFreq){ |
| 59 | + this(iid,minWordFreq,minPhraseFreq,false); |
| 60 | + } |
| 61 | + |
| 62 | + public TitleIndexer(IndexId iid, int minWordFreq, int minPhraseFreq, boolean createNew){ |
| 63 | + this.iid = iid; |
| 64 | + this.minWordFreq = minWordFreq; |
| 65 | + this.minPhraseFreq = minPhraseFreq; |
| 66 | + this.createNew = createNew; |
| 67 | + this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
| 68 | + this.ngramWriter = new NgramIndexer(); |
| 69 | + this.registry = IndexRegistry.getInstance(); |
| 70 | + this.path = iid.getSuggestTitlesPath(); |
| 71 | + } |
| 72 | + |
| 73 | + protected Searcher makeSearcher(IndexId logical) throws IOException{ |
| 74 | + if(logical.isSingle()) |
| 75 | + return new IndexSearcherMul(registry.getLatestSnapshot(logical).path); |
| 76 | + else{ |
| 77 | + ArrayList<IndexSearcherMul> searchers = new ArrayList<IndexSearcherMul>(); |
| 78 | + for(String part : iid.getPhysicalIndexes()){ |
| 79 | + searchers.add(new IndexSearcherMul(registry.getLatestSnapshot(IndexId.get(part)).path)); |
| 80 | + } |
| 81 | + return new MultiSearcher(searchers.toArray(new SearchableMul[]{})); |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, Query q) throws IOException{ |
| 86 | + Hits hits = searcher.search(q); |
| 87 | + NamespaceFreq wnf = new NamespaceFreq(); |
| 88 | + for(int j=0;j<hits.length();j++){ |
| 89 | + wnf.incFrequency(namespaces[hits.id(j)]); |
| 90 | + } |
| 91 | + return wnf; |
| 92 | + } |
| 93 | + |
| 94 | + /** Get frequency for a single word */ |
| 95 | + protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String word) throws IOException{ |
| 96 | + return getFrequency(searcher,namespaces,new TermQuery(new Term("contents",word))); |
| 97 | + } |
| 98 | + |
| 99 | + /** Get frequency of phrase (invidual words as array) */ |
| 100 | + protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{ |
| 101 | + PhraseQuery pq = new PhraseQuery(); |
| 102 | + for(String p : phrase){ |
| 103 | + pq.add(new Term("contents",p)); |
| 104 | + } |
| 105 | + return getFrequency(searcher,namespaces,pq); |
| 106 | + } |
| 107 | + |
| 108 | + /** Get namespaces where word appears in title */ |
| 109 | + protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, Query q) throws IOException{ |
| 110 | + Hits hits = searcher.search(q); |
| 111 | + HashSet<Integer> ns = new HashSet<Integer>(); |
| 112 | + for(int j=0;j<hits.length();j++){ |
| 113 | + ns.add(namespaces[hits.id(j)]); |
| 114 | + } |
| 115 | + return ns; |
| 116 | + } |
| 117 | + |
| 118 | + protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String word) throws IOException{ |
| 119 | + return getNamespaces(searcher,namespaces,new TermQuery(new Term("title",word))); |
| 120 | + } |
| 121 | + |
| 122 | + protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{ |
| 123 | + PhraseQuery pq = new PhraseQuery(); |
| 124 | + for(String p : phrase){ |
| 125 | + pq.add(new Term("title",p)); |
| 126 | + } |
| 127 | + return getNamespaces(searcher,namespaces,pq); |
| 128 | + } |
| 129 | + |
| 130 | + /** |
| 131 | + * Returns the namespace for each doc_id |
| 132 | + * @throws IOException |
| 133 | + * @FIXME: assumes optimized index |
| 134 | + */ |
| 135 | + protected int[] makeNamespaceMap(Searcher searcher) throws IOException{ |
| 136 | + log.debug("Making namespace map..."); |
| 137 | + int[] namespaces = new int[searcher.maxDoc()]; |
| 138 | + for(int i=0;i<namespaces.length;i++){ |
| 139 | + namespaces[i] = -100; |
| 140 | + Document doc = searcher.doc(i); |
| 141 | + if(doc != null) |
| 142 | + namespaces[i] = Integer.parseInt(doc.get("namespace")); |
| 143 | + } |
| 144 | + log.debug("Done making namespace map"); |
| 145 | + return namespaces; |
| 146 | + } |
| 147 | + |
| 148 | + /** Create new title word/phrases index from an existing index *snapshot* by reading all terms in the index */ |
| 149 | + public void createFromExistingIndex(IndexId src){ |
| 150 | + try{ |
| 151 | + log.debug("Creating new suggest index"); |
| 152 | + ngramWriter.createIndex(path,new SimpleAnalyzer()); |
| 153 | + Searcher searcher = makeSearcher(iid.getLogical()); |
| 154 | + // map doc_id -> namespace |
| 155 | + int[] namespaces = makeNamespaceMap(searcher); |
| 156 | + |
| 157 | + for(String dbrole : src.getPhysicalIndexes()){ |
| 158 | + log.info("Processing index "+dbrole); |
| 159 | + if(!ngramWriter.isOpen()) // if we closed the index previously |
| 160 | + ngramWriter.reopenIndex(path,new SimpleAnalyzer()); |
| 161 | + |
| 162 | + IndexId part = IndexId.get(dbrole); |
| 163 | + IndexReader ir = IndexReader.open(registry.getLatestSnapshot(part).path); |
| 164 | + LuceneDictionary dict = new LuceneDictionary(ir,"title"); |
| 165 | + IndexSearcher ngramSearcher = new IndexSearcher(path); |
| 166 | + Word word; |
| 167 | + // get all words, and all phrases beginning with word |
| 168 | + while((word = dict.next()) != null){ |
| 169 | + log.debug("Processing word "+word); |
| 170 | + String w = word.getWord(); |
| 171 | + |
| 172 | + // check if word is already in the index |
| 173 | + if(ngramSearcher.docFreq(new Term("word",w)) != 0) |
| 174 | + continue; |
| 175 | + |
| 176 | + // index word |
| 177 | + NamespaceFreq wnf = getFrequency(searcher,namespaces,w); |
| 178 | + Collection<Integer> wns = getNamespaces(searcher,namespaces,w); |
| 179 | + addWord(w,wnf,wns); |
| 180 | + |
| 181 | + // index phrases |
| 182 | + HashSet<String> phrases = new HashSet<String>(); |
| 183 | + Hits hits = searcher.search(new TermQuery(new Term("title",w))); |
| 184 | + // find all phrases beginning with word |
| 185 | + for(int i=0;i<hits.length();i++){ |
| 186 | + Document doc = hits.doc(i); |
| 187 | + // tokenize to make phrases |
| 188 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false); |
| 189 | + ArrayList<Token> tokens = parser.parse(); |
| 190 | + for(int j=0;j<tokens.size()-1;j++){ |
| 191 | + Token t = tokens.get(j); |
| 192 | + // ignore aliases |
| 193 | + if(t.getPositionIncrement() == 0) |
| 194 | + continue; |
| 195 | + // find phrases beginning with the target word |
| 196 | + if(w.equals(t.termText())){ |
| 197 | + phrases.add(t.termText()+"_"+tokens.get(j+1).termText()); |
| 198 | + } |
| 199 | + } |
| 200 | + } |
| 201 | + log.debug("Adding "+phrases.size()+" phrases "+phrases); |
| 202 | + // index phrases |
| 203 | + for(String phrase : phrases){ |
| 204 | + NamespaceFreq nf = getFrequency(searcher,namespaces,phrase.split("_")); |
| 205 | + Collection<Integer> pns = getNamespaces(searcher,namespaces,phrase.split("_")); |
| 206 | + addPhrase(phrase,nf,pns); |
| 207 | + } |
| 208 | + } |
| 209 | + log.debug("Finished index "+dbrole+", closing/optimizing."); |
| 210 | + ir.close(); |
| 211 | + ngramSearcher.close(); |
| 212 | + ngramWriter.closeAndOptimize(); |
| 213 | + } |
| 214 | + searcher.close(); |
| 215 | + } catch (IOException e) { |
| 216 | + log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage()); |
| 217 | + e.printStackTrace(); |
| 218 | + return; |
| 219 | + } |
| 220 | + } |
| 221 | + |
| 222 | + /** |
| 223 | + * Add phrase to index |
| 224 | + * |
| 225 | + * @param phrase - 2+ words joined with underscore |
| 226 | + * @param nf - frequencies of phrase in various namespaces |
| 227 | + * @param namespaces - namespaces where phrase appears in title |
| 228 | + */ |
| 229 | + public void addPhrase(String phrase, NamespaceFreq nf, Collection<Integer> namespaces){ |
| 230 | + String freq = nf.serialize(minPhraseFreq); |
| 231 | + if(freq.length() == 0) |
| 232 | + return; |
| 233 | + if(phrase.length() <= 2){ |
| 234 | + log.warn("Invalid phrase: "+phrase); |
| 235 | + return; |
| 236 | + } |
| 237 | + Document doc = new Document(); |
| 238 | + ngramWriter.createNgramFields(doc,"phrase",phrase); |
| 239 | + doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 240 | + doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO)); |
| 241 | + for(Integer ns : namespaces){ |
| 242 | + doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED)); |
| 243 | + } |
| 244 | + |
| 245 | + ngramWriter.addDocument(doc); |
| 246 | + } |
| 247 | + |
| 248 | + /** Add ordinary word to the index, convenient for suggesting joins |
| 249 | + * |
| 250 | + * @param word - word to add |
| 251 | + * @param nf - frequencies in namespaces |
| 252 | + * @param namespaces - namespaces where word appears in title |
| 253 | + */ |
| 254 | + public void addWord(String word, NamespaceFreq nf, Collection<Integer> namespaces){ |
| 255 | + if(word.length() < 2) |
| 256 | + return; |
| 257 | + String freq = nf.serialize(minWordFreq); |
| 258 | + if(freq.length() == 0) |
| 259 | + return; |
| 260 | + Document doc = new Document(); |
| 261 | + ngramWriter.createNgramFields(doc,"word",word); |
| 262 | + doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 263 | + doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO)); |
| 264 | + for(Integer ns : namespaces){ |
| 265 | + doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED)); |
| 266 | + } |
| 267 | + |
| 268 | + ngramWriter.addDocument(doc); |
| 269 | + } |
| 270 | + |
| 271 | + /** Update the index */ |
| 272 | + public void update(Collection<IndexUpdateRecord> records){ |
| 273 | + try{ |
| 274 | + log.info("Updating suggest index for "+iid+" with "+records.size()); |
| 275 | + IndexReader ir = IndexReader.open(path); |
| 276 | + Searcher searcher = makeSearcher(iid.getLogical()); |
| 277 | + int[] namespaces = makeNamespaceMap(searcher); |
| 278 | + // get all words and phrases |
| 279 | + HashSet<String> words = new HashSet<String>(); |
| 280 | + HashSet<String> phrases = new HashSet<String>(); |
| 281 | + for(IndexUpdateRecord rec : records){ |
| 282 | + String title = rec.getArticle().getTitle(); |
| 283 | + ArrayList<Token> tokens = new FastWikiTokenizerEngine(title,langCode,false).parse(); |
| 284 | + String last = null; |
| 285 | + // register word/phrases |
| 286 | + for(Token t : tokens){ |
| 287 | + String w = t.termText(); |
| 288 | + words.add(w); |
| 289 | + if(last != null){ |
| 290 | + phrases.add(last+"_"+w); |
| 291 | + } |
| 292 | + last = w; |
| 293 | + } |
| 294 | + } |
| 295 | + searcher.close(); |
| 296 | + |
| 297 | + // batch delete old values |
| 298 | + for(String word : words){ |
| 299 | + ir.deleteDocuments(new Term("word",word)); |
| 300 | + } |
| 301 | + for(String phrase : phrases){ |
| 302 | + ir.deleteDocuments(new Term("phrase",phrase)); |
| 303 | + } |
| 304 | + ir.close(); |
| 305 | + ngramWriter.reopenIndex(path,new SimpleAnalyzer()); |
| 306 | + |
| 307 | + // batch add new stuff |
| 308 | + for(String word : words){ |
| 309 | + addWord(word,getFrequency(searcher,namespaces,word),getNamespaces(searcher,namespaces,word)); |
| 310 | + } |
| 311 | + for(String phrase : phrases){ |
| 312 | + String[] ph = phrase.split("_"); |
| 313 | + addPhrase(phrase,getFrequency(searcher,namespaces,ph),getNamespaces(searcher,namespaces,ph)); |
| 314 | + } |
| 315 | + |
| 316 | + ngramWriter.close(); |
| 317 | + } catch(IOException e){ |
| 318 | + log.error("Cannot update suggest index for "+iid+" : "+e.getMessage()); |
| 319 | + e.printStackTrace(); |
| 320 | + return; |
| 321 | + } |
| 322 | + } |
| 323 | + |
| 324 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/LuceneDictionary.java |
— | — | @@ -20,10 +20,7 @@ |
21 | 21 | import org.apache.lucene.index.IndexReader; |
22 | 22 | import org.apache.lucene.index.Term; |
23 | 23 | |
24 | | -import java.util.Iterator; |
25 | | - |
26 | 24 | import org.apache.lucene.index.TermEnum; |
27 | | -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word; |
28 | 25 | |
29 | 26 | import java.io.*; |
30 | 27 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NgramIndexer.java |
— | — | @@ -0,0 +1,162 @@ |
| 2 | +package org.wikimedia.lsearch.suggest.api; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | + |
| 6 | +import org.apache.log4j.Logger; |
| 7 | +import org.apache.lucene.analysis.Analyzer; |
| 8 | +import org.apache.lucene.document.Document; |
| 9 | +import org.apache.lucene.document.Field; |
| 10 | +import org.apache.lucene.index.IndexWriter; |
| 11 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 12 | + |
| 13 | +/** |
| 14 | + * Useful for basic ngram indexes handling, open/close indexes, add ngram fields, etc.. |
| 15 | + * |
| 16 | + * @author rainman |
| 17 | + * |
| 18 | + */ |
| 19 | +public class NgramIndexer { |
| 20 | + Logger log = Logger.getLogger(NgramIndexer.class); |
| 21 | + protected String path; |
| 22 | + protected Analyzer analyzer; |
| 23 | + protected IndexWriter writer; |
| 24 | + |
| 25 | + public NgramIndexer(){ |
| 26 | + path = null; |
| 27 | + analyzer = null; |
| 28 | + writer = null; |
| 29 | + } |
| 30 | + |
| 31 | + /** Make a new ngram index */ |
| 32 | + public void createIndex(String path, Analyzer analyzer) throws IOException{ |
| 33 | + openIndex(path,analyzer,true); |
| 34 | + } |
| 35 | + |
| 36 | + /** Reopen old index, make if doesn't exist */ |
| 37 | + public void reopenIndex(String path, Analyzer analyzer) throws IOException{ |
| 38 | + openIndex(path,analyzer,false); |
| 39 | + } |
| 40 | + |
| 41 | + public void openIndex(String path, Analyzer analyzer, boolean newIndex) throws IOException{ |
| 42 | + this.path = path; |
| 43 | + this.analyzer = analyzer; |
| 44 | + try { |
| 45 | + writer = new IndexWriter(path,analyzer,newIndex); |
| 46 | + } catch (IOException e) { |
| 47 | + try { |
| 48 | + log.info("Making new index at path "+path); |
| 49 | + // try to make brand new index |
| 50 | + WikiIndexModifier.makeDBPath(path); // ensure all directories are made |
| 51 | + writer = new IndexWriter(path,analyzer,newIndex); |
| 52 | + } catch (IOException e1) { |
| 53 | + log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage()); |
| 54 | + throw e1; |
| 55 | + } |
| 56 | + } |
| 57 | + writer.setMergeFactor(20); |
| 58 | + writer.setMaxBufferedDocs(500); |
| 59 | + writer.setUseCompoundFile(true); |
| 60 | + writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH); |
| 61 | + |
| 62 | + } |
| 63 | + |
| 64 | + /** Check if index is open and ready for modification */ |
| 65 | + public boolean isOpen(){ |
| 66 | + return writer != null; |
| 67 | + } |
| 68 | + |
| 69 | + /** Optimize and close index, always call when done indexing */ |
| 70 | + public void close() throws IOException { |
| 71 | + try{ |
| 72 | + writer.close(); |
| 73 | + writer = null; |
| 74 | + } catch(IOException e){ |
| 75 | + log.warn("I/O error closing index at "+path); |
| 76 | + throw e; |
| 77 | + } |
| 78 | + } |
| 79 | + |
| 80 | + /** Optimize and close index, always call when done indexing */ |
| 81 | + public void closeAndOptimize() throws IOException { |
| 82 | + try{ |
| 83 | + writer.optimize(); |
| 84 | + writer.close(); |
| 85 | + writer = null; |
| 86 | + } catch(IOException e){ |
| 87 | + log.warn("I/O error optimizing/closing index at "+path); |
| 88 | + throw e; |
| 89 | + } |
| 90 | + } |
| 91 | + |
| 92 | + /** Return ngrams of specific size for text */ |
| 93 | + public static String[] nGrams(String text, int size) { |
| 94 | + int len = text.length(); |
| 95 | + String[] res = new String[len - size + 1]; |
| 96 | + for (int i = 0; i < len - size + 1; i++) { |
| 97 | + res[i] = text.substring(i, i + size); |
| 98 | + } |
| 99 | + return res; |
| 100 | + } |
| 101 | + |
| 102 | + /** Get minimal ngram size for word. the minimal size should be at least 1/2 of word length */ |
| 103 | + public static int getMinNgram(String word){ |
| 104 | + if(word.length() <= 3) |
| 105 | + return 1; |
| 106 | + else if(word.length() == 4 || word.length() == 5) |
| 107 | + return 2; |
| 108 | + else |
| 109 | + return 3; |
| 110 | + } |
| 111 | + |
| 112 | + /** Maximal size of ngram block, at most the length of word */ |
| 113 | + public static int getMaxNgram(String word){ |
| 114 | + if(word.length() == 2) |
| 115 | + return 2; |
| 116 | + else |
| 117 | + return 3; |
| 118 | + } |
| 119 | + |
| 120 | + /** Get ngram field name with no prefix */ |
| 121 | + public static String getNgramField(){ |
| 122 | + return getNgramField(null); |
| 123 | + } |
| 124 | + |
| 125 | + /** Get prefixed ngram field name */ |
| 126 | + public static String getNgramField(String prefix){ |
| 127 | + if(prefix == null || prefix.equals("")) |
| 128 | + return "ngram"; |
| 129 | + else |
| 130 | + return prefix+"_ngram"; |
| 131 | + } |
| 132 | + |
| 133 | + /** |
| 134 | + * Add ngrams of all sizes from 1 to word.length to document |
| 135 | + * |
| 136 | + * @param doc - document to add fields to |
| 137 | + * @param prefix - prefix to ngram field name |
| 138 | + * @param word - word |
| 139 | + */ |
| 140 | + protected void createNgramFields(Document doc, String prefix, String word) { |
| 141 | + int min = getMinNgram(word); |
| 142 | + int max = getMaxNgram(word); |
| 143 | + String fieldBase = getNgramField(prefix); |
| 144 | + for(int i=min ; i <= max ; i++ ){ |
| 145 | + String[] ngrams = nGrams(word,i); |
| 146 | + String field = fieldBase+i; |
| 147 | + for(int j=0 ; j<ngrams.length ; j++){ |
| 148 | + String ngram = ngrams[j]; |
| 149 | + doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED)); |
| 150 | + } |
| 151 | + } |
| 152 | + } |
| 153 | + |
| 154 | + public void addDocument(Document doc){ |
| 155 | + try { |
| 156 | + log.debug("Adding document "+doc); |
| 157 | + writer.addDocument(doc); |
| 158 | + } catch (Exception e) { |
| 159 | + log.error("Cannot add document "+doc+" : "+e.getMessage()); |
| 160 | + e.printStackTrace(); |
| 161 | + } |
| 162 | + } |
| 163 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/Dictionary.java |
— | — | @@ -1,9 +1,31 @@ |
2 | 2 | package org.wikimedia.lsearch.suggest.api; |
3 | 3 | |
4 | | -import org.wikimedia.lsearch.suggest.api.WordsIndexer.Word; |
5 | | - |
6 | | - |
7 | 4 | public interface Dictionary { |
| 5 | + public static class Word { |
| 6 | + protected String word; |
| 7 | + protected int frequency; |
| 8 | + public Word(String word, int frequency) { |
| 9 | + super(); |
| 10 | + this.word = word; |
| 11 | + this.frequency = frequency; |
| 12 | + } |
| 13 | + public int getFrequency() { |
| 14 | + return frequency; |
| 15 | + } |
| 16 | + public void setFrequency(int frequency) { |
| 17 | + this.frequency = frequency; |
| 18 | + } |
| 19 | + public String getWord() { |
| 20 | + return word; |
| 21 | + } |
| 22 | + public void setWord(String word) { |
| 23 | + this.word = word; |
| 24 | + } |
| 25 | + public String toString(){ |
| 26 | + return word+" : "+frequency; |
| 27 | + } |
| 28 | + |
| 29 | + } |
8 | 30 | /** Get next term or null if there is no more terms */ |
9 | 31 | public Word next(); |
10 | 32 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/api/NamespaceFreq.java |
— | — | @@ -0,0 +1,88 @@ |
| 2 | +package org.wikimedia.lsearch.suggest.api; |
| 3 | + |
| 4 | +import java.util.BitSet; |
| 5 | +import java.util.HashMap; |
| 6 | +import java.util.Set; |
| 7 | +import java.util.Map.Entry; |
| 8 | + |
| 9 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
| 10 | + |
| 11 | +/** Mapping from namespaces to frequencies */ |
| 12 | +public class NamespaceFreq { |
| 13 | + class IntWrap{ |
| 14 | + int val = 0; |
| 15 | + IntWrap() {} |
| 16 | + IntWrap(int value){ val = value; } |
| 17 | + IntWrap(String value){ val = Integer.parseInt(value); } |
| 18 | + public String toString(){ return ""+val; } |
| 19 | + } |
| 20 | + /** namespace -> frequency */ |
| 21 | + protected HashMap<Integer,IntWrap> nsmap = new HashMap<Integer,IntWrap>(); |
| 22 | + |
| 23 | + public NamespaceFreq(String field){ |
| 24 | + String[] pairs = field.split(" "); |
| 25 | + for(String pair : pairs){ |
| 26 | + if(pair.length() == 0) |
| 27 | + continue; |
| 28 | + String[] nsf = pair.split(":"); |
| 29 | + if(nsf.length == 2) |
| 30 | + nsmap.put(Integer.parseInt(nsf[0]),new IntWrap(nsf[1])); |
| 31 | + else { |
| 32 | + throw new RuntimeException("Bad syntax for namespace-frequency pairs : "+field); |
| 33 | + } |
| 34 | + } |
| 35 | + } |
| 36 | + |
| 37 | + public NamespaceFreq() { |
| 38 | + } |
| 39 | + |
| 40 | + public int getFrequency(int namespace){ |
| 41 | + if(nsmap.containsKey(namespace)) |
| 42 | + return nsmap.get(namespace).val; |
| 43 | + else |
| 44 | + return 0; |
| 45 | + } |
| 46 | + |
| 47 | + public int getFrequency(NamespaceFilter nsf){ |
| 48 | + int sum = 0; |
| 49 | + BitSet ns = nsf.getIncluded(); |
| 50 | + for(int i=ns.nextSetBit(0); i>=0; i=ns.nextSetBit(i+1)){ |
| 51 | + sum += getFrequency(i); |
| 52 | + } |
| 53 | + return sum; |
| 54 | + } |
| 55 | + |
| 56 | + public String serialize(int minFreq){ |
| 57 | + StringBuilder sb = new StringBuilder(); |
| 58 | + int sum = 0; |
| 59 | + for(Entry<Integer,IntWrap> e : nsmap.entrySet()){ |
| 60 | + sum += e.getValue().val; |
| 61 | + sb.append(e.getKey()); |
| 62 | + sb.append(":"); |
| 63 | + sb.append(e.getValue()); |
| 64 | + sb.append(" "); |
| 65 | + } |
| 66 | + if(sum < minFreq) |
| 67 | + return ""; |
| 68 | + return sb.toString(); |
| 69 | + } |
| 70 | + |
| 71 | + public String serialize(){ |
| 72 | + return serialize(0); |
| 73 | + } |
| 74 | + |
| 75 | + public void setFrequency(int namespace, int frequency){ |
| 76 | + nsmap.put(namespace,new IntWrap(frequency)); |
| 77 | + } |
| 78 | + |
| 79 | + public void incFrequency(int namespace){ |
| 80 | + if(nsmap.containsKey(namespace)){ |
| 81 | + nsmap.get(namespace).val++; |
| 82 | + } else |
| 83 | + nsmap.put(namespace,new IntWrap(1)); |
| 84 | + } |
| 85 | + |
| 86 | + public Set<Integer> getNamespaces(){ |
| 87 | + return nsmap.keySet(); |
| 88 | + } |
| 89 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest/CleanIndexImporter.java |
— | — | @@ -25,8 +25,6 @@ |
26 | 26 | import org.wikimedia.lsearch.config.IndexId; |
27 | 27 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
28 | 28 | import org.wikimedia.lsearch.ranks.Links; |
29 | | -import org.wikimedia.lsearch.suggest.api.PhraseIndexer; |
30 | | -import org.wikimedia.lsearch.suggest.api.TitlesIndexer; |
31 | 29 | import org.wikimedia.lsearch.util.Localization; |
32 | 30 | |
33 | 31 | /** |
— | — | @@ -59,8 +57,10 @@ |
60 | 58 | boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null; |
61 | 59 | // make article |
62 | 60 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects); |
63 | | - if(page.Title.Namespace == 0) |
64 | | - writer.addArticle(article); |
| 61 | + if(page.Title.Namespace != 0) |
| 62 | + article.setContents(""); |
| 63 | + |
| 64 | + writer.addArticle(article); |
65 | 65 | // generate phrases |
66 | 66 | /* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false); |
67 | 67 | ArrayList<Token> tokens = parser.parse(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -196,8 +196,13 @@ |
197 | 197 | /** Get title object corresponding to this article */ |
198 | 198 | public Title makeTitle(){ |
199 | 199 | return new Title(Integer.parseInt(namespace),title); |
| 200 | + } |
| 201 | + |
| 202 | + public void setContents(String contents) { |
| 203 | + this.contents = contents; |
200 | 204 | } |
201 | 205 | |
202 | 206 | |
203 | 207 | |
| 208 | + |
204 | 209 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -530,6 +530,11 @@ |
531 | 531 | return namespaceSet; |
532 | 532 | } |
533 | 533 | |
| 534 | + /** Get logical iid for this index, i.e. enwiki.mainpart -> enwiki */ |
| 535 | + public IndexId getLogical(){ |
| 536 | + return get(dbname); |
| 537 | + } |
534 | 538 | |
| 539 | + |
535 | 540 | |
536 | 541 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -769,6 +769,20 @@ |
770 | 770 | |
771 | 771 | dbroles.put(type,params); |
772 | 772 | |
| 773 | + } else if(type.equals("suggest")){ |
| 774 | + // all params are optional, if absent default will be used |
| 775 | + if(tokens.length>1) |
| 776 | + params.put("wordsMinFreq",tokens[1]); |
| 777 | + if(tokens.length>2) |
| 778 | + params.put("titlesWordsMinFreq",tokens[2]); |
| 779 | + if(tokens.length>3) |
| 780 | + params.put("titlesPhrasesMinFreq", tokens[3]); |
| 781 | + |
| 782 | + if(tokens.length>4) |
| 783 | + System.out.println("Unrecognized suggest parameters in ("+role+")"); |
| 784 | + |
| 785 | + dbroles.put(type,params); |
| 786 | + |
773 | 787 | } else{ |
774 | 788 | System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring."); |
775 | 789 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -64,7 +64,7 @@ |
65 | 65 | if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact")) |
66 | 66 | exactCase = true; |
67 | 67 | NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
68 | | - SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase); |
| 68 | + SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase, false); |
69 | 69 | if(res!=null && res.isRetry()){ |
70 | 70 | int retries = 0; |
71 | 71 | if(iid.isSplit() || iid.isNssplit()){ |
— | — | @@ -73,19 +73,27 @@ |
74 | 74 | retries = 1; |
75 | 75 | |
76 | 76 | while(retries > 0 && res.isRetry()){ |
77 | | - res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase); |
| 77 | + res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase, false); |
78 | 78 | retries--; |
79 | 79 | } |
80 | 80 | if(res.isRetry()) |
81 | 81 | res.setErrorMsg("Internal error, too many internal retries."); |
82 | 82 | } |
83 | 83 | return res; |
84 | | - } else if (what.equals("raw")) { |
85 | | - //TODO: return searchRaw(searchterm); |
| 84 | + } else if (what.equals("raw") || what.equals("rawexplain")) { |
| 85 | + int offset = 0, limit = 100; boolean exactCase = false; |
| 86 | + if (query.containsKey("offset")) |
| 87 | + offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
| 88 | + if (query.containsKey("limit")) |
| 89 | + limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines); |
| 90 | + if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact")) |
| 91 | + exactCase = true; |
| 92 | + NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
| 93 | + return search(iid, searchterm, offset, limit, namespaces, what.equals("rawexplain"), exactCase, true); |
86 | 94 | } else { |
87 | 95 | SearchResults res = new SearchResults(); |
88 | 96 | res.setErrorMsg("Unrecognized search type. Try one of: " + |
89 | | - "titlematch, titleprefix, search, explain, quit, raw."); |
| 97 | + "search, explain, raw, rawexplain."); |
90 | 98 | log.warn("Unknown request type [" + what + "]."); |
91 | 99 | return res; |
92 | 100 | } |
— | — | @@ -124,7 +132,7 @@ |
125 | 133 | * Search on iid, with query searchterm. View results from offset to offset+limit, using |
126 | 134 | * the default namespaces filter |
127 | 135 | */ |
128 | | - public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase){ |
| 136 | + public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw){ |
129 | 137 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
130 | 138 | if(nsDefault == null || nsDefault.cardinality() == 0) |
131 | 139 | nsDefault = new NamespaceFilter("0"); // default to main namespace |
— | — | @@ -155,13 +163,16 @@ |
156 | 164 | } |
157 | 165 | |
158 | 166 | try { |
159 | | - if(nsfw == null){ |
| 167 | + if(raw){ |
| 168 | + // do minimal parsing, make a raw query |
| 169 | + parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE); |
| 170 | + q = parser.parseRaw(searchterm); |
| 171 | + } else if(nsfw == null){ |
160 | 172 | if(searchAll) |
161 | 173 | q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
162 | 174 | else |
163 | 175 | q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
164 | | - } |
165 | | - else{ |
| 176 | + } else{ |
166 | 177 | q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
167 | 178 | log.info("Using NamespaceFilterWrapper "+nsfw); |
168 | 179 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -404,6 +404,10 @@ |
405 | 405 | doc.add(new Field("category", "", |
406 | 406 | Field.Store.NO, Field.Index.TOKENIZED)); |
407 | 407 | |
| 408 | + // interwiki associated with this page |
| 409 | + doc.add(new Field("interwiki", "", |
| 410 | + Field.Store.NO, Field.Index.TOKENIZED)); |
| 411 | + |
408 | 412 | for(FieldBuilder.BuilderSet bs : builder.getBuilders()){ |
409 | 413 | FieldNameFactory fields = bs.getFields(); |
410 | 414 | // boost document title with it's article rank |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -77,13 +77,15 @@ |
78 | 78 | /** boost for title field */ |
79 | 79 | public static float TITLE_BOOST = 6; |
80 | 80 | public static float TITLE_ALIAS_BOOST = 0.2f; |
| 81 | + public static float TITLE_PHRASE_BOOST = 2; |
81 | 82 | public static float STEM_TITLE_BOOST = 2; |
82 | 83 | public static float STEM_TITLE_ALIAS_BOOST = 0.4f; |
83 | | - public static float ALT_TITLE_BOOST = 4; |
| 84 | + public static float ALT_TITLE_BOOST = 8; |
84 | 85 | public static float ALT_TITLE_ALIAS_BOOST = 0.4f; |
85 | 86 | public static float KEYWORD_BOOST = 0.02f; |
86 | 87 | |
87 | 88 | public static boolean ADD_STEM_TITLE = true; |
| 89 | + public static boolean ADD_TITLE_PHRASES = true; |
88 | 90 | |
89 | 91 | /** Policies in treating field names: |
90 | 92 | * |
— | — | @@ -295,7 +297,7 @@ |
296 | 298 | if(length == 0 && ch == ' ') |
297 | 299 | continue; // ignore whitespaces |
298 | 300 | |
299 | | - // pluses and minuses, underscores can be within words, *,? are for wildcard queries |
| 301 | + // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries |
300 | 302 | if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*'){ |
301 | 303 | if(length<buffer.length) |
302 | 304 | buffer[length++] = ch; |
— | — | @@ -322,7 +324,7 @@ |
323 | 325 | else if(ch == ':'){ |
324 | 326 | // check if it's a valid field |
325 | 327 | String f = new String(buffer,0,length); |
326 | | - if(f.equals(namespaceAllKeyword) || f.equals("incategory") || namespaceFilters.containsKey(f)){ |
| 328 | + if(f.equals(namespaceAllKeyword) || f.equals("incategory") || namespaceFilters.containsKey(f) || namespacePolicy == NamespacePolicy.LEAVE){ |
327 | 329 | cur = lookup; |
328 | 330 | return TokenType.FIELD; |
329 | 331 | } else |
— | — | @@ -1094,7 +1096,7 @@ |
1095 | 1097 | } |
1096 | 1098 | |
1097 | 1099 | /** Make title query in format: title:query stemtitle:stemmedquery */ |
1098 | | - protected Query makeTitleQuery(String queryText) { |
| 1100 | + protected Query[] makeTitleQuery(String queryText) { |
1099 | 1101 | String contentField = defaultField; |
1100 | 1102 | float olfDefaultBoost = defaultBoost; |
1101 | 1103 | defaultField = fields.title(); // now parse the title part |
— | — | @@ -1117,16 +1119,19 @@ |
1118 | 1120 | defaultBoost = olfDefaultBoost; |
1119 | 1121 | defaultAliasBoost = ALIAS_BOOST; |
1120 | 1122 | |
| 1123 | + // make title phrases |
| 1124 | + Query qp = ADD_TITLE_PHRASES? makeTitlePhrases(qt) : null; |
| 1125 | + |
1121 | 1126 | if(qt == qs) // either null, or category query |
1122 | | - return qt; |
| 1127 | + return new Query[] {qt,qp}; |
1123 | 1128 | if(qt == null) |
1124 | | - return qs; |
| 1129 | + return new Query[] {qs,qp}; |
1125 | 1130 | if(qs == null) |
1126 | | - return qt; |
| 1131 | + return new Query[] {qt,qp}; |
1127 | 1132 | BooleanQuery bq = new BooleanQuery(true); |
1128 | 1133 | bq.add(qt,BooleanClause.Occur.SHOULD); |
1129 | 1134 | bq.add(qs,BooleanClause.Occur.SHOULD); |
1130 | | - return bq; |
| 1135 | + return new Query[] {bq,qp}; |
1131 | 1136 | } |
1132 | 1137 | |
1133 | 1138 | /** Quote CJK chars to avoid frequency-based analysis */ |
— | — | @@ -1173,6 +1178,44 @@ |
1174 | 1179 | } |
1175 | 1180 | } |
1176 | 1181 | |
| 1182 | + /** make two-word queries for some simple queries */ |
| 1183 | + protected Query makeTitlePhrases(Query q){ |
| 1184 | + if(q instanceof BooleanQuery){ |
| 1185 | + boolean allReq = true; |
| 1186 | + BooleanQuery bq = (BooleanQuery) q; |
| 1187 | + for(BooleanClause bc : bq.getClauses()){ |
| 1188 | + if(!bc.getOccur().equals(BooleanClause.Occur.MUST) || !(bc.getQuery() instanceof TermQuery) || |
| 1189 | + !(((TermQuery)bc.getQuery()).getTerm().field().equals("title"))){ |
| 1190 | + allReq = false; |
| 1191 | + break; |
| 1192 | + } |
| 1193 | + } |
| 1194 | + if(allReq){ |
| 1195 | + BooleanQuery ret = new BooleanQuery(true); |
| 1196 | + Term last = null; |
| 1197 | + // make phrases '+very +long +query' => "very long" "long query" |
| 1198 | + for(BooleanClause bc : bq.getClauses()){ |
| 1199 | + Term t = ((TermQuery)bc.getQuery()).getTerm(); |
| 1200 | + if(last != null){ |
| 1201 | + PhraseQuery pq = new PhraseQuery(); |
| 1202 | + pq.add(new Term("stemtitle",last.text())); |
| 1203 | + pq.add(new Term("stemtitle",t.text())); |
| 1204 | + pq.setBoost(TITLE_PHRASE_BOOST); |
| 1205 | + pq.setSlop(2); |
| 1206 | + ret.add(pq,BooleanClause.Occur.SHOULD); |
| 1207 | + } |
| 1208 | + last = t; |
| 1209 | + |
| 1210 | + } |
| 1211 | + if(ret.getClauses() != null && ret.getClauses().length != 0) |
| 1212 | + return ret; |
| 1213 | + } |
| 1214 | + } |
| 1215 | + |
| 1216 | + return null; |
| 1217 | + |
| 1218 | + } |
| 1219 | + |
1177 | 1220 | /** |
1178 | 1221 | * Main function for multi-pass parsing. |
1179 | 1222 | * |
— | — | @@ -1188,12 +1231,23 @@ |
1189 | 1232 | defaultBoost = 1; |
1190 | 1233 | defaultAliasBoost = ALIAS_BOOST; |
1191 | 1234 | Query qc = parseRaw(queryText); |
1192 | | - |
1193 | | - Query qt = makeTitleQuery(queryText); |
| 1235 | + Query[] qtqp = makeTitleQuery(queryText); |
| 1236 | + // qt = title query, qp = title phrase query |
| 1237 | + Query qt = qtqp[0]; |
| 1238 | + Query qp = null; |
| 1239 | + qp = qtqp[1]; |
1194 | 1240 | if(qc == null || qt == null) |
1195 | 1241 | return new BooleanQuery(); |
1196 | 1242 | if(qc.equals(qt)) |
1197 | 1243 | return qc; // don't duplicate (probably a query for categories only) |
| 1244 | + |
| 1245 | + // embedd phrase queries into main contents query |
| 1246 | + if(qp!=null && qc instanceof BooleanQuery){ |
| 1247 | + ((BooleanQuery)qc).add(qp,BooleanClause.Occur.SHOULD); |
| 1248 | + } else if(qp !=null && !(qc instanceof BooleanQuery)){ |
| 1249 | + // TODO: delete in release |
| 1250 | + System.out.println("SHOULD NEVER HAPPEN"); |
| 1251 | + } |
1198 | 1252 | BooleanQuery bq = new BooleanQuery(); |
1199 | 1253 | bq.add(qc,BooleanClause.Occur.SHOULD); |
1200 | 1254 | bq.add(qt,BooleanClause.Occur.SHOULD); |
— | — | @@ -1263,6 +1317,14 @@ |
1264 | 1318 | public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{ |
1265 | 1319 | return parseMultiPass(queryText,policy,false,false); |
1266 | 1320 | } |
| 1321 | + |
| 1322 | + public NamespacePolicy getNamespacePolicy() { |
| 1323 | + return namespacePolicy; |
| 1324 | + } |
| 1325 | + public void setNamespacePolicy(NamespacePolicy namespacePolicy) { |
| 1326 | + this.namespacePolicy = namespacePolicy; |
| 1327 | + } |
1267 | 1328 | |
1268 | 1329 | |
| 1330 | + |
1269 | 1331 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
4 | 4 | import java.util.ArrayList; |
| 5 | +import java.util.HashMap; |
5 | 6 | |
6 | 7 | import org.apache.log4j.Logger; |
7 | 8 | import org.apache.lucene.analysis.Analyzer; |
— | — | @@ -69,6 +70,7 @@ |
70 | 71 | WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase); |
71 | 72 | tokenizer.tokenize(); |
72 | 73 | ArrayList<String> categories = tokenizer.getCategories(); |
| 74 | + HashMap<String,String> interwiki = tokenizer.getInterwikis(); |
73 | 75 | |
74 | 76 | ArrayList<String> allKeywords = new ArrayList<String>(); |
75 | 77 | if(addKeywords && tokenizer.getKeywords()!=null) |
— | — | @@ -80,6 +82,8 @@ |
81 | 83 | new LanguageAnalyzer(filters,tokenizer)); |
82 | 84 | perFieldAnalyzer.addAnalyzer("category", |
83 | 85 | new CategoryAnalyzer(categories,exactCase)); |
| 86 | + perFieldAnalyzer.addAnalyzer("interwiki", |
| 87 | + new InterwikiAnalyzer(interwiki)); |
84 | 88 | perFieldAnalyzer.addAnalyzer(fields.title(), |
85 | 89 | getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
86 | 90 | perFieldAnalyzer.addAnalyzer(fields.stemtitle(), |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/InterwikiAnalyzer.java |
— | — | @@ -0,0 +1,58 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.Reader; |
| 6 | +import java.util.HashMap; |
| 7 | +import java.util.Iterator; |
| 8 | +import java.util.Map.Entry; |
| 9 | + |
| 10 | +import org.apache.lucene.analysis.Analyzer; |
| 11 | +import org.apache.lucene.analysis.Token; |
| 12 | +import org.apache.lucene.analysis.TokenStream; |
| 13 | + |
| 14 | +public class InterwikiAnalyzer extends Analyzer { |
| 15 | + public class InterwikiTokenStream extends TokenStream { |
| 16 | + protected Iterator<Entry<String,String>> tokensIt; |
| 17 | + protected int start; |
| 18 | + protected Token next = null; |
| 19 | + |
| 20 | + InterwikiTokenStream(){ |
| 21 | + tokensIt = interwiki.entrySet().iterator(); |
| 22 | + start = 0; |
| 23 | + } |
| 24 | + |
| 25 | + @Override |
| 26 | + public Token next() throws IOException { |
| 27 | + if(next != null){ |
| 28 | + Token t = next; |
| 29 | + next = null; |
| 30 | + return t; |
| 31 | + } |
| 32 | + if(tokensIt.hasNext()){ |
| 33 | + Entry<String,String> map = tokensIt.next(); |
| 34 | + String iw = map.getKey()+":"; // e.g. en: |
| 35 | + String title = map.getValue().toLowerCase(); // e.g. "douglas adams" |
| 36 | + Token t = new Token(iw,start,start+iw.length()); |
| 37 | + start += iw.length()+1; |
| 38 | + next = new Token(title,start,start+title.length()); |
| 39 | + start += title.length()+1; |
| 40 | + |
| 41 | + return t; |
| 42 | + } else |
| 43 | + return null; |
| 44 | + } |
| 45 | + |
| 46 | + } |
| 47 | + |
| 48 | + HashMap<String,String> interwiki; |
| 49 | + |
| 50 | + public InterwikiAnalyzer(HashMap<String,String> interwiki) { |
| 51 | + this.interwiki = interwiki; |
| 52 | + } |
| 53 | + |
| 54 | + @Override |
| 55 | + public TokenStream tokenStream(String fieldName, Reader reader) { |
| 56 | + return new InterwikiTokenStream(); |
| 57 | + } |
| 58 | + |
| 59 | +} |
Index: branches/lucene-search-2.1/build.xml |
— | — | @@ -6,9 +6,11 @@ |
7 | 7 | <property name="jars" value="jars"/> |
8 | 8 | <property name="dist" location="dist"/> |
9 | 9 | <property name="pack.name" value="lucene-search-2.1"/> |
| 10 | + <property name="src.name" value="lucene-search-src-2.1"/> |
10 | 11 | <property name="binary.name" value="ls2-bin"/> |
11 | 12 | <property name="jar.name" value="LuceneSearch.jar"/> |
12 | 13 | <property name="include" value="src/** lib/** sql/** test-data/** webinterface/** *-example *.txt lsearch* build.xml scripts/*"/> |
| 14 | + <property name="include.src" value="src/** sql/** build.xml scripts/*"/> |
13 | 15 | |
14 | 16 | <property file="${basedir}/hostname"/> |
15 | 17 | |
— | — | @@ -82,6 +84,19 @@ |
83 | 85 | <delete file="${dist}/${pack.name}.tar"/> |
84 | 86 | </target> |
85 | 87 | |
| 88 | + <target name="pack-src" description="Make tar.gz distribution of only core source files"> |
| 89 | + <mkdir dir="${dist}"/> |
| 90 | + <delete file="${dist}/${src.name}.tar"/> |
| 91 | + <delete file="${dist}/${src.name}.tar.gz"/> |
| 92 | + <tar tarfile="${dist}/${src.name}.tar"> |
| 93 | + <tarfileset prefix="${pack.name}" dir="." includes="${include.src}"/> |
| 94 | + </tar> |
| 95 | + |
| 96 | + <gzip zipfile="${dist}/${src.name}.tar.gz" src="${dist}/${src.name}.tar"/> |
| 97 | + <delete file="${dist}/${src.name}.tar"/> |
| 98 | + </target> |
| 99 | + |
| 100 | + |
86 | 101 | <target name="binary" depends="alljar" description="Make binary tar.gz distribution"> |
87 | 102 | <mkdir dir="${bin}"/> |
88 | 103 | <delete file="${dist}/${binary.name}.tar"/> |