Index: branches/lucene-search-2.1/build.xml |
— | — | @@ -10,7 +10,7 @@ |
11 | 11 | <property name="binary.name" value="ls2-bin"/> |
12 | 12 | <property name="jar.name" value="LuceneSearch.jar"/> |
13 | 13 | <property name="include" value="src/** lib/** sql/** test-data/** webinterface/** *-example *.txt lsearch* build.xml scripts/*"/> |
14 | | - <property name="include.src" value="src/** sql/** build.xml scripts/*"/> |
| 14 | + <property name="include.src" value="src/** sql/** build.xml scripts/* webinterface/*"/> |
15 | 15 | |
16 | 16 | <property file="${basedir}/hostname"/> |
17 | 17 | |
Index: branches/lucene-search-2.1/test-data/mathfunc.test |
— | — | @@ -0,0 +1,541 @@ |
| 2 | +39.20291483424475 |
| 3 | +13.450437958707814 |
| 4 | +12.678569683286979 |
| 5 | +10.25526171963254 |
| 6 | +8.849258859845378 |
| 7 | +8.666273372729856 |
| 8 | +8.31447886618143 |
| 9 | +8.197323614179476 |
| 10 | +8.065610978164585 |
| 11 | +7.997460285543743 |
| 12 | +6.390661056615026 |
| 13 | +6.195251633448001 |
| 14 | +6.003035468744286 |
| 15 | +5.922401025326313 |
| 16 | +5.858780569202485 |
| 17 | +5.7545766633852065 |
| 18 | +5.663348431691695 |
| 19 | +5.513502014868557 |
| 20 | +5.397556486517471 |
| 21 | +5.320454269824394 |
| 22 | +5.118790939776451 |
| 23 | +4.957945854762612 |
| 24 | +4.677393185927231 |
| 25 | +4.417044635347138 |
| 26 | +4.286872568087402 |
| 27 | +4.111160627955847 |
| 28 | +4.018425808374067 |
| 29 | +3.875594177139727 |
| 30 | +3.855756478461852 |
| 31 | +3.583812703943094 |
| 32 | +3.5817677966683417 |
| 33 | +3.393666733951346 |
| 34 | +3.3825997856599685 |
| 35 | +3.311922305543995 |
| 36 | +3.284748780252251 |
| 37 | +3.2803658354545093 |
| 38 | +3.208116699014506 |
| 39 | +3.197975985106876 |
| 40 | +3.1501966327182416 |
| 41 | +3.126299857812352 |
| 42 | +3.063553780327862 |
| 43 | +3.0467910121568567 |
| 44 | +2.947827241203268 |
| 45 | +2.8966477779781448 |
| 46 | +2.8116941812748055 |
| 47 | +2.730939712302121 |
| 48 | +2.7194772730786663 |
| 49 | +2.6503967112741775 |
| 50 | +2.499346732261985 |
| 51 | +2.4531229860503347 |
| 52 | +2.4245965196546595 |
| 53 | +2.4048828498563353 |
| 54 | +2.398834838013775 |
| 55 | +2.3983530801899637 |
| 56 | +2.301883995710939 |
| 57 | +2.2515294877640977 |
| 58 | +2.2418150405156405 |
| 59 | +2.186851347893209 |
| 60 | +2.1421959631446454 |
| 61 | +2.112862553279673 |
| 62 | +2.111304636049398 |
| 63 | +2.0884066313608103 |
| 64 | +2.080225946703776 |
| 65 | +2.0652098242981136 |
| 66 | +2.045337183390787 |
| 67 | +2.002148363977345 |
| 68 | +1.9262536235110566 |
| 69 | +1.9254642966646955 |
| 70 | +1.8815690362186883 |
| 71 | +1.8796391915958572 |
| 72 | +1.8583157394056122 |
| 73 | +1.842890610174396 |
| 74 | +1.832874813193032 |
| 75 | +1.7857804147392449 |
| 76 | +1.778042188142664 |
| 77 | +1.7751602358737153 |
| 78 | +1.770588743574571 |
| 79 | +1.7658860788389297 |
| 80 | +1.7330514584559034 |
| 81 | +1.676248860250089 |
| 82 | +1.6632396794496727 |
| 83 | +1.6529964196348452 |
| 84 | +1.587700607893723 |
| 85 | +1.554507271828346 |
| 86 | +1.5462922600019373 |
| 87 | +1.536011998632598 |
| 88 | +1.499536875414111 |
| 89 | +1.4827109040124988 |
| 90 | +1.4618995756130717 |
| 91 | +1.4561634933713434 |
| 92 | +1.4442030498548373 |
| 93 | +1.4232400481329681 |
| 94 | +1.4148239967756024 |
| 95 | +1.4125065245521127 |
| 96 | +1.390309778182693 |
| 97 | +1.374617285871078 |
| 98 | +1.3591113373855113 |
| 99 | +1.2959623367365105 |
| 100 | +1.2916936614995556 |
| 101 | +1.2788722876401044 |
| 102 | +1.2710067417073394 |
| 103 | +1.2579879901973903 |
| 104 | +1.234760921054205 |
| 105 | +1.232572710846885 |
| 106 | +1.2239982328262375 |
| 107 | +1.1960562381380218 |
| 108 | +1.1909451721904833 |
| 109 | +1.1883725046626885 |
| 110 | +1.1870577049511857 |
| 111 | +1.179821425669525 |
| 112 | +1.170960754894606 |
| 113 | +1.1627022555545126 |
| 114 | +1.155955236116103 |
| 115 | +1.151314330373941 |
| 116 | +1.138548931392361 |
| 117 | +1.136753034029899 |
| 118 | +1.1252591078172476 |
| 119 | +1.1153859351428042 |
| 120 | +1.0995712079396343 |
| 121 | +1.093167701863354 |
| 122 | +1.0926266284365183 |
| 123 | +1.0730378946931582 |
| 124 | +1.0577347862606676 |
| 125 | +1.0517570343151739 |
| 126 | +1.043571403629543 |
| 127 | +1.0363716250813024 |
| 128 | +1.0339183869730422 |
| 129 | +1.0273092324678479 |
| 130 | +1.0272902345083126 |
| 131 | +1.0255021017312675 |
| 132 | +1.0095597490734871 |
| 133 | +1.0045834871416266 |
| 134 | +1.0032766725818 |
| 135 | +1.0008620689655172 |
| 136 | +0.9986067546725256 |
| 137 | +0.9654483583749244 |
| 138 | +0.9369909811501816 |
| 139 | +0.935063552081607 |
| 140 | +0.9083748494156194 |
| 141 | +0.9040905868635284 |
| 142 | +0.8957921591066108 |
| 143 | +0.8954248366013071 |
| 144 | +0.884183371098627 |
| 145 | +0.884105579905624 |
| 146 | +0.8835226747659162 |
| 147 | +0.8119355402722593 |
| 148 | +0.789096316833624 |
| 149 | +0.7691353626783575 |
| 150 | +0.7547464640913147 |
| 151 | +0.7517606818891444 |
| 152 | +0.7304977445593964 |
| 153 | +0.7281849565279721 |
| 154 | +0.7062913079465717 |
| 155 | +0.7020404538039372 |
| 156 | +0.701127126043806 |
| 157 | +0.699206200140122 |
| 158 | +0.685040305819171 |
| 159 | +0.6655547498187092 |
| 160 | +0.6532654101207102 |
| 161 | +0.6310138330650028 |
| 162 | +0.6246948793782561 |
| 163 | +0.6069942918933324 |
| 164 | +0.6067663016931207 |
| 165 | +0.5853223819529878 |
| 166 | +0.5670848391524492 |
| 167 | +0.5555930180090919 |
| 168 | +0.5545160077694642 |
| 169 | +0.5519308889007148 |
| 170 | +0.5317575800317539 |
| 171 | +0.5251936936993156 |
| 172 | +0.5187831936649074 |
| 173 | +0.5156023165475659 |
| 174 | +0.515510718520791 |
| 175 | +0.5103972455063522 |
| 176 | +0.5084803339569334 |
| 177 | +0.5063598230037413 |
| 178 | +0.505667612185183 |
| 179 | +0.501937984496124 |
| 180 | +0.4851394374544459 |
| 181 | +0.48245486501300455 |
| 182 | +0.47980203736009863 |
| 183 | +0.4785642026738086 |
| 184 | +0.47490132931301576 |
| 185 | +0.47440087145969495 |
| 186 | +0.470478765184812 |
| 187 | +0.4653662166610065 |
| 188 | +0.4584488352331244 |
| 189 | +0.4483601686329818 |
| 190 | +0.44831619992910315 |
| 191 | +0.43756844494575 |
| 192 | +0.41978485314864694 |
| 193 | +0.417847075405215 |
| 194 | +0.4142618474815748 |
| 195 | +0.40664111545267345 |
| 196 | +0.40430555555555553 |
| 197 | +0.3993738819320215 |
| 198 | +0.39618140428122023 |
| 199 | +0.3954248366013072 |
| 200 | +0.3866070684337751 |
| 201 | +0.38264501470195134 |
| 202 | +0.374294355322793 |
| 203 | +0.3739015797363756 |
| 204 | +0.3690967047524425 |
| 205 | +0.369047619047619 |
| 206 | +0.36553730323393024 |
| 207 | +0.36265695286094 |
| 208 | +0.36172088042753525 |
| 209 | +0.3564633249084819 |
| 210 | +0.3520672565581789 |
| 211 | +0.3513354291972556 |
| 212 | +0.3487797377639518 |
| 213 | +0.3477145520949912 |
| 214 | +0.3471940694816792 |
| 215 | +0.34373979946349387 |
| 216 | +0.3433980148049866 |
| 217 | +0.3418468571494427 |
| 218 | +0.33541054251262054 |
| 219 | +0.33527131782945735 |
| 220 | +0.33527131782945735 |
| 221 | +0.3347176079734219 |
| 222 | +0.33416587471291676 |
| 223 | +0.3297251214860608 |
| 224 | +0.3289868732118273 |
| 225 | +0.32559026238374766 |
| 226 | +0.32399276565866203 |
| 227 | +0.3236174686422881 |
| 228 | +0.32093503885813623 |
| 229 | +0.32079037346689465 |
| 230 | +0.3154645117323287 |
| 231 | +0.3139680596840939 |
| 232 | +0.3104207122130782 |
| 233 | +0.3003896715822824 |
| 234 | +0.2985506761279957 |
| 235 | +0.29658825807372824 |
| 236 | +0.2934782608695652 |
| 237 | +0.2931321661265045 |
| 238 | +0.28864928130789674 |
| 239 | +0.2861340335083771 |
| 240 | +0.28459669208546495 |
| 241 | +0.28198129842265857 |
| 242 | +0.2782558499908782 |
| 243 | +0.2763939378708801 |
| 244 | +0.2680798207081973 |
| 245 | +0.24866758585566026 |
| 246 | +0.24720389265501513 |
| 247 | +0.24634287408169175 |
| 248 | +0.24545454545454548 |
| 249 | +0.23687033450579364 |
| 250 | +0.23378644225863587 |
| 251 | +0.22875816993464054 |
| 252 | +0.22869471413160733 |
| 253 | +0.22675400538418133 |
| 254 | +0.22472044129321897 |
| 255 | +0.22466775576989487 |
| 256 | +0.2112033035327623 |
| 257 | +0.20752025499370294 |
| 258 | +0.20467549544966926 |
| 259 | +0.2041005387843561 |
| 260 | +0.2034160991520012 |
| 261 | +0.20222535082268692 |
| 262 | +0.20194931773879143 |
| 263 | +0.19780433157196253 |
| 264 | +0.1927963696680974 |
| 265 | +0.18920010810462992 |
| 266 | +0.18787808738286582 |
| 267 | +0.17671003692730708 |
| 268 | +0.17565274073006032 |
| 269 | +0.17341231124442874 |
| 270 | +0.1723925299506695 |
| 271 | +0.17197609946770082 |
| 272 | +0.1719272995046191 |
| 273 | +0.16993464052287582 |
| 274 | +0.16993464052287582 |
| 275 | +0.16865044895491207 |
| 276 | +0.16808637799360715 |
| 277 | +0.16773504273504275 |
| 278 | +0.16669077556470457 |
| 279 | +0.16270324501083455 |
| 280 | +0.1589700996677741 |
| 281 | +0.15778214034027988 |
| 282 | +0.15444497571279267 |
| 283 | +0.15334213590027546 |
| 284 | +0.15281709216048345 |
| 285 | +0.15067748638122527 |
| 286 | +0.1495469563842297 |
| 287 | +0.14720196876886033 |
| 288 | +0.1457772370310287 |
| 289 | +0.14479512735326688 |
| 290 | +0.14432989690721648 |
| 291 | +0.14082592613564054 |
| 292 | +0.14067196735578466 |
| 293 | +0.13776405094181346 |
| 294 | +0.1377070905789004 |
| 295 | +0.13638598896589604 |
| 296 | +0.13563103388092834 |
| 297 | +0.13304514191907094 |
| 298 | +0.13219008533646015 |
| 299 | +0.13095759505521926 |
| 300 | +0.13076649778510113 |
| 301 | +0.13045367197450033 |
| 302 | +0.12914131169709264 |
| 303 | +0.12482312219746579 |
| 304 | +0.12458333333333334 |
| 305 | +0.12327018675025805 |
| 306 | +0.11980187188808131 |
| 307 | +0.11831331097192645 |
| 308 | +0.11802553892761806 |
| 309 | +0.11652894048018908 |
| 310 | +0.11602674308579007 |
| 311 | +0.11368723374537328 |
| 312 | +0.11233370092594643 |
| 313 | +0.11094249945262813 |
| 314 | +0.10983825886267112 |
| 315 | +0.10720114239086087 |
| 316 | +0.1070457931678245 |
| 317 | +0.10679419583948348 |
| 318 | +0.10601680933489718 |
| 319 | +0.10441831916759216 |
| 320 | +0.10031529111575649 |
| 321 | +0.1 |
| 322 | +0.09892913376451788 |
| 323 | +0.09613003410578937 |
| 324 | +0.09558177667787654 |
| 325 | +0.09353741496598639 |
| 326 | +0.09335130310817207 |
| 327 | +0.09299489506522973 |
| 328 | +0.09284707540521495 |
| 329 | +0.09156836165742682 |
| 330 | +0.09143959488787076 |
| 331 | +0.09117350127397722 |
| 332 | +0.09090909090909091 |
| 333 | +0.08957592659331161 |
| 334 | +0.08851076659860356 |
| 335 | +0.08778550096326349 |
| 336 | +0.08730787885969052 |
| 337 | +0.08595141700404858 |
| 338 | +0.08527131782945736 |
| 339 | +0.08380728650268886 |
| 340 | +0.08270375576255015 |
| 341 | +0.08187134502923976 |
| 342 | +0.08038147306700115 |
| 343 | +0.07926470588235295 |
| 344 | +0.07667821777071915 |
| 345 | +0.07456348210639226 |
| 346 | +0.0740920438489128 |
| 347 | +0.0723925299506695 |
| 348 | +0.0722556667435676 |
| 349 | +0.07058355935137631 |
| 350 | +0.07057745902570312 |
| 351 | +0.06917211328976035 |
| 352 | +0.06897873003531063 |
| 353 | +0.06809526321135173 |
| 354 | +0.06791100973310865 |
| 355 | +0.06743256743256744 |
| 356 | +0.06730159311397596 |
| 357 | +0.06722996202214411 |
| 358 | +0.06674924924924926 |
| 359 | +0.06666666666666667 |
| 360 | +0.06666666666666667 |
| 361 | +0.06666666666666667 |
| 362 | +0.06666666666666667 |
| 363 | +0.06666666666666667 |
| 364 | +0.06624754009147153 |
| 365 | +0.06520375576255016 |
| 366 | +0.06469420951891736 |
| 367 | +0.06443798449612403 |
| 368 | +0.06431866694209185 |
| 369 | +0.06376903553299493 |
| 370 | +0.06349206349206349 |
| 371 | +0.06333495599357147 |
| 372 | +0.06263242909540888 |
| 373 | +0.0625 |
| 374 | +0.06076151390788874 |
| 375 | +0.060480192024804376 |
| 376 | +0.060452567221700414 |
| 377 | +0.05991285403050109 |
| 378 | +0.058823529411764705 |
| 379 | +0.058823529411764705 |
| 380 | +0.058823529411764705 |
| 381 | +0.058823529411764705 |
| 382 | +0.058823529411764705 |
| 383 | +0.05823070839310234 |
| 384 | +0.05816748407653175 |
| 385 | +0.0579618045578126 |
| 386 | +0.05745341614906832 |
| 387 | +0.05564512855962682 |
| 388 | +0.054808368686332126 |
| 389 | +0.05419516301503879 |
| 390 | +0.05334281650071124 |
| 391 | +0.05263157894736842 |
| 392 | +0.05259856630824373 |
| 393 | +0.051665702718334296 |
| 394 | +0.05145489270868442 |
| 395 | +0.05101136441542828 |
| 396 | +0.04973743435858965 |
| 397 | +0.04951565046909012 |
| 398 | +0.048980995248812206 |
| 399 | +0.04867645430616502 |
| 400 | +0.04768432483791873 |
| 401 | +0.04765694031555579 |
| 402 | +0.04739252995066949 |
| 403 | +0.046511627906976744 |
| 404 | +0.04640702183427576 |
| 405 | +0.04598599183197713 |
| 406 | +0.045454545454545456 |
| 407 | +0.04513888888888889 |
| 408 | +0.04398685228151585 |
| 409 | +0.04362262093762942 |
| 410 | +0.043478260869565216 |
| 411 | +0.04278197208876688 |
| 412 | +0.042360667607827314 |
| 413 | +0.041666666666666664 |
| 414 | +0.041591268270502295 |
| 415 | +0.04068627450980392 |
| 416 | +0.03972809052794399 |
| 417 | +0.03972291040988196 |
| 418 | +0.03959025470653378 |
| 419 | +0.03897502153316107 |
| 420 | +0.03886844013161564 |
| 421 | +0.03818036293723188 |
| 422 | +0.037037037037037035 |
| 423 | +0.037037037037037035 |
| 424 | +0.03654897909577054 |
| 425 | +0.03651960692790449 |
| 426 | +0.03641147396373545 |
| 427 | +0.0363407371388875 |
| 428 | +0.03600713012477718 |
| 429 | +0.0359586316471341 |
| 430 | +0.0357838573513611 |
| 431 | +0.03574975173783515 |
| 432 | +0.03573225548912443 |
| 433 | +0.03571428571428571 |
| 434 | +0.03571428571428571 |
| 435 | +0.03571428571428571 |
| 436 | +0.03571428571428571 |
| 437 | +0.03564082975847682 |
| 438 | +0.03544985471391053 |
| 439 | +0.03504654092288226 |
| 440 | +0.03486279802069275 |
| 441 | +0.03333333333333333 |
| 442 | +0.03333333333333333 |
| 443 | +0.03273854270680871 |
| 444 | +0.03257478689045375 |
| 445 | +0.030855931055416896 |
| 446 | +0.03064721823688825 |
| 447 | +0.030596683035300658 |
| 448 | +0.03019175846593227 |
| 449 | +0.029311223675659005 |
| 450 | +0.029247991816905444 |
| 451 | +0.02869474125288079 |
| 452 | +0.0286046511627907 |
| 453 | +0.02849550837462078 |
| 454 | +0.02825377396980824 |
| 455 | +0.027777777777777776 |
| 456 | +0.027777777777777776 |
| 457 | +0.02702702702702703 |
| 458 | +0.02631578947368421 |
| 459 | +0.02631578947368421 |
| 460 | +0.02601263470828688 |
| 461 | +0.02574750830564784 |
| 462 | +0.025550100018185126 |
| 463 | +0.025372327930467467 |
| 464 | +0.025211768224451123 |
| 465 | +0.025193798449612403 |
| 466 | +0.024160206718346254 |
| 467 | +0.023008165967777532 |
| 468 | +0.022904441700569815 |
| 469 | +0.02269369606133612 |
| 470 | +0.022269634413562448 |
| 471 | +0.021753233319546015 |
| 472 | +0.021739130434782608 |
| 473 | +0.021739130434782608 |
| 474 | +0.021496108285197813 |
| 475 | +0.02127659574468085 |
| 476 | +0.02127659574468085 |
| 477 | +0.02127659574468085 |
| 478 | +0.02103108133258887 |
| 479 | +0.020702295544261874 |
| 480 | +0.019552721786808246 |
| 481 | +0.019542404334929694 |
| 482 | +0.018518518518518517 |
| 483 | +0.018173482676494504 |
| 484 | +0.01791290238206013 |
| 485 | +0.017196509385738565 |
| 486 | +0.017080685702920213 |
| 487 | +0.016975703819570268 |
| 488 | +0.01640469265710271 |
| 489 | +0.016129032258064516 |
| 490 | +0.016129032258064516 |
| 491 | +0.015713963183612317 |
| 492 | +0.015625 |
| 493 | +0.014924997483137019 |
| 494 | +0.013333333333333334 |
| 495 | +0.013004982761851708 |
| 496 | +0.012763288406558142 |
| 497 | +0.011832633972748726 |
| 498 | +0.010638297872340425 |
| 499 | +0.010412560767310472 |
| 500 | +0.009900990099009901 |
| 501 | +0.00972670763891639 |
| 502 | +0.009073661840429657 |
| 503 | +0.00744185191163226 |
| 504 | +0.007414552803038665 |
| 505 | +0.007314328582145537 |
| 506 | +0.007135677344305626 |
| 507 | +0.006407710219770128 |
| 508 | +0.006211488769628305 |
| 509 | +0.006211488769628305 |
| 510 | +0.006211488769628305 |
| 511 | +0.006211488769628305 |
| 512 | +0.005638820977848822 |
| 513 | +0.005620914150601959 |
| 514 | +0.005374410612962519 |
| 515 | +0.004532574995380161 |
| 516 | +0.003942832324438774 |
| 517 | +0.0036032634303455133 |
| 518 | +0.0033840379683466597 |
| 519 | +0.003220559531554977 |
| 520 | +0.0028000534616412723 |
| 521 | +0.00274799145526168 |
| 522 | +0.0026964243582868258 |
| 523 | +0.0025624990369252356 |
| 524 | +0.002512562814070352 |
| 525 | +0.002288062199929619 |
| 526 | +0.0022123893805309734 |
| 527 | +0.0022061535822785463 |
| 528 | +0.0022023948610103345 |
| 529 | +0.002188183807439825 |
| 530 | +0.0019559542709627524 |
| 531 | +0.0019559542709627524 |
| 532 | +0.001949317738791423 |
| 533 | +0.001937984496124031 |
| 534 | +0.001937984496124031 |
| 535 | +0.001937984496124031 |
| 536 | +0.001937984496124031 |
| 537 | +0.001937984496124031 |
| 538 | +0.001937984496124031 |
| 539 | +0.001937984496124031 |
| 540 | +0.0016129032258064516 |
| 541 | +0.0014824708759661475 |
| 542 | +0.0014727540500736377 |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | import org.wikimedia.lsearch.frontend.SearchServer; |
32 | 32 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
33 | 33 | import org.wikimedia.lsearch.spell.Suggest; |
| 34 | +import org.wikimedia.lsearch.spell.SuggestQuery; |
34 | 35 | import org.wikimedia.lsearch.util.QueryStringMap; |
35 | 36 | |
36 | 37 | /** |
— | — | @@ -148,10 +149,12 @@ |
149 | 150 | Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes(); |
150 | 151 | boolean searchAll = false; |
151 | 152 | Suggest sug = null; |
152 | | - try { |
153 | | - sug = new Suggest(iid); |
154 | | - } catch (IOException e1) { |
155 | | - log.warn("Cannot open spell-suggestion indexes for "+iid+" : "+e1); |
| 153 | + if(offset == 0){ |
| 154 | + try { |
| 155 | + sug = new Suggest(iid); |
| 156 | + } catch (IOException e1) { |
| 157 | + log.warn("Cannot open spell-suggestion indexes for "+iid+" : "+e1); |
| 158 | + } |
156 | 159 | } |
157 | 160 | |
158 | 161 | // if search is over one field, try to use filters |
— | — | @@ -170,20 +173,8 @@ |
171 | 174 | } |
172 | 175 | |
173 | 176 | try { |
174 | | - if(raw){ |
175 | | - // do minimal parsing, make a raw query |
176 | | - parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE); |
177 | | - q = parser.parseRaw(searchterm); |
178 | | - } else if(nsfw == null){ |
179 | | - if(searchAll) |
180 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
181 | | - else |
182 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
183 | | - } else{ |
184 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
185 | | - log.info("Using NamespaceFilterWrapper "+nsfw); |
186 | | - } |
187 | | - |
| 177 | + q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll); |
| 178 | + |
188 | 179 | TopDocs hits=null; |
189 | 180 | // see if we can search only part of the index |
190 | 181 | if(nsfw!=null && (iid.isMainsplit() || iid.isNssplit())){ |
— | — | @@ -216,8 +207,27 @@ |
217 | 208 | } |
218 | 209 | RMIMessengerClient messenger = new RMIMessengerClient(); |
219 | 210 | res = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host); |
220 | | - if(sug != null) |
221 | | - res.setSuggest(sug.suggest(searchterm,parser,nsfw.getFilter(),res.getNumHits())); |
| 211 | + if(sug != null){ |
| 212 | + SuggestQuery sq = sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res); |
| 213 | + if(sq == null) |
| 214 | + res.setSuggest(null); |
| 215 | + else{ |
| 216 | + if(res.getNumHits() == 0){ |
| 217 | + // no hits: show the spell-checked results |
| 218 | + SearchResults sugres = messenger.searchPart(piid,sq.getSearchterm(),q,nsfw,offset,limit,explain,host); |
| 219 | + if(sugres.getNumHits() > 0){ |
| 220 | + res = sugres; |
| 221 | + res.setSuggest(sq.getSearchterm()); |
| 222 | + } |
| 223 | + } else if(sq.needsCheck()){ |
| 224 | + q = parseQuery(sq.getSearchterm(),parser,iid,raw,nsfw,searchAll); |
| 225 | + SearchResults sugres = messenger.searchPart(piid,sq.getSearchterm(),q,nsfw,0,1,explain,host); |
| 226 | + if(sugres.getNumHits() > 0){ |
| 227 | + res.setSuggest(sq.getSearchterm()); |
| 228 | + } |
| 229 | + } |
| 230 | + } |
| 231 | + } |
222 | 232 | return res; |
223 | 233 | } |
224 | 234 | } |
— | — | @@ -226,8 +236,27 @@ |
227 | 237 | try{ |
228 | 238 | hits = searcher.search(q,nsfw,offset+limit); |
229 | 239 | res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
230 | | - if(sug != null) |
231 | | - res.setSuggest(sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res.getNumHits())); |
| 240 | + if(sug != null){ |
| 241 | + SuggestQuery sq = sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res); |
| 242 | + if(sq == null) |
| 243 | + res.setSuggest(null); |
| 244 | + else{ |
| 245 | + if(res.getNumHits() == 0){ |
| 246 | + // no hits: show the spell-checked results |
| 247 | + hits = searcher.search(q,nsfw,offset+limit); |
| 248 | + if(hits.totalHits != 0){ |
| 249 | + res = makeSearchResults(searcher,hits,offset,limit,iid,sq.getSearchterm(),q,searchStart,explain); |
| 250 | + res.setSuggest(sq.getSearchterm()); |
| 251 | + } |
| 252 | + } else if(sq.needsCheck()){ |
| 253 | + q = parseQuery(sq.getSearchterm(),parser,iid,raw,nsfw,searchAll); |
| 254 | + hits = searcher.search(q,nsfw,1); // fetch only one result |
| 255 | + if(hits.totalHits != 0){ |
| 256 | + res.setSuggest(sq.getSearchterm()); |
| 257 | + } |
| 258 | + } |
| 259 | + } |
| 260 | + } |
232 | 261 | return res; |
233 | 262 | } catch(Exception e){ |
234 | 263 | e.printStackTrace(); |
— | — | @@ -250,6 +279,24 @@ |
251 | 280 | } |
252 | 281 | } |
253 | 282 | |
| 283 | + protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll) throws ParseException { |
| 284 | + Query q = null; |
| 285 | + if(raw){ |
| 286 | + // do minimal parsing, make a raw query |
| 287 | + parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE); |
| 288 | + q = parser.parseRaw(searchterm); |
| 289 | + } else if(nsfw == null){ |
| 290 | + if(searchAll) |
| 291 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
| 292 | + else |
| 293 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
| 294 | + } else{ |
| 295 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
| 296 | + log.info("Using NamespaceFilterWrapper "+nsfw); |
| 297 | + } |
| 298 | + return q; |
| 299 | + } |
| 300 | + |
254 | 301 | /** Our scores can span several orders of magnitude, transform them to be more relevant to the user */ |
255 | 302 | public float transformScore(double score){ |
256 | 303 | return (float) (Math.log10(1+score*99)/2); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java |
— | — | @@ -94,7 +94,7 @@ |
95 | 95 | return searcher; |
96 | 96 | } |
97 | 97 | |
98 | | - IndexSearcherMul get(){ |
| 98 | + synchronized IndexSearcherMul get(){ |
99 | 99 | if(index >= searchers.length) |
100 | 100 | index = 0; |
101 | 101 | log.debug("Using "+iid+" searcher "+index); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -368,7 +368,7 @@ |
369 | 369 | continue; // ignore single quotes (it's -> its) |
370 | 370 | |
371 | 371 | // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries |
372 | | - if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*'){ |
| 372 | + if(!Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != '.' && ch != ',' && ch != ';' && ch != '"'){ |
373 | 373 | if(length<buffer.length) |
374 | 374 | buffer[length++] = ch; |
375 | 375 | } else{ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -122,6 +122,10 @@ |
123 | 123 | return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase)); |
124 | 124 | } |
125 | 125 | |
| 126 | + public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(String langCode){ |
| 127 | + return getSearcherAnalyzer(new FilterFactory(langCode,FilterFactory.Type.SPELL_CHECK),new FieldNameFactory()); |
| 128 | + } |
| 129 | + |
126 | 130 | /** |
127 | 131 | * Analyzer for search queries. Can be reused to parse many queries. |
128 | 132 | * |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -1,5 +1,10 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
| 4 | +import java.lang.reflect.InvocationTargetException; |
| 5 | +import java.lang.reflect.Method; |
| 6 | +import java.util.ArrayList; |
| 7 | +import java.util.Set; |
| 8 | + |
4 | 9 | import org.apache.lucene.analysis.PorterStemFilter; |
5 | 10 | import org.apache.lucene.analysis.TokenFilter; |
6 | 11 | import org.apache.lucene.analysis.TokenStream; |
— | — | @@ -16,26 +21,36 @@ |
17 | 22 | public class FilterFactory { |
18 | 23 | protected String lang; |
19 | 24 | protected String snowballName = null; |
20 | | - protected boolean useStemmer,useCustomFilter; |
| 25 | + protected boolean useStemmer,useLangFilter; |
21 | 26 | protected Class stemmer = null; |
22 | | - protected Class customFilter = null; |
| 27 | + protected Class langFilter = null; |
23 | 28 | protected boolean usingCJK = false; |
| 29 | + protected ArrayList<Class> additionalFilters = null; |
24 | 30 | |
25 | 31 | protected FilterFactory noStemmerFilterFactory=null; |
26 | 32 | |
| 33 | + public enum Type { FULL, NO_STEM, SPELL_CHECK }; |
| 34 | + protected Type type = null; |
| 35 | + |
27 | 36 | public FilterFactory(String lang){ |
| 37 | + this(lang,Type.FULL); |
| 38 | + } |
| 39 | + |
| 40 | + public FilterFactory(String lang, Type type){ |
28 | 41 | this.lang = lang; |
| 42 | + this.type = type; |
29 | 43 | init(); |
30 | | - noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useCustomFilter,null,customFilter); |
| 44 | + noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters); |
31 | 45 | } |
32 | 46 | |
33 | | - public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useCustomFilter, Class stemmer, Class customFilter) { |
| 47 | + public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) { |
34 | 48 | this.lang = lang; |
35 | 49 | this.snowballName = snowballName; |
36 | 50 | this.useStemmer = useStemmer; |
37 | | - this.useCustomFilter = useCustomFilter; |
| 51 | + this.useLangFilter = useLangFilter; |
38 | 52 | this.stemmer = stemmer; |
39 | | - this.customFilter = customFilter; |
| 53 | + this.langFilter = langFilter; |
| 54 | + this.additionalFilters = additionalFilters; |
40 | 55 | } |
41 | 56 | |
42 | 57 | public FilterFactory getNoStemmerFilterFactory() { |
— | — | @@ -49,51 +64,60 @@ |
50 | 65 | if(lang == null) |
51 | 66 | lang = "en"; |
52 | 67 | |
53 | | - // figure out stemmer |
54 | | - useStemmer = true; |
55 | | - if(lang.equals("en")) |
56 | | - snowballName = "English"; |
| 68 | + if(type == Type.FULL){ |
| 69 | + useStemmer = true; |
| 70 | + // figure out stemmer |
| 71 | + if(lang.equals("en")) |
| 72 | + snowballName = "English"; |
57 | 73 | //stemmer = PorterStemFilter.class; // 2x faster but less accurate |
58 | | - else if(lang.equals("da")) |
59 | | - snowballName = "Danish"; |
60 | | - else if(lang.equals("nl")) |
61 | | - snowballName = "Dutch"; |
62 | | - else if(lang.equals("fi")) |
63 | | - snowballName = "Finnish"; |
64 | | - else if(lang.equals("de")) |
65 | | - snowballName = "German"; |
66 | | - else if(lang.equals("it")) |
67 | | - snowballName = "Italian"; |
68 | | - else if(lang.equals("no")) |
69 | | - snowballName = "Norwegian"; |
70 | | - else if(lang.equals("pt")) |
71 | | - snowballName = "Portuguese"; |
72 | | - else if(lang.equals("ru")) |
73 | | - snowballName = "Russian"; |
74 | | - else if(lang.equals("es")) |
75 | | - snowballName = "Spanish"; |
76 | | - else if(lang.equals("sv")) |
77 | | - snowballName = "Swedish"; |
78 | | - else if(lang.equals("eo")) |
79 | | - stemmer = EsperantoStemFilter.class; |
80 | | - else |
| 74 | + else if(lang.equals("da")) |
| 75 | + snowballName = "Danish"; |
| 76 | + else if(lang.equals("nl")) |
| 77 | + snowballName = "Dutch"; |
| 78 | + else if(lang.equals("fi")) |
| 79 | + snowballName = "Finnish"; |
| 80 | + else if(lang.equals("de")) |
| 81 | + snowballName = "German"; |
| 82 | + else if(lang.equals("it")) |
| 83 | + snowballName = "Italian"; |
| 84 | + else if(lang.equals("no")) |
| 85 | + snowballName = "Norwegian"; |
| 86 | + else if(lang.equals("pt")) |
| 87 | + snowballName = "Portuguese"; |
| 88 | + else if(lang.equals("ru")) |
| 89 | + snowballName = "Russian"; |
| 90 | + else if(lang.equals("es")) |
| 91 | + snowballName = "Spanish"; |
| 92 | + else if(lang.equals("sv")) |
| 93 | + snowballName = "Swedish"; |
| 94 | + else if(lang.equals("eo")) |
| 95 | + stemmer = EsperantoStemFilter.class; |
| 96 | + else |
| 97 | + useStemmer = false; |
| 98 | + } else |
81 | 99 | useStemmer = false; |
82 | 100 | |
83 | | - // figure out custom filter |
84 | | - useCustomFilter = true; |
| 101 | + // figure out language-dependent filters |
| 102 | + useLangFilter = true; |
85 | 103 | if(lang.equals("th")) |
86 | | - customFilter = ThaiWordFilter.class; |
| 104 | + langFilter = ThaiWordFilter.class; |
87 | 105 | else if(lang.equals("sr")) |
88 | | - customFilter = SerbianFilter.class; |
| 106 | + langFilter = SerbianFilter.class; |
89 | 107 | else if(lang.equals("vi")) |
90 | | - customFilter = VietnameseFilter.class; |
| 108 | + langFilter = VietnameseFilter.class; |
91 | 109 | else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") || |
92 | 110 | lang.equals("zh-classical") || lang.equals("zh-yue")){ |
93 | | - customFilter = CJKFilter.class; |
| 111 | + langFilter = CJKFilter.class; |
94 | 112 | usingCJK = true; |
95 | 113 | } else |
96 | | - useCustomFilter = false; |
| 114 | + useLangFilter = false; |
97 | 115 | |
| 116 | + // additional filters |
| 117 | + if(type == Type.SPELL_CHECK){ |
| 118 | + additionalFilters = new ArrayList<Class>(); |
| 119 | + additionalFilters.add(PhraseFilter.class); |
| 120 | + } |
| 121 | + |
98 | 122 | } |
99 | 123 | |
100 | 124 | public TokenFilter makeStemmer(TokenStream in){ |
— | — | @@ -113,11 +137,11 @@ |
114 | 138 | } |
115 | 139 | |
116 | 140 | public TokenFilter makeCustomFilter(TokenStream in){ |
117 | | - if(!useCustomFilter) |
| 141 | + if(!useLangFilter) |
118 | 142 | return null; |
119 | | - else if(customFilter != null){ |
| 143 | + else if(langFilter != null){ |
120 | 144 | try { |
121 | | - return (TokenFilter) customFilter.getConstructor(TokenStream.class).newInstance(in); |
| 145 | + return (TokenFilter) langFilter.getConstructor(TokenStream.class).newInstance(in); |
122 | 146 | } catch (Exception e) { |
123 | 147 | e.printStackTrace(); |
124 | 148 | } |
— | — | @@ -126,6 +150,26 @@ |
127 | 151 | return null; |
128 | 152 | } |
129 | 153 | |
| 154 | + public TokenStream makeAdditionalFilterChain(TokenStream in){ |
| 155 | + if(additionalFilters == null) |
| 156 | + return in; |
| 157 | + try { |
| 158 | + TokenStream chain = in; |
| 159 | + // nest additional filters, apply them as added to the list |
| 160 | + for(Class filter : additionalFilters){ |
| 161 | + chain = (TokenStream) filter.getConstructor(TokenStream.class).newInstance(chain); |
| 162 | + } |
| 163 | + return chain; |
| 164 | + } catch (Exception e) { |
| 165 | + e.printStackTrace(); |
| 166 | + return null; |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + public boolean hasAdditionalFilters(){ |
| 171 | + return additionalFilters != null; |
| 172 | + } |
| 173 | + |
130 | 174 | public boolean hasStemmer(){ |
131 | 175 | return useStemmer; |
132 | 176 | } |
— | — | @@ -135,12 +179,31 @@ |
136 | 180 | } |
137 | 181 | |
138 | 182 | public boolean hasCustomFilter(){ |
139 | | - return useCustomFilter; |
| 183 | + return useLangFilter; |
140 | 184 | } |
141 | 185 | |
142 | 186 | public String getLanguage(){ |
143 | 187 | return lang; |
144 | 188 | } |
145 | 189 | |
| 190 | + public void setStopWords(Set<String> stopWords){ |
| 191 | + for(Class filter : additionalFilters){ |
| 192 | + for(Method m : filter.getMethods()){ |
| 193 | + if(m.getName().equals("setStopWords")){ |
| 194 | + try { |
| 195 | + m.invoke(filter,new Object[] {stopWords}); |
| 196 | + } catch (IllegalArgumentException e) { |
| 197 | + e.printStackTrace(); |
| 198 | + } catch (IllegalAccessException e) { |
| 199 | + e.printStackTrace(); |
| 200 | + } catch (InvocationTargetException e) { |
| 201 | + e.printStackTrace(); |
| 202 | + } |
| 203 | + } |
| 204 | + } |
| 205 | + |
| 206 | + } |
| 207 | + } |
146 | 208 | |
| 209 | + |
147 | 210 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java |
— | — | @@ -0,0 +1,76 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.HashSet; |
| 6 | +import java.util.Set; |
| 7 | + |
| 8 | +import org.apache.lucene.analysis.Token; |
| 9 | +import org.apache.lucene.analysis.TokenFilter; |
| 10 | +import org.apache.lucene.analysis.TokenStream; |
| 11 | +import org.apache.lucene.index.IndexReader; |
| 12 | +import org.apache.lucene.index.Term; |
| 13 | +import org.wikimedia.lsearch.config.IndexId; |
| 14 | +import org.wikimedia.lsearch.config.IndexRegistry; |
| 15 | + |
| 16 | +/** |
| 17 | + * Filter that outputs phrases and words mixed, e.g. |
| 18 | + * novi sad is a city -> novi, sad, novi_sad, is, sad_is, a, is_a, city, a_city |
| 19 | + * |
| 20 | + * @author rainman |
| 21 | + * |
| 22 | + */ |
| 23 | +public class PhraseFilter extends TokenFilter { |
| 24 | + protected Set<String> stopWords = null; |
| 25 | + |
| 26 | + public PhraseFilter(TokenStream input) { |
| 27 | + super(input); |
| 28 | + } |
| 29 | + |
| 30 | + protected Token phrase1 = null, phrase2 = null; |
| 31 | + protected boolean phraseReady = false; |
| 32 | + |
| 33 | + protected boolean forPhrase(Token t){ |
| 34 | + if(stopWords!=null && stopWords.contains(t.termText())) |
| 35 | + return false; |
| 36 | + else |
| 37 | + return true; |
| 38 | + } |
| 39 | + |
| 40 | + @Override |
| 41 | + public Token next() throws IOException { |
| 42 | + if(phraseReady){ |
| 43 | + phraseReady = false; |
| 44 | + return new Token(phrase1.termText()+"_"+phrase2.termText(),phrase1.startOffset(),phrase2.endOffset()); |
| 45 | + } |
| 46 | + Token t = input.next(); |
| 47 | + if(t == null) |
| 48 | + return null; // EOS |
| 49 | + if(!forPhrase(t)) |
| 50 | + return t; // stop word, return as word only |
| 51 | + |
| 52 | + if(phrase1 == null){ |
| 53 | + phrase1 = t; |
| 54 | + return t; |
| 55 | + } |
| 56 | + if(phrase2 == null){ |
| 57 | + phrase2 = t; |
| 58 | + phraseReady = true; |
| 59 | + return t; |
| 60 | + } |
| 61 | + |
| 62 | + phrase1 = phrase2; |
| 63 | + phrase2 = t; |
| 64 | + phraseReady = true; |
| 65 | + |
| 66 | + return t; // prepared phrase, return word, phrase in next call |
| 67 | + } |
| 68 | + |
| 69 | + public Set<String> getStopWords() { |
| 70 | + return stopWords; |
| 71 | + } |
| 72 | + |
| 73 | + public void setStopWords(Set<String> stopWords) { |
| 74 | + this.stopWords = stopWords; |
| 75 | + } |
| 76 | + |
| 77 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java |
— | — | @@ -61,8 +61,12 @@ |
62 | 62 | if(filters.hasCustomFilter()) |
63 | 63 | tokens = applyCustomFilter(tokens); |
64 | 64 | |
65 | | - return new AliasFilter(filters, |
66 | | - new ArrayTokens(tokens), new ArrayTokens(tokens)); |
| 65 | + TokenStream out = new AliasFilter(filters, |
| 66 | + new ArrayTokens(tokens), new ArrayTokens(tokens)); |
| 67 | + if(filters.hasAdditionalFilters()) |
| 68 | + return filters.makeAdditionalFilterChain(out); |
| 69 | + else |
| 70 | + return out; |
67 | 71 | } |
68 | 72 | |
69 | 73 | /** Filter the tokens via the custom filter. For instance, to delete |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java |
— | — | @@ -42,23 +42,28 @@ |
43 | 43 | /** default is ignore case (upper/lower), use exact_case for wiktionaries, etc */ |
44 | 44 | public static enum Case { IGNORE_CASE, EXACT_CASE }; |
45 | 45 | /** use stemmer if available, of force no stemming */ |
46 | | - public static enum Stemmer { USE_STEMMER, NO_STEMMER }; |
| 46 | + public static enum Stemmer { USE_STEMMER, NO_STEMMER }; |
| 47 | + /** additional options */ |
| 48 | + public static enum Options { NONE, SPELL_CHECK }; |
47 | 49 | |
48 | 50 | /** Construct case-insensitive field builder with stemming */ |
49 | 51 | public FieldBuilder(String lang){ |
50 | | - this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER); |
| 52 | + this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE); |
51 | 53 | } |
52 | 54 | |
53 | 55 | public FieldBuilder(String lang, Case useCase){ |
54 | | - this(lang,useCase,Stemmer.USE_STEMMER); |
| 56 | + this(lang,useCase,Stemmer.USE_STEMMER,Options.NONE); |
55 | 57 | } |
56 | 58 | |
57 | | - public FieldBuilder(String lang, Case useCase, Stemmer useStemmer){ |
| 59 | + public FieldBuilder(String lang, Case useCase, Stemmer useStemmer, Options options){ |
| 60 | + FilterFactory.Type type = FilterFactory.Type.FULL; |
| 61 | + if(options == Options.SPELL_CHECK) |
| 62 | + type = FilterFactory.Type.SPELL_CHECK; |
58 | 63 | // additional exact case factory |
59 | 64 | if(useCase == Case.EXACT_CASE){ |
60 | | - builders = new BuilderSet[2]; |
| 65 | + builders = new BuilderSet[2]; |
61 | 66 | builders[1] = new BuilderSet( |
62 | | - new FilterFactory(lang).getNoStemmerFilterFactory(), |
| 67 | + new FilterFactory(lang,type).getNoStemmerFilterFactory(), |
63 | 68 | new FieldNameFactory(FieldNameFactory.EXACT_CASE)); |
64 | 69 | } else |
65 | 70 | builders = new BuilderSet[1]; |
— | — | @@ -66,11 +71,11 @@ |
67 | 72 | // default factory, lowercase all data |
68 | 73 | if(useStemmer == Stemmer.USE_STEMMER){ |
69 | 74 | builders[0] = new BuilderSet( |
70 | | - new FilterFactory(lang), |
| 75 | + new FilterFactory(lang,type), |
71 | 76 | new FieldNameFactory()); |
72 | 77 | } else{ |
73 | 78 | builders[0] = new BuilderSet( |
74 | | - new FilterFactory(lang).getNoStemmerFilterFactory(), |
| 79 | + new FilterFactory(lang,type).getNoStemmerFilterFactory(), |
75 | 80 | new FieldNameFactory()); |
76 | 81 | } |
77 | 82 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java |
— | — | @@ -1,5 +1,7 @@ |
2 | 2 | package org.wikimedia.lsearch.util; |
3 | 3 | |
| 4 | +import org.wikimedia.lsearch.test.MathFuncTest; |
| 5 | + |
4 | 6 | public class MathFunc { |
5 | 7 | |
6 | 8 | /** Calculate average value starting from start to end (end excluded) */ |
— | — | @@ -26,27 +28,34 @@ |
27 | 29 | // av[i] = avg(val,part[i],part[i+1]); |
28 | 30 | // error |
29 | 31 | double err = calcErr(part,val,num); |
| 32 | + double err2 = calcErr2(part,val,num); |
30 | 33 | // values at next iteration |
31 | 34 | int[] newpart = new int[num+1]; |
32 | 35 | //double[] newav = new double[num]; |
33 | | - double newerr = 0; |
| 36 | + double newerr = 0, newerr2 = 0; |
34 | 37 | |
35 | 38 | while(true){ |
36 | 39 | for(int i=0;i<num-1;i++){ |
37 | 40 | merge(i,part,newpart,val,num); |
38 | 41 | newerr = calcErr(newpart,val,num); |
39 | | - if(newerr < err){ |
| 42 | + newerr2 = calcErr2(newpart,val,num); |
| 43 | + if(newerr < err || (newerr == err && newerr2 < err2)){ |
40 | 44 | copy(newpart,part); |
41 | 45 | err = newerr; |
| 46 | + err2 = newerr2; |
| 47 | + //MathFuncTest.print(newpart,val); |
42 | 48 | continue; |
43 | 49 | } |
44 | 50 | } |
45 | 51 | // try extending last |
46 | 52 | extend(part,newpart,val,num); |
47 | 53 | newerr = calcErr(newpart,val,num); |
48 | | - if(newerr < err){ |
| 54 | + newerr2 = calcErr2(newpart,val,num); |
| 55 | + if(newerr < err || (newerr == err && newerr2 < err2)){ |
49 | 56 | copy(newpart,part); |
50 | 57 | err = newerr; |
| 58 | + err2 = newerr2; |
| 59 | + //MathFuncTest.print(newpart,val); |
51 | 60 | continue; |
52 | 61 | } |
53 | 62 | break; |
— | — | @@ -94,10 +103,24 @@ |
95 | 104 | double err = 0; |
96 | 105 | for(int i=0;i<num;i++){ |
97 | 106 | // max - min value |
98 | | - double e = val[part[i]]-val[part[i+1]-1]; |
| 107 | + double v2 = val[part[i]]; |
| 108 | + double v1 = val[part[i+1]-1]; |
| 109 | + double e = v2 - v1; |
99 | 110 | if( e > err ) |
100 | 111 | err = e; |
101 | 112 | } |
102 | 113 | return err; |
103 | 114 | } |
| 115 | + |
| 116 | + private static double calcErr2(int[] part, double[] val, int num) { |
| 117 | + double err = 0; |
| 118 | + for(int i=0;i<num;i++){ |
| 119 | + // max - min value |
| 120 | + double v2 = val[part[i]]; |
| 121 | + double v1 = val[part[i+1]-1]; |
| 122 | + double e = v2 - v1; |
| 123 | + err += e*(part[i+1]-1-part[i]); |
| 124 | + } |
| 125 | + return err; |
| 126 | + } |
104 | 127 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java |
— | — | @@ -140,6 +140,10 @@ |
141 | 141 | if(table[ich]==null) |
142 | 142 | continue; |
143 | 143 | Buffer buffer = new Buffer(buf,0); |
| 144 | + if(ich == 0xD4A){ |
| 145 | + int b = 0; |
| 146 | + b++; |
| 147 | + } |
144 | 148 | recursiveDecompose(buffer,table,letters,(char)ich); |
145 | 149 | if(buffer.len != 0){ |
146 | 150 | decomposition[ich]= new char[buffer.len]; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/HighFreqTerms.java |
— | — | @@ -0,0 +1,83 @@ |
| 2 | +package org.wikimedia.lsearch.util; |
| 3 | + |
| 4 | +/** |
| 5 | + * Copyright 2004 The Apache Software Foundation |
| 6 | + * |
| 7 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 8 | + * you may not use this file except in compliance with the License. |
| 9 | + * You may obtain a copy of the License at |
| 10 | + * |
| 11 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | + * |
| 13 | + * Unless required by applicable law or agreed to in writing, software |
| 14 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 15 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 16 | + * See the License for the specific language governing permissions and |
| 17 | + * limitations under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +import java.io.IOException; |
| 21 | +import java.util.ArrayList; |
| 22 | +import java.util.Collection; |
| 23 | +import java.util.LinkedList; |
| 24 | + |
| 25 | +import org.apache.lucene.index.IndexReader; |
| 26 | +import org.apache.lucene.index.Term; |
| 27 | +import org.apache.lucene.index.TermEnum; |
| 28 | +import org.apache.lucene.util.PriorityQueue; |
| 29 | + |
| 30 | +/** |
| 31 | + * <code>HighFreqTerms</code> class extracts terms and their frequencies out |
| 32 | + * of an existing Lucene index. |
| 33 | + * |
| 34 | + * @version $Id: HighFreqTerms.java 376393 2006-02-09 19:17:14Z otis $ |
| 35 | + */ |
| 36 | +public class HighFreqTerms { |
| 37 | + |
| 38 | + public static Collection<String> getHighFreqTerms(IndexReader reader, String field, int numTerms) throws IOException { |
| 39 | + TermInfoQueue tiq = new TermInfoQueue(numTerms); |
| 40 | + TermEnum terms = reader.terms(); |
| 41 | + LinkedList<String> ret = new LinkedList<String>(); |
| 42 | + |
| 43 | + if (field != null) { |
| 44 | + // collect terms from field into priority queue |
| 45 | + while (terms.next()) { |
| 46 | + if (terms.term().field().equals(field)) { |
| 47 | + tiq.insert(new TermInfo(terms.term(), terms.docFreq())); |
| 48 | + } |
| 49 | + } |
| 50 | + } else { |
| 51 | + // collect all terms |
| 52 | + while (terms.next()) { |
| 53 | + tiq.insert(new TermInfo(terms.term(), terms.docFreq())); |
| 54 | + } |
| 55 | + } |
| 56 | + |
| 57 | + // get higest ranked |
| 58 | + while (tiq.size() != 0) { |
| 59 | + ret.addFirst(((TermInfo) tiq.pop()).term.text()); |
| 60 | + } |
| 61 | + |
| 62 | + return ret; |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +final class TermInfo { |
| 67 | + TermInfo(Term t, int df) { |
| 68 | + term = t; |
| 69 | + docFreq = df; |
| 70 | + } |
| 71 | + int docFreq; |
| 72 | + Term term; |
| 73 | +} |
| 74 | + |
| 75 | +final class TermInfoQueue extends PriorityQueue { |
| 76 | + TermInfoQueue(int size) { |
| 77 | + initialize(size); |
| 78 | + } |
| 79 | + protected final boolean lessThan(Object a, Object b) { |
| 80 | + TermInfo termInfoA = (TermInfo) a; |
| 81 | + TermInfo termInfoB = (TermInfo) b; |
| 82 | + return termInfoA.docFreq < termInfoB.docFreq; |
| 83 | + } |
| 84 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | int bad=0; |
41 | 41 | long start = System.currentTimeMillis(); |
42 | 42 | for(String[] m : DATA){ |
43 | | - ArrayList<SuggestResult> res = sc.suggestWords(m[0],5); |
| 43 | + ArrayList<SuggestResult> res = sc.suggestWordsFromTitle(m[0],new NamespaceFilter(0),5); |
44 | 44 | if(res.size() > 0){ |
45 | 45 | SuggestResult r = res.get(0); |
46 | 46 | if(r.getWord().equals(m[1])) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -385,6 +385,15 @@ |
386 | 386 | q = parser.parseFourPass("\"うろパン\"",NamespacePolicy.IGNORE,false); |
387 | 387 | assertEquals("contents:\"うろ ろハ ハン\" title:\"うろ ろハ ハン\"^2.0 (alttitle1:\"うろ ろハ ハン\"^6.0 alttitle2:\"うろ ろハ ハン\"^6.0 alttitle3:\"うろ ろハ ハン\"^6.0)",q.toString()); |
388 | 388 | |
| 389 | + |
| 390 | + // Malayalam |
| 391 | + analyzer = Analyzers.getSearcherAnalyzer("ml"); |
| 392 | + bs = new FieldBuilder("ml").getBuilder(); |
| 393 | + parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE); |
| 394 | + q = parser.parseFourPass("കൊറിയ ",NamespacePolicy.IGNORE,false); |
| 395 | + assertEquals("contents:കറയ title:കറയ^2.0 (alttitle1:കറയ^6.0 alttitle2:കറയ^6.0 alttitle3:കറയ^6.0)",q.toString()); |
| 396 | + |
| 397 | + |
389 | 398 | // Test field extraction |
390 | 399 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
391 | 400 | assertEquals(3,fs.size()); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -5,13 +5,19 @@ |
6 | 6 | import java.util.Collections; |
7 | 7 | import java.util.Comparator; |
8 | 8 | import java.util.HashMap; |
| 9 | +import java.util.HashSet; |
9 | 10 | import java.util.LinkedList; |
| 11 | +import java.util.Set; |
| 12 | +import java.util.WeakHashMap; |
10 | 13 | import java.util.Map.Entry; |
11 | 14 | |
12 | 15 | import org.apache.log4j.Logger; |
| 16 | +import org.apache.lucene.analysis.Analyzer; |
13 | 17 | import org.apache.lucene.analysis.Token; |
| 18 | +import org.apache.lucene.analysis.TokenStream; |
14 | 19 | import org.apache.lucene.document.Document; |
15 | 20 | import org.apache.lucene.index.Term; |
| 21 | +import org.apache.lucene.index.TermDocs; |
16 | 22 | import org.apache.lucene.search.BooleanClause; |
17 | 23 | import org.apache.lucene.search.BooleanQuery; |
18 | 24 | import org.apache.lucene.search.Hits; |
— | — | @@ -20,7 +26,10 @@ |
21 | 27 | import org.apache.lucene.search.ScoreDoc; |
22 | 28 | import org.apache.lucene.search.TermQuery; |
23 | 29 | import org.apache.lucene.search.TopDocs; |
| 30 | +import org.wikimedia.lsearch.analyzers.Analyzers; |
24 | 31 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
| 32 | +import org.wikimedia.lsearch.beans.ResultSet; |
| 33 | +import org.wikimedia.lsearch.beans.SearchResults; |
25 | 34 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
26 | 35 | import org.wikimedia.lsearch.config.IndexId; |
27 | 36 | import org.wikimedia.lsearch.search.NamespaceFilter; |
— | — | @@ -38,6 +47,8 @@ |
39 | 48 | protected IndexSearcher titles; |
40 | 49 | protected int minHitsWords; |
41 | 50 | protected int minHitsTitles; |
| 51 | + protected static WeakHashMap<IndexSearcher,Set<String>> stopWordsIndexes = new WeakHashMap<IndexSearcher,Set<String>>(); |
| 52 | + protected Set<String> stopWords; |
42 | 53 | |
43 | 54 | /** Distance an metaphone metrics */ |
44 | 55 | static class Metric { |
— | — | @@ -70,7 +81,7 @@ |
71 | 82 | } |
72 | 83 | |
73 | 84 | /** Number of results to fetch */ |
74 | | - public static final int POOL = 300; |
| 85 | + public static final int POOL = 150; |
75 | 86 | |
76 | 87 | /** Lower limit to hit rate for joining */ |
77 | 88 | public static final int JOIN_FREQ = 1; |
— | — | @@ -83,6 +94,21 @@ |
84 | 95 | this.titles = cache.getLocalSearcher(iid.getSpellTitles()); |
85 | 96 | this.minHitsWords = global.getIntDBParam(iid.getDBname(),"spell_words","minHits",20); |
86 | 97 | this.minHitsTitles = global.getIntDBParam(iid.getDBname(),"spell_titles","minHits",20); |
| 98 | + |
| 99 | + synchronized(stopWordsIndexes){ |
| 100 | + if(!stopWordsIndexes.containsKey(titles)){ |
| 101 | + Set<String> s = Collections.synchronizedSet(new HashSet<String>()); |
| 102 | + stopWordsIndexes.put(titles,s); |
| 103 | + TermDocs d = titles.getIndexReader().termDocs(new Term("metadata_key","stopWords")); |
| 104 | + if(d.next()){ |
| 105 | + String val = titles.doc(d.doc()).get("metadata_value"); |
| 106 | + for(String sw : val.split(" ")){ |
| 107 | + s.add(sw); |
| 108 | + } |
| 109 | + } |
| 110 | + } |
| 111 | + this.stopWords = stopWordsIndexes.get(titles); |
| 112 | + } |
87 | 113 | } |
88 | 114 | |
89 | 115 | static class Change { |
— | — | @@ -111,15 +137,46 @@ |
112 | 138 | * |
113 | 139 | * @return suggested query, or null if no suggestions |
114 | 140 | */ |
115 | | - public String suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, int numHits){ |
| 141 | + @SuppressWarnings("unchecked") |
| 142 | + public SuggestQuery suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, SearchResults res){ |
116 | 143 | ArrayList<Token> tokens = parser.tokenizeBareText(searchterm); |
| 144 | + int numHits = res.getNumHits(); |
| 145 | + |
| 146 | + if(numHits >= minHitsTitles) |
| 147 | + return null; |
| 148 | + |
| 149 | + // collect words in titles, these shouldn't be spell-checked |
| 150 | + HashSet<String> correctWords = new HashSet<String>(); |
| 151 | + Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false); |
| 152 | + try { |
| 153 | + for(ResultSet r : res.getResults()){ |
| 154 | + Token t = null; |
| 155 | + TokenStream ts = analyzer.tokenStream("title",r.title); |
| 156 | + while( (t = ts.next()) != null ){ |
| 157 | + correctWords.add(t.termText()); |
| 158 | + } |
| 159 | + } |
| 160 | + } catch (IOException e) { |
| 161 | + log.error("I/O error trying to get list of correct words : "+e.getMessage()); |
| 162 | + e.printStackTrace(); |
| 163 | + } |
117 | 164 | |
118 | 165 | // always spell-check phrases |
119 | 166 | int minFreq = (numHits < minHitsTitles)? 0 : numHits; |
120 | 167 | ArrayList<Change> suggestions = new ArrayList<Change>(); |
121 | | - Token last = null; |
| 168 | + |
| 169 | + // add correct words |
122 | 170 | for(int i=0;i<tokens.size();i++){ |
123 | 171 | Token t = tokens.get(i); |
| 172 | + if(correctWords.contains(t.termText())){ |
| 173 | + Change c = new Change(0,1,Change.Type.TITLE_WORD); |
| 174 | + c.preserves.put(i,t.termText()); |
| 175 | + suggestions.add(c); |
| 176 | + } |
| 177 | + } |
| 178 | + |
| 179 | + for(int i=0;i<tokens.size();i++){ |
| 180 | + Token t = tokens.get(i); |
124 | 181 | String w = t.termText(); |
125 | 182 | if(!"word".equals(t.type()) && !"phrase".equals(t.type())) |
126 | 183 | continue; // ignore aliases and such |
— | — | @@ -143,64 +200,46 @@ |
144 | 201 | sc.substitutes.put(i,split.word.replace("_"," ")); |
145 | 202 | suggestions.add(sc); |
146 | 203 | } |
147 | | - |
148 | 204 | // get suggestions for pairs of words |
149 | | - if(last != null && t.type().equals(last.type())){ |
150 | | - String word1 = last.termText(); |
151 | | - String word2 = t.termText(); |
152 | | - // phrase |
153 | | - ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq); |
154 | | - if(r.size() > 0){ |
155 | | - SuggestResult res = r.get(0); |
156 | | - String[] ph = res.word.split("_"); |
157 | | - if(ph.length == 2){ |
158 | | - // figure out which words need to be changed |
159 | | - Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE); |
160 | | - if(!ph[0].equals(word1)) |
161 | | - sc.substitutes.put(i-1,ph[0]); |
162 | | - else |
163 | | - sc.preserves.put(i-1,ph[0]); |
164 | | - if(!ph[1].equals(word2)) |
165 | | - sc.substitutes.put(i,ph[1]); |
166 | | - else |
167 | | - sc.preserves.put(i,ph[1]); |
168 | | - suggestions.add(sc); |
169 | | - } else |
170 | | - log.error("Unexpected phrase in suggest result "+res); |
| 205 | + for(int j=i+1;j<tokens.size();j++){ |
| 206 | + if(!correctWords.contains(tokens.get(i)) && !correctWords.contains(tokens.get(j))){ |
| 207 | + boolean succ = addPhraseSuggestion(tokens,i,j,suggestions,nsf,minFreq); |
| 208 | + if(succ) |
| 209 | + break; |
171 | 210 | } |
172 | | - // join |
173 | | - SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq); |
174 | | - if(join != null){ |
175 | | - Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN); |
176 | | - sc.substitutes.put(i-1,""); |
177 | | - sc.substitutes.put(i,join.word); |
178 | | - suggestions.add(sc); |
179 | | - } |
180 | 211 | } |
181 | | - last = t; |
| 212 | + |
182 | 213 | } |
| 214 | + // indexes of tokens to be preserved in individual word check |
| 215 | + HashSet<Integer> preserveTokens = new HashSet<Integer>(); |
183 | 216 | if(suggestions.size() > 0){ |
184 | 217 | // found some suggestions |
185 | | - ArrayList<Entry<Integer,String>> proposedChanges = calculateChanges(suggestions,searchterm.length()/2); |
| 218 | + Object[] ret = calculateChanges(suggestions,searchterm.length()/2); |
| 219 | + ArrayList<Entry<Integer,String>> proposedChanges = (ArrayList<Entry<Integer, String>>) ret[0]; |
| 220 | + ArrayList<Entry<Integer,String>> preservedWords = (ArrayList<Entry<Integer, String>>) ret[1]; |
| 221 | + for(Entry<Integer,String> e : preservedWords) |
| 222 | + preserveTokens.add(e.getKey()); |
186 | 223 | // substitute |
187 | 224 | if(proposedChanges.size() > 0){ |
188 | 225 | for(Entry<Integer,String> e : proposedChanges){ |
189 | 226 | Token t = tokens.get(e.getKey()); |
190 | 227 | searchterm = markSuggestion(searchterm,t,e.getValue()); |
191 | 228 | } |
192 | | - return tidy(searchterm); |
| 229 | + return new SuggestQuery(tidy(searchterm)); |
193 | 230 | } |
194 | 231 | } |
195 | 232 | |
196 | 233 | // spell-check individual words |
197 | | - if(numHits < minHitsWords){ |
| 234 | + if(numHits < minHitsWords && tokens.size() != 1){ |
198 | 235 | LinkedList<Change> changes = new LinkedList<Change>(); |
199 | 236 | for(int i=0;i<tokens.size();i++){ |
200 | 237 | Token t = tokens.get(i); |
201 | 238 | String w = t.termText(); |
202 | 239 | if(w.length() < 2) |
203 | 240 | continue; |
204 | | - ArrayList<SuggestResult> sug = suggestWords(w,1); |
| 241 | + if(correctWords.contains(w) || preserveTokens.contains(i)) |
| 242 | + continue; |
| 243 | + ArrayList<SuggestResult> sug = suggestWordsFromTitle(w,nsf,1); |
205 | 244 | if(sug.size() > 0){ |
206 | 245 | SuggestResult r = sug.get(0); |
207 | 246 | if(r.word.equals(w)) |
— | — | @@ -218,13 +257,55 @@ |
219 | 258 | searchterm = markSuggestion(searchterm,t,e.getValue()); |
220 | 259 | } |
221 | 260 | } |
222 | | - return searchterm; |
| 261 | + return new SuggestQuery(tidy(searchterm),true); |
223 | 262 | } |
224 | 263 | } |
225 | 264 | |
226 | 265 | return null; |
227 | 266 | } |
228 | 267 | |
| 268 | + protected boolean addPhraseSuggestion(ArrayList<Token> tokens, int i1, int i2, ArrayList<Change> suggestions, NamespaceFilter nsf, int minFreq) { |
| 269 | + Token t1 = tokens.get(i1); |
| 270 | + Token t2 = tokens.get(i2); |
| 271 | + if(t2.type().equals(t1.type())){ |
| 272 | + String word1 = t1.termText(); |
| 273 | + String word2 = t2.termText(); |
| 274 | + if(stopWords.contains(word1) || stopWords.contains(word2)) |
| 275 | + return false; |
| 276 | + log.info("spell-check phrase \""+word1+" "+word2+"\""); |
| 277 | + // phrase |
| 278 | + ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq); |
| 279 | + if(r.size() > 0){ |
| 280 | + SuggestResult res = r.get(0); |
| 281 | + String[] ph = res.word.split("_"); |
| 282 | + if(ph.length == 2){ |
| 283 | + // figure out which words need to be changed |
| 284 | + Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE); |
| 285 | + if(!ph[0].equals(word1)) |
| 286 | + sc.substitutes.put(i1,ph[0]); |
| 287 | + else |
| 288 | + sc.preserves.put(i1,ph[0]); |
| 289 | + if(!ph[1].equals(word2)) |
| 290 | + sc.substitutes.put(i2,ph[1]); |
| 291 | + else |
| 292 | + sc.preserves.put(i2,ph[1]); |
| 293 | + suggestions.add(sc); |
| 294 | + } else |
| 295 | + log.error("Unexpected phrase in suggest result "+res); |
| 296 | + } |
| 297 | + // join |
| 298 | + SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq); |
| 299 | + if(join != null){ |
| 300 | + Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN); |
| 301 | + sc.substitutes.put(i1,""); |
| 302 | + sc.substitutes.put(i2,join.word); |
| 303 | + suggestions.add(sc); |
| 304 | + } |
| 305 | + return true; |
| 306 | + } |
| 307 | + return false; |
| 308 | + } |
| 309 | + |
229 | 310 | protected String markSuggestion(String searchterm, Token t, String newWord){ |
230 | 311 | return searchterm.substring(0,t.startOffset()) |
231 | 312 | + "<i>" + newWord + "</i>" |
— | — | @@ -233,7 +314,7 @@ |
234 | 315 | |
235 | 316 | /** tidy the query, convert double spaces into single spaces, and such... */ |
236 | 317 | protected String tidy(String searchterm){ |
237 | | - return searchterm.replaceAll(" +"," "); |
| 318 | + return searchterm.replaceAll("<i></i>","").replaceAll(" +"," ").replaceAll(";",""); |
238 | 319 | } |
239 | 320 | |
240 | 321 | /** |
— | — | @@ -242,7 +323,7 @@ |
243 | 324 | * |
244 | 325 | * @return set of token_number -> new string. |
245 | 326 | */ |
246 | | - protected ArrayList<Entry<Integer,String>> calculateChanges(ArrayList<Change> changes, int maxDist){ |
| 327 | + protected Object[] calculateChanges(ArrayList<Change> changes, int maxDist){ |
247 | 328 | // sort suggested changes by relevance |
248 | 329 | Collections.sort(changes,new Comparator<Change>() { |
249 | 330 | public int compare(Change o1, Change o2){ |
— | — | @@ -288,7 +369,9 @@ |
289 | 370 | return o2.getKey() - o1.getKey(); |
290 | 371 | } |
291 | 372 | }); |
292 | | - return proposedChanges; |
| 373 | + ArrayList<Entry<Integer,String>> preservedWords = new ArrayList<Entry<Integer,String>>(); |
| 374 | + preservedWords.addAll(preserve.entrySet()); |
| 375 | + return new Object[] {proposedChanges, preservedWords}; |
293 | 376 | } |
294 | 377 | |
295 | 378 | /** Suggest some words from the words index */ |
— | — | @@ -387,7 +470,7 @@ |
388 | 471 | protected boolean acceptWord(SuggestResult r, Metric m){ |
389 | 472 | // check metaphones: don't add if the pronunciation is something completely unrelated |
390 | 473 | if((r.distMetaphone < m.meta1.length() || r.distMetaphone2 < m.meta2.length()) && (r.distMetaphone<=3 || r.distMetaphone2<=3) |
391 | | - && (r.dist <= m.word.length()/2 || r.dist <= r.word.length()/2)) |
| 474 | + && (r.dist <= m.word.length()/2 || r.dist <= r.word.length()/2) && Math.abs(m.word.length()-r.word.length()) <= 3) |
392 | 475 | return true; |
393 | 476 | else |
394 | 477 | return false; |
— | — | @@ -435,7 +518,7 @@ |
436 | 519 | if(hits.length() > 0){ |
437 | 520 | int pfreq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf); |
438 | 521 | if(pfreq >= freq && pfreq > minFreq) |
439 | | - res.add(new SuggestResult(phrase,pfreq,1)); |
| 522 | + res.add(new SuggestResult(phrase,pfreq,2)); |
440 | 523 | } |
441 | 524 | } |
442 | 525 | if(res.size() > 0){ |
— | — | @@ -473,7 +556,7 @@ |
474 | 557 | Metric m2 = new Metric(word2); |
475 | 558 | Metric metric = new Metric(phrase); |
476 | 559 | try { |
477 | | - TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),200); |
| 560 | + TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),POOL/2); |
478 | 561 | ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
479 | 562 | int minfreq = (minFreq == 0)? -1 : minFreq; |
480 | 563 | // fetch results |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | import org.wikimedia.lsearch.analyzers.Analyzers; |
14 | 14 | import org.wikimedia.lsearch.analyzers.FieldBuilder; |
15 | 15 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
| 16 | +import org.wikimedia.lsearch.beans.SearchResults; |
16 | 17 | import org.wikimedia.lsearch.config.Configuration; |
17 | 18 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
18 | 19 | import org.wikimedia.lsearch.config.IndexId; |
— | — | @@ -25,7 +26,7 @@ |
26 | 27 | Configuration.open(); |
27 | 28 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
28 | 29 | boolean suggestOnly = false; |
29 | | - String dbname = "wikilucene"; |
| 30 | + String dbname = "enwiki"; |
30 | 31 | for(int i=0;i<args.length;i++){ |
31 | 32 | if(args[i].equals("-s")) |
32 | 33 | suggestOnly = true; |
— | — | @@ -78,7 +79,7 @@ |
79 | 80 | last = text; |
80 | 81 | } |
81 | 82 | } |
82 | | - System.out.println("#suggest: "+sc.suggest(inputtext,parser,new NamespaceFilter(ns),0)); |
| 83 | + System.out.println("#suggest: "+sc.suggest(inputtext,parser,new NamespaceFilter(ns),new SearchResults())); |
83 | 84 | System.out.println("(finished in "+(System.currentTimeMillis()-start)+" ms)"); |
84 | 85 | } |
85 | 86 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java |
— | — | @@ -35,7 +35,7 @@ |
36 | 36 | |
37 | 37 | public CleanIndexWriter(IndexId iid) throws IOException{ |
38 | 38 | this.iid = iid; |
39 | | - this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER); |
| 39 | + this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK); |
40 | 40 | this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
41 | 41 | String pathMain = iid.getSpellWords().getTempPath(); |
42 | 42 | //String pathAll = iid.getSpellTitles().getTempPath(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestQuery.java |
— | — | @@ -0,0 +1,36 @@ |
| 2 | +package org.wikimedia.lsearch.spell; |
| 3 | + |
| 4 | +/** Result of suggestion for a query */ |
| 5 | +public class SuggestQuery { |
| 6 | + protected String searchterm; |
| 7 | + protected boolean needsCheck; |
| 8 | + public SuggestQuery(String searchterm) { |
| 9 | + this(searchterm,false); |
| 10 | + } |
| 11 | + public SuggestQuery(String searchterm, boolean needsCheck) { |
| 12 | + this.searchterm = searchterm; |
| 13 | + this.needsCheck = needsCheck; |
| 14 | + } |
| 15 | + /** Wether suggestion needs further checking (in case of individual word spell-check) */ |
| 16 | + public boolean needsCheck() { |
| 17 | + return needsCheck; |
| 18 | + } |
| 19 | + public void setNeedsCheck(boolean needsCheck) { |
| 20 | + this.needsCheck = needsCheck; |
| 21 | + } |
| 22 | + /** the suggested search term */ |
| 23 | + public String getSearchterm() { |
| 24 | + return searchterm; |
| 25 | + } |
| 26 | + public void setSearchterm(String searchterm) { |
| 27 | + this.searchterm = searchterm; |
| 28 | + } |
| 29 | + @Override |
| 30 | + public String toString() { |
| 31 | + return needsCheck? searchterm+" [needs check]" : searchterm; |
| 32 | + } |
| 33 | + |
| 34 | + |
| 35 | + |
| 36 | + |
| 37 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java |
— | — | @@ -107,7 +107,7 @@ |
108 | 108 | } |
109 | 109 | } |
110 | 110 | // make words index |
111 | | - log.info("Making words index"); |
| 111 | + /*log.info("Making words index"); |
112 | 112 | try { |
113 | 113 | LuceneDictionary dict = new LuceneDictionary(IndexReader.open(words.getTempPath()),"contents"); |
114 | 114 | WordsIndexer writer = new WordsIndexer(words.getImportPath(),(dbname.equals("wikilucene")? 3 : 50)); |
— | — | @@ -121,18 +121,18 @@ |
122 | 122 | log.fatal("Cannot open clean dictionary for "+words+" : "+e.getMessage()); |
123 | 123 | e.printStackTrace(); |
124 | 124 | return; |
125 | | - } |
| 125 | + }*/ |
126 | 126 | |
127 | 127 | log.info("Making suggest title index"); |
128 | 128 | // make phrase index |
129 | 129 | |
130 | 130 | TitleIndexer tInx = new TitleIndexer(titles); |
131 | | - tInx.createFromSnapshot(); |
| 131 | + tInx.createFromTempIndex(); |
132 | 132 | |
133 | 133 | long end = System.currentTimeMillis(); |
134 | 134 | |
135 | 135 | // make snapshots |
136 | | - IndexThread.makeIndexSnapshot(words,words.getImportPath()); |
| 136 | + //IndexThread.makeIndexSnapshot(words,words.getImportPath()); |
137 | 137 | IndexThread.makeIndexSnapshot(titles,titles.getImportPath()); |
138 | 138 | |
139 | 139 | System.out.println("Finished making suggest index in "+formatTime(end-start)); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NgramIndexer.java |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | package org.wikimedia.lsearch.spell.api; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.util.Collection; |
5 | 6 | |
6 | 7 | import org.apache.log4j.Logger; |
7 | 8 | import org.apache.lucene.analysis.Analyzer; |
— | — | @@ -89,7 +90,7 @@ |
90 | 91 | } |
91 | 92 | |
92 | 93 | /** Return ngrams of specific size for text */ |
93 | | - public static String[] nGrams(String text, int size) { |
| 94 | + public static String[] nGramsRegular(String text, int size) { |
94 | 95 | int len = text.length(); |
95 | 96 | String[] res = new String[len - size + 1]; |
96 | 97 | for (int i = 0; i < len - size + 1; i++) { |
— | — | @@ -98,11 +99,40 @@ |
99 | 100 | return res; |
100 | 101 | } |
101 | 102 | |
| 103 | + /** Reverse a string */ |
| 104 | + protected static String reverse(String source){ |
| 105 | + int len = source.length(); |
| 106 | + StringBuilder dest = new StringBuilder(len); |
| 107 | + |
| 108 | + for (int i = (len - 1); i >= 0; i--) |
| 109 | + dest.append(source.charAt(i)); |
| 110 | + return dest.toString(); |
| 111 | + } |
| 112 | + |
| 113 | + /** Return ngrams of specific size for text, assuming circular string */ |
| 114 | + public static String[] nGrams(String text, int size) { |
| 115 | + int len = text.length(); |
| 116 | + String[] res = null; |
| 117 | + if(len <= 6 && size == 2){ // produce reversed 2-grams |
| 118 | + String[] rev = nGramsRegular(reverse(text),size); |
| 119 | + res = new String[len + rev.length]; |
| 120 | + System.arraycopy(rev,0,res,len,rev.length); |
| 121 | + } else |
| 122 | + res = new String[len]; |
| 123 | + for (int i = 0; i < len; i++) { |
| 124 | + if(i + size <= len) |
| 125 | + res[i] = text.substring(i, i + size); |
| 126 | + else // string is assumed to be circular |
| 127 | + res[i] = text.substring(i)+text.substring(0,(i+size)%len); |
| 128 | + } |
| 129 | + return res; |
| 130 | + } |
| 131 | + |
102 | 132 | /** Get minimal ngram size for word. the minimal size should be at least 1/2 of word length */ |
103 | 133 | public static int getMinNgram(String word){ |
104 | | - if(word.length() <= 7) |
| 134 | + if(word.length() <= 5) |
105 | 135 | return 1; |
106 | | - else if(word.length() <= 14) |
| 136 | + else if(word.length() <= 7) |
107 | 137 | return 2; |
108 | 138 | else |
109 | 139 | return 3; |
— | — | @@ -110,10 +140,12 @@ |
111 | 141 | |
112 | 142 | /** Maximal size of ngram block, at most the length of word */ |
113 | 143 | public static int getMaxNgram(String word){ |
114 | | - if(word.length() <= 10) |
| 144 | + if(word.length() == 4) |
115 | 145 | return 2; |
116 | | - else |
| 146 | + else if(word.length() <= 6) |
117 | 147 | return 3; |
| 148 | + else |
| 149 | + return 4; |
118 | 150 | } |
119 | 151 | |
120 | 152 | /** Get ngram field name with no prefix */ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | import org.wikimedia.lsearch.search.IndexSearcherMul; |
32 | 32 | import org.wikimedia.lsearch.search.WikiSearcher; |
33 | 33 | import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
| 34 | +import org.wikimedia.lsearch.util.HighFreqTerms; |
34 | 35 | |
35 | 36 | /** |
36 | 37 | * Index words and phrases from article titles. |
— | — | @@ -284,6 +285,65 @@ |
285 | 286 | } |
286 | 287 | } |
287 | 288 | |
| 289 | + public void createFromTempIndex(){ |
| 290 | + String path = titles.getImportPath(); // dest where to put index |
| 291 | + try { |
| 292 | + ngramWriter.createIndex(path,new SimpleAnalyzer()); |
| 293 | + IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath()); |
| 294 | + Collection<String> mostfreq = HighFreqTerms.getHighFreqTerms(ir,"contents",50); |
| 295 | + // get at most 25 stopwords |
| 296 | + HashSet<String> stopWords = new HashSet<String>(); |
| 297 | + for(String w : mostfreq){ |
| 298 | + if(!w.contains("_")) |
| 299 | + stopWords.add(w); |
| 300 | + if(stopWords.size() >= 25) |
| 301 | + break; |
| 302 | + } |
| 303 | + addMetadata("stopWords",stopWords); |
| 304 | + |
| 305 | + LuceneDictionary dict = new LuceneDictionary(ir,"contents"); |
| 306 | + Word word; |
| 307 | + while((word = dict.next()) != null){ |
| 308 | + String w = word.getWord(); |
| 309 | + int freq = word.getFrequency(); |
| 310 | + if(w.contains("_")){ // phrase |
| 311 | + String[] words = w.split("_"); |
| 312 | + boolean allowed = true; |
| 313 | + for(String ww : words){ |
| 314 | + // allow only those phrases consisting of title words |
| 315 | + if(stopWords.contains(ww) || ir.docFreq(new Term("title",ww)) == 0){ |
| 316 | + allowed = false; |
| 317 | + break; |
| 318 | + } |
| 319 | + } |
| 320 | + if(allowed && freq > minPhraseFreq){ |
| 321 | + NamespaceFreq nsf = new NamespaceFreq(); |
| 322 | + nsf.setFrequency(0,freq); |
| 323 | + ArrayList<Integer> nss = new ArrayList<Integer>(); |
| 324 | + nss.add(0); |
| 325 | + addPhrase(w,nsf,nss); |
| 326 | + } |
| 327 | + } else{ |
| 328 | + if(freq > minWordFreq){ |
| 329 | + NamespaceFreq nsf = new NamespaceFreq(); |
| 330 | + nsf.setFrequency(0,freq); |
| 331 | + ArrayList<Integer> nss = new ArrayList<Integer>(); |
| 332 | + nss.add(0); |
| 333 | + addWord(w,nsf,nss); |
| 334 | + } |
| 335 | + } |
| 336 | + } |
| 337 | + ngramWriter.closeAndOptimize(); |
| 338 | + ir.close(); |
| 339 | + |
| 340 | + } catch (IOException e) { |
| 341 | + log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage()); |
| 342 | + e.printStackTrace(); |
| 343 | + return; |
| 344 | + } |
| 345 | + |
| 346 | + } |
| 347 | + |
288 | 348 | /** |
289 | 349 | * Add phrase to index |
290 | 350 | * |
— | — | @@ -310,7 +370,26 @@ |
311 | 371 | ngramWriter.addDocument(doc); |
312 | 372 | } |
313 | 373 | |
314 | | - /** Add ordinary word to the index, convenient for suggesting joins |
| 374 | + /** |
| 375 | + * Add into metadata_key and metadata_value. |
| 376 | + * Collection is assumed to contain words (without spaces) |
| 377 | + */ |
| 378 | + public void addMetadata(String key, Collection<String> values){ |
| 379 | + StringBuilder sb = new StringBuilder(); |
| 380 | + // serialize by joining with spaces |
| 381 | + for(String val : values){ |
| 382 | + if(sb.length() != 0) |
| 383 | + sb.append(" "); |
| 384 | + sb.append(val); |
| 385 | + } |
| 386 | + Document doc = new Document(); |
| 387 | + doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 388 | + doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO)); |
| 389 | + |
| 390 | + ngramWriter.addDocument(doc); |
| 391 | + } |
| 392 | + |
| 393 | + /** Add ordinary word to the index |
315 | 394 | * |
316 | 395 | * @param word - word to add |
317 | 396 | * @param nf - frequencies in namespaces |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java |
— | — | @@ -62,7 +62,7 @@ |
63 | 63 | // article.setContents(""); |
64 | 64 | |
65 | 65 | writer.addMainArticle(article); |
66 | | - writer.addAllArticle(article); |
| 66 | + //writer.addAllArticle(article); |
67 | 67 | // generate phrases |
68 | 68 | /* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false); |
69 | 69 | ArrayList<Token> tokens = parser.parse(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java |
— | — | @@ -8,8 +8,12 @@ |
9 | 9 | int distMetaphone2=0; |
10 | 10 | |
11 | 11 | static class Comparator implements java.util.Comparator<SuggestResult> { |
12 | | - public int compare(SuggestResult o1, SuggestResult o2){ |
13 | | - if(o1.dist == o2.dist) |
| 12 | + public int compare(SuggestResult o1, SuggestResult o2){ |
| 13 | + if(o1.dist - o2.dist == -1 && o1.frequency * 100 < o2.frequency) |
| 14 | + return 1; |
| 15 | + else if(o1.dist - o2.dist == 1 && o2.frequency * 100 < o1.frequency) |
| 16 | + return -1; |
| 17 | + else if(o1.dist == o2.dist) |
14 | 18 | return o2.getFrequency() - o1.getFrequency(); |
15 | 19 | else |
16 | 20 | return o1.dist - o2.dist; |