r25117 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r25116‎ | r25117 | r25118 >
Date:13:20, 24 August 2007
Author:rainman
Status:old
Tags:
Comment:
Yet another did you mean implementation, more accurate but way too slow
(150ms per query). Phrase lookup needs to be reorganized.
Added:
* PhraseFilter - outputs two-word phrases and ignores stop words
* SuggestQuery - bean to return suggest results
Modified:
* Index all phrases (w/o stopwords)
* Metric tweaks, length diff, ...
* Circular ngrams
Modified paths:
  • /branches/lucene-search-2.1/build.xml (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestQuery.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NgramIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/HighFreqTerms.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java (modified) (history)
  • /branches/lucene-search-2.1/test-data/mathfunc.test (added) (history)

Diff [purge]

Index: branches/lucene-search-2.1/build.xml
@@ -10,7 +10,7 @@
1111 <property name="binary.name" value="ls2-bin"/>
1212 <property name="jar.name" value="LuceneSearch.jar"/>
1313 <property name="include" value="src/** lib/** sql/** test-data/** webinterface/** *-example *.txt lsearch* build.xml scripts/*"/>
14 - <property name="include.src" value="src/** sql/** build.xml scripts/*"/>
 14+ <property name="include.src" value="src/** sql/** build.xml scripts/* webinterface/*"/>
1515
1616 <property file="${basedir}/hostname"/>
1717
Index: branches/lucene-search-2.1/test-data/mathfunc.test
@@ -0,0 +1,541 @@
 2+39.20291483424475
 3+13.450437958707814
 4+12.678569683286979
 5+10.25526171963254
 6+8.849258859845378
 7+8.666273372729856
 8+8.31447886618143
 9+8.197323614179476
 10+8.065610978164585
 11+7.997460285543743
 12+6.390661056615026
 13+6.195251633448001
 14+6.003035468744286
 15+5.922401025326313
 16+5.858780569202485
 17+5.7545766633852065
 18+5.663348431691695
 19+5.513502014868557
 20+5.397556486517471
 21+5.320454269824394
 22+5.118790939776451
 23+4.957945854762612
 24+4.677393185927231
 25+4.417044635347138
 26+4.286872568087402
 27+4.111160627955847
 28+4.018425808374067
 29+3.875594177139727
 30+3.855756478461852
 31+3.583812703943094
 32+3.5817677966683417
 33+3.393666733951346
 34+3.3825997856599685
 35+3.311922305543995
 36+3.284748780252251
 37+3.2803658354545093
 38+3.208116699014506
 39+3.197975985106876
 40+3.1501966327182416
 41+3.126299857812352
 42+3.063553780327862
 43+3.0467910121568567
 44+2.947827241203268
 45+2.8966477779781448
 46+2.8116941812748055
 47+2.730939712302121
 48+2.7194772730786663
 49+2.6503967112741775
 50+2.499346732261985
 51+2.4531229860503347
 52+2.4245965196546595
 53+2.4048828498563353
 54+2.398834838013775
 55+2.3983530801899637
 56+2.301883995710939
 57+2.2515294877640977
 58+2.2418150405156405
 59+2.186851347893209
 60+2.1421959631446454
 61+2.112862553279673
 62+2.111304636049398
 63+2.0884066313608103
 64+2.080225946703776
 65+2.0652098242981136
 66+2.045337183390787
 67+2.002148363977345
 68+1.9262536235110566
 69+1.9254642966646955
 70+1.8815690362186883
 71+1.8796391915958572
 72+1.8583157394056122
 73+1.842890610174396
 74+1.832874813193032
 75+1.7857804147392449
 76+1.778042188142664
 77+1.7751602358737153
 78+1.770588743574571
 79+1.7658860788389297
 80+1.7330514584559034
 81+1.676248860250089
 82+1.6632396794496727
 83+1.6529964196348452
 84+1.587700607893723
 85+1.554507271828346
 86+1.5462922600019373
 87+1.536011998632598
 88+1.499536875414111
 89+1.4827109040124988
 90+1.4618995756130717
 91+1.4561634933713434
 92+1.4442030498548373
 93+1.4232400481329681
 94+1.4148239967756024
 95+1.4125065245521127
 96+1.390309778182693
 97+1.374617285871078
 98+1.3591113373855113
 99+1.2959623367365105
 100+1.2916936614995556
 101+1.2788722876401044
 102+1.2710067417073394
 103+1.2579879901973903
 104+1.234760921054205
 105+1.232572710846885
 106+1.2239982328262375
 107+1.1960562381380218
 108+1.1909451721904833
 109+1.1883725046626885
 110+1.1870577049511857
 111+1.179821425669525
 112+1.170960754894606
 113+1.1627022555545126
 114+1.155955236116103
 115+1.151314330373941
 116+1.138548931392361
 117+1.136753034029899
 118+1.1252591078172476
 119+1.1153859351428042
 120+1.0995712079396343
 121+1.093167701863354
 122+1.0926266284365183
 123+1.0730378946931582
 124+1.0577347862606676
 125+1.0517570343151739
 126+1.043571403629543
 127+1.0363716250813024
 128+1.0339183869730422
 129+1.0273092324678479
 130+1.0272902345083126
 131+1.0255021017312675
 132+1.0095597490734871
 133+1.0045834871416266
 134+1.0032766725818
 135+1.0008620689655172
 136+0.9986067546725256
 137+0.9654483583749244
 138+0.9369909811501816
 139+0.935063552081607
 140+0.9083748494156194
 141+0.9040905868635284
 142+0.8957921591066108
 143+0.8954248366013071
 144+0.884183371098627
 145+0.884105579905624
 146+0.8835226747659162
 147+0.8119355402722593
 148+0.789096316833624
 149+0.7691353626783575
 150+0.7547464640913147
 151+0.7517606818891444
 152+0.7304977445593964
 153+0.7281849565279721
 154+0.7062913079465717
 155+0.7020404538039372
 156+0.701127126043806
 157+0.699206200140122
 158+0.685040305819171
 159+0.6655547498187092
 160+0.6532654101207102
 161+0.6310138330650028
 162+0.6246948793782561
 163+0.6069942918933324
 164+0.6067663016931207
 165+0.5853223819529878
 166+0.5670848391524492
 167+0.5555930180090919
 168+0.5545160077694642
 169+0.5519308889007148
 170+0.5317575800317539
 171+0.5251936936993156
 172+0.5187831936649074
 173+0.5156023165475659
 174+0.515510718520791
 175+0.5103972455063522
 176+0.5084803339569334
 177+0.5063598230037413
 178+0.505667612185183
 179+0.501937984496124
 180+0.4851394374544459
 181+0.48245486501300455
 182+0.47980203736009863
 183+0.4785642026738086
 184+0.47490132931301576
 185+0.47440087145969495
 186+0.470478765184812
 187+0.4653662166610065
 188+0.4584488352331244
 189+0.4483601686329818
 190+0.44831619992910315
 191+0.43756844494575
 192+0.41978485314864694
 193+0.417847075405215
 194+0.4142618474815748
 195+0.40664111545267345
 196+0.40430555555555553
 197+0.3993738819320215
 198+0.39618140428122023
 199+0.3954248366013072
 200+0.3866070684337751
 201+0.38264501470195134
 202+0.374294355322793
 203+0.3739015797363756
 204+0.3690967047524425
 205+0.369047619047619
 206+0.36553730323393024
 207+0.36265695286094
 208+0.36172088042753525
 209+0.3564633249084819
 210+0.3520672565581789
 211+0.3513354291972556
 212+0.3487797377639518
 213+0.3477145520949912
 214+0.3471940694816792
 215+0.34373979946349387
 216+0.3433980148049866
 217+0.3418468571494427
 218+0.33541054251262054
 219+0.33527131782945735
 220+0.33527131782945735
 221+0.3347176079734219
 222+0.33416587471291676
 223+0.3297251214860608
 224+0.3289868732118273
 225+0.32559026238374766
 226+0.32399276565866203
 227+0.3236174686422881
 228+0.32093503885813623
 229+0.32079037346689465
 230+0.3154645117323287
 231+0.3139680596840939
 232+0.3104207122130782
 233+0.3003896715822824
 234+0.2985506761279957
 235+0.29658825807372824
 236+0.2934782608695652
 237+0.2931321661265045
 238+0.28864928130789674
 239+0.2861340335083771
 240+0.28459669208546495
 241+0.28198129842265857
 242+0.2782558499908782
 243+0.2763939378708801
 244+0.2680798207081973
 245+0.24866758585566026
 246+0.24720389265501513
 247+0.24634287408169175
 248+0.24545454545454548
 249+0.23687033450579364
 250+0.23378644225863587
 251+0.22875816993464054
 252+0.22869471413160733
 253+0.22675400538418133
 254+0.22472044129321897
 255+0.22466775576989487
 256+0.2112033035327623
 257+0.20752025499370294
 258+0.20467549544966926
 259+0.2041005387843561
 260+0.2034160991520012
 261+0.20222535082268692
 262+0.20194931773879143
 263+0.19780433157196253
 264+0.1927963696680974
 265+0.18920010810462992
 266+0.18787808738286582
 267+0.17671003692730708
 268+0.17565274073006032
 269+0.17341231124442874
 270+0.1723925299506695
 271+0.17197609946770082
 272+0.1719272995046191
 273+0.16993464052287582
 274+0.16993464052287582
 275+0.16865044895491207
 276+0.16808637799360715
 277+0.16773504273504275
 278+0.16669077556470457
 279+0.16270324501083455
 280+0.1589700996677741
 281+0.15778214034027988
 282+0.15444497571279267
 283+0.15334213590027546
 284+0.15281709216048345
 285+0.15067748638122527
 286+0.1495469563842297
 287+0.14720196876886033
 288+0.1457772370310287
 289+0.14479512735326688
 290+0.14432989690721648
 291+0.14082592613564054
 292+0.14067196735578466
 293+0.13776405094181346
 294+0.1377070905789004
 295+0.13638598896589604
 296+0.13563103388092834
 297+0.13304514191907094
 298+0.13219008533646015
 299+0.13095759505521926
 300+0.13076649778510113
 301+0.13045367197450033
 302+0.12914131169709264
 303+0.12482312219746579
 304+0.12458333333333334
 305+0.12327018675025805
 306+0.11980187188808131
 307+0.11831331097192645
 308+0.11802553892761806
 309+0.11652894048018908
 310+0.11602674308579007
 311+0.11368723374537328
 312+0.11233370092594643
 313+0.11094249945262813
 314+0.10983825886267112
 315+0.10720114239086087
 316+0.1070457931678245
 317+0.10679419583948348
 318+0.10601680933489718
 319+0.10441831916759216
 320+0.10031529111575649
 321+0.1
 322+0.09892913376451788
 323+0.09613003410578937
 324+0.09558177667787654
 325+0.09353741496598639
 326+0.09335130310817207
 327+0.09299489506522973
 328+0.09284707540521495
 329+0.09156836165742682
 330+0.09143959488787076
 331+0.09117350127397722
 332+0.09090909090909091
 333+0.08957592659331161
 334+0.08851076659860356
 335+0.08778550096326349
 336+0.08730787885969052
 337+0.08595141700404858
 338+0.08527131782945736
 339+0.08380728650268886
 340+0.08270375576255015
 341+0.08187134502923976
 342+0.08038147306700115
 343+0.07926470588235295
 344+0.07667821777071915
 345+0.07456348210639226
 346+0.0740920438489128
 347+0.0723925299506695
 348+0.0722556667435676
 349+0.07058355935137631
 350+0.07057745902570312
 351+0.06917211328976035
 352+0.06897873003531063
 353+0.06809526321135173
 354+0.06791100973310865
 355+0.06743256743256744
 356+0.06730159311397596
 357+0.06722996202214411
 358+0.06674924924924926
 359+0.06666666666666667
 360+0.06666666666666667
 361+0.06666666666666667
 362+0.06666666666666667
 363+0.06666666666666667
 364+0.06624754009147153
 365+0.06520375576255016
 366+0.06469420951891736
 367+0.06443798449612403
 368+0.06431866694209185
 369+0.06376903553299493
 370+0.06349206349206349
 371+0.06333495599357147
 372+0.06263242909540888
 373+0.0625
 374+0.06076151390788874
 375+0.060480192024804376
 376+0.060452567221700414
 377+0.05991285403050109
 378+0.058823529411764705
 379+0.058823529411764705
 380+0.058823529411764705
 381+0.058823529411764705
 382+0.058823529411764705
 383+0.05823070839310234
 384+0.05816748407653175
 385+0.0579618045578126
 386+0.05745341614906832
 387+0.05564512855962682
 388+0.054808368686332126
 389+0.05419516301503879
 390+0.05334281650071124
 391+0.05263157894736842
 392+0.05259856630824373
 393+0.051665702718334296
 394+0.05145489270868442
 395+0.05101136441542828
 396+0.04973743435858965
 397+0.04951565046909012
 398+0.048980995248812206
 399+0.04867645430616502
 400+0.04768432483791873
 401+0.04765694031555579
 402+0.04739252995066949
 403+0.046511627906976744
 404+0.04640702183427576
 405+0.04598599183197713
 406+0.045454545454545456
 407+0.04513888888888889
 408+0.04398685228151585
 409+0.04362262093762942
 410+0.043478260869565216
 411+0.04278197208876688
 412+0.042360667607827314
 413+0.041666666666666664
 414+0.041591268270502295
 415+0.04068627450980392
 416+0.03972809052794399
 417+0.03972291040988196
 418+0.03959025470653378
 419+0.03897502153316107
 420+0.03886844013161564
 421+0.03818036293723188
 422+0.037037037037037035
 423+0.037037037037037035
 424+0.03654897909577054
 425+0.03651960692790449
 426+0.03641147396373545
 427+0.0363407371388875
 428+0.03600713012477718
 429+0.0359586316471341
 430+0.0357838573513611
 431+0.03574975173783515
 432+0.03573225548912443
 433+0.03571428571428571
 434+0.03571428571428571
 435+0.03571428571428571
 436+0.03571428571428571
 437+0.03564082975847682
 438+0.03544985471391053
 439+0.03504654092288226
 440+0.03486279802069275
 441+0.03333333333333333
 442+0.03333333333333333
 443+0.03273854270680871
 444+0.03257478689045375
 445+0.030855931055416896
 446+0.03064721823688825
 447+0.030596683035300658
 448+0.03019175846593227
 449+0.029311223675659005
 450+0.029247991816905444
 451+0.02869474125288079
 452+0.0286046511627907
 453+0.02849550837462078
 454+0.02825377396980824
 455+0.027777777777777776
 456+0.027777777777777776
 457+0.02702702702702703
 458+0.02631578947368421
 459+0.02631578947368421
 460+0.02601263470828688
 461+0.02574750830564784
 462+0.025550100018185126
 463+0.025372327930467467
 464+0.025211768224451123
 465+0.025193798449612403
 466+0.024160206718346254
 467+0.023008165967777532
 468+0.022904441700569815
 469+0.02269369606133612
 470+0.022269634413562448
 471+0.021753233319546015
 472+0.021739130434782608
 473+0.021739130434782608
 474+0.021496108285197813
 475+0.02127659574468085
 476+0.02127659574468085
 477+0.02127659574468085
 478+0.02103108133258887
 479+0.020702295544261874
 480+0.019552721786808246
 481+0.019542404334929694
 482+0.018518518518518517
 483+0.018173482676494504
 484+0.01791290238206013
 485+0.017196509385738565
 486+0.017080685702920213
 487+0.016975703819570268
 488+0.01640469265710271
 489+0.016129032258064516
 490+0.016129032258064516
 491+0.015713963183612317
 492+0.015625
 493+0.014924997483137019
 494+0.013333333333333334
 495+0.013004982761851708
 496+0.012763288406558142
 497+0.011832633972748726
 498+0.010638297872340425
 499+0.010412560767310472
 500+0.009900990099009901
 501+0.00972670763891639
 502+0.009073661840429657
 503+0.00744185191163226
 504+0.007414552803038665
 505+0.007314328582145537
 506+0.007135677344305626
 507+0.006407710219770128
 508+0.006211488769628305
 509+0.006211488769628305
 510+0.006211488769628305
 511+0.006211488769628305
 512+0.005638820977848822
 513+0.005620914150601959
 514+0.005374410612962519
 515+0.004532574995380161
 516+0.003942832324438774
 517+0.0036032634303455133
 518+0.0033840379683466597
 519+0.003220559531554977
 520+0.0028000534616412723
 521+0.00274799145526168
 522+0.0026964243582868258
 523+0.0025624990369252356
 524+0.002512562814070352
 525+0.002288062199929619
 526+0.0022123893805309734
 527+0.0022061535822785463
 528+0.0022023948610103345
 529+0.002188183807439825
 530+0.0019559542709627524
 531+0.0019559542709627524
 532+0.001949317738791423
 533+0.001937984496124031
 534+0.001937984496124031
 535+0.001937984496124031
 536+0.001937984496124031
 537+0.001937984496124031
 538+0.001937984496124031
 539+0.001937984496124031
 540+0.0016129032258064516
 541+0.0014824708759661475
 542+0.0014727540500736377
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -30,6 +30,7 @@
3131 import org.wikimedia.lsearch.frontend.SearchServer;
3232 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
3333 import org.wikimedia.lsearch.spell.Suggest;
 34+import org.wikimedia.lsearch.spell.SuggestQuery;
3435 import org.wikimedia.lsearch.util.QueryStringMap;
3536
3637 /**
@@ -148,10 +149,12 @@
149150 Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes();
150151 boolean searchAll = false;
151152 Suggest sug = null;
152 - try {
153 - sug = new Suggest(iid);
154 - } catch (IOException e1) {
155 - log.warn("Cannot open spell-suggestion indexes for "+iid+" : "+e1);
 153+ if(offset == 0){
 154+ try {
 155+ sug = new Suggest(iid);
 156+ } catch (IOException e1) {
 157+ log.warn("Cannot open spell-suggestion indexes for "+iid+" : "+e1);
 158+ }
156159 }
157160
158161 // if search is over one field, try to use filters
@@ -170,20 +173,8 @@
171174 }
172175
173176 try {
174 - if(raw){
175 - // do minimal parsing, make a raw query
176 - parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE);
177 - q = parser.parseRaw(searchterm);
178 - } else if(nsfw == null){
179 - if(searchAll)
180 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
181 - else
182 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
183 - } else{
184 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
185 - log.info("Using NamespaceFilterWrapper "+nsfw);
186 - }
187 -
 177+ q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll);
 178+
188179 TopDocs hits=null;
189180 // see if we can search only part of the index
190181 if(nsfw!=null && (iid.isMainsplit() || iid.isNssplit())){
@@ -216,8 +207,27 @@
217208 }
218209 RMIMessengerClient messenger = new RMIMessengerClient();
219210 res = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host);
220 - if(sug != null)
221 - res.setSuggest(sug.suggest(searchterm,parser,nsfw.getFilter(),res.getNumHits()));
 211+ if(sug != null){
 212+ SuggestQuery sq = sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res);
 213+ if(sq == null)
 214+ res.setSuggest(null);
 215+ else{
 216+ if(res.getNumHits() == 0){
 217+ // no hits: show the spell-checked results
 218+ SearchResults sugres = messenger.searchPart(piid,sq.getSearchterm(),q,nsfw,offset,limit,explain,host);
 219+ if(sugres.getNumHits() > 0){
 220+ res = sugres;
 221+ res.setSuggest(sq.getSearchterm());
 222+ }
 223+ } else if(sq.needsCheck()){
 224+ q = parseQuery(sq.getSearchterm(),parser,iid,raw,nsfw,searchAll);
 225+ SearchResults sugres = messenger.searchPart(piid,sq.getSearchterm(),q,nsfw,0,1,explain,host);
 226+ if(sugres.getNumHits() > 0){
 227+ res.setSuggest(sq.getSearchterm());
 228+ }
 229+ }
 230+ }
 231+ }
222232 return res;
223233 }
224234 }
@@ -226,8 +236,27 @@
227237 try{
228238 hits = searcher.search(q,nsfw,offset+limit);
229239 res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
230 - if(sug != null)
231 - res.setSuggest(sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res.getNumHits()));
 240+ if(sug != null){
 241+ SuggestQuery sq = sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res);
 242+ if(sq == null)
 243+ res.setSuggest(null);
 244+ else{
 245+ if(res.getNumHits() == 0){
 246+ // no hits: show the spell-checked results
 247+ hits = searcher.search(q,nsfw,offset+limit);
 248+ if(hits.totalHits != 0){
 249+ res = makeSearchResults(searcher,hits,offset,limit,iid,sq.getSearchterm(),q,searchStart,explain);
 250+ res.setSuggest(sq.getSearchterm());
 251+ }
 252+ } else if(sq.needsCheck()){
 253+ q = parseQuery(sq.getSearchterm(),parser,iid,raw,nsfw,searchAll);
 254+ hits = searcher.search(q,nsfw,1); // fetch only one result
 255+ if(hits.totalHits != 0){
 256+ res.setSuggest(sq.getSearchterm());
 257+ }
 258+ }
 259+ }
 260+ }
232261 return res;
233262 } catch(Exception e){
234263 e.printStackTrace();
@@ -250,6 +279,24 @@
251280 }
252281 }
253282
 283+ protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll) throws ParseException {
 284+ Query q = null;
 285+ if(raw){
 286+ // do minimal parsing, make a raw query
 287+ parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE);
 288+ q = parser.parseRaw(searchterm);
 289+ } else if(nsfw == null){
 290+ if(searchAll)
 291+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
 292+ else
 293+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
 294+ } else{
 295+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
 296+ log.info("Using NamespaceFilterWrapper "+nsfw);
 297+ }
 298+ return q;
 299+ }
 300+
254301 /** Our scores can span several orders of magnitude, transform them to be more relevant to the user */
255302 public float transformScore(double score){
256303 return (float) (Math.log10(1+score*99)/2);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
@@ -94,7 +94,7 @@
9595 return searcher;
9696 }
9797
98 - IndexSearcherMul get(){
 98+ synchronized IndexSearcherMul get(){
9999 if(index >= searchers.length)
100100 index = 0;
101101 log.debug("Using "+iid+" searcher "+index);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -368,7 +368,7 @@
369369 continue; // ignore single quotes (it's -> its)
370370
371371 // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries
372 - if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*'){
 372+ if(!Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != '.' && ch != ',' && ch != ';' && ch != '"'){
373373 if(length<buffer.length)
374374 buffer[length++] = ch;
375375 } else{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -122,6 +122,10 @@
123123 return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase));
124124 }
125125
 126+ public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(String langCode){
 127+ return getSearcherAnalyzer(new FilterFactory(langCode,FilterFactory.Type.SPELL_CHECK),new FieldNameFactory());
 128+ }
 129+
126130 /**
127131 * Analyzer for search queries. Can be reused to parse many queries.
128132 *
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -1,5 +1,10 @@
22 package org.wikimedia.lsearch.analyzers;
33
 4+import java.lang.reflect.InvocationTargetException;
 5+import java.lang.reflect.Method;
 6+import java.util.ArrayList;
 7+import java.util.Set;
 8+
49 import org.apache.lucene.analysis.PorterStemFilter;
510 import org.apache.lucene.analysis.TokenFilter;
611 import org.apache.lucene.analysis.TokenStream;
@@ -16,26 +21,36 @@
1722 public class FilterFactory {
1823 protected String lang;
1924 protected String snowballName = null;
20 - protected boolean useStemmer,useCustomFilter;
 25+ protected boolean useStemmer,useLangFilter;
2126 protected Class stemmer = null;
22 - protected Class customFilter = null;
 27+ protected Class langFilter = null;
2328 protected boolean usingCJK = false;
 29+ protected ArrayList<Class> additionalFilters = null;
2430
2531 protected FilterFactory noStemmerFilterFactory=null;
2632
 33+ public enum Type { FULL, NO_STEM, SPELL_CHECK };
 34+ protected Type type = null;
 35+
2736 public FilterFactory(String lang){
 37+ this(lang,Type.FULL);
 38+ }
 39+
 40+ public FilterFactory(String lang, Type type){
2841 this.lang = lang;
 42+ this.type = type;
2943 init();
30 - noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useCustomFilter,null,customFilter);
 44+ noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters);
3145 }
3246
33 - public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useCustomFilter, Class stemmer, Class customFilter) {
 47+ public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) {
3448 this.lang = lang;
3549 this.snowballName = snowballName;
3650 this.useStemmer = useStemmer;
37 - this.useCustomFilter = useCustomFilter;
 51+ this.useLangFilter = useLangFilter;
3852 this.stemmer = stemmer;
39 - this.customFilter = customFilter;
 53+ this.langFilter = langFilter;
 54+ this.additionalFilters = additionalFilters;
4055 }
4156
4257 public FilterFactory getNoStemmerFilterFactory() {
@@ -49,51 +64,60 @@
5065 if(lang == null)
5166 lang = "en";
5267
53 - // figure out stemmer
54 - useStemmer = true;
55 - if(lang.equals("en"))
56 - snowballName = "English";
 68+ if(type == Type.FULL){
 69+ useStemmer = true;
 70+ // figure out stemmer
 71+ if(lang.equals("en"))
 72+ snowballName = "English";
5773 //stemmer = PorterStemFilter.class; // 2x faster but less accurate
58 - else if(lang.equals("da"))
59 - snowballName = "Danish";
60 - else if(lang.equals("nl"))
61 - snowballName = "Dutch";
62 - else if(lang.equals("fi"))
63 - snowballName = "Finnish";
64 - else if(lang.equals("de"))
65 - snowballName = "German";
66 - else if(lang.equals("it"))
67 - snowballName = "Italian";
68 - else if(lang.equals("no"))
69 - snowballName = "Norwegian";
70 - else if(lang.equals("pt"))
71 - snowballName = "Portuguese";
72 - else if(lang.equals("ru"))
73 - snowballName = "Russian";
74 - else if(lang.equals("es"))
75 - snowballName = "Spanish";
76 - else if(lang.equals("sv"))
77 - snowballName = "Swedish";
78 - else if(lang.equals("eo"))
79 - stemmer = EsperantoStemFilter.class;
80 - else
 74+ else if(lang.equals("da"))
 75+ snowballName = "Danish";
 76+ else if(lang.equals("nl"))
 77+ snowballName = "Dutch";
 78+ else if(lang.equals("fi"))
 79+ snowballName = "Finnish";
 80+ else if(lang.equals("de"))
 81+ snowballName = "German";
 82+ else if(lang.equals("it"))
 83+ snowballName = "Italian";
 84+ else if(lang.equals("no"))
 85+ snowballName = "Norwegian";
 86+ else if(lang.equals("pt"))
 87+ snowballName = "Portuguese";
 88+ else if(lang.equals("ru"))
 89+ snowballName = "Russian";
 90+ else if(lang.equals("es"))
 91+ snowballName = "Spanish";
 92+ else if(lang.equals("sv"))
 93+ snowballName = "Swedish";
 94+ else if(lang.equals("eo"))
 95+ stemmer = EsperantoStemFilter.class;
 96+ else
 97+ useStemmer = false;
 98+ } else
8199 useStemmer = false;
82100
83 - // figure out custom filter
84 - useCustomFilter = true;
 101+ // figure out language-dependent filters
 102+ useLangFilter = true;
85103 if(lang.equals("th"))
86 - customFilter = ThaiWordFilter.class;
 104+ langFilter = ThaiWordFilter.class;
87105 else if(lang.equals("sr"))
88 - customFilter = SerbianFilter.class;
 106+ langFilter = SerbianFilter.class;
89107 else if(lang.equals("vi"))
90 - customFilter = VietnameseFilter.class;
 108+ langFilter = VietnameseFilter.class;
91109 else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") ||
92110 lang.equals("zh-classical") || lang.equals("zh-yue")){
93 - customFilter = CJKFilter.class;
 111+ langFilter = CJKFilter.class;
94112 usingCJK = true;
95113 } else
96 - useCustomFilter = false;
 114+ useLangFilter = false;
97115
 116+ // additional filters
 117+ if(type == Type.SPELL_CHECK){
 118+ additionalFilters = new ArrayList<Class>();
 119+ additionalFilters.add(PhraseFilter.class);
 120+ }
 121+
98122 }
99123
100124 public TokenFilter makeStemmer(TokenStream in){
@@ -113,11 +137,11 @@
114138 }
115139
116140 public TokenFilter makeCustomFilter(TokenStream in){
117 - if(!useCustomFilter)
 141+ if(!useLangFilter)
118142 return null;
119 - else if(customFilter != null){
 143+ else if(langFilter != null){
120144 try {
121 - return (TokenFilter) customFilter.getConstructor(TokenStream.class).newInstance(in);
 145+ return (TokenFilter) langFilter.getConstructor(TokenStream.class).newInstance(in);
122146 } catch (Exception e) {
123147 e.printStackTrace();
124148 }
@@ -126,6 +150,26 @@
127151 return null;
128152 }
129153
 154+ public TokenStream makeAdditionalFilterChain(TokenStream in){
 155+ if(additionalFilters == null)
 156+ return in;
 157+ try {
 158+ TokenStream chain = in;
 159+ // nest additional filters, apply them as added to the list
 160+ for(Class filter : additionalFilters){
 161+ chain = (TokenStream) filter.getConstructor(TokenStream.class).newInstance(chain);
 162+ }
 163+ return chain;
 164+ } catch (Exception e) {
 165+ e.printStackTrace();
 166+ return null;
 167+ }
 168+ }
 169+
 170+ public boolean hasAdditionalFilters(){
 171+ return additionalFilters != null;
 172+ }
 173+
130174 public boolean hasStemmer(){
131175 return useStemmer;
132176 }
@@ -135,12 +179,31 @@
136180 }
137181
138182 public boolean hasCustomFilter(){
139 - return useCustomFilter;
 183+ return useLangFilter;
140184 }
141185
142186 public String getLanguage(){
143187 return lang;
144188 }
145189
 190+ public void setStopWords(Set<String> stopWords){
 191+ for(Class filter : additionalFilters){
 192+ for(Method m : filter.getMethods()){
 193+ if(m.getName().equals("setStopWords")){
 194+ try {
 195+ m.invoke(filter,new Object[] {stopWords});
 196+ } catch (IllegalArgumentException e) {
 197+ e.printStackTrace();
 198+ } catch (IllegalAccessException e) {
 199+ e.printStackTrace();
 200+ } catch (InvocationTargetException e) {
 201+ e.printStackTrace();
 202+ }
 203+ }
 204+ }
 205+
 206+ }
 207+ }
146208
 209+
147210 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
@@ -0,0 +1,76 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.util.HashSet;
 6+import java.util.Set;
 7+
 8+import org.apache.lucene.analysis.Token;
 9+import org.apache.lucene.analysis.TokenFilter;
 10+import org.apache.lucene.analysis.TokenStream;
 11+import org.apache.lucene.index.IndexReader;
 12+import org.apache.lucene.index.Term;
 13+import org.wikimedia.lsearch.config.IndexId;
 14+import org.wikimedia.lsearch.config.IndexRegistry;
 15+
 16+/**
 17+ * Filter that outputs phrases and words mixed, e.g.
 18+ * novi sad is a city -> novi, sad, novi_sad, is, sad_is, a, is_a, city, a_city
 19+ *
 20+ * @author rainman
 21+ *
 22+ */
 23+public class PhraseFilter extends TokenFilter {
 24+ protected Set<String> stopWords = null;
 25+
 26+ public PhraseFilter(TokenStream input) {
 27+ super(input);
 28+ }
 29+
 30+ protected Token phrase1 = null, phrase2 = null;
 31+ protected boolean phraseReady = false;
 32+
 33+ protected boolean forPhrase(Token t){
 34+ if(stopWords!=null && stopWords.contains(t.termText()))
 35+ return false;
 36+ else
 37+ return true;
 38+ }
 39+
 40+ @Override
 41+ public Token next() throws IOException {
 42+ if(phraseReady){
 43+ phraseReady = false;
 44+ return new Token(phrase1.termText()+"_"+phrase2.termText(),phrase1.startOffset(),phrase2.endOffset());
 45+ }
 46+ Token t = input.next();
 47+ if(t == null)
 48+ return null; // EOS
 49+ if(!forPhrase(t))
 50+ return t; // stop word, return as word only
 51+
 52+ if(phrase1 == null){
 53+ phrase1 = t;
 54+ return t;
 55+ }
 56+ if(phrase2 == null){
 57+ phrase2 = t;
 58+ phraseReady = true;
 59+ return t;
 60+ }
 61+
 62+ phrase1 = phrase2;
 63+ phrase2 = t;
 64+ phraseReady = true;
 65+
 66+ return t; // prepared phrase, return word, phrase in next call
 67+ }
 68+
 69+ public Set<String> getStopWords() {
 70+ return stopWords;
 71+ }
 72+
 73+ public void setStopWords(Set<String> stopWords) {
 74+ this.stopWords = stopWords;
 75+ }
 76+
 77+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java
@@ -61,8 +61,12 @@
6262 if(filters.hasCustomFilter())
6363 tokens = applyCustomFilter(tokens);
6464
65 - return new AliasFilter(filters,
66 - new ArrayTokens(tokens), new ArrayTokens(tokens));
 65+ TokenStream out = new AliasFilter(filters,
 66+ new ArrayTokens(tokens), new ArrayTokens(tokens));
 67+ if(filters.hasAdditionalFilters())
 68+ return filters.makeAdditionalFilterChain(out);
 69+ else
 70+ return out;
6771 }
6872
6973 /** Filter the tokens via the custom filter. For instance, to delete
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
@@ -42,23 +42,28 @@
4343 /** default is ignore case (upper/lower), use exact_case for wiktionaries, etc */
4444 public static enum Case { IGNORE_CASE, EXACT_CASE };
4545 /** use stemmer if available, of force no stemming */
46 - public static enum Stemmer { USE_STEMMER, NO_STEMMER };
 46+ public static enum Stemmer { USE_STEMMER, NO_STEMMER };
 47+ /** additional options */
 48+ public static enum Options { NONE, SPELL_CHECK };
4749
4850 /** Construct case-insensitive field builder with stemming */
4951 public FieldBuilder(String lang){
50 - this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER);
 52+ this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE);
5153 }
5254
5355 public FieldBuilder(String lang, Case useCase){
54 - this(lang,useCase,Stemmer.USE_STEMMER);
 56+ this(lang,useCase,Stemmer.USE_STEMMER,Options.NONE);
5557 }
5658
57 - public FieldBuilder(String lang, Case useCase, Stemmer useStemmer){
 59+ public FieldBuilder(String lang, Case useCase, Stemmer useStemmer, Options options){
 60+ FilterFactory.Type type = FilterFactory.Type.FULL;
 61+ if(options == Options.SPELL_CHECK)
 62+ type = FilterFactory.Type.SPELL_CHECK;
5863 // additional exact case factory
5964 if(useCase == Case.EXACT_CASE){
60 - builders = new BuilderSet[2];
 65+ builders = new BuilderSet[2];
6166 builders[1] = new BuilderSet(
62 - new FilterFactory(lang).getNoStemmerFilterFactory(),
 67+ new FilterFactory(lang,type).getNoStemmerFilterFactory(),
6368 new FieldNameFactory(FieldNameFactory.EXACT_CASE));
6469 } else
6570 builders = new BuilderSet[1];
@@ -66,11 +71,11 @@
6772 // default factory, lowercase all data
6873 if(useStemmer == Stemmer.USE_STEMMER){
6974 builders[0] = new BuilderSet(
70 - new FilterFactory(lang),
 75+ new FilterFactory(lang,type),
7176 new FieldNameFactory());
7277 } else{
7378 builders[0] = new BuilderSet(
74 - new FilterFactory(lang).getNoStemmerFilterFactory(),
 79+ new FilterFactory(lang,type).getNoStemmerFilterFactory(),
7580 new FieldNameFactory());
7681 }
7782
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java
@@ -1,5 +1,7 @@
22 package org.wikimedia.lsearch.util;
33
 4+import org.wikimedia.lsearch.test.MathFuncTest;
 5+
46 public class MathFunc {
57
68 /** Calculate average value starting from start to end (end excluded) */
@@ -26,27 +28,34 @@
2729 // av[i] = avg(val,part[i],part[i+1]);
2830 // error
2931 double err = calcErr(part,val,num);
 32+ double err2 = calcErr2(part,val,num);
3033 // values at next iteration
3134 int[] newpart = new int[num+1];
3235 //double[] newav = new double[num];
33 - double newerr = 0;
 36+ double newerr = 0, newerr2 = 0;
3437
3538 while(true){
3639 for(int i=0;i<num-1;i++){
3740 merge(i,part,newpart,val,num);
3841 newerr = calcErr(newpart,val,num);
39 - if(newerr < err){
 42+ newerr2 = calcErr2(newpart,val,num);
 43+ if(newerr < err || (newerr == err && newerr2 < err2)){
4044 copy(newpart,part);
4145 err = newerr;
 46+ err2 = newerr2;
 47+ //MathFuncTest.print(newpart,val);
4248 continue;
4349 }
4450 }
4551 // try extending last
4652 extend(part,newpart,val,num);
4753 newerr = calcErr(newpart,val,num);
48 - if(newerr < err){
 54+ newerr2 = calcErr2(newpart,val,num);
 55+ if(newerr < err || (newerr == err && newerr2 < err2)){
4956 copy(newpart,part);
5057 err = newerr;
 58+ err2 = newerr2;
 59+ //MathFuncTest.print(newpart,val);
5160 continue;
5261 }
5362 break;
@@ -94,10 +103,24 @@
95104 double err = 0;
96105 for(int i=0;i<num;i++){
97106 // max - min value
98 - double e = val[part[i]]-val[part[i+1]-1];
 107+ double v2 = val[part[i]];
 108+ double v1 = val[part[i+1]-1];
 109+ double e = v2 - v1;
99110 if( e > err )
100111 err = e;
101112 }
102113 return err;
103114 }
 115+
 116+ private static double calcErr2(int[] part, double[] val, int num) {
 117+ double err = 0;
 118+ for(int i=0;i<num;i++){
 119+ // max - min value
 120+ double v2 = val[part[i]];
 121+ double v1 = val[part[i+1]-1];
 122+ double e = v2 - v1;
 123+ err += e*(part[i+1]-1-part[i]);
 124+ }
 125+ return err;
 126+ }
104127 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java
@@ -140,6 +140,10 @@
141141 if(table[ich]==null)
142142 continue;
143143 Buffer buffer = new Buffer(buf,0);
 144+ if(ich == 0xD4A){
 145+ int b = 0;
 146+ b++;
 147+ }
144148 recursiveDecompose(buffer,table,letters,(char)ich);
145149 if(buffer.len != 0){
146150 decomposition[ich]= new char[buffer.len];
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/HighFreqTerms.java
@@ -0,0 +1,83 @@
 2+package org.wikimedia.lsearch.util;
 3+
 4+/**
 5+ * Copyright 2004 The Apache Software Foundation
 6+ *
 7+ * Licensed under the Apache License, Version 2.0 (the "License");
 8+ * you may not use this file except in compliance with the License.
 9+ * You may obtain a copy of the License at
 10+ *
 11+ * http://www.apache.org/licenses/LICENSE-2.0
 12+ *
 13+ * Unless required by applicable law or agreed to in writing, software
 14+ * distributed under the License is distributed on an "AS IS" BASIS,
 15+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16+ * See the License for the specific language governing permissions and
 17+ * limitations under the License.
 18+ */
 19+
 20+import java.io.IOException;
 21+import java.util.ArrayList;
 22+import java.util.Collection;
 23+import java.util.LinkedList;
 24+
 25+import org.apache.lucene.index.IndexReader;
 26+import org.apache.lucene.index.Term;
 27+import org.apache.lucene.index.TermEnum;
 28+import org.apache.lucene.util.PriorityQueue;
 29+
 30+/**
 31+ * <code>HighFreqTerms</code> class extracts terms and their frequencies out
 32+ * of an existing Lucene index.
 33+ *
 34+ * @version $Id: HighFreqTerms.java 376393 2006-02-09 19:17:14Z otis $
 35+ */
 36+public class HighFreqTerms {
 37+
 38+ public static Collection<String> getHighFreqTerms(IndexReader reader, String field, int numTerms) throws IOException {
 39+ TermInfoQueue tiq = new TermInfoQueue(numTerms);
 40+ TermEnum terms = reader.terms();
 41+ LinkedList<String> ret = new LinkedList<String>();
 42+
 43+ if (field != null) {
 44+ // collect terms from field into priority queue
 45+ while (terms.next()) {
 46+ if (terms.term().field().equals(field)) {
 47+ tiq.insert(new TermInfo(terms.term(), terms.docFreq()));
 48+ }
 49+ }
 50+ } else {
 51+ // collect all terms
 52+ while (terms.next()) {
 53+ tiq.insert(new TermInfo(terms.term(), terms.docFreq()));
 54+ }
 55+ }
 56+
 57+ // get higest ranked
 58+ while (tiq.size() != 0) {
 59+ ret.addFirst(((TermInfo) tiq.pop()).term.text());
 60+ }
 61+
 62+ return ret;
 63+ }
 64+}
 65+
 66+final class TermInfo {
 67+ TermInfo(Term t, int df) {
 68+ term = t;
 69+ docFreq = df;
 70+ }
 71+ int docFreq;
 72+ Term term;
 73+}
 74+
 75+final class TermInfoQueue extends PriorityQueue {
 76+ TermInfoQueue(int size) {
 77+ initialize(size);
 78+ }
 79+ protected final boolean lessThan(Object a, Object b) {
 80+ TermInfo termInfoA = (TermInfo) a;
 81+ TermInfo termInfoB = (TermInfo) b;
 82+ return termInfoA.docFreq < termInfoB.docFreq;
 83+ }
 84+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
@@ -39,7 +39,7 @@
4040 int bad=0;
4141 long start = System.currentTimeMillis();
4242 for(String[] m : DATA){
43 - ArrayList<SuggestResult> res = sc.suggestWords(m[0],5);
 43+ ArrayList<SuggestResult> res = sc.suggestWordsFromTitle(m[0],new NamespaceFilter(0),5);
4444 if(res.size() > 0){
4545 SuggestResult r = res.get(0);
4646 if(r.getWord().equals(m[1]))
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -385,6 +385,15 @@
386386 q = parser.parseFourPass("\"うろパン\"",NamespacePolicy.IGNORE,false);
387387 assertEquals("contents:\"うろ ろハ ハン\" title:\"うろ ろハ ハン\"^2.0 (alttitle1:\"うろ ろハ ハン\"^6.0 alttitle2:\"うろ ろハ ハン\"^6.0 alttitle3:\"うろ ろハ ハン\"^6.0)",q.toString());
388388
 389+
 390+ // Malayalam
 391+ analyzer = Analyzers.getSearcherAnalyzer("ml");
 392+ bs = new FieldBuilder("ml").getBuilder();
 393+ parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
 394+ q = parser.parseFourPass("കൊറിയ ",NamespacePolicy.IGNORE,false);
 395+ assertEquals("contents:കറയ title:കറയ^2.0 (alttitle1:കറയ^6.0 alttitle2:കറയ^6.0 alttitle3:കറയ^6.0)",q.toString());
 396+
 397+
389398 // Test field extraction
390399 HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
391400 assertEquals(3,fs.size());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -5,13 +5,19 @@
66 import java.util.Collections;
77 import java.util.Comparator;
88 import java.util.HashMap;
 9+import java.util.HashSet;
910 import java.util.LinkedList;
 11+import java.util.Set;
 12+import java.util.WeakHashMap;
1013 import java.util.Map.Entry;
1114
1215 import org.apache.log4j.Logger;
 16+import org.apache.lucene.analysis.Analyzer;
1317 import org.apache.lucene.analysis.Token;
 18+import org.apache.lucene.analysis.TokenStream;
1419 import org.apache.lucene.document.Document;
1520 import org.apache.lucene.index.Term;
 21+import org.apache.lucene.index.TermDocs;
1622 import org.apache.lucene.search.BooleanClause;
1723 import org.apache.lucene.search.BooleanQuery;
1824 import org.apache.lucene.search.Hits;
@@ -20,7 +26,10 @@
2127 import org.apache.lucene.search.ScoreDoc;
2228 import org.apache.lucene.search.TermQuery;
2329 import org.apache.lucene.search.TopDocs;
 30+import org.wikimedia.lsearch.analyzers.Analyzers;
2431 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
 32+import org.wikimedia.lsearch.beans.ResultSet;
 33+import org.wikimedia.lsearch.beans.SearchResults;
2534 import org.wikimedia.lsearch.config.GlobalConfiguration;
2635 import org.wikimedia.lsearch.config.IndexId;
2736 import org.wikimedia.lsearch.search.NamespaceFilter;
@@ -38,6 +47,8 @@
3948 protected IndexSearcher titles;
4049 protected int minHitsWords;
4150 protected int minHitsTitles;
 51+ protected static WeakHashMap<IndexSearcher,Set<String>> stopWordsIndexes = new WeakHashMap<IndexSearcher,Set<String>>();
 52+ protected Set<String> stopWords;
4253
4354 /** Distance an metaphone metrics */
4455 static class Metric {
@@ -70,7 +81,7 @@
7182 }
7283
7384 /** Number of results to fetch */
74 - public static final int POOL = 300;
 85+ public static final int POOL = 150;
7586
7687 /** Lower limit to hit rate for joining */
7788 public static final int JOIN_FREQ = 1;
@@ -83,6 +94,21 @@
8495 this.titles = cache.getLocalSearcher(iid.getSpellTitles());
8596 this.minHitsWords = global.getIntDBParam(iid.getDBname(),"spell_words","minHits",20);
8697 this.minHitsTitles = global.getIntDBParam(iid.getDBname(),"spell_titles","minHits",20);
 98+
 99+ synchronized(stopWordsIndexes){
 100+ if(!stopWordsIndexes.containsKey(titles)){
 101+ Set<String> s = Collections.synchronizedSet(new HashSet<String>());
 102+ stopWordsIndexes.put(titles,s);
 103+ TermDocs d = titles.getIndexReader().termDocs(new Term("metadata_key","stopWords"));
 104+ if(d.next()){
 105+ String val = titles.doc(d.doc()).get("metadata_value");
 106+ for(String sw : val.split(" ")){
 107+ s.add(sw);
 108+ }
 109+ }
 110+ }
 111+ this.stopWords = stopWordsIndexes.get(titles);
 112+ }
87113 }
88114
89115 static class Change {
@@ -111,15 +137,46 @@
112138 *
113139 * @return suggested query, or null if no suggestions
114140 */
115 - public String suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, int numHits){
 141+ @SuppressWarnings("unchecked")
 142+ public SuggestQuery suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, SearchResults res){
116143 ArrayList<Token> tokens = parser.tokenizeBareText(searchterm);
 144+ int numHits = res.getNumHits();
 145+
 146+ if(numHits >= minHitsTitles)
 147+ return null;
 148+
 149+ // collect words in titles, these shouldn't be spell-checked
 150+ HashSet<String> correctWords = new HashSet<String>();
 151+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false);
 152+ try {
 153+ for(ResultSet r : res.getResults()){
 154+ Token t = null;
 155+ TokenStream ts = analyzer.tokenStream("title",r.title);
 156+ while( (t = ts.next()) != null ){
 157+ correctWords.add(t.termText());
 158+ }
 159+ }
 160+ } catch (IOException e) {
 161+ log.error("I/O error trying to get list of correct words : "+e.getMessage());
 162+ e.printStackTrace();
 163+ }
117164
118165 // always spell-check phrases
119166 int minFreq = (numHits < minHitsTitles)? 0 : numHits;
120167 ArrayList<Change> suggestions = new ArrayList<Change>();
121 - Token last = null;
 168+
 169+ // add correct words
122170 for(int i=0;i<tokens.size();i++){
123171 Token t = tokens.get(i);
 172+ if(correctWords.contains(t.termText())){
 173+ Change c = new Change(0,1,Change.Type.TITLE_WORD);
 174+ c.preserves.put(i,t.termText());
 175+ suggestions.add(c);
 176+ }
 177+ }
 178+
 179+ for(int i=0;i<tokens.size();i++){
 180+ Token t = tokens.get(i);
124181 String w = t.termText();
125182 if(!"word".equals(t.type()) && !"phrase".equals(t.type()))
126183 continue; // ignore aliases and such
@@ -143,64 +200,46 @@
144201 sc.substitutes.put(i,split.word.replace("_"," "));
145202 suggestions.add(sc);
146203 }
147 -
148204 // get suggestions for pairs of words
149 - if(last != null && t.type().equals(last.type())){
150 - String word1 = last.termText();
151 - String word2 = t.termText();
152 - // phrase
153 - ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq);
154 - if(r.size() > 0){
155 - SuggestResult res = r.get(0);
156 - String[] ph = res.word.split("_");
157 - if(ph.length == 2){
158 - // figure out which words need to be changed
159 - Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE);
160 - if(!ph[0].equals(word1))
161 - sc.substitutes.put(i-1,ph[0]);
162 - else
163 - sc.preserves.put(i-1,ph[0]);
164 - if(!ph[1].equals(word2))
165 - sc.substitutes.put(i,ph[1]);
166 - else
167 - sc.preserves.put(i,ph[1]);
168 - suggestions.add(sc);
169 - } else
170 - log.error("Unexpected phrase in suggest result "+res);
 205+ for(int j=i+1;j<tokens.size();j++){
 206+ if(!correctWords.contains(tokens.get(i)) && !correctWords.contains(tokens.get(j))){
 207+ boolean succ = addPhraseSuggestion(tokens,i,j,suggestions,nsf,minFreq);
 208+ if(succ)
 209+ break;
171210 }
172 - // join
173 - SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq);
174 - if(join != null){
175 - Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
176 - sc.substitutes.put(i-1,"");
177 - sc.substitutes.put(i,join.word);
178 - suggestions.add(sc);
179 - }
180211 }
181 - last = t;
 212+
182213 }
 214+ // indexes of tokens to be preserved in individual word check
 215+ HashSet<Integer> preserveTokens = new HashSet<Integer>();
183216 if(suggestions.size() > 0){
184217 // found some suggestions
185 - ArrayList<Entry<Integer,String>> proposedChanges = calculateChanges(suggestions,searchterm.length()/2);
 218+ Object[] ret = calculateChanges(suggestions,searchterm.length()/2);
 219+ ArrayList<Entry<Integer,String>> proposedChanges = (ArrayList<Entry<Integer, String>>) ret[0];
 220+ ArrayList<Entry<Integer,String>> preservedWords = (ArrayList<Entry<Integer, String>>) ret[1];
 221+ for(Entry<Integer,String> e : preservedWords)
 222+ preserveTokens.add(e.getKey());
186223 // substitute
187224 if(proposedChanges.size() > 0){
188225 for(Entry<Integer,String> e : proposedChanges){
189226 Token t = tokens.get(e.getKey());
190227 searchterm = markSuggestion(searchterm,t,e.getValue());
191228 }
192 - return tidy(searchterm);
 229+ return new SuggestQuery(tidy(searchterm));
193230 }
194231 }
195232
196233 // spell-check individual words
197 - if(numHits < minHitsWords){
 234+ if(numHits < minHitsWords && tokens.size() != 1){
198235 LinkedList<Change> changes = new LinkedList<Change>();
199236 for(int i=0;i<tokens.size();i++){
200237 Token t = tokens.get(i);
201238 String w = t.termText();
202239 if(w.length() < 2)
203240 continue;
204 - ArrayList<SuggestResult> sug = suggestWords(w,1);
 241+ if(correctWords.contains(w) || preserveTokens.contains(i))
 242+ continue;
 243+ ArrayList<SuggestResult> sug = suggestWordsFromTitle(w,nsf,1);
205244 if(sug.size() > 0){
206245 SuggestResult r = sug.get(0);
207246 if(r.word.equals(w))
@@ -218,13 +257,55 @@
219258 searchterm = markSuggestion(searchterm,t,e.getValue());
220259 }
221260 }
222 - return searchterm;
 261+ return new SuggestQuery(tidy(searchterm),true);
223262 }
224263 }
225264
226265 return null;
227266 }
228267
 268+ protected boolean addPhraseSuggestion(ArrayList<Token> tokens, int i1, int i2, ArrayList<Change> suggestions, NamespaceFilter nsf, int minFreq) {
 269+ Token t1 = tokens.get(i1);
 270+ Token t2 = tokens.get(i2);
 271+ if(t2.type().equals(t1.type())){
 272+ String word1 = t1.termText();
 273+ String word2 = t2.termText();
 274+ if(stopWords.contains(word1) || stopWords.contains(word2))
 275+ return false;
 276+ log.info("spell-check phrase \""+word1+" "+word2+"\"");
 277+ // phrase
 278+ ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq);
 279+ if(r.size() > 0){
 280+ SuggestResult res = r.get(0);
 281+ String[] ph = res.word.split("_");
 282+ if(ph.length == 2){
 283+ // figure out which words need to be changed
 284+ Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE);
 285+ if(!ph[0].equals(word1))
 286+ sc.substitutes.put(i1,ph[0]);
 287+ else
 288+ sc.preserves.put(i1,ph[0]);
 289+ if(!ph[1].equals(word2))
 290+ sc.substitutes.put(i2,ph[1]);
 291+ else
 292+ sc.preserves.put(i2,ph[1]);
 293+ suggestions.add(sc);
 294+ } else
 295+ log.error("Unexpected phrase in suggest result "+res);
 296+ }
 297+ // join
 298+ SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq);
 299+ if(join != null){
 300+ Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
 301+ sc.substitutes.put(i1,"");
 302+ sc.substitutes.put(i2,join.word);
 303+ suggestions.add(sc);
 304+ }
 305+ return true;
 306+ }
 307+ return false;
 308+ }
 309+
229310 protected String markSuggestion(String searchterm, Token t, String newWord){
230311 return searchterm.substring(0,t.startOffset())
231312 + "<i>" + newWord + "</i>"
@@ -233,7 +314,7 @@
234315
235316 /** tidy the query, convert double spaces into single spaces, and such... */
236317 protected String tidy(String searchterm){
237 - return searchterm.replaceAll(" +"," ");
 318+ return searchterm.replaceAll("<i></i>","").replaceAll(" +"," ").replaceAll(";","");
238319 }
239320
240321 /**
@@ -242,7 +323,7 @@
243324 *
244325 * @return set of token_number -> new string.
245326 */
246 - protected ArrayList<Entry<Integer,String>> calculateChanges(ArrayList<Change> changes, int maxDist){
 327+ protected Object[] calculateChanges(ArrayList<Change> changes, int maxDist){
247328 // sort suggested changes by relevance
248329 Collections.sort(changes,new Comparator<Change>() {
249330 public int compare(Change o1, Change o2){
@@ -288,7 +369,9 @@
289370 return o2.getKey() - o1.getKey();
290371 }
291372 });
292 - return proposedChanges;
 373+ ArrayList<Entry<Integer,String>> preservedWords = new ArrayList<Entry<Integer,String>>();
 374+ preservedWords.addAll(preserve.entrySet());
 375+ return new Object[] {proposedChanges, preservedWords};
293376 }
294377
295378 /** Suggest some words from the words index */
@@ -387,7 +470,7 @@
388471 protected boolean acceptWord(SuggestResult r, Metric m){
389472 // check metaphones: don't add if the pronunciation is something completely unrelated
390473 if((r.distMetaphone < m.meta1.length() || r.distMetaphone2 < m.meta2.length()) && (r.distMetaphone<=3 || r.distMetaphone2<=3)
391 - && (r.dist <= m.word.length()/2 || r.dist <= r.word.length()/2))
 474+ && (r.dist <= m.word.length()/2 || r.dist <= r.word.length()/2) && Math.abs(m.word.length()-r.word.length()) <= 3)
392475 return true;
393476 else
394477 return false;
@@ -435,7 +518,7 @@
436519 if(hits.length() > 0){
437520 int pfreq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf);
438521 if(pfreq >= freq && pfreq > minFreq)
439 - res.add(new SuggestResult(phrase,pfreq,1));
 522+ res.add(new SuggestResult(phrase,pfreq,2));
440523 }
441524 }
442525 if(res.size() > 0){
@@ -473,7 +556,7 @@
474557 Metric m2 = new Metric(word2);
475558 Metric metric = new Metric(phrase);
476559 try {
477 - TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),200);
 560+ TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),POOL/2);
478561 ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
479562 int minfreq = (minFreq == 0)? -1 : minFreq;
480563 // fetch results
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
@@ -12,6 +12,7 @@
1313 import org.wikimedia.lsearch.analyzers.Analyzers;
1414 import org.wikimedia.lsearch.analyzers.FieldBuilder;
1515 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
 16+import org.wikimedia.lsearch.beans.SearchResults;
1617 import org.wikimedia.lsearch.config.Configuration;
1718 import org.wikimedia.lsearch.config.GlobalConfiguration;
1819 import org.wikimedia.lsearch.config.IndexId;
@@ -25,7 +26,7 @@
2627 Configuration.open();
2728 GlobalConfiguration global = GlobalConfiguration.getInstance();
2829 boolean suggestOnly = false;
29 - String dbname = "wikilucene";
 30+ String dbname = "enwiki";
3031 for(int i=0;i<args.length;i++){
3132 if(args[i].equals("-s"))
3233 suggestOnly = true;
@@ -78,7 +79,7 @@
7980 last = text;
8081 }
8182 }
82 - System.out.println("#suggest: "+sc.suggest(inputtext,parser,new NamespaceFilter(ns),0));
 83+ System.out.println("#suggest: "+sc.suggest(inputtext,parser,new NamespaceFilter(ns),new SearchResults()));
8384 System.out.println("(finished in "+(System.currentTimeMillis()-start)+" ms)");
8485 }
8586
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
@@ -35,7 +35,7 @@
3636
3737 public CleanIndexWriter(IndexId iid) throws IOException{
3838 this.iid = iid;
39 - this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER);
 39+ this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
4040 this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
4141 String pathMain = iid.getSpellWords().getTempPath();
4242 //String pathAll = iid.getSpellTitles().getTempPath();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestQuery.java
@@ -0,0 +1,36 @@
 2+package org.wikimedia.lsearch.spell;
 3+
 4+/** Result of suggestion for a query */
 5+public class SuggestQuery {
 6+ protected String searchterm;
 7+ protected boolean needsCheck;
 8+ public SuggestQuery(String searchterm) {
 9+ this(searchterm,false);
 10+ }
 11+ public SuggestQuery(String searchterm, boolean needsCheck) {
 12+ this.searchterm = searchterm;
 13+ this.needsCheck = needsCheck;
 14+ }
 15+ /** Wether suggestion needs further checking (in case of individual word spell-check) */
 16+ public boolean needsCheck() {
 17+ return needsCheck;
 18+ }
 19+ public void setNeedsCheck(boolean needsCheck) {
 20+ this.needsCheck = needsCheck;
 21+ }
 22+ /** the suggested search term */
 23+ public String getSearchterm() {
 24+ return searchterm;
 25+ }
 26+ public void setSearchterm(String searchterm) {
 27+ this.searchterm = searchterm;
 28+ }
 29+ @Override
 30+ public String toString() {
 31+ return needsCheck? searchterm+" [needs check]" : searchterm;
 32+ }
 33+
 34+
 35+
 36+
 37+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java
@@ -107,7 +107,7 @@
108108 }
109109 }
110110 // make words index
111 - log.info("Making words index");
 111+ /*log.info("Making words index");
112112 try {
113113 LuceneDictionary dict = new LuceneDictionary(IndexReader.open(words.getTempPath()),"contents");
114114 WordsIndexer writer = new WordsIndexer(words.getImportPath(),(dbname.equals("wikilucene")? 3 : 50));
@@ -121,18 +121,18 @@
122122 log.fatal("Cannot open clean dictionary for "+words+" : "+e.getMessage());
123123 e.printStackTrace();
124124 return;
125 - }
 125+ }*/
126126
127127 log.info("Making suggest title index");
128128 // make phrase index
129129
130130 TitleIndexer tInx = new TitleIndexer(titles);
131 - tInx.createFromSnapshot();
 131+ tInx.createFromTempIndex();
132132
133133 long end = System.currentTimeMillis();
134134
135135 // make snapshots
136 - IndexThread.makeIndexSnapshot(words,words.getImportPath());
 136+ //IndexThread.makeIndexSnapshot(words,words.getImportPath());
137137 IndexThread.makeIndexSnapshot(titles,titles.getImportPath());
138138
139139 System.out.println("Finished making suggest index in "+formatTime(end-start));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NgramIndexer.java
@@ -1,6 +1,7 @@
22 package org.wikimedia.lsearch.spell.api;
33
44 import java.io.IOException;
 5+import java.util.Collection;
56
67 import org.apache.log4j.Logger;
78 import org.apache.lucene.analysis.Analyzer;
@@ -89,7 +90,7 @@
9091 }
9192
9293 /** Return ngrams of specific size for text */
93 - public static String[] nGrams(String text, int size) {
 94+ public static String[] nGramsRegular(String text, int size) {
9495 int len = text.length();
9596 String[] res = new String[len - size + 1];
9697 for (int i = 0; i < len - size + 1; i++) {
@@ -98,11 +99,40 @@
99100 return res;
100101 }
101102
 103+ /** Reverse a string */
 104+ protected static String reverse(String source){
 105+ int len = source.length();
 106+ StringBuilder dest = new StringBuilder(len);
 107+
 108+ for (int i = (len - 1); i >= 0; i--)
 109+ dest.append(source.charAt(i));
 110+ return dest.toString();
 111+ }
 112+
 113+ /** Return ngrams of specific size for text, assuming circular string */
 114+ public static String[] nGrams(String text, int size) {
 115+ int len = text.length();
 116+ String[] res = null;
 117+ if(len <= 6 && size == 2){ // produce reversed 2-grams
 118+ String[] rev = nGramsRegular(reverse(text),size);
 119+ res = new String[len + rev.length];
 120+ System.arraycopy(rev,0,res,len,rev.length);
 121+ } else
 122+ res = new String[len];
 123+ for (int i = 0; i < len; i++) {
 124+ if(i + size <= len)
 125+ res[i] = text.substring(i, i + size);
 126+ else // string is assumed to be circular
 127+ res[i] = text.substring(i)+text.substring(0,(i+size)%len);
 128+ }
 129+ return res;
 130+ }
 131+
102132 /** Get minimal ngram size for word. the minimal size should be at least 1/2 of word length */
103133 public static int getMinNgram(String word){
104 - if(word.length() <= 7)
 134+ if(word.length() <= 5)
105135 return 1;
106 - else if(word.length() <= 14)
 136+ else if(word.length() <= 7)
107137 return 2;
108138 else
109139 return 3;
@@ -110,10 +140,12 @@
111141
112142 /** Maximal size of ngram block, at most the length of word */
113143 public static int getMaxNgram(String word){
114 - if(word.length() <= 10)
 144+ if(word.length() == 4)
115145 return 2;
116 - else
 146+ else if(word.length() <= 6)
117147 return 3;
 148+ else
 149+ return 4;
118150 }
119151
120152 /** Get ngram field name with no prefix */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java
@@ -30,6 +30,7 @@
3131 import org.wikimedia.lsearch.search.IndexSearcherMul;
3232 import org.wikimedia.lsearch.search.WikiSearcher;
3333 import org.wikimedia.lsearch.spell.api.Dictionary.Word;
 34+import org.wikimedia.lsearch.util.HighFreqTerms;
3435
3536 /**
3637 * Index words and phrases from article titles.
@@ -284,6 +285,65 @@
285286 }
286287 }
287288
 289+ public void createFromTempIndex(){
 290+ String path = titles.getImportPath(); // dest where to put index
 291+ try {
 292+ ngramWriter.createIndex(path,new SimpleAnalyzer());
 293+ IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath());
 294+ Collection<String> mostfreq = HighFreqTerms.getHighFreqTerms(ir,"contents",50);
 295+ // get at most 25 stopwords
 296+ HashSet<String> stopWords = new HashSet<String>();
 297+ for(String w : mostfreq){
 298+ if(!w.contains("_"))
 299+ stopWords.add(w);
 300+ if(stopWords.size() >= 25)
 301+ break;
 302+ }
 303+ addMetadata("stopWords",stopWords);
 304+
 305+ LuceneDictionary dict = new LuceneDictionary(ir,"contents");
 306+ Word word;
 307+ while((word = dict.next()) != null){
 308+ String w = word.getWord();
 309+ int freq = word.getFrequency();
 310+ if(w.contains("_")){ // phrase
 311+ String[] words = w.split("_");
 312+ boolean allowed = true;
 313+ for(String ww : words){
 314+ // allow only those phrases consisting of title words
 315+ if(stopWords.contains(ww) || ir.docFreq(new Term("title",ww)) == 0){
 316+ allowed = false;
 317+ break;
 318+ }
 319+ }
 320+ if(allowed && freq > minPhraseFreq){
 321+ NamespaceFreq nsf = new NamespaceFreq();
 322+ nsf.setFrequency(0,freq);
 323+ ArrayList<Integer> nss = new ArrayList<Integer>();
 324+ nss.add(0);
 325+ addPhrase(w,nsf,nss);
 326+ }
 327+ } else{
 328+ if(freq > minWordFreq){
 329+ NamespaceFreq nsf = new NamespaceFreq();
 330+ nsf.setFrequency(0,freq);
 331+ ArrayList<Integer> nss = new ArrayList<Integer>();
 332+ nss.add(0);
 333+ addWord(w,nsf,nss);
 334+ }
 335+ }
 336+ }
 337+ ngramWriter.closeAndOptimize();
 338+ ir.close();
 339+
 340+ } catch (IOException e) {
 341+ log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
 342+ e.printStackTrace();
 343+ return;
 344+ }
 345+
 346+ }
 347+
288348 /**
289349 * Add phrase to index
290350 *
@@ -310,7 +370,26 @@
311371 ngramWriter.addDocument(doc);
312372 }
313373
314 - /** Add ordinary word to the index, convenient for suggesting joins
 374+ /**
 375+ * Add into metadata_key and metadata_value.
 376+ * Collection is assumed to contain words (without spaces)
 377+ */
 378+ public void addMetadata(String key, Collection<String> values){
 379+ StringBuilder sb = new StringBuilder();
 380+ // serialize by joining with spaces
 381+ for(String val : values){
 382+ if(sb.length() != 0)
 383+ sb.append(" ");
 384+ sb.append(val);
 385+ }
 386+ Document doc = new Document();
 387+ doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED));
 388+ doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO));
 389+
 390+ ngramWriter.addDocument(doc);
 391+ }
 392+
 393+ /** Add ordinary word to the index
315394 *
316395 * @param word - word to add
317396 * @param nf - frequencies in namespaces
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java
@@ -62,7 +62,7 @@
6363 // article.setContents("");
6464
6565 writer.addMainArticle(article);
66 - writer.addAllArticle(article);
 66+ //writer.addAllArticle(article);
6767 // generate phrases
6868 /* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false);
6969 ArrayList<Token> tokens = parser.parse();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java
@@ -8,8 +8,12 @@
99 int distMetaphone2=0;
1010
1111 static class Comparator implements java.util.Comparator<SuggestResult> {
12 - public int compare(SuggestResult o1, SuggestResult o2){
13 - if(o1.dist == o2.dist)
 12+ public int compare(SuggestResult o1, SuggestResult o2){
 13+ if(o1.dist - o2.dist == -1 && o1.frequency * 100 < o2.frequency)
 14+ return 1;
 15+ else if(o1.dist - o2.dist == 1 && o2.frequency * 100 < o1.frequency)
 16+ return -1;
 17+ else if(o1.dist == o2.dist)
1418 return o2.getFrequency() - o1.getFrequency();
1519 else
1620 return o1.dist - o2.dist;

Status & tagging log