r25117 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r25116‎ \| r25117 \| r25118 >
Date:	13:20, 24 August 2007
Author:	rainman
Status:	old
Tags:
Comment:	Yet another did you mean implementation, more accurate but way too slow (150ms per query). Phrase lookup needs to be reorganized. Added: * PhraseFilter - outputs two-word phrases and ignores stop words * SuggestQuery - bean to return suggest results Modified: * Index all phrases (w/o stopwords) * Metric tweaks, length diff, ... * Circular ngrams
Modified paths:	/branches/lucene-search-2.1/build.xml (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestQuery.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NgramIndexer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/HighFreqTerms.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java (modified) (history) /branches/lucene-search-2.1/test-data/mathfunc.test (added) (history)

Diff [purge]

Index: branches/lucene-search-2.1/build.xml
—	—	@@ -10,7 +10,7 @@
11	11	<property name="binary.name" value="ls2-bin"/>
12	12	<property name="jar.name" value="LuceneSearch.jar"/>
13	13	<property name="include" value="src/ lib/ sql/ test-data/ webinterface/** -example .txt lsearch* build.xml scripts/*"/>
14		~~- <property name="include.src" value="src/ sql/ build.xml scripts/*"/>~~
	14	+ <property name="include.src" value="src/ sql/ build.xml scripts/* webinterface/*"/>
15	15
16	16	<property file="${basedir}/hostname"/>
17	17
Index: branches/lucene-search-2.1/test-data/mathfunc.test
—	—	@@ -0,0 +1,541 @@
	2	+39.20291483424475
	3	+13.450437958707814
	4	+12.678569683286979
	5	+10.25526171963254
	6	+8.849258859845378
	7	+8.666273372729856
	8	+8.31447886618143
	9	+8.197323614179476
	10	+8.065610978164585
	11	+7.997460285543743
	12	+6.390661056615026
	13	+6.195251633448001
	14	+6.003035468744286
	15	+5.922401025326313
	16	+5.858780569202485
	17	+5.7545766633852065
	18	+5.663348431691695
	19	+5.513502014868557
	20	+5.397556486517471
	21	+5.320454269824394
	22	+5.118790939776451
	23	+4.957945854762612
	24	+4.677393185927231
	25	+4.417044635347138
	26	+4.286872568087402
	27	+4.111160627955847
	28	+4.018425808374067
	29	+3.875594177139727
	30	+3.855756478461852
	31	+3.583812703943094
	32	+3.5817677966683417
	33	+3.393666733951346
	34	+3.3825997856599685
	35	+3.311922305543995
	36	+3.284748780252251
	37	+3.2803658354545093
	38	+3.208116699014506
	39	+3.197975985106876
	40	+3.1501966327182416
	41	+3.126299857812352
	42	+3.063553780327862
	43	+3.0467910121568567
	44	+2.947827241203268
	45	+2.8966477779781448
	46	+2.8116941812748055
	47	+2.730939712302121
	48	+2.7194772730786663
	49	+2.6503967112741775
	50	+2.499346732261985
	51	+2.4531229860503347
	52	+2.4245965196546595
	53	+2.4048828498563353
	54	+2.398834838013775
	55	+2.3983530801899637
	56	+2.301883995710939
	57	+2.2515294877640977
	58	+2.2418150405156405
	59	+2.186851347893209
	60	+2.1421959631446454
	61	+2.112862553279673
	62	+2.111304636049398
	63	+2.0884066313608103
	64	+2.080225946703776
	65	+2.0652098242981136
	66	+2.045337183390787
	67	+2.002148363977345
	68	+1.9262536235110566
	69	+1.9254642966646955
	70	+1.8815690362186883
	71	+1.8796391915958572
	72	+1.8583157394056122
	73	+1.842890610174396
	74	+1.832874813193032
	75	+1.7857804147392449
	76	+1.778042188142664
	77	+1.7751602358737153
	78	+1.770588743574571
	79	+1.7658860788389297
	80	+1.7330514584559034
	81	+1.676248860250089
	82	+1.6632396794496727
	83	+1.6529964196348452
	84	+1.587700607893723
	85	+1.554507271828346
	86	+1.5462922600019373
	87	+1.536011998632598
	88	+1.499536875414111
	89	+1.4827109040124988
	90	+1.4618995756130717
	91	+1.4561634933713434
	92	+1.4442030498548373
	93	+1.4232400481329681
	94	+1.4148239967756024
	95	+1.4125065245521127
	96	+1.390309778182693
	97	+1.374617285871078
	98	+1.3591113373855113
	99	+1.2959623367365105
	100	+1.2916936614995556
	101	+1.2788722876401044
	102	+1.2710067417073394
	103	+1.2579879901973903
	104	+1.234760921054205
	105	+1.232572710846885
	106	+1.2239982328262375
	107	+1.1960562381380218
	108	+1.1909451721904833
	109	+1.1883725046626885
	110	+1.1870577049511857
	111	+1.179821425669525
	112	+1.170960754894606
	113	+1.1627022555545126
	114	+1.155955236116103
	115	+1.151314330373941
	116	+1.138548931392361
	117	+1.136753034029899
	118	+1.1252591078172476
	119	+1.1153859351428042
	120	+1.0995712079396343
	121	+1.093167701863354
	122	+1.0926266284365183
	123	+1.0730378946931582
	124	+1.0577347862606676
	125	+1.0517570343151739
	126	+1.043571403629543
	127	+1.0363716250813024
	128	+1.0339183869730422
	129	+1.0273092324678479
	130	+1.0272902345083126
	131	+1.0255021017312675
	132	+1.0095597490734871
	133	+1.0045834871416266
	134	+1.0032766725818
	135	+1.0008620689655172
	136	+0.9986067546725256
	137	+0.9654483583749244
	138	+0.9369909811501816
	139	+0.935063552081607
	140	+0.9083748494156194
	141	+0.9040905868635284
	142	+0.8957921591066108
	143	+0.8954248366013071
	144	+0.884183371098627
	145	+0.884105579905624
	146	+0.8835226747659162
	147	+0.8119355402722593
	148	+0.789096316833624
	149	+0.7691353626783575
	150	+0.7547464640913147
	151	+0.7517606818891444
	152	+0.7304977445593964
	153	+0.7281849565279721
	154	+0.7062913079465717
	155	+0.7020404538039372
	156	+0.701127126043806
	157	+0.699206200140122
	158	+0.685040305819171
	159	+0.6655547498187092
	160	+0.6532654101207102
	161	+0.6310138330650028
	162	+0.6246948793782561
	163	+0.6069942918933324
	164	+0.6067663016931207
	165	+0.5853223819529878
	166	+0.5670848391524492
	167	+0.5555930180090919
	168	+0.5545160077694642
	169	+0.5519308889007148
	170	+0.5317575800317539
	171	+0.5251936936993156
	172	+0.5187831936649074
	173	+0.5156023165475659
	174	+0.515510718520791
	175	+0.5103972455063522
	176	+0.5084803339569334
	177	+0.5063598230037413
	178	+0.505667612185183
	179	+0.501937984496124
	180	+0.4851394374544459
	181	+0.48245486501300455
	182	+0.47980203736009863
	183	+0.4785642026738086
	184	+0.47490132931301576
	185	+0.47440087145969495
	186	+0.470478765184812
	187	+0.4653662166610065
	188	+0.4584488352331244
	189	+0.4483601686329818
	190	+0.44831619992910315
	191	+0.43756844494575
	192	+0.41978485314864694
	193	+0.417847075405215
	194	+0.4142618474815748
	195	+0.40664111545267345
	196	+0.40430555555555553
	197	+0.3993738819320215
	198	+0.39618140428122023
	199	+0.3954248366013072
	200	+0.3866070684337751
	201	+0.38264501470195134
	202	+0.374294355322793
	203	+0.3739015797363756
	204	+0.3690967047524425
	205	+0.369047619047619
	206	+0.36553730323393024
	207	+0.36265695286094
	208	+0.36172088042753525
	209	+0.3564633249084819
	210	+0.3520672565581789
	211	+0.3513354291972556
	212	+0.3487797377639518
	213	+0.3477145520949912
	214	+0.3471940694816792
	215	+0.34373979946349387
	216	+0.3433980148049866
	217	+0.3418468571494427
	218	+0.33541054251262054
	219	+0.33527131782945735
	220	+0.33527131782945735
	221	+0.3347176079734219
	222	+0.33416587471291676
	223	+0.3297251214860608
	224	+0.3289868732118273
	225	+0.32559026238374766
	226	+0.32399276565866203
	227	+0.3236174686422881
	228	+0.32093503885813623
	229	+0.32079037346689465
	230	+0.3154645117323287
	231	+0.3139680596840939
	232	+0.3104207122130782
	233	+0.3003896715822824
	234	+0.2985506761279957
	235	+0.29658825807372824
	236	+0.2934782608695652
	237	+0.2931321661265045
	238	+0.28864928130789674
	239	+0.2861340335083771
	240	+0.28459669208546495
	241	+0.28198129842265857
	242	+0.2782558499908782
	243	+0.2763939378708801
	244	+0.2680798207081973
	245	+0.24866758585566026
	246	+0.24720389265501513
	247	+0.24634287408169175
	248	+0.24545454545454548
	249	+0.23687033450579364
	250	+0.23378644225863587
	251	+0.22875816993464054
	252	+0.22869471413160733
	253	+0.22675400538418133
	254	+0.22472044129321897
	255	+0.22466775576989487
	256	+0.2112033035327623
	257	+0.20752025499370294
	258	+0.20467549544966926
	259	+0.2041005387843561
	260	+0.2034160991520012
	261	+0.20222535082268692
	262	+0.20194931773879143
	263	+0.19780433157196253
	264	+0.1927963696680974
	265	+0.18920010810462992
	266	+0.18787808738286582
	267	+0.17671003692730708
	268	+0.17565274073006032
	269	+0.17341231124442874
	270	+0.1723925299506695
	271	+0.17197609946770082
	272	+0.1719272995046191
	273	+0.16993464052287582
	274	+0.16993464052287582
	275	+0.16865044895491207
	276	+0.16808637799360715
	277	+0.16773504273504275
	278	+0.16669077556470457
	279	+0.16270324501083455
	280	+0.1589700996677741
	281	+0.15778214034027988
	282	+0.15444497571279267
	283	+0.15334213590027546
	284	+0.15281709216048345
	285	+0.15067748638122527
	286	+0.1495469563842297
	287	+0.14720196876886033
	288	+0.1457772370310287
	289	+0.14479512735326688
	290	+0.14432989690721648
	291	+0.14082592613564054
	292	+0.14067196735578466
	293	+0.13776405094181346
	294	+0.1377070905789004
	295	+0.13638598896589604
	296	+0.13563103388092834
	297	+0.13304514191907094
	298	+0.13219008533646015
	299	+0.13095759505521926
	300	+0.13076649778510113
	301	+0.13045367197450033
	302	+0.12914131169709264
	303	+0.12482312219746579
	304	+0.12458333333333334
	305	+0.12327018675025805
	306	+0.11980187188808131
	307	+0.11831331097192645
	308	+0.11802553892761806
	309	+0.11652894048018908
	310	+0.11602674308579007
	311	+0.11368723374537328
	312	+0.11233370092594643
	313	+0.11094249945262813
	314	+0.10983825886267112
	315	+0.10720114239086087
	316	+0.1070457931678245
	317	+0.10679419583948348
	318	+0.10601680933489718
	319	+0.10441831916759216
	320	+0.10031529111575649
	321	+0.1
	322	+0.09892913376451788
	323	+0.09613003410578937
	324	+0.09558177667787654
	325	+0.09353741496598639
	326	+0.09335130310817207
	327	+0.09299489506522973
	328	+0.09284707540521495
	329	+0.09156836165742682
	330	+0.09143959488787076
	331	+0.09117350127397722
	332	+0.09090909090909091
	333	+0.08957592659331161
	334	+0.08851076659860356
	335	+0.08778550096326349
	336	+0.08730787885969052
	337	+0.08595141700404858
	338	+0.08527131782945736
	339	+0.08380728650268886
	340	+0.08270375576255015
	341	+0.08187134502923976
	342	+0.08038147306700115
	343	+0.07926470588235295
	344	+0.07667821777071915
	345	+0.07456348210639226
	346	+0.0740920438489128
	347	+0.0723925299506695
	348	+0.0722556667435676
	349	+0.07058355935137631
	350	+0.07057745902570312
	351	+0.06917211328976035
	352	+0.06897873003531063
	353	+0.06809526321135173
	354	+0.06791100973310865
	355	+0.06743256743256744
	356	+0.06730159311397596
	357	+0.06722996202214411
	358	+0.06674924924924926
	359	+0.06666666666666667
	360	+0.06666666666666667
	361	+0.06666666666666667
	362	+0.06666666666666667
	363	+0.06666666666666667
	364	+0.06624754009147153
	365	+0.06520375576255016
	366	+0.06469420951891736
	367	+0.06443798449612403
	368	+0.06431866694209185
	369	+0.06376903553299493
	370	+0.06349206349206349
	371	+0.06333495599357147
	372	+0.06263242909540888
	373	+0.0625
	374	+0.06076151390788874
	375	+0.060480192024804376
	376	+0.060452567221700414
	377	+0.05991285403050109
	378	+0.058823529411764705
	379	+0.058823529411764705
	380	+0.058823529411764705
	381	+0.058823529411764705
	382	+0.058823529411764705
	383	+0.05823070839310234
	384	+0.05816748407653175
	385	+0.0579618045578126
	386	+0.05745341614906832
	387	+0.05564512855962682
	388	+0.054808368686332126
	389	+0.05419516301503879
	390	+0.05334281650071124
	391	+0.05263157894736842
	392	+0.05259856630824373
	393	+0.051665702718334296
	394	+0.05145489270868442
	395	+0.05101136441542828
	396	+0.04973743435858965
	397	+0.04951565046909012
	398	+0.048980995248812206
	399	+0.04867645430616502
	400	+0.04768432483791873
	401	+0.04765694031555579
	402	+0.04739252995066949
	403	+0.046511627906976744
	404	+0.04640702183427576
	405	+0.04598599183197713
	406	+0.045454545454545456
	407	+0.04513888888888889
	408	+0.04398685228151585
	409	+0.04362262093762942
	410	+0.043478260869565216
	411	+0.04278197208876688
	412	+0.042360667607827314
	413	+0.041666666666666664
	414	+0.041591268270502295
	415	+0.04068627450980392
	416	+0.03972809052794399
	417	+0.03972291040988196
	418	+0.03959025470653378
	419	+0.03897502153316107
	420	+0.03886844013161564
	421	+0.03818036293723188
	422	+0.037037037037037035
	423	+0.037037037037037035
	424	+0.03654897909577054
	425	+0.03651960692790449
	426	+0.03641147396373545
	427	+0.0363407371388875
	428	+0.03600713012477718
	429	+0.0359586316471341
	430	+0.0357838573513611
	431	+0.03574975173783515
	432	+0.03573225548912443
	433	+0.03571428571428571
	434	+0.03571428571428571
	435	+0.03571428571428571
	436	+0.03571428571428571
	437	+0.03564082975847682
	438	+0.03544985471391053
	439	+0.03504654092288226
	440	+0.03486279802069275
	441	+0.03333333333333333
	442	+0.03333333333333333
	443	+0.03273854270680871
	444	+0.03257478689045375
	445	+0.030855931055416896
	446	+0.03064721823688825
	447	+0.030596683035300658
	448	+0.03019175846593227
	449	+0.029311223675659005
	450	+0.029247991816905444
	451	+0.02869474125288079
	452	+0.0286046511627907
	453	+0.02849550837462078
	454	+0.02825377396980824
	455	+0.027777777777777776
	456	+0.027777777777777776
	457	+0.02702702702702703
	458	+0.02631578947368421
	459	+0.02631578947368421
	460	+0.02601263470828688
	461	+0.02574750830564784
	462	+0.025550100018185126
	463	+0.025372327930467467
	464	+0.025211768224451123
	465	+0.025193798449612403
	466	+0.024160206718346254
	467	+0.023008165967777532
	468	+0.022904441700569815
	469	+0.02269369606133612
	470	+0.022269634413562448
	471	+0.021753233319546015
	472	+0.021739130434782608
	473	+0.021739130434782608
	474	+0.021496108285197813
	475	+0.02127659574468085
	476	+0.02127659574468085
	477	+0.02127659574468085
	478	+0.02103108133258887
	479	+0.020702295544261874
	480	+0.019552721786808246
	481	+0.019542404334929694
	482	+0.018518518518518517
	483	+0.018173482676494504
	484	+0.01791290238206013
	485	+0.017196509385738565
	486	+0.017080685702920213
	487	+0.016975703819570268
	488	+0.01640469265710271
	489	+0.016129032258064516
	490	+0.016129032258064516
	491	+0.015713963183612317
	492	+0.015625
	493	+0.014924997483137019
	494	+0.013333333333333334
	495	+0.013004982761851708
	496	+0.012763288406558142
	497	+0.011832633972748726
	498	+0.010638297872340425
	499	+0.010412560767310472
	500	+0.009900990099009901
	501	+0.00972670763891639
	502	+0.009073661840429657
	503	+0.00744185191163226
	504	+0.007414552803038665
	505	+0.007314328582145537
	506	+0.007135677344305626
	507	+0.006407710219770128
	508	+0.006211488769628305
	509	+0.006211488769628305
	510	+0.006211488769628305
	511	+0.006211488769628305
	512	+0.005638820977848822
	513	+0.005620914150601959
	514	+0.005374410612962519
	515	+0.004532574995380161
	516	+0.003942832324438774
	517	+0.0036032634303455133
	518	+0.0033840379683466597
	519	+0.003220559531554977
	520	+0.0028000534616412723
	521	+0.00274799145526168
	522	+0.0026964243582868258
	523	+0.0025624990369252356
	524	+0.002512562814070352
	525	+0.002288062199929619
	526	+0.0022123893805309734
	527	+0.0022061535822785463
	528	+0.0022023948610103345
	529	+0.002188183807439825
	530	+0.0019559542709627524
	531	+0.0019559542709627524
	532	+0.001949317738791423
	533	+0.001937984496124031
	534	+0.001937984496124031
	535	+0.001937984496124031
	536	+0.001937984496124031
	537	+0.001937984496124031
	538	+0.001937984496124031
	539	+0.001937984496124031
	540	+0.0016129032258064516
	541	+0.0014824708759661475
	542	+0.0014727540500736377
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -30,6 +30,7 @@
31	31	import org.wikimedia.lsearch.frontend.SearchServer;
32	32	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
33	33	import org.wikimedia.lsearch.spell.Suggest;
	34	+import org.wikimedia.lsearch.spell.SuggestQuery;
34	35	import org.wikimedia.lsearch.util.QueryStringMap;
35	36
36	37	/**
—	—	@@ -148,10 +149,12 @@
149	150	Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes();
150	151	boolean searchAll = false;
151	152	Suggest sug = null;
152		~~- try {~~
153		~~- sug = new Suggest(iid);~~
154		~~- } catch (IOException e1) {~~
155		~~- log.warn("Cannot open spell-suggestion indexes for "+iid+" : "+e1);~~
	153	+ if(offset == 0){
	154	+ try {
	155	+ sug = new Suggest(iid);
	156	+ } catch (IOException e1) {
	157	+ log.warn("Cannot open spell-suggestion indexes for "+iid+" : "+e1);
	158	+ }
156	159	}
157	160
158	161	// if search is over one field, try to use filters
—	—	@@ -170,20 +173,8 @@
171	174	}
172	175
173	176	try {
174		~~- if(raw){~~
175		~~- // do minimal parsing, make a raw query~~
176		~~- parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE);~~
177		~~- q = parser.parseRaw(searchterm);~~
178		~~- } else if(nsfw == null){~~
179		~~- if(searchAll)~~
180		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());~~
181		~~- else~~
182		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());~~
183		~~- } else{~~
184		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());~~
185		~~- log.info("Using NamespaceFilterWrapper "+nsfw);~~
186		~~- }~~
187		-
	177	+ q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll);
	178	+
188	179	TopDocs hits=null;
189	180	// see if we can search only part of the index
190	181	if(nsfw!=null && (iid.isMainsplit() \|\| iid.isNssplit())){
—	—	@@ -216,8 +207,27 @@
217	208	}
218	209	RMIMessengerClient messenger = new RMIMessengerClient();
219	210	res = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host);
220		~~- if(sug != null)~~
221		~~- res.setSuggest(sug.suggest(searchterm,parser,nsfw.getFilter(),res.getNumHits()));~~
	211	+ if(sug != null){
	212	+ SuggestQuery sq = sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res);
	213	+ if(sq == null)
	214	+ res.setSuggest(null);
	215	+ else{
	216	+ if(res.getNumHits() == 0){
	217	+ // no hits: show the spell-checked results
	218	+ SearchResults sugres = messenger.searchPart(piid,sq.getSearchterm(),q,nsfw,offset,limit,explain,host);
	219	+ if(sugres.getNumHits() > 0){
	220	+ res = sugres;
	221	+ res.setSuggest(sq.getSearchterm());
	222	+ }
	223	+ } else if(sq.needsCheck()){
	224	+ q = parseQuery(sq.getSearchterm(),parser,iid,raw,nsfw,searchAll);
	225	+ SearchResults sugres = messenger.searchPart(piid,sq.getSearchterm(),q,nsfw,0,1,explain,host);
	226	+ if(sugres.getNumHits() > 0){
	227	+ res.setSuggest(sq.getSearchterm());
	228	+ }
	229	+ }
	230	+ }
	231	+ }
222	232	return res;
223	233	}
224	234	}
—	—	@@ -226,8 +236,27 @@
227	237	try{
228	238	hits = searcher.search(q,nsfw,offset+limit);
229	239	res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
230		~~- if(sug != null)~~
231		~~- res.setSuggest(sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res.getNumHits()));~~
	240	+ if(sug != null){
	241	+ SuggestQuery sq = sug.suggest(searchterm,parser,(nsfw==null)? null : nsfw.getFilter(),res);
	242	+ if(sq == null)
	243	+ res.setSuggest(null);
	244	+ else{
	245	+ if(res.getNumHits() == 0){
	246	+ // no hits: show the spell-checked results
	247	+ hits = searcher.search(q,nsfw,offset+limit);
	248	+ if(hits.totalHits != 0){
	249	+ res = makeSearchResults(searcher,hits,offset,limit,iid,sq.getSearchterm(),q,searchStart,explain);
	250	+ res.setSuggest(sq.getSearchterm());
	251	+ }
	252	+ } else if(sq.needsCheck()){
	253	+ q = parseQuery(sq.getSearchterm(),parser,iid,raw,nsfw,searchAll);
	254	+ hits = searcher.search(q,nsfw,1); // fetch only one result
	255	+ if(hits.totalHits != 0){
	256	+ res.setSuggest(sq.getSearchterm());
	257	+ }
	258	+ }
	259	+ }
	260	+ }
232	261	return res;
233	262	} catch(Exception e){
234	263	e.printStackTrace();
—	—	@@ -250,6 +279,24 @@
251	280	}
252	281	}
253	282
	283	+ protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll) throws ParseException {
	284	+ Query q = null;
	285	+ if(raw){
	286	+ // do minimal parsing, make a raw query
	287	+ parser.setNamespacePolicy(WikiQueryParser.NamespacePolicy.LEAVE);
	288	+ q = parser.parseRaw(searchterm);
	289	+ } else if(nsfw == null){
	290	+ if(searchAll)
	291	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
	292	+ else
	293	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
	294	+ } else{
	295	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
	296	+ log.info("Using NamespaceFilterWrapper "+nsfw);
	297	+ }
	298	+ return q;
	299	+ }
	300	+
254	301	/** Our scores can span several orders of magnitude, transform them to be more relevant to the user */
255	302	public float transformScore(double score){
256	303	return (float) (Math.log10(1+score*99)/2);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
—	—	@@ -94,7 +94,7 @@
95	95	return searcher;
96	96	}
97	97
98		~~- IndexSearcherMul get(){~~
	98	+ synchronized IndexSearcherMul get(){
99	99	if(index >= searchers.length)
100	100	index = 0;
101	101	log.debug("Using "+iid+" searcher "+index);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -368,7 +368,7 @@
369	369	continue; // ignore single quotes (it's -> its)
370	370
371	371	// pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries
372		~~- if(Character.isLetterOrDigit(ch) \|\| ch=='-' \|\| ch=='+' \|\| ch=='_' \|\| ch=='*'){~~
	372	+ if(!Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != '.' && ch != ',' && ch != ';' && ch != '"'){
373	373	if(length<buffer.length)
374	374	buffer[length++] = ch;
375	375	} else{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -122,6 +122,10 @@
123	123	return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase));
124	124	}
125	125
	126	+ public static PerFieldAnalyzerWrapper getSpellCheckAnalyzer(String langCode){
	127	+ return getSearcherAnalyzer(new FilterFactory(langCode,FilterFactory.Type.SPELL_CHECK),new FieldNameFactory());
	128	+ }
	129	+
126	130	/**
127	131	* Analyzer for search queries. Can be reused to parse many queries.
128	132	*
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
—	—	@@ -1,5 +1,10 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
	4	+import java.lang.reflect.InvocationTargetException;
	5	+import java.lang.reflect.Method;
	6	+import java.util.ArrayList;
	7	+import java.util.Set;
	8	+
4	9	import org.apache.lucene.analysis.PorterStemFilter;
5	10	import org.apache.lucene.analysis.TokenFilter;
6	11	import org.apache.lucene.analysis.TokenStream;
—	—	@@ -16,26 +21,36 @@
17	22	public class FilterFactory {
18	23	protected String lang;
19	24	protected String snowballName = null;
20		~~- protected boolean useStemmer,useCustomFilter;~~
	25	+ protected boolean useStemmer,useLangFilter;
21	26	protected Class stemmer = null;
22		~~- protected Class customFilter = null;~~
	27	+ protected Class langFilter = null;
23	28	protected boolean usingCJK = false;
	29	+ protected ArrayList<Class> additionalFilters = null;
24	30
25	31	protected FilterFactory noStemmerFilterFactory=null;
26	32
	33	+ public enum Type { FULL, NO_STEM, SPELL_CHECK };
	34	+ protected Type type = null;
	35	+
27	36	public FilterFactory(String lang){
	37	+ this(lang,Type.FULL);
	38	+ }
	39	+
	40	+ public FilterFactory(String lang, Type type){
28	41	this.lang = lang;
	42	+ this.type = type;
29	43	init();
30		~~- noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useCustomFilter,null,customFilter);~~
	44	+ noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useLangFilter,null,langFilter,additionalFilters);
31	45	}
32	46
33		~~- public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useCustomFilter, Class stemmer, Class customFilter) {~~
	47	+ public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useLangFilter, Class stemmer, Class langFilter, ArrayList<Class> additionalFilters) {
34	48	this.lang = lang;
35	49	this.snowballName = snowballName;
36	50	this.useStemmer = useStemmer;
37		~~- this.useCustomFilter = useCustomFilter;~~
	51	+ this.useLangFilter = useLangFilter;
38	52	this.stemmer = stemmer;
39		~~- this.customFilter = customFilter;~~
	53	+ this.langFilter = langFilter;
	54	+ this.additionalFilters = additionalFilters;
40	55	}
41	56
42	57	public FilterFactory getNoStemmerFilterFactory() {
—	—	@@ -49,51 +64,60 @@
50	65	if(lang == null)
51	66	lang = "en";
52	67
53		~~- // figure out stemmer~~
54		~~- useStemmer = true;~~
55		~~- if(lang.equals("en"))~~
56		~~- snowballName = "English";~~
	68	+ if(type == Type.FULL){
	69	+ useStemmer = true;
	70	+ // figure out stemmer
	71	+ if(lang.equals("en"))
	72	+ snowballName = "English";
57	73	//stemmer = PorterStemFilter.class; // 2x faster but less accurate
58		~~- else if(lang.equals("da"))~~
59		~~- snowballName = "Danish";~~
60		~~- else if(lang.equals("nl"))~~
61		~~- snowballName = "Dutch";~~
62		~~- else if(lang.equals("fi"))~~
63		~~- snowballName = "Finnish";~~
64		~~- else if(lang.equals("de"))~~
65		~~- snowballName = "German";~~
66		~~- else if(lang.equals("it"))~~
67		~~- snowballName = "Italian";~~
68		~~- else if(lang.equals("no"))~~
69		~~- snowballName = "Norwegian";~~
70		~~- else if(lang.equals("pt"))~~
71		~~- snowballName = "Portuguese";~~
72		~~- else if(lang.equals("ru"))~~
73		~~- snowballName = "Russian";~~
74		~~- else if(lang.equals("es"))~~
75		~~- snowballName = "Spanish";~~
76		~~- else if(lang.equals("sv"))~~
77		~~- snowballName = "Swedish";~~
78		~~- else if(lang.equals("eo"))~~
79		~~- stemmer = EsperantoStemFilter.class;~~
80		~~- else~~
	74	+ else if(lang.equals("da"))
	75	+ snowballName = "Danish";
	76	+ else if(lang.equals("nl"))
	77	+ snowballName = "Dutch";
	78	+ else if(lang.equals("fi"))
	79	+ snowballName = "Finnish";
	80	+ else if(lang.equals("de"))
	81	+ snowballName = "German";
	82	+ else if(lang.equals("it"))
	83	+ snowballName = "Italian";
	84	+ else if(lang.equals("no"))
	85	+ snowballName = "Norwegian";
	86	+ else if(lang.equals("pt"))
	87	+ snowballName = "Portuguese";
	88	+ else if(lang.equals("ru"))
	89	+ snowballName = "Russian";
	90	+ else if(lang.equals("es"))
	91	+ snowballName = "Spanish";
	92	+ else if(lang.equals("sv"))
	93	+ snowballName = "Swedish";
	94	+ else if(lang.equals("eo"))
	95	+ stemmer = EsperantoStemFilter.class;
	96	+ else
	97	+ useStemmer = false;
	98	+ } else
81	99	useStemmer = false;
82	100
83		~~- // figure out custom filter~~
84		~~- useCustomFilter = true;~~
	101	+ // figure out language-dependent filters
	102	+ useLangFilter = true;
85	103	if(lang.equals("th"))
86		~~- customFilter = ThaiWordFilter.class;~~
	104	+ langFilter = ThaiWordFilter.class;
87	105	else if(lang.equals("sr"))
88		~~- customFilter = SerbianFilter.class;~~
	106	+ langFilter = SerbianFilter.class;
89	107	else if(lang.equals("vi"))
90		~~- customFilter = VietnameseFilter.class;~~
	108	+ langFilter = VietnameseFilter.class;
91	109	else if(lang.equals("zh") \|\| lang.equals("cjk") \|\| lang.equals("ja") \|\|
92	110	lang.equals("zh-classical") \|\| lang.equals("zh-yue")){
93		~~- customFilter = CJKFilter.class;~~
	111	+ langFilter = CJKFilter.class;
94	112	usingCJK = true;
95	113	} else
96		~~- useCustomFilter = false;~~
	114	+ useLangFilter = false;
97	115
	116	+ // additional filters
	117	+ if(type == Type.SPELL_CHECK){
	118	+ additionalFilters = new ArrayList<Class>();
	119	+ additionalFilters.add(PhraseFilter.class);
	120	+ }
	121	+
98	122	}
99	123
100	124	public TokenFilter makeStemmer(TokenStream in){
—	—	@@ -113,11 +137,11 @@
114	138	}
115	139
116	140	public TokenFilter makeCustomFilter(TokenStream in){
117		~~- if(!useCustomFilter)~~
	141	+ if(!useLangFilter)
118	142	return null;
119		~~- else if(customFilter != null){~~
	143	+ else if(langFilter != null){
120	144	try {
121		~~- return (TokenFilter) customFilter.getConstructor(TokenStream.class).newInstance(in);~~
	145	+ return (TokenFilter) langFilter.getConstructor(TokenStream.class).newInstance(in);
122	146	} catch (Exception e) {
123	147	e.printStackTrace();
124	148	}
—	—	@@ -126,6 +150,26 @@
127	151	return null;
128	152	}
129	153
	154	+ public TokenStream makeAdditionalFilterChain(TokenStream in){
	155	+ if(additionalFilters == null)
	156	+ return in;
	157	+ try {
	158	+ TokenStream chain = in;
	159	+ // nest additional filters, apply them as added to the list
	160	+ for(Class filter : additionalFilters){
	161	+ chain = (TokenStream) filter.getConstructor(TokenStream.class).newInstance(chain);
	162	+ }
	163	+ return chain;
	164	+ } catch (Exception e) {
	165	+ e.printStackTrace();
	166	+ return null;
	167	+ }
	168	+ }
	169	+
	170	+ public boolean hasAdditionalFilters(){
	171	+ return additionalFilters != null;
	172	+ }
	173	+
130	174	public boolean hasStemmer(){
131	175	return useStemmer;
132	176	}
—	—	@@ -135,12 +179,31 @@
136	180	}
137	181
138	182	public boolean hasCustomFilter(){
139		~~- return useCustomFilter;~~
	183	+ return useLangFilter;
140	184	}
141	185
142	186	public String getLanguage(){
143	187	return lang;
144	188	}
145	189
	190	+ public void setStopWords(Set<String> stopWords){
	191	+ for(Class filter : additionalFilters){
	192	+ for(Method m : filter.getMethods()){
	193	+ if(m.getName().equals("setStopWords")){
	194	+ try {
	195	+ m.invoke(filter,new Object[] {stopWords});
	196	+ } catch (IllegalArgumentException e) {
	197	+ e.printStackTrace();
	198	+ } catch (IllegalAccessException e) {
	199	+ e.printStackTrace();
	200	+ } catch (InvocationTargetException e) {
	201	+ e.printStackTrace();
	202	+ }
	203	+ }
	204	+ }
	205	+
	206	+ }
	207	+ }
146	208
	209	+
147	210	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
—	—	@@ -0,0 +1,76 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.HashSet;
	6	+import java.util.Set;
	7	+
	8	+import org.apache.lucene.analysis.Token;
	9	+import org.apache.lucene.analysis.TokenFilter;
	10	+import org.apache.lucene.analysis.TokenStream;
	11	+import org.apache.lucene.index.IndexReader;
	12	+import org.apache.lucene.index.Term;
	13	+import org.wikimedia.lsearch.config.IndexId;
	14	+import org.wikimedia.lsearch.config.IndexRegistry;
	15	+
	16	+/**
	17	+ * Filter that outputs phrases and words mixed, e.g.
	18	+ * novi sad is a city -> novi, sad, novi_sad, is, sad_is, a, is_a, city, a_city
	19	+ *
	20	+ * @author rainman
	21	+ *
	22	+ */
	23	+public class PhraseFilter extends TokenFilter {
	24	+ protected Set<String> stopWords = null;
	25	+
	26	+ public PhraseFilter(TokenStream input) {
	27	+ super(input);
	28	+ }
	29	+
	30	+ protected Token phrase1 = null, phrase2 = null;
	31	+ protected boolean phraseReady = false;
	32	+
	33	+ protected boolean forPhrase(Token t){
	34	+ if(stopWords!=null && stopWords.contains(t.termText()))
	35	+ return false;
	36	+ else
	37	+ return true;
	38	+ }
	39	+
	40	+ @Override
	41	+ public Token next() throws IOException {
	42	+ if(phraseReady){
	43	+ phraseReady = false;
	44	+ return new Token(phrase1.termText()+"_"+phrase2.termText(),phrase1.startOffset(),phrase2.endOffset());
	45	+ }
	46	+ Token t = input.next();
	47	+ if(t == null)
	48	+ return null; // EOS
	49	+ if(!forPhrase(t))
	50	+ return t; // stop word, return as word only
	51	+
	52	+ if(phrase1 == null){
	53	+ phrase1 = t;
	54	+ return t;
	55	+ }
	56	+ if(phrase2 == null){
	57	+ phrase2 = t;
	58	+ phraseReady = true;
	59	+ return t;
	60	+ }
	61	+
	62	+ phrase1 = phrase2;
	63	+ phrase2 = t;
	64	+ phraseReady = true;
	65	+
	66	+ return t; // prepared phrase, return word, phrase in next call
	67	+ }
	68	+
	69	+ public Set<String> getStopWords() {
	70	+ return stopWords;
	71	+ }
	72	+
	73	+ public void setStopWords(Set<String> stopWords) {
	74	+ this.stopWords = stopWords;
	75	+ }
	76	+
	77	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java
—	—	@@ -61,8 +61,12 @@
62	62	if(filters.hasCustomFilter())
63	63	tokens = applyCustomFilter(tokens);
64	64
65		~~- return new AliasFilter(filters,~~
66		~~- new ArrayTokens(tokens), new ArrayTokens(tokens));~~
	65	+ TokenStream out = new AliasFilter(filters,
	66	+ new ArrayTokens(tokens), new ArrayTokens(tokens));
	67	+ if(filters.hasAdditionalFilters())
	68	+ return filters.makeAdditionalFilterChain(out);
	69	+ else
	70	+ return out;
67	71	}
68	72
69	73	/** Filter the tokens via the custom filter. For instance, to delete
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
—	—	@@ -42,23 +42,28 @@
43	43	/** default is ignore case (upper/lower), use exact_case for wiktionaries, etc */
44	44	public static enum Case { IGNORE_CASE, EXACT_CASE };
45	45	/** use stemmer if available, of force no stemming */
46		~~- public static enum Stemmer { USE_STEMMER, NO_STEMMER };~~
	46	+ public static enum Stemmer { USE_STEMMER, NO_STEMMER };
	47	+ /** additional options */
	48	+ public static enum Options { NONE, SPELL_CHECK };
47	49
48	50	/** Construct case-insensitive field builder with stemming */
49	51	public FieldBuilder(String lang){
50		~~- this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER);~~
	52	+ this(lang,Case.IGNORE_CASE,Stemmer.USE_STEMMER,Options.NONE);
51	53	}
52	54
53	55	public FieldBuilder(String lang, Case useCase){
54		~~- this(lang,useCase,Stemmer.USE_STEMMER);~~
	56	+ this(lang,useCase,Stemmer.USE_STEMMER,Options.NONE);
55	57	}
56	58
57		~~- public FieldBuilder(String lang, Case useCase, Stemmer useStemmer){~~
	59	+ public FieldBuilder(String lang, Case useCase, Stemmer useStemmer, Options options){
	60	+ FilterFactory.Type type = FilterFactory.Type.FULL;
	61	+ if(options == Options.SPELL_CHECK)
	62	+ type = FilterFactory.Type.SPELL_CHECK;
58	63	// additional exact case factory
59	64	if(useCase == Case.EXACT_CASE){
60		~~- builders = new BuilderSet[2];~~
	65	+ builders = new BuilderSet[2];
61	66	builders[1] = new BuilderSet(
62		~~- new FilterFactory(lang).getNoStemmerFilterFactory(),~~
	67	+ new FilterFactory(lang,type).getNoStemmerFilterFactory(),
63	68	new FieldNameFactory(FieldNameFactory.EXACT_CASE));
64	69	} else
65	70	builders = new BuilderSet[1];
—	—	@@ -66,11 +71,11 @@
67	72	// default factory, lowercase all data
68	73	if(useStemmer == Stemmer.USE_STEMMER){
69	74	builders[0] = new BuilderSet(
70		~~- new FilterFactory(lang),~~
	75	+ new FilterFactory(lang,type),
71	76	new FieldNameFactory());
72	77	} else{
73	78	builders[0] = new BuilderSet(
74		~~- new FilterFactory(lang).getNoStemmerFilterFactory(),~~
	79	+ new FilterFactory(lang,type).getNoStemmerFilterFactory(),
75	80	new FieldNameFactory());
76	81	}
77	82
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java
—	—	@@ -1,5 +1,7 @@
2	2	package org.wikimedia.lsearch.util;
3	3
	4	+import org.wikimedia.lsearch.test.MathFuncTest;
	5	+
4	6	public class MathFunc {
5	7
6	8	/** Calculate average value starting from start to end (end excluded) */
—	—	@@ -26,27 +28,34 @@
27	29	// av[i] = avg(val,part[i],part[i+1]);
28	30	// error
29	31	double err = calcErr(part,val,num);
	32	+ double err2 = calcErr2(part,val,num);
30	33	// values at next iteration
31	34	int[] newpart = new int[num+1];
32	35	//double[] newav = new double[num];
33		~~- double newerr = 0;~~
	36	+ double newerr = 0, newerr2 = 0;
34	37
35	38	while(true){
36	39	for(int i=0;i<num-1;i++){
37	40	merge(i,part,newpart,val,num);
38	41	newerr = calcErr(newpart,val,num);
39		~~- if(newerr < err){~~
	42	+ newerr2 = calcErr2(newpart,val,num);
	43	+ if(newerr < err \|\| (newerr == err && newerr2 < err2)){
40	44	copy(newpart,part);
41	45	err = newerr;
	46	+ err2 = newerr2;
	47	+ //MathFuncTest.print(newpart,val);
42	48	continue;
43	49	}
44	50	}
45	51	// try extending last
46	52	extend(part,newpart,val,num);
47	53	newerr = calcErr(newpart,val,num);
48		~~- if(newerr < err){~~
	54	+ newerr2 = calcErr2(newpart,val,num);
	55	+ if(newerr < err \|\| (newerr == err && newerr2 < err2)){
49	56	copy(newpart,part);
50	57	err = newerr;
	58	+ err2 = newerr2;
	59	+ //MathFuncTest.print(newpart,val);
51	60	continue;
52	61	}
53	62	break;
—	—	@@ -94,10 +103,24 @@
95	104	double err = 0;
96	105	for(int i=0;i<num;i++){
97	106	// max - min value
98		~~- double e = val[part[i]]-val[part[i+1]-1];~~
	107	+ double v2 = val[part[i]];
	108	+ double v1 = val[part[i+1]-1];
	109	+ double e = v2 - v1;
99	110	if( e > err )
100	111	err = e;
101	112	}
102	113	return err;
103	114	}
	115	+
	116	+ private static double calcErr2(int[] part, double[] val, int num) {
	117	+ double err = 0;
	118	+ for(int i=0;i<num;i++){
	119	+ // max - min value
	120	+ double v2 = val[part[i]];
	121	+ double v1 = val[part[i+1]-1];
	122	+ double e = v2 - v1;
	123	+ err += e*(part[i+1]-1-part[i]);
	124	+ }
	125	+ return err;
	126	+ }
104	127	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java
—	—	@@ -140,6 +140,10 @@
141	141	if(table[ich]==null)
142	142	continue;
143	143	Buffer buffer = new Buffer(buf,0);
	144	+ if(ich == 0xD4A){
	145	+ int b = 0;
	146	+ b++;
	147	+ }
144	148	recursiveDecompose(buffer,table,letters,(char)ich);
145	149	if(buffer.len != 0){
146	150	decomposition[ich]= new char[buffer.len];
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/HighFreqTerms.java
—	—	@@ -0,0 +1,83 @@
	2	+package org.wikimedia.lsearch.util;
	3	+
	4	+/**
	5	+ * Copyright 2004 The Apache Software Foundation
	6	+ *
	7	+ * Licensed under the Apache License, Version 2.0 (the "License");
	8	+ * you may not use this file except in compliance with the License.
	9	+ * You may obtain a copy of the License at
	10	+ *
	11	+ * http://www.apache.org/licenses/LICENSE-2.0
	12	+ *
	13	+ * Unless required by applicable law or agreed to in writing, software
	14	+ * distributed under the License is distributed on an "AS IS" BASIS,
	15	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	16	+ * See the License for the specific language governing permissions and
	17	+ * limitations under the License.
	18	+ */
	19	+
	20	+import java.io.IOException;
	21	+import java.util.ArrayList;
	22	+import java.util.Collection;
	23	+import java.util.LinkedList;
	24	+
	25	+import org.apache.lucene.index.IndexReader;
	26	+import org.apache.lucene.index.Term;
	27	+import org.apache.lucene.index.TermEnum;
	28	+import org.apache.lucene.util.PriorityQueue;
	29	+
	30	+/**
	31	+ * <code>HighFreqTerms</code> class extracts terms and their frequencies out
	32	+ * of an existing Lucene index.
	33	+ *
	34	+ * @version $Id: HighFreqTerms.java 376393 2006-02-09 19:17:14Z otis $
	35	+ */
	36	+public class HighFreqTerms {
	37	+
	38	+ public static Collection<String> getHighFreqTerms(IndexReader reader, String field, int numTerms) throws IOException {
	39	+ TermInfoQueue tiq = new TermInfoQueue(numTerms);
	40	+ TermEnum terms = reader.terms();
	41	+ LinkedList<String> ret = new LinkedList<String>();
	42	+
	43	+ if (field != null) {
	44	+ // collect terms from field into priority queue
	45	+ while (terms.next()) {
	46	+ if (terms.term().field().equals(field)) {
	47	+ tiq.insert(new TermInfo(terms.term(), terms.docFreq()));
	48	+ }
	49	+ }
	50	+ } else {
	51	+ // collect all terms
	52	+ while (terms.next()) {
	53	+ tiq.insert(new TermInfo(terms.term(), terms.docFreq()));
	54	+ }
	55	+ }
	56	+
	57	+ // get higest ranked
	58	+ while (tiq.size() != 0) {
	59	+ ret.addFirst(((TermInfo) tiq.pop()).term.text());
	60	+ }
	61	+
	62	+ return ret;
	63	+ }
	64	+}
	65	+
	66	+final class TermInfo {
	67	+ TermInfo(Term t, int df) {
	68	+ term = t;
	69	+ docFreq = df;
	70	+ }
	71	+ int docFreq;
	72	+ Term term;
	73	+}
	74	+
	75	+final class TermInfoQueue extends PriorityQueue {
	76	+ TermInfoQueue(int size) {
	77	+ initialize(size);
	78	+ }
	79	+ protected final boolean lessThan(Object a, Object b) {
	80	+ TermInfo termInfoA = (TermInfo) a;
	81	+ TermInfo termInfoB = (TermInfo) b;
	82	+ return termInfoA.docFreq < termInfoB.docFreq;
	83	+ }
	84	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
—	—	@@ -39,7 +39,7 @@
40	40	int bad=0;
41	41	long start = System.currentTimeMillis();
42	42	for(String[] m : DATA){
43		~~- ArrayList<SuggestResult> res = sc.suggestWords(m[0],5);~~
	43	+ ArrayList<SuggestResult> res = sc.suggestWordsFromTitle(m[0],new NamespaceFilter(0),5);
44	44	if(res.size() > 0){
45	45	SuggestResult r = res.get(0);
46	46	if(r.getWord().equals(m[1]))
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -385,6 +385,15 @@
386	386	q = parser.parseFourPass("\"うろパン\"",NamespacePolicy.IGNORE,false);
387	387	assertEquals("contents:\"うろろハハン\" title:\"うろろハハン\"^2.0 (alttitle1:\"うろろハハン\"^6.0 alttitle2:\"うろろハハン\"^6.0 alttitle3:\"うろろハハン\"^6.0)",q.toString());
388	388
	389	+
	390	+ // Malayalam
	391	+ analyzer = Analyzers.getSearcherAnalyzer("ml");
	392	+ bs = new FieldBuilder("ml").getBuilder();
	393	+ parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
	394	+ q = parser.parseFourPass("കൊറിയ ",NamespacePolicy.IGNORE,false);
	395	+ assertEquals("contents:കറയ title:കറയ^2.0 (alttitle1:കറയ^6.0 alttitle2:കറയ^6.0 alttitle3:കറയ^6.0)",q.toString());
	396	+
	397	+
389	398	// Test field extraction
390	399	HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
391	400	assertEquals(3,fs.size());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
—	—	@@ -5,13 +5,19 @@
6	6	import java.util.Collections;
7	7	import java.util.Comparator;
8	8	import java.util.HashMap;
	9	+import java.util.HashSet;
9	10	import java.util.LinkedList;
	11	+import java.util.Set;
	12	+import java.util.WeakHashMap;
10	13	import java.util.Map.Entry;
11	14
12	15	import org.apache.log4j.Logger;
	16	+import org.apache.lucene.analysis.Analyzer;
13	17	import org.apache.lucene.analysis.Token;
	18	+import org.apache.lucene.analysis.TokenStream;
14	19	import org.apache.lucene.document.Document;
15	20	import org.apache.lucene.index.Term;
	21	+import org.apache.lucene.index.TermDocs;
16	22	import org.apache.lucene.search.BooleanClause;
17	23	import org.apache.lucene.search.BooleanQuery;
18	24	import org.apache.lucene.search.Hits;
—	—	@@ -20,7 +26,10 @@
21	27	import org.apache.lucene.search.ScoreDoc;
22	28	import org.apache.lucene.search.TermQuery;
23	29	import org.apache.lucene.search.TopDocs;
	30	+import org.wikimedia.lsearch.analyzers.Analyzers;
24	31	import org.wikimedia.lsearch.analyzers.WikiQueryParser;
	32	+import org.wikimedia.lsearch.beans.ResultSet;
	33	+import org.wikimedia.lsearch.beans.SearchResults;
25	34	import org.wikimedia.lsearch.config.GlobalConfiguration;
26	35	import org.wikimedia.lsearch.config.IndexId;
27	36	import org.wikimedia.lsearch.search.NamespaceFilter;
—	—	@@ -38,6 +47,8 @@
39	48	protected IndexSearcher titles;
40	49	protected int minHitsWords;
41	50	protected int minHitsTitles;
	51	+ protected static WeakHashMap<IndexSearcher,Set<String>> stopWordsIndexes = new WeakHashMap<IndexSearcher,Set<String>>();
	52	+ protected Set<String> stopWords;
42	53
43	54	/** Distance an metaphone metrics */
44	55	static class Metric {
—	—	@@ -70,7 +81,7 @@
71	82	}
72	83
73	84	/** Number of results to fetch */
74		~~- public static final int POOL = 300;~~
	85	+ public static final int POOL = 150;
75	86
76	87	/** Lower limit to hit rate for joining */
77	88	public static final int JOIN_FREQ = 1;
—	—	@@ -83,6 +94,21 @@
84	95	this.titles = cache.getLocalSearcher(iid.getSpellTitles());
85	96	this.minHitsWords = global.getIntDBParam(iid.getDBname(),"spell_words","minHits",20);
86	97	this.minHitsTitles = global.getIntDBParam(iid.getDBname(),"spell_titles","minHits",20);
	98	+
	99	+ synchronized(stopWordsIndexes){
	100	+ if(!stopWordsIndexes.containsKey(titles)){
	101	+ Set<String> s = Collections.synchronizedSet(new HashSet<String>());
	102	+ stopWordsIndexes.put(titles,s);
	103	+ TermDocs d = titles.getIndexReader().termDocs(new Term("metadata_key","stopWords"));
	104	+ if(d.next()){
	105	+ String val = titles.doc(d.doc()).get("metadata_value");
	106	+ for(String sw : val.split(" ")){
	107	+ s.add(sw);
	108	+ }
	109	+ }
	110	+ }
	111	+ this.stopWords = stopWordsIndexes.get(titles);
	112	+ }
87	113	}
88	114
89	115	static class Change {
—	—	@@ -111,15 +137,46 @@
112	138	*
113	139	* @return suggested query, or null if no suggestions
114	140	*/
115		~~- public String suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, int numHits){~~
	141	+ @SuppressWarnings("unchecked")
	142	+ public SuggestQuery suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, SearchResults res){
116	143	ArrayList<Token> tokens = parser.tokenizeBareText(searchterm);
	144	+ int numHits = res.getNumHits();
	145	+
	146	+ if(numHits >= minHitsTitles)
	147	+ return null;
	148	+
	149	+ // collect words in titles, these shouldn't be spell-checked
	150	+ HashSet<String> correctWords = new HashSet<String>();
	151	+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false);
	152	+ try {
	153	+ for(ResultSet r : res.getResults()){
	154	+ Token t = null;
	155	+ TokenStream ts = analyzer.tokenStream("title",r.title);
	156	+ while( (t = ts.next()) != null ){
	157	+ correctWords.add(t.termText());
	158	+ }
	159	+ }
	160	+ } catch (IOException e) {
	161	+ log.error("I/O error trying to get list of correct words : "+e.getMessage());
	162	+ e.printStackTrace();
	163	+ }
117	164
118	165	// always spell-check phrases
119	166	int minFreq = (numHits < minHitsTitles)? 0 : numHits;
120	167	ArrayList<Change> suggestions = new ArrayList<Change>();
121		~~- Token last = null;~~
	168	+
	169	+ // add correct words
122	170	for(int i=0;i<tokens.size();i++){
123	171	Token t = tokens.get(i);
	172	+ if(correctWords.contains(t.termText())){
	173	+ Change c = new Change(0,1,Change.Type.TITLE_WORD);
	174	+ c.preserves.put(i,t.termText());
	175	+ suggestions.add(c);
	176	+ }
	177	+ }
	178	+
	179	+ for(int i=0;i<tokens.size();i++){
	180	+ Token t = tokens.get(i);
124	181	String w = t.termText();
125	182	if(!"word".equals(t.type()) && !"phrase".equals(t.type()))
126	183	continue; // ignore aliases and such
—	—	@@ -143,64 +200,46 @@
144	201	sc.substitutes.put(i,split.word.replace("_"," "));
145	202	suggestions.add(sc);
146	203	}
147		-
148	204	// get suggestions for pairs of words
149		~~- if(last != null && t.type().equals(last.type())){~~
150		~~- String word1 = last.termText();~~
151		~~- String word2 = t.termText();~~
152		~~- // phrase~~
153		~~- ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq);~~
154		~~- if(r.size() > 0){~~
155		~~- SuggestResult res = r.get(0);~~
156		~~- String[] ph = res.word.split("_");~~
157		~~- if(ph.length == 2){~~
158		~~- // figure out which words need to be changed~~
159		~~- Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE);~~
160		~~- if(!ph[0].equals(word1))~~
161		~~- sc.substitutes.put(i-1,ph[0]);~~
162		~~- else~~
163		~~- sc.preserves.put(i-1,ph[0]);~~
164		~~- if(!ph[1].equals(word2))~~
165		~~- sc.substitutes.put(i,ph[1]);~~
166		~~- else~~
167		~~- sc.preserves.put(i,ph[1]);~~
168		~~- suggestions.add(sc);~~
169		~~- } else~~
170		~~- log.error("Unexpected phrase in suggest result "+res);~~
	205	+ for(int j=i+1;j<tokens.size();j++){
	206	+ if(!correctWords.contains(tokens.get(i)) && !correctWords.contains(tokens.get(j))){
	207	+ boolean succ = addPhraseSuggestion(tokens,i,j,suggestions,nsf,minFreq);
	208	+ if(succ)
	209	+ break;
171	210	}
172		~~- // join~~
173		~~- SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq);~~
174		~~- if(join != null){~~
175		~~- Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);~~
176		~~- sc.substitutes.put(i-1,"");~~
177		~~- sc.substitutes.put(i,join.word);~~
178		~~- suggestions.add(sc);~~
179		~~- }~~
180	211	}
181		~~- last = t;~~
	212	+
182	213	}
	214	+ // indexes of tokens to be preserved in individual word check
	215	+ HashSet<Integer> preserveTokens = new HashSet<Integer>();
183	216	if(suggestions.size() > 0){
184	217	// found some suggestions
185		~~- ArrayList<Entry<Integer,String>> proposedChanges = calculateChanges(suggestions,searchterm.length()/2);~~
	218	+ Object[] ret = calculateChanges(suggestions,searchterm.length()/2);
	219	+ ArrayList<Entry<Integer,String>> proposedChanges = (ArrayList<Entry<Integer, String>>) ret[0];
	220	+ ArrayList<Entry<Integer,String>> preservedWords = (ArrayList<Entry<Integer, String>>) ret[1];
	221	+ for(Entry<Integer,String> e : preservedWords)
	222	+ preserveTokens.add(e.getKey());
186	223	// substitute
187	224	if(proposedChanges.size() > 0){
188	225	for(Entry<Integer,String> e : proposedChanges){
189	226	Token t = tokens.get(e.getKey());
190	227	searchterm = markSuggestion(searchterm,t,e.getValue());
191	228	}
192		~~- return tidy(searchterm);~~
	229	+ return new SuggestQuery(tidy(searchterm));
193	230	}
194	231	}
195	232
196	233	// spell-check individual words
197		~~- if(numHits < minHitsWords){~~
	234	+ if(numHits < minHitsWords && tokens.size() != 1){
198	235	LinkedList<Change> changes = new LinkedList<Change>();
199	236	for(int i=0;i<tokens.size();i++){
200	237	Token t = tokens.get(i);
201	238	String w = t.termText();
202	239	if(w.length() < 2)
203	240	continue;
204		~~- ArrayList<SuggestResult> sug = suggestWords(w,1);~~
	241	+ if(correctWords.contains(w) \|\| preserveTokens.contains(i))
	242	+ continue;
	243	+ ArrayList<SuggestResult> sug = suggestWordsFromTitle(w,nsf,1);
205	244	if(sug.size() > 0){
206	245	SuggestResult r = sug.get(0);
207	246	if(r.word.equals(w))
—	—	@@ -218,13 +257,55 @@
219	258	searchterm = markSuggestion(searchterm,t,e.getValue());
220	259	}
221	260	}
222		~~- return searchterm;~~
	261	+ return new SuggestQuery(tidy(searchterm),true);
223	262	}
224	263	}
225	264
226	265	return null;
227	266	}
228	267
	268	+ protected boolean addPhraseSuggestion(ArrayList<Token> tokens, int i1, int i2, ArrayList<Change> suggestions, NamespaceFilter nsf, int minFreq) {
	269	+ Token t1 = tokens.get(i1);
	270	+ Token t2 = tokens.get(i2);
	271	+ if(t2.type().equals(t1.type())){
	272	+ String word1 = t1.termText();
	273	+ String word2 = t2.termText();
	274	+ if(stopWords.contains(word1) \|\| stopWords.contains(word2))
	275	+ return false;
	276	+ log.info("spell-check phrase \""+word1+" "+word2+"\"");
	277	+ // phrase
	278	+ ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq);
	279	+ if(r.size() > 0){
	280	+ SuggestResult res = r.get(0);
	281	+ String[] ph = res.word.split("_");
	282	+ if(ph.length == 2){
	283	+ // figure out which words need to be changed
	284	+ Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE);
	285	+ if(!ph[0].equals(word1))
	286	+ sc.substitutes.put(i1,ph[0]);
	287	+ else
	288	+ sc.preserves.put(i1,ph[0]);
	289	+ if(!ph[1].equals(word2))
	290	+ sc.substitutes.put(i2,ph[1]);
	291	+ else
	292	+ sc.preserves.put(i2,ph[1]);
	293	+ suggestions.add(sc);
	294	+ } else
	295	+ log.error("Unexpected phrase in suggest result "+res);
	296	+ }
	297	+ // join
	298	+ SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq);
	299	+ if(join != null){
	300	+ Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
	301	+ sc.substitutes.put(i1,"");
	302	+ sc.substitutes.put(i2,join.word);
	303	+ suggestions.add(sc);
	304	+ }
	305	+ return true;
	306	+ }
	307	+ return false;
	308	+ }
	309	+
229	310	protected String markSuggestion(String searchterm, Token t, String newWord){
230	311	return searchterm.substring(0,t.startOffset())
231	312	+ "<i>" + newWord + "</i>"
—	—	@@ -233,7 +314,7 @@
234	315
235	316	/** tidy the query, convert double spaces into single spaces, and such... */
236	317	protected String tidy(String searchterm){
237		~~- return searchterm.replaceAll(" +"," ");~~
	318	+ return searchterm.replaceAll("<i></i>","").replaceAll(" +"," ").replaceAll(";","");
238	319	}
239	320
240	321	/**
—	—	@@ -242,7 +323,7 @@
243	324	*
244	325	* @return set of token_number -> new string.
245	326	*/
246		~~- protected ArrayList<Entry<Integer,String>> calculateChanges(ArrayList<Change> changes, int maxDist){~~
	327	+ protected Object[] calculateChanges(ArrayList<Change> changes, int maxDist){
247	328	// sort suggested changes by relevance
248	329	Collections.sort(changes,new Comparator<Change>() {
249	330	public int compare(Change o1, Change o2){
—	—	@@ -288,7 +369,9 @@
289	370	return o2.getKey() - o1.getKey();
290	371	}
291	372	});
292		~~- return proposedChanges;~~
	373	+ ArrayList<Entry<Integer,String>> preservedWords = new ArrayList<Entry<Integer,String>>();
	374	+ preservedWords.addAll(preserve.entrySet());
	375	+ return new Object[] {proposedChanges, preservedWords};
293	376	}
294	377
295	378	/** Suggest some words from the words index */
—	—	@@ -387,7 +470,7 @@
388	471	protected boolean acceptWord(SuggestResult r, Metric m){
389	472	// check metaphones: don't add if the pronunciation is something completely unrelated
390	473	if((r.distMetaphone < m.meta1.length() \|\| r.distMetaphone2 < m.meta2.length()) && (r.distMetaphone<=3 \|\| r.distMetaphone2<=3)
391		~~- && (r.dist <= m.word.length()/2 \|\| r.dist <= r.word.length()/2))~~
	474	+ && (r.dist <= m.word.length()/2 \|\| r.dist <= r.word.length()/2) && Math.abs(m.word.length()-r.word.length()) <= 3)
392	475	return true;
393	476	else
394	477	return false;
—	—	@@ -435,7 +518,7 @@
436	519	if(hits.length() > 0){
437	520	int pfreq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf);
438	521	if(pfreq >= freq && pfreq > minFreq)
439		~~- res.add(new SuggestResult(phrase,pfreq,1));~~
	522	+ res.add(new SuggestResult(phrase,pfreq,2));
440	523	}
441	524	}
442	525	if(res.size() > 0){
—	—	@@ -473,7 +556,7 @@
474	557	Metric m2 = new Metric(word2);
475	558	Metric metric = new Metric(phrase);
476	559	try {
477		~~- TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),200);~~
	560	+ TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),POOL/2);
478	561	ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
479	562	int minfreq = (minFreq == 0)? -1 : minFreq;
480	563	// fetch results
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
—	—	@@ -12,6 +12,7 @@
13	13	import org.wikimedia.lsearch.analyzers.Analyzers;
14	14	import org.wikimedia.lsearch.analyzers.FieldBuilder;
15	15	import org.wikimedia.lsearch.analyzers.WikiQueryParser;
	16	+import org.wikimedia.lsearch.beans.SearchResults;
16	17	import org.wikimedia.lsearch.config.Configuration;
17	18	import org.wikimedia.lsearch.config.GlobalConfiguration;
18	19	import org.wikimedia.lsearch.config.IndexId;
—	—	@@ -25,7 +26,7 @@
26	27	Configuration.open();
27	28	GlobalConfiguration global = GlobalConfiguration.getInstance();
28	29	boolean suggestOnly = false;
29		~~- String dbname = "wikilucene";~~
	30	+ String dbname = "enwiki";
30	31	for(int i=0;i<args.length;i++){
31	32	if(args[i].equals("-s"))
32	33	suggestOnly = true;
—	—	@@ -78,7 +79,7 @@
79	80	last = text;
80	81	}
81	82	}
82		~~- System.out.println("#suggest: "+sc.suggest(inputtext,parser,new NamespaceFilter(ns),0));~~
	83	+ System.out.println("#suggest: "+sc.suggest(inputtext,parser,new NamespaceFilter(ns),new SearchResults()));
83	84	System.out.println("(finished in "+(System.currentTimeMillis()-start)+" ms)");
84	85	}
85	86
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
—	—	@@ -35,7 +35,7 @@
36	36
37	37	public CleanIndexWriter(IndexId iid) throws IOException{
38	38	this.iid = iid;
39		~~- this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER);~~
	39	+ this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
40	40	this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
41	41	String pathMain = iid.getSpellWords().getTempPath();
42	42	//String pathAll = iid.getSpellTitles().getTempPath();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestQuery.java
—	—	@@ -0,0 +1,36 @@
	2	+package org.wikimedia.lsearch.spell;
	3	+
	4	+/** Result of suggestion for a query */
	5	+public class SuggestQuery {
	6	+ protected String searchterm;
	7	+ protected boolean needsCheck;
	8	+ public SuggestQuery(String searchterm) {
	9	+ this(searchterm,false);
	10	+ }
	11	+ public SuggestQuery(String searchterm, boolean needsCheck) {
	12	+ this.searchterm = searchterm;
	13	+ this.needsCheck = needsCheck;
	14	+ }
	15	+ /** Wether suggestion needs further checking (in case of individual word spell-check) */
	16	+ public boolean needsCheck() {
	17	+ return needsCheck;
	18	+ }
	19	+ public void setNeedsCheck(boolean needsCheck) {
	20	+ this.needsCheck = needsCheck;
	21	+ }
	22	+ /** the suggested search term */
	23	+ public String getSearchterm() {
	24	+ return searchterm;
	25	+ }
	26	+ public void setSearchterm(String searchterm) {
	27	+ this.searchterm = searchterm;
	28	+ }
	29	+ @Override
	30	+ public String toString() {
	31	+ return needsCheck? searchterm+" [needs check]" : searchterm;
	32	+ }
	33	+
	34	+
	35	+
	36	+
	37	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java
—	—	@@ -107,7 +107,7 @@
108	108	}
109	109	}
110	110	// make words index
111		~~- log.info("Making words index");~~
	111	+ /*log.info("Making words index");
112	112	try {
113	113	LuceneDictionary dict = new LuceneDictionary(IndexReader.open(words.getTempPath()),"contents");
114	114	WordsIndexer writer = new WordsIndexer(words.getImportPath(),(dbname.equals("wikilucene")? 3 : 50));
—	—	@@ -121,18 +121,18 @@
122	122	log.fatal("Cannot open clean dictionary for "+words+" : "+e.getMessage());
123	123	e.printStackTrace();
124	124	return;
125		~~- }~~
	125	+ }*/
126	126
127	127	log.info("Making suggest title index");
128	128	// make phrase index
129	129
130	130	TitleIndexer tInx = new TitleIndexer(titles);
131		~~- tInx.createFromSnapshot();~~
	131	+ tInx.createFromTempIndex();
132	132
133	133	long end = System.currentTimeMillis();
134	134
135	135	// make snapshots
136		~~- IndexThread.makeIndexSnapshot(words,words.getImportPath());~~
	136	+ //IndexThread.makeIndexSnapshot(words,words.getImportPath());
137	137	IndexThread.makeIndexSnapshot(titles,titles.getImportPath());
138	138
139	139	System.out.println("Finished making suggest index in "+formatTime(end-start));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NgramIndexer.java
—	—	@@ -1,6 +1,7 @@
2	2	package org.wikimedia.lsearch.spell.api;
3	3
4	4	import java.io.IOException;
	5	+import java.util.Collection;
5	6
6	7	import org.apache.log4j.Logger;
7	8	import org.apache.lucene.analysis.Analyzer;
—	—	@@ -89,7 +90,7 @@
90	91	}
91	92
92	93	/** Return ngrams of specific size for text */
93		~~- public static String[] nGrams(String text, int size) {~~
	94	+ public static String[] nGramsRegular(String text, int size) {
94	95	int len = text.length();
95	96	String[] res = new String[len - size + 1];
96	97	for (int i = 0; i < len - size + 1; i++) {
—	—	@@ -98,11 +99,40 @@
99	100	return res;
100	101	}
101	102
	103	+ /** Reverse a string */
	104	+ protected static String reverse(String source){
	105	+ int len = source.length();
	106	+ StringBuilder dest = new StringBuilder(len);
	107	+
	108	+ for (int i = (len - 1); i >= 0; i--)
	109	+ dest.append(source.charAt(i));
	110	+ return dest.toString();
	111	+ }
	112	+
	113	+ /** Return ngrams of specific size for text, assuming circular string */
	114	+ public static String[] nGrams(String text, int size) {
	115	+ int len = text.length();
	116	+ String[] res = null;
	117	+ if(len <= 6 && size == 2){ // produce reversed 2-grams
	118	+ String[] rev = nGramsRegular(reverse(text),size);
	119	+ res = new String[len + rev.length];
	120	+ System.arraycopy(rev,0,res,len,rev.length);
	121	+ } else
	122	+ res = new String[len];
	123	+ for (int i = 0; i < len; i++) {
	124	+ if(i + size <= len)
	125	+ res[i] = text.substring(i, i + size);
	126	+ else // string is assumed to be circular
	127	+ res[i] = text.substring(i)+text.substring(0,(i+size)%len);
	128	+ }
	129	+ return res;
	130	+ }
	131	+
102	132	/** Get minimal ngram size for word. the minimal size should be at least 1/2 of word length */
103	133	public static int getMinNgram(String word){
104		~~- if(word.length() <= 7)~~
	134	+ if(word.length() <= 5)
105	135	return 1;
106		~~- else if(word.length() <= 14)~~
	136	+ else if(word.length() <= 7)
107	137	return 2;
108	138	else
109	139	return 3;
—	—	@@ -110,10 +140,12 @@
111	141
112	142	/** Maximal size of ngram block, at most the length of word */
113	143	public static int getMaxNgram(String word){
114		~~- if(word.length() <= 10)~~
	144	+ if(word.length() == 4)
115	145	return 2;
116		~~- else~~
	146	+ else if(word.length() <= 6)
117	147	return 3;
	148	+ else
	149	+ return 4;
118	150	}
119	151
120	152	/** Get ngram field name with no prefix */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java
—	—	@@ -30,6 +30,7 @@
31	31	import org.wikimedia.lsearch.search.IndexSearcherMul;
32	32	import org.wikimedia.lsearch.search.WikiSearcher;
33	33	import org.wikimedia.lsearch.spell.api.Dictionary.Word;
	34	+import org.wikimedia.lsearch.util.HighFreqTerms;
34	35
35	36	/**
36	37	* Index words and phrases from article titles.
—	—	@@ -284,6 +285,65 @@
285	286	}
286	287	}
287	288
	289	+ public void createFromTempIndex(){
	290	+ String path = titles.getImportPath(); // dest where to put index
	291	+ try {
	292	+ ngramWriter.createIndex(path,new SimpleAnalyzer());
	293	+ IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath());
	294	+ Collection<String> mostfreq = HighFreqTerms.getHighFreqTerms(ir,"contents",50);
	295	+ // get at most 25 stopwords
	296	+ HashSet<String> stopWords = new HashSet<String>();
	297	+ for(String w : mostfreq){
	298	+ if(!w.contains("_"))
	299	+ stopWords.add(w);
	300	+ if(stopWords.size() >= 25)
	301	+ break;
	302	+ }
	303	+ addMetadata("stopWords",stopWords);
	304	+
	305	+ LuceneDictionary dict = new LuceneDictionary(ir,"contents");
	306	+ Word word;
	307	+ while((word = dict.next()) != null){
	308	+ String w = word.getWord();
	309	+ int freq = word.getFrequency();
	310	+ if(w.contains("_")){ // phrase
	311	+ String[] words = w.split("_");
	312	+ boolean allowed = true;
	313	+ for(String ww : words){
	314	+ // allow only those phrases consisting of title words
	315	+ if(stopWords.contains(ww) \|\| ir.docFreq(new Term("title",ww)) == 0){
	316	+ allowed = false;
	317	+ break;
	318	+ }
	319	+ }
	320	+ if(allowed && freq > minPhraseFreq){
	321	+ NamespaceFreq nsf = new NamespaceFreq();
	322	+ nsf.setFrequency(0,freq);
	323	+ ArrayList<Integer> nss = new ArrayList<Integer>();
	324	+ nss.add(0);
	325	+ addPhrase(w,nsf,nss);
	326	+ }
	327	+ } else{
	328	+ if(freq > minWordFreq){
	329	+ NamespaceFreq nsf = new NamespaceFreq();
	330	+ nsf.setFrequency(0,freq);
	331	+ ArrayList<Integer> nss = new ArrayList<Integer>();
	332	+ nss.add(0);
	333	+ addWord(w,nsf,nss);
	334	+ }
	335	+ }
	336	+ }
	337	+ ngramWriter.closeAndOptimize();
	338	+ ir.close();
	339	+
	340	+ } catch (IOException e) {
	341	+ log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
	342	+ e.printStackTrace();
	343	+ return;
	344	+ }
	345	+
	346	+ }
	347	+
288	348	/**
289	349	* Add phrase to index
290	350	*
—	—	@@ -310,7 +370,26 @@
311	371	ngramWriter.addDocument(doc);
312	372	}
313	373
314		~~- /** Add ordinary word to the index, convenient for suggesting joins~~
	374	+ /**
	375	+ * Add into metadata_key and metadata_value.
	376	+ * Collection is assumed to contain words (without spaces)
	377	+ */
	378	+ public void addMetadata(String key, Collection<String> values){
	379	+ StringBuilder sb = new StringBuilder();
	380	+ // serialize by joining with spaces
	381	+ for(String val : values){
	382	+ if(sb.length() != 0)
	383	+ sb.append(" ");
	384	+ sb.append(val);
	385	+ }
	386	+ Document doc = new Document();
	387	+ doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED));
	388	+ doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO));
	389	+
	390	+ ngramWriter.addDocument(doc);
	391	+ }
	392	+
	393	+ /** Add ordinary word to the index
315	394	*
316	395	* @param word - word to add
317	396	* @param nf - frequencies in namespaces
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java
—	—	@@ -62,7 +62,7 @@
63	63	// article.setContents("");
64	64
65	65	writer.addMainArticle(article);
66		~~- writer.addAllArticle(article);~~
	66	+ //writer.addAllArticle(article);
67	67	// generate phrases
68	68	/* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false);
69	69	ArrayList<Token> tokens = parser.parse();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java
—	—	@@ -8,8 +8,12 @@
9	9	int distMetaphone2=0;
10	10
11	11	static class Comparator implements java.util.Comparator<SuggestResult> {
12		~~- public int compare(SuggestResult o1, SuggestResult o2){~~
13		~~- if(o1.dist == o2.dist)~~
	12	+ public int compare(SuggestResult o1, SuggestResult o2){
	13	+ if(o1.dist - o2.dist == -1 && o1.frequency * 100 < o2.frequency)
	14	+ return 1;
	15	+ else if(o1.dist - o2.dist == 1 && o2.frequency * 100 < o1.frequency)
	16	+ return -1;
	17	+ else if(o1.dist == o2.dist)
14	18	return o2.getFrequency() - o1.getFrequency();
15	19	else
16	20	return o1.dist - o2.dist;

Status & tagging log

15:20, 12 September 2011 Meno25 (talk | contribs) changed the status of r25117 [removed: ok added: old]