Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -494,7 +494,7 @@ |
495 | 495 | if(ranks.get(i) == 0) |
496 | 496 | break; // we don't want redirects with zero links |
497 | 497 | //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i)); |
498 | | - Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED); |
| 498 | + Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.YES, Field.Index.TOKENIZED); |
499 | 499 | alttitle.setBoost(calculateArticleRank(ranks.get(i))); |
500 | 500 | doc.add(alttitle); |
501 | 501 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java |
— | — | @@ -6,6 +6,7 @@ |
7 | 7 | int dist=0; |
8 | 8 | int distMetaphone=0; |
9 | 9 | int distMetaphone2=0; |
| 10 | + boolean sameLetters=false; |
10 | 11 | |
11 | 12 | static class Comparator implements java.util.Comparator<SuggestResult> { |
12 | 13 | public int compare(SuggestResult o1, SuggestResult o2){ |
— | — | @@ -13,18 +14,28 @@ |
14 | 15 | return 1; |
15 | 16 | else if(o1.dist - o2.dist == 1 && o2.frequency * 100 < o1.frequency) |
16 | 17 | return -1; |
17 | | - else if(o1.dist == o2.dist) |
18 | | - return o2.getFrequency() - o1.getFrequency(); |
19 | | - else |
| 18 | + else if(o1.dist == o2.dist){ |
| 19 | + if(!o1.sameLetters && o2.sameLetters) |
| 20 | + return 1; |
| 21 | + else if(o1.sameLetters && !o2.sameLetters) |
| 22 | + return -1; |
| 23 | + else |
| 24 | + return o2.getFrequency() - o1.getFrequency(); |
| 25 | + } else |
20 | 26 | return o1.dist - o2.dist; |
21 | 27 | } |
22 | 28 | } |
23 | 29 | |
24 | 30 | static class ComparatorNoCommonMisspell implements java.util.Comparator<SuggestResult> { |
25 | 31 | public int compare(SuggestResult o1, SuggestResult o2){ |
26 | | - if(o1.dist == o2.dist) |
27 | | - return o2.getFrequency() - o1.getFrequency(); |
28 | | - else |
| 32 | + if(o1.dist == o2.dist){ |
| 33 | + if(!o1.sameLetters && o2.sameLetters) |
| 34 | + return 1; |
| 35 | + else if(o1.sameLetters && !o2.sameLetters) |
| 36 | + return -1; |
| 37 | + else |
| 38 | + return o2.getFrequency() - o1.getFrequency(); |
| 39 | + } else |
29 | 40 | return o1.dist - o2.dist; |
30 | 41 | } |
31 | 42 | } |
— | — | @@ -43,6 +54,7 @@ |
44 | 55 | this.dist = metric.distance(word); |
45 | 56 | this.distMetaphone = metric.meta1Distance(word); |
46 | 57 | this.distMetaphone2 = metric.meta2Distance(word); |
| 58 | + this.sameLetters = metric.hasSameLetters(word); |
47 | 59 | } |
48 | 60 | |
49 | 61 | /** Initialize all atributes using suggestion metrics */ |
— | — | @@ -52,6 +64,7 @@ |
53 | 65 | this.dist = metric.distance(word); |
54 | 66 | this.distMetaphone = metric.sdmeta1.getDistance(meta1); |
55 | 67 | this.distMetaphone2 = metric.sdmeta2.getDistance(meta2); |
| 68 | + this.sameLetters = metric.hasSameLetters(word); |
56 | 69 | } |
57 | 70 | |
58 | 71 | public int getDist() { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -84,6 +84,10 @@ |
85 | 85 | public int meta2Distance(String w){ |
86 | 86 | return sdmeta2.getDistance(dmeta.doubleMetaphone(w,true)); |
87 | 87 | } |
| 88 | + /** If string differs only in duplication of some letters */ |
| 89 | + public boolean hasSameLetters(String w){ |
| 90 | + return sd.hasSameLetters(w); |
| 91 | + } |
88 | 92 | } |
89 | 93 | |
90 | 94 | /** Number of results to fetch */ |
— | — | @@ -153,21 +157,21 @@ |
154 | 158 | @SuppressWarnings("unchecked") |
155 | 159 | public SuggestQuery suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, SearchResults res) throws IOException{ |
156 | 160 | ArrayList<Token> tokens = parser.tokenizeBareText(searchterm); |
157 | | - int numHits = res.getNumHits(); |
158 | 161 | |
159 | | - //if(numHits >= minHitsTitles) |
160 | | - //return null; |
161 | | - |
162 | 162 | // collect words in titles, these shouldn't be spell-checked |
| 163 | + ArrayList<HashSet<String>> titles = new ArrayList<HashSet<String>>(); |
163 | 164 | HashSet<String> correctWords = new HashSet<String>(); |
164 | 165 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false); |
165 | 166 | try { |
166 | 167 | for(ResultSet r : res.getResults()){ |
| 168 | + HashSet<String> title = new HashSet<String>(); |
167 | 169 | Token t = null; |
168 | 170 | TokenStream ts = analyzer.tokenStream("title",r.title); |
169 | 171 | while( (t = ts.next()) != null ){ |
170 | 172 | correctWords.add(t.termText()); |
| 173 | + title.add(t.termText()); |
171 | 174 | } |
| 175 | + titles.add(title); |
172 | 176 | } |
173 | 177 | } catch (IOException e) { |
174 | 178 | log.error("I/O error trying to get list of correct words : "+e.getMessage()); |
— | — | @@ -223,7 +227,15 @@ |
224 | 228 | if(titlesReader.docFreq(new Term("phrase",phrase)) != 0){ |
225 | 229 | correctPhrases.add(i); |
226 | 230 | correctPhrases.add(i2); |
227 | | - } |
| 231 | + } else if(correctWords.contains(w) && correctWords.contains(w2)){ |
| 232 | + for(HashSet<String> title : titles){ |
| 233 | + if(title.contains(w) && title.contains(w2)){ |
| 234 | + correctPhrases.add(i); |
| 235 | + correctPhrases.add(i2); |
| 236 | + break; |
| 237 | + } |
| 238 | + } |
| 239 | + } |
228 | 240 | } |
229 | 241 | if(correctPhrases.size()+numStopWords >= tokens.size() |
230 | 242 | && correctWords.size()+numStopWords >= tokens.size()){ |
— | — | @@ -410,7 +422,7 @@ |
411 | 423 | } else if(tokens.size() == 1 && wordSug.get(0)!=null |
412 | 424 | && wordSug.get(0).size() > 0 && !correctWords.contains(tokens.get(0).termText())){ |
413 | 425 | // only one token, try different spell-checks for title |
414 | | - ArrayList<SuggestResult> sg = wordSug.get(0); |
| 426 | + ArrayList<SuggestResult> sg = (ArrayList<SuggestResult>) wordSug.get(0).clone(); |
415 | 427 | Collections.sort(sg,new SuggestResult.ComparatorNoCommonMisspell()); |
416 | 428 | Token t = tokens.get(0); |
417 | 429 | int maxdist = sg.get(0).getDist(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/dist/EditDistance.java |
— | — | @@ -29,7 +29,6 @@ |
30 | 30 | final int n; |
31 | 31 | final int[][][] cache=new int[30][][]; |
32 | 32 | |
33 | | - |
34 | 33 | /** |
35 | 34 | * Optimized to run a bit faster than the static getDistance(). |
36 | 35 | * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster. |
— | — | @@ -38,6 +37,33 @@ |
39 | 38 | sa=target.toCharArray(); |
40 | 39 | n=sa.length; |
41 | 40 | } |
| 41 | + |
| 42 | + /** Check if only difference is duplication of some letters */ |
| 43 | + public boolean hasSameLetters(String other){ |
| 44 | + final char[] ta=other.toCharArray(); |
| 45 | + final int m=ta.length; |
| 46 | + int i=0,j=0; |
| 47 | + for(;i<n && j<m;i++,j++){ |
| 48 | + if(sa[i]!=ta[j]){ |
| 49 | + if(i>0 && sa[i-1] == ta[j]){ |
| 50 | + i--; |
| 51 | + continue; |
| 52 | + } else if(j>0 && sa[i] == ta[j-1]){ |
| 53 | + j--; |
| 54 | + continue; |
| 55 | + } else |
| 56 | + return false; |
| 57 | + } |
| 58 | + if(i == n - 1 && j < m - 1) |
| 59 | + i--; |
| 60 | + else if(j == m - 1 && i < n - 1) |
| 61 | + j--; |
| 62 | + } |
| 63 | + if(i == n && j == m) |
| 64 | + return true; |
| 65 | + |
| 66 | + return false; |
| 67 | + } |
42 | 68 | |
43 | 69 | |
44 | 70 | //***************************** |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java |
— | — | @@ -24,10 +24,12 @@ |
25 | 25 | import org.apache.lucene.search.Searcher; |
26 | 26 | import org.apache.lucene.search.TermQuery; |
27 | 27 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 28 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
28 | 29 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
29 | 30 | import org.wikimedia.lsearch.config.IndexId; |
30 | 31 | import org.wikimedia.lsearch.config.IndexRegistry; |
31 | 32 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
| 33 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
32 | 34 | import org.wikimedia.lsearch.search.IndexSearcherMul; |
33 | 35 | import org.wikimedia.lsearch.search.WikiSearcher; |
34 | 36 | import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
— | — | @@ -290,35 +292,37 @@ |
291 | 293 | |
292 | 294 | public void createFromTempIndex(){ |
293 | 295 | String path = titles.getImportPath(); // dest where to put index |
| 296 | + FieldNameFactory fields = new FieldNameFactory(); |
| 297 | + final String title = fields.title(); |
| 298 | + final String contents = fields.contents(); |
| 299 | + final String alttitle = fields.alttitle(); |
294 | 300 | try { |
295 | 301 | ngramWriter.createIndex(path,new SimpleAnalyzer()); |
296 | 302 | IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath()); |
297 | | - /*Collection<String> mostfreq = HighFreqTerms.getHighFreqTerms(iid,"contents",50); |
298 | | - // get at most 25 stopwords |
299 | 303 | HashSet<String> stopWords = new HashSet<String>(); |
300 | | - for(String w : mostfreq){ |
301 | | - if(!w.contains("_")) |
302 | | - stopWords.add(w); |
303 | | - if(stopWords.size() >= 25) |
304 | | - break; |
305 | | - } */ |
306 | | - HashSet<String> stopWords = new HashSet<String>(); |
307 | 304 | TermDocs td = ir.termDocs(new Term("metadata_key","stopWords")); |
308 | 305 | if(td.next()){ |
309 | 306 | for(String s : ir.document(td.doc()).get("metadata_value").split(" ")) |
310 | 307 | stopWords.add(s); |
311 | 308 | } |
312 | 309 | addMetadata("stopWords",stopWords); |
| 310 | + |
313 | 311 | // add all titles |
314 | 312 | for(int i=0;i<ir.maxDoc();i++){ |
315 | 313 | if(ir.isDeleted(i)) |
316 | 314 | continue; |
317 | | - String title = ir.document(i).get("title"); |
318 | | - if(title != null) |
319 | | - addTitle(title); |
| 315 | + String titleText = ir.document(i).get(title); |
| 316 | + if(titleText != null) |
| 317 | + addTitle(titleText); |
| 318 | + // FIXME: alttitle fiels is not generated! |
| 319 | + for(int j=0;j<WikiIndexModifier.ALT_TITLES;j++){ |
| 320 | + String altTitleText = ir.document(i).get(alttitle+j); |
| 321 | + if(altTitleText != null) |
| 322 | + addTitle(altTitleText); |
| 323 | + } |
320 | 324 | } |
321 | 325 | |
322 | | - LuceneDictionary dict = new LuceneDictionary(ir,"contents"); |
| 326 | + LuceneDictionary dict = new LuceneDictionary(ir,contents); |
323 | 327 | Word word; |
324 | 328 | while((word = dict.next()) != null){ |
325 | 329 | String w = word.getWord(); |
— | — | @@ -330,13 +334,13 @@ |
331 | 335 | boolean allowed = true; |
332 | 336 | for(String ww : words){ |
333 | 337 | // allow only those phrases consisting of title words |
334 | | - if(ir.docFreq(new Term("title",ww)) == 0){ |
| 338 | + if(ir.docFreq(new Term(title,ww)) == 0){ |
335 | 339 | allowed = false; |
336 | 340 | break; |
337 | 341 | } |
338 | 342 | } |
339 | 343 | if(allowed && freq > minPhraseFreq){ |
340 | | - boolean inTitle = ir.docFreq(new Term("title",w))!= 0; |
| 344 | + boolean inTitle = ir.docFreq(new Term(title,w))!= 0; |
341 | 345 | NamespaceFreq nsf = new NamespaceFreq(); |
342 | 346 | nsf.setFrequency(0,freq); |
343 | 347 | ArrayList<Integer> nss = new ArrayList<Integer>(); |
— | — | @@ -357,7 +361,7 @@ |
358 | 362 | //ngramWriter.reopenIndex(path,new SimpleAnalyzer()); |
359 | 363 | //IndexReader ngramReader = ngramWriter.getReader(); |
360 | 364 | // add stuff from titles with stop words |
361 | | - dict = new LuceneDictionary(ir,"title"); |
| 365 | + dict = new LuceneDictionary(ir,title); |
362 | 366 | while((word = dict.next()) != null){ |
363 | 367 | String w = word.getWord(); |
364 | 368 | if(w.contains("_")){ // phrase |
— | — | @@ -370,16 +374,8 @@ |
371 | 375 | nss.add(0); |
372 | 376 | addPhrase(w,nsf,nss,true); |
373 | 377 | } |
374 | | - } /* else if(ngramReader.docFreq(new Term("word",w))==0){ |
375 | | - // add words from titles |
376 | | - int freq = ir.docFreq(new Term("contents",w)); |
377 | | - NamespaceFreq nsf = new NamespaceFreq(); |
378 | | - nsf.setFrequency(0,freq); |
379 | | - ArrayList<Integer> nss = new ArrayList<Integer>(); |
380 | | - nss.add(0); |
381 | | - addWord(w,nsf,nss); |
382 | | - } */ |
383 | | - } |
| 378 | + } |
| 379 | + } |
384 | 380 | ngramWriter.closeAndOptimize(); |
385 | 381 | ir.close(); |
386 | 382 | |
— | — | @@ -390,8 +386,7 @@ |
391 | 387 | } |
392 | 388 | |
393 | 389 | } |
394 | | - |
395 | | - |
| 390 | + |
396 | 391 | /** |
397 | 392 | * Register a title in the index, without tokenization, just lowercase. |
398 | 393 | * |