r25294 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r25293‎ | r25294 | r25295 >
Date:19:58, 29 August 2007
Author:rainman
Status:old
Tags:
Comment:
More minor did you mean tweeks:
* if edit dist is same, prefer those suggestions that have same letters
* use good redirects from alttitle as valid spellcheck titles
Modified paths:
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/dist/EditDistance.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -494,7 +494,7 @@
495495 if(ranks.get(i) == 0)
496496 break; // we don't want redirects with zero links
497497 //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i));
498 - Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED);
 498+ Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.YES, Field.Index.TOKENIZED);
499499 alttitle.setBoost(calculateArticleRank(ranks.get(i)));
500500 doc.add(alttitle);
501501 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestResult.java
@@ -6,6 +6,7 @@
77 int dist=0;
88 int distMetaphone=0;
99 int distMetaphone2=0;
 10+ boolean sameLetters=false;
1011
1112 static class Comparator implements java.util.Comparator<SuggestResult> {
1213 public int compare(SuggestResult o1, SuggestResult o2){
@@ -13,18 +14,28 @@
1415 return 1;
1516 else if(o1.dist - o2.dist == 1 && o2.frequency * 100 < o1.frequency)
1617 return -1;
17 - else if(o1.dist == o2.dist)
18 - return o2.getFrequency() - o1.getFrequency();
19 - else
 18+ else if(o1.dist == o2.dist){
 19+ if(!o1.sameLetters && o2.sameLetters)
 20+ return 1;
 21+ else if(o1.sameLetters && !o2.sameLetters)
 22+ return -1;
 23+ else
 24+ return o2.getFrequency() - o1.getFrequency();
 25+ } else
2026 return o1.dist - o2.dist;
2127 }
2228 }
2329
2430 static class ComparatorNoCommonMisspell implements java.util.Comparator<SuggestResult> {
2531 public int compare(SuggestResult o1, SuggestResult o2){
26 - if(o1.dist == o2.dist)
27 - return o2.getFrequency() - o1.getFrequency();
28 - else
 32+ if(o1.dist == o2.dist){
 33+ if(!o1.sameLetters && o2.sameLetters)
 34+ return 1;
 35+ else if(o1.sameLetters && !o2.sameLetters)
 36+ return -1;
 37+ else
 38+ return o2.getFrequency() - o1.getFrequency();
 39+ } else
2940 return o1.dist - o2.dist;
3041 }
3142 }
@@ -43,6 +54,7 @@
4455 this.dist = metric.distance(word);
4556 this.distMetaphone = metric.meta1Distance(word);
4657 this.distMetaphone2 = metric.meta2Distance(word);
 58+ this.sameLetters = metric.hasSameLetters(word);
4759 }
4860
4961 /** Initialize all atributes using suggestion metrics */
@@ -52,6 +64,7 @@
5365 this.dist = metric.distance(word);
5466 this.distMetaphone = metric.sdmeta1.getDistance(meta1);
5567 this.distMetaphone2 = metric.sdmeta2.getDistance(meta2);
 68+ this.sameLetters = metric.hasSameLetters(word);
5669 }
5770
5871 public int getDist() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -84,6 +84,10 @@
8585 public int meta2Distance(String w){
8686 return sdmeta2.getDistance(dmeta.doubleMetaphone(w,true));
8787 }
 88+ /** If string differs only in duplication of some letters */
 89+ public boolean hasSameLetters(String w){
 90+ return sd.hasSameLetters(w);
 91+ }
8892 }
8993
9094 /** Number of results to fetch */
@@ -153,21 +157,21 @@
154158 @SuppressWarnings("unchecked")
155159 public SuggestQuery suggest(String searchterm, WikiQueryParser parser, NamespaceFilter nsf, SearchResults res) throws IOException{
156160 ArrayList<Token> tokens = parser.tokenizeBareText(searchterm);
157 - int numHits = res.getNumHits();
158161
159 - //if(numHits >= minHitsTitles)
160 - //return null;
161 -
162162 // collect words in titles, these shouldn't be spell-checked
 163+ ArrayList<HashSet<String>> titles = new ArrayList<HashSet<String>>();
163164 HashSet<String> correctWords = new HashSet<String>();
164165 Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,false);
165166 try {
166167 for(ResultSet r : res.getResults()){
 168+ HashSet<String> title = new HashSet<String>();
167169 Token t = null;
168170 TokenStream ts = analyzer.tokenStream("title",r.title);
169171 while( (t = ts.next()) != null ){
170172 correctWords.add(t.termText());
 173+ title.add(t.termText());
171174 }
 175+ titles.add(title);
172176 }
173177 } catch (IOException e) {
174178 log.error("I/O error trying to get list of correct words : "+e.getMessage());
@@ -223,7 +227,15 @@
224228 if(titlesReader.docFreq(new Term("phrase",phrase)) != 0){
225229 correctPhrases.add(i);
226230 correctPhrases.add(i2);
227 - }
 231+ } else if(correctWords.contains(w) && correctWords.contains(w2)){
 232+ for(HashSet<String> title : titles){
 233+ if(title.contains(w) && title.contains(w2)){
 234+ correctPhrases.add(i);
 235+ correctPhrases.add(i2);
 236+ break;
 237+ }
 238+ }
 239+ }
228240 }
229241 if(correctPhrases.size()+numStopWords >= tokens.size()
230242 && correctWords.size()+numStopWords >= tokens.size()){
@@ -410,7 +422,7 @@
411423 } else if(tokens.size() == 1 && wordSug.get(0)!=null
412424 && wordSug.get(0).size() > 0 && !correctWords.contains(tokens.get(0).termText())){
413425 // only one token, try different spell-checks for title
414 - ArrayList<SuggestResult> sg = wordSug.get(0);
 426+ ArrayList<SuggestResult> sg = (ArrayList<SuggestResult>) wordSug.get(0).clone();
415427 Collections.sort(sg,new SuggestResult.ComparatorNoCommonMisspell());
416428 Token t = tokens.get(0);
417429 int maxdist = sg.get(0).getDist();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/dist/EditDistance.java
@@ -29,7 +29,6 @@
3030 final int n;
3131 final int[][][] cache=new int[30][][];
3232
33 -
3433 /**
3534 * Optimized to run a bit faster than the static getDistance().
3635 * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
@@ -38,6 +37,33 @@
3938 sa=target.toCharArray();
4039 n=sa.length;
4140 }
 41+
 42+ /** Check if only difference is duplication of some letters */
 43+ public boolean hasSameLetters(String other){
 44+ final char[] ta=other.toCharArray();
 45+ final int m=ta.length;
 46+ int i=0,j=0;
 47+ for(;i<n && j<m;i++,j++){
 48+ if(sa[i]!=ta[j]){
 49+ if(i>0 && sa[i-1] == ta[j]){
 50+ i--;
 51+ continue;
 52+ } else if(j>0 && sa[i] == ta[j-1]){
 53+ j--;
 54+ continue;
 55+ } else
 56+ return false;
 57+ }
 58+ if(i == n - 1 && j < m - 1)
 59+ i--;
 60+ else if(j == m - 1 && i < n - 1)
 61+ j--;
 62+ }
 63+ if(i == n && j == m)
 64+ return true;
 65+
 66+ return false;
 67+ }
4268
4369
4470 //*****************************
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java
@@ -24,10 +24,12 @@
2525 import org.apache.lucene.search.Searcher;
2626 import org.apache.lucene.search.TermQuery;
2727 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 28+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
2829 import org.wikimedia.lsearch.config.GlobalConfiguration;
2930 import org.wikimedia.lsearch.config.IndexId;
3031 import org.wikimedia.lsearch.config.IndexRegistry;
3132 import org.wikimedia.lsearch.index.IndexUpdateRecord;
 33+import org.wikimedia.lsearch.index.WikiIndexModifier;
3234 import org.wikimedia.lsearch.search.IndexSearcherMul;
3335 import org.wikimedia.lsearch.search.WikiSearcher;
3436 import org.wikimedia.lsearch.spell.api.Dictionary.Word;
@@ -290,35 +292,37 @@
291293
292294 public void createFromTempIndex(){
293295 String path = titles.getImportPath(); // dest where to put index
 296+ FieldNameFactory fields = new FieldNameFactory();
 297+ final String title = fields.title();
 298+ final String contents = fields.contents();
 299+ final String alttitle = fields.alttitle();
294300 try {
295301 ngramWriter.createIndex(path,new SimpleAnalyzer());
296302 IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath());
297 - /*Collection<String> mostfreq = HighFreqTerms.getHighFreqTerms(iid,"contents",50);
298 - // get at most 25 stopwords
299303 HashSet<String> stopWords = new HashSet<String>();
300 - for(String w : mostfreq){
301 - if(!w.contains("_"))
302 - stopWords.add(w);
303 - if(stopWords.size() >= 25)
304 - break;
305 - } */
306 - HashSet<String> stopWords = new HashSet<String>();
307304 TermDocs td = ir.termDocs(new Term("metadata_key","stopWords"));
308305 if(td.next()){
309306 for(String s : ir.document(td.doc()).get("metadata_value").split(" "))
310307 stopWords.add(s);
311308 }
312309 addMetadata("stopWords",stopWords);
 310+
313311 // add all titles
314312 for(int i=0;i<ir.maxDoc();i++){
315313 if(ir.isDeleted(i))
316314 continue;
317 - String title = ir.document(i).get("title");
318 - if(title != null)
319 - addTitle(title);
 315+ String titleText = ir.document(i).get(title);
 316+ if(titleText != null)
 317+ addTitle(titleText);
 318+ // FIXME: alttitle fiels is not generated!
 319+ for(int j=0;j<WikiIndexModifier.ALT_TITLES;j++){
 320+ String altTitleText = ir.document(i).get(alttitle+j);
 321+ if(altTitleText != null)
 322+ addTitle(altTitleText);
 323+ }
320324 }
321325
322 - LuceneDictionary dict = new LuceneDictionary(ir,"contents");
 326+ LuceneDictionary dict = new LuceneDictionary(ir,contents);
323327 Word word;
324328 while((word = dict.next()) != null){
325329 String w = word.getWord();
@@ -330,13 +334,13 @@
331335 boolean allowed = true;
332336 for(String ww : words){
333337 // allow only those phrases consisting of title words
334 - if(ir.docFreq(new Term("title",ww)) == 0){
 338+ if(ir.docFreq(new Term(title,ww)) == 0){
335339 allowed = false;
336340 break;
337341 }
338342 }
339343 if(allowed && freq > minPhraseFreq){
340 - boolean inTitle = ir.docFreq(new Term("title",w))!= 0;
 344+ boolean inTitle = ir.docFreq(new Term(title,w))!= 0;
341345 NamespaceFreq nsf = new NamespaceFreq();
342346 nsf.setFrequency(0,freq);
343347 ArrayList<Integer> nss = new ArrayList<Integer>();
@@ -357,7 +361,7 @@
358362 //ngramWriter.reopenIndex(path,new SimpleAnalyzer());
359363 //IndexReader ngramReader = ngramWriter.getReader();
360364 // add stuff from titles with stop words
361 - dict = new LuceneDictionary(ir,"title");
 365+ dict = new LuceneDictionary(ir,title);
362366 while((word = dict.next()) != null){
363367 String w = word.getWord();
364368 if(w.contains("_")){ // phrase
@@ -370,16 +374,8 @@
371375 nss.add(0);
372376 addPhrase(w,nsf,nss,true);
373377 }
374 - } /* else if(ngramReader.docFreq(new Term("word",w))==0){
375 - // add words from titles
376 - int freq = ir.docFreq(new Term("contents",w));
377 - NamespaceFreq nsf = new NamespaceFreq();
378 - nsf.setFrequency(0,freq);
379 - ArrayList<Integer> nss = new ArrayList<Integer>();
380 - nss.add(0);
381 - addWord(w,nsf,nss);
382 - } */
383 - }
 378+ }
 379+ }
384380 ngramWriter.closeAndOptimize();
385381 ir.close();
386382
@@ -390,8 +386,7 @@
391387 }
392388
393389 }
394 -
395 -
 390+
396391 /**
397392 * Register a title in the index, without tokenization, just lowercase.
398393 *

Status & tagging log