r109911 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109910‎ | r109911 | r109912 >
Date:10:11, 24 January 2012
Author:oren
Status:ok (Comments)
Tags:
Comment:
replaced
*TOKENIZED --> ANALYZED
*UN_TOKENIZED --> NOT_ANALYZED
*NO_NORMS --> NOT_ANALYZED_NO_NORMS
which are clearer and became deprecated in version 3.0
Modified paths:
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/api/NgramIndexer.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/api/TitleNgramIndexer.java (modified) (history)
  • /trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/storage/RelatedStorage.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/CleanIndexWriter.java
@@ -158,9 +158,9 @@
159159 /** Add title/redirect with ranks information only */
160160 protected void addTitleOnly(Article article) {
161161 Document doc = new Document();
162 - doc.add(new Field("key",article.getIndexKey(),Store.NO,Index.UN_TOKENIZED));
163 - doc.add(new Field("ns_title",article.getTitle(),Store.YES,Index.TOKENIZED));
164 - doc.add(new Field("ns_namespace",article.getNamespace(),Store.YES,Index.UN_TOKENIZED));
 162+ doc.add(new Field("key",article.getIndexKey(),Store.NO,Index.NOT_ANALYZED));
 163+ doc.add(new Field("ns_title",article.getTitle(),Store.YES,Index.ANALYZED));
 164+ doc.add(new Field("ns_namespace",article.getNamespace(),Store.YES,Index.NOT_ANALYZED));
165165 doc.add(new Field("ns_rank",Integer.toString(article.getReferences()),Store.YES,Index.NO));
166166 if(article.isRedirect())
167167 doc.add(new Field("ns_redirect",article.getRedirectTarget(),Store.YES,Index.NO));
@@ -202,7 +202,7 @@
203203 sb.append(val);
204204 }
205205 Document doc = new Document();
206 - doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED));
 206+ doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.NOT_ANALYZED));
207207 doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO));
208208
209209 try {
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/api/NgramIndexer.java
@@ -246,8 +246,8 @@
247247 for(int j=0 ; j<ngrams.length ; j++){
248248 String ngram = ngrams[j];
249249 if(j==0)
250 - doc.add(new Field(startField+i, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
251 - doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.UN_TOKENIZED));
 250+ doc.add(new Field(startField+i, ngram, Field.Store.NO, Field.Index.NOT_ANALYZED));
 251+ doc.add(new Field(field, ngram, Field.Store.NO, Field.Index.NOT_ANALYZED));
252252 }
253253 }
254254 }
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/api/TitleNgramIndexer.java
@@ -97,9 +97,9 @@
9898
9999 Document doc = new Document();
100100 // pageId is primary key
101 - doc.add(new Field("pageid", pageId, Field.Store.NO, Field.Index.UN_TOKENIZED));
 101+ doc.add(new Field("pageid", pageId, Field.Store.NO, Field.Index.NOT_ANALYZED));
102102 if(!ns.equals("0"))
103 - doc.add(new Field("namespace", ns, Field.Store.NO, Field.Index.UN_TOKENIZED));
 103+ doc.add(new Field("namespace", ns, Field.Store.NO, Field.Index.NOT_ANALYZED));
104104 doc.add(new Field("key", ns+":"+title, Field.Store.YES, Field.Index.NO));
105105 doc.add(new Field(field, decomposed, Field.Store.YES, Field.Index.NO));
106106 if(redirectTo != null)
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
@@ -299,9 +299,9 @@
300300 String normalized = FastWikiTokenizerEngine.normalize(title.toLowerCase());
301301 String decomposed = FastWikiTokenizerEngine.decompose(normalized);
302302 // doc.add(new Field("title", ns+":"+title, Field.Store.YES, Field.Index.NO));
303 - doc.add(new Field("title", normalized, Field.Store.YES, Field.Index.UN_TOKENIZED));
 303+ doc.add(new Field("title", normalized, Field.Store.YES, Field.Index.NOT_ANALYZED));
304304 if(decomposed != normalized)
305 - doc.add(new Field("title", decomposed, Field.Store.NO, Field.Index.UN_TOKENIZED));
 305+ doc.add(new Field("title", decomposed, Field.Store.NO, Field.Index.NOT_ANALYZED));
306306 doc.add(new Field("rank", rank, Field.Store.YES, Field.Index.NO));
307307 if(redirect!=null){
308308 String redirectNormalized = FastWikiTokenizerEngine.normalize(redirect.substring(redirect.indexOf(':')+1).toLowerCase());
@@ -320,10 +320,10 @@
321321 String normalized = FastWikiTokenizerEngine.normalize(title.toLowerCase());
322322 String decomposed = FastWikiTokenizerEngine.decompose(normalized);
323323 //doc.add(new Field("ns_title", ns+":"+title, Field.Store.YES, Field.Index.NO));
324 - doc.add(new Field("ns_title", ns+":"+normalized, Field.Store.YES, Field.Index.UN_TOKENIZED));
 324+ doc.add(new Field("ns_title", ns+":"+normalized, Field.Store.YES, Field.Index.NOT_ANALYZED));
325325 if(decomposed != normalized)
326 - doc.add(new Field("ns_title", ns+":"+decomposed, Field.Store.NO, Field.Index.UN_TOKENIZED));
327 - doc.add(new Field("ns_namespace", ns, Field.Store.YES, Field.Index.UN_TOKENIZED));
 326+ doc.add(new Field("ns_title", ns+":"+decomposed, Field.Store.NO, Field.Index.NOT_ANALYZED));
 327+ doc.add(new Field("ns_namespace", ns, Field.Store.YES, Field.Index.NOT_ANALYZED));
328328 doc.add(new Field("ns_rank", rank, Field.Store.YES, Field.Index.NO));
329329 if(redirect!=null && redirect.substring(0,redirect.indexOf(':')).equals(ns)){
330330 String redirectNormalized = FastWikiTokenizerEngine.normalize(redirect.substring(redirect.indexOf(':')+1).toLowerCase());
@@ -374,13 +374,13 @@
375375 HashMap<String,SimpleInt> freq = getFrequencies(phrase,ir);
376376
377377 Document doc = new Document();
378 - doc.add(new Field("ns_phrase", phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
 378+ doc.add(new Field("ns_phrase", phrase, Field.Store.YES, Field.Index.NOT_ANALYZED));
379379 doc.add(new Field("ns_namespace", new StringTokenStream(freq.keySet())));
380380 for(Entry<String,SimpleInt> e : freq.entrySet()){
381381 doc.add(new Field("ns_freq_"+e.getKey(), Integer.toString(e.getValue().count), Field.Store.YES, Field.Index.NO));
382382 }
383383 if(inTitle){
384 - doc.add(new Field("ns_intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED));
 384+ doc.add(new Field("ns_intitle","1", Field.Store.YES, Field.Index.NOT_ANALYZED));
385385 }
386386 setOmitNorms(doc);
387387 ngramWriter.addDocument(doc);
@@ -397,9 +397,9 @@
398398 Document doc = new Document();
399399 String decomposed = FastWikiTokenizerEngine.decompose(word);
400400 ngramWriter.createNgramFields(doc,"ns_word",decomposed,NgramIndexer.Type.WORDS);
401 - doc.add(new Field("ns_word",word, Field.Store.YES, Field.Index.UN_TOKENIZED));
 401+ doc.add(new Field("ns_word",word, Field.Store.YES, Field.Index.NOT_ANALYZED));
402402 if(decomposed != word)
403 - doc.add(new Field("ns_word",decomposed, Field.Store.NO, Field.Index.UN_TOKENIZED));
 403+ doc.add(new Field("ns_word",decomposed, Field.Store.NO, Field.Index.NOT_ANALYZED));
404404 for(Entry<String,SimpleInt> e : freq.entrySet())
405405 doc.add(new Field("ns_freq_"+e.getKey(), Integer.toString(e.getValue().count), Field.Store.YES, Field.Index.NO));
406406 doc.add(new Field("ns_freq",Integer.toString(freqSum),Field.Store.YES, Field.Index.NO));
@@ -424,10 +424,10 @@
425425 }
426426 Document doc = new Document();
427427 //ngramWriter.createNgramFields(doc,"phrase",phrase);
428 - doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
 428+ doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.NOT_ANALYZED));
429429 doc.add(new Field("freq",Integer.toString(freq), Field.Store.YES, Field.Index.NO));
430430 if(inTitle){
431 - doc.add(new Field("intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED));
 431+ doc.add(new Field("intitle","1", Field.Store.YES, Field.Index.NOT_ANALYZED));
432432 }
433433 if(corrected != null){
434434 doc.add(new Field("misspell",corrected, Field.Store.YES, Field.Index.NO));
@@ -451,7 +451,7 @@
452452 sb.append(val);
453453 }
454454 Document doc = new Document();
455 - doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED));
 455+ doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.NOT_ANALYZED));
456456 doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO));
457457
458458 setOmitNorms(doc);
@@ -470,9 +470,9 @@
471471 Document doc = new Document();
472472 String decomposed = FastWikiTokenizerEngine.decompose(word);
473473 ngramWriter.createNgramFields(doc,"word",decomposed,NgramIndexer.Type.WORDS);
474 - doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED));
 474+ doc.add(new Field("word",word, Field.Store.YES, Field.Index.NOT_ANALYZED));
475475 if(decomposed != word)
476 - doc.add(new Field("word",decomposed, Field.Store.NO, Field.Index.UN_TOKENIZED));
 476+ doc.add(new Field("word",decomposed, Field.Store.NO, Field.Index.NOT_ANALYZED));
477477 doc.add(new Field("freq",Integer.toString(freq), Field.Store.YES, Field.Index.NO));
478478 doc.add(new Field("meta1",dmeta.doubleMetaphone(decomposed), Field.Store.YES, Field.Index.NO));
479479 doc.add(new Field("meta2",dmeta.doubleMetaphone(decomposed,true), Field.Store.YES, Field.Index.NO));
@@ -485,7 +485,7 @@
486486 if(context == null)
487487 return;
488488 Document doc = new Document();
489 - doc.add(new Field("context_key",key, Field.Store.NO, Field.Index.UN_TOKENIZED));
 489+ doc.add(new Field("context_key",key, Field.Store.NO, Field.Index.NOT_ANALYZED));
490490 doc.add(new Field("context", context, Field.Store.YES, Field.Index.NO));
491491 setOmitNorms(doc);
492492 ngramWriter.addDocument(doc);
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
@@ -245,7 +245,7 @@
246246 }
247247 }
248248 Document d = new Document();
249 - d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.NO_NORMS));
 249+ d.add(new Field("prefix",prefix,Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS));
250250 d.add(new Field("articles",new StringList(selected).toString(),Field.Store.YES,Field.Index.NO));
251251 setOmitNorms(d);
252252 writer.addDocument(d);
@@ -268,7 +268,7 @@
269269 d.add(new Field("article",serialize(key,ref,redirect),Field.Store.YES,Field.Index.NO));
270270 ArrayList<Token> canonized = canonize(key,iid,filters);
271271 for(Token t : canonized){
272 - d.add(new Field("key",t.termText(),Field.Store.NO,Field.Index.TOKENIZED));
 272+ d.add(new Field("key",t.termText(),Field.Store.NO,Field.Index.ANALYZED));
273273 }
274274 setOmitNorms(d);
275275 writer.addDocument(d);
@@ -387,11 +387,11 @@
388388 return; // ignore redirects like byzantine -> byzantine empire
389389 // add to index
390390 Document d = new Document();
391 - d.add(new Field("pageid",pageId,Field.Store.NO,Field.Index.UN_TOKENIZED));
392 - d.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
 391+ d.add(new Field("pageid",pageId,Field.Store.NO,Field.Index.NOT_ANALYZED));
 392+ d.add(new Field("key",key,Field.Store.YES,Field.Index.NOT_ANALYZED));
393393 ArrayList<Token> canonized = canonize(key,iid,filters);
394394 for(Token t : canonized){
395 - d.add(new Field("key",t.termText(),Field.Store.NO,Field.Index.TOKENIZED));
 395+ d.add(new Field("key",t.termText(),Field.Store.NO,Field.Index.ANALYZED));
396396 }
397397 if(redirect!=null && !redirect.equals("")){ // redirect target and its rank
398398 d.add(new Field("redirect",redirect,Field.Store.YES,Field.Index.NO));
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/storage/RelatedStorage.java
@@ -34,7 +34,7 @@
3535 ensureWrite();
3636 StringList sl = new StringList(CompactRelated.convertToStringList(rel));
3737 Document doc = new Document();
38 - doc.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
 38+ doc.add(new Field("key",key,Field.Store.YES,Field.Index.NOT_ANALYZED));
3939 doc.add(new Field("related",sl.toString(),Field.Store.COMPRESS,Field.Index.NO));
4040 writer.addDocument(doc);
4141 }
@@ -43,7 +43,7 @@
4444 ensureWrite();
4545 StringList sl = new StringList(Related.convertToStringList(rel));
4646 Document doc = new Document();
47 - doc.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
 47+ doc.add(new Field("key",key,Field.Store.YES,Field.Index.NOT_ANALYZED));
4848 doc.add(new Field("related",sl.toString(),Field.Store.COMPRESS,Field.Index.NO));
4949 writer.addDocument(doc);
5050 }
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -682,10 +682,10 @@
683683 NamespaceFilter contentNamespaces = iid.getContentNamespaces();
684684
685685 // page_id from database, used to look up and replace entries on index updates
686 - doc.add(new Field("key", article.getIndexKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
 686+ doc.add(new Field("key", article.getIndexKey(), Field.Store.YES, Field.Index.NOT_ANALYZED));
687687
688688 // namespace, returned with results
689 - doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED));
 689+ doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.NOT_ANALYZED));
690690
691691 // raw rank value
692692 doc.add(new Field("rank",Integer.toString(article.getRank()),
@@ -694,7 +694,7 @@
695695 // redirect namespace
696696 if(article.isRedirect()){
697697 doc.add(new Field("redirect_namespace",Integer.toString(article.getRedirectTargetNamespace()),
698 - Field.Store.NO, Field.Index.UN_TOKENIZED));
 698+ Field.Store.NO, Field.Index.NOT_ANALYZED));
699699 }
700700
701701 if(contentNamespaces.contains(article.getNamespace())){
@@ -710,7 +710,7 @@
711711 float rankBoost = transformRank(article.getRank());
712712
713713 // prefix title for prefix: searches
714 - Field prefix = new Field("prefix", article.getNsTitleKey().toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED);
 714+ Field prefix = new Field("prefix", article.getNsTitleKey().toLowerCase(), Field.Store.NO, Field.Index.NOT_ANALYZED);
715715 prefix.setBoost(rankBoost);
716716 doc.add(prefix);
717717
@@ -737,7 +737,7 @@
738738 tokenizer.tokenize();
739739
740740 // title
741 - Field title = new Field(fields.title(), article.getTitle(), Field.Store.YES, Field.Index.TOKENIZED);
 741+ Field title = new Field(fields.title(), article.getTitle(), Field.Store.YES, Field.Index.ANALYZED);
742742 title.setBoost(rankBoost);
743743 doc.add(title);
744744
@@ -766,7 +766,7 @@
767767 }
768768
769769 // reverse title for wildcard searches
770 - Field rtitle = new Field(fields.reverse_title(), StringUtils.reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED);
 770+ Field rtitle = new Field(fields.reverse_title(), StringUtils.reverseString(article.getTitle()), Field.Store.NO, Field.Index.ANALYZED);
771771 rtitle.setBoost(rankBoost);
772772 doc.add(rtitle);
773773
@@ -775,7 +775,7 @@
776776 while (e.hasMoreElements()) {
777777 String key = (String)e.nextElement();
778778 String value = article.DiscussionThreadingInfo.get(key);
779 - doc.add( new Field( key, value, Store.YES, Index.UN_TOKENIZED) );
 779+ doc.add( new Field( key, value, Store.YES, Index.NOT_ANALYZED) );
780780 }
781781
782782 // extra info (for spellcheck indexes)
@@ -819,8 +819,8 @@
820820 SimpleDateFormat isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
821821 isoDate.setTimeZone(TimeZone.getTimeZone("GMT"));
822822 Document doc = new Document();
823 - doc.add(new Field("pageid",article.getPageIdStr(),Store.NO,Index.UN_TOKENIZED));
824 - doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED));
 823+ doc.add(new Field("pageid",article.getPageIdStr(),Store.NO,Index.NOT_ANALYZED));
 824+ doc.add(new Field("key",key,Store.NO,Index.NOT_ANALYZED));
825825 for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
826826 FieldNameFactory fields = bs.getFields();
827827 FilterFactory filters = bs.getFilters();
@@ -845,15 +845,15 @@
846846 float rankBoost = transformRank(article.getRank());
847847 Document doc = new Document();
848848 log.debug("Adding interwiki title pageid="+suffix+":"+article.getPageIdStr()+", key="+suffix+":"+key);
849 - doc.add(new Field("pageid",suffix+":"+article.getPageIdStr(),Store.NO,Index.UN_TOKENIZED));
850 - doc.add(new Field("key",suffix+":"+key,Store.NO,Index.UN_TOKENIZED));
851 - doc.add(new Field("suffix",suffix,Store.YES,Index.UN_TOKENIZED));
852 - doc.add(new Field("dbname",dbname,Store.NO,Index.UN_TOKENIZED));
853 - doc.add(new Field("namespace",article.getNamespace(),Store.YES,Index.UN_TOKENIZED));
 849+ doc.add(new Field("pageid",suffix+":"+article.getPageIdStr(),Store.NO,Index.NOT_ANALYZED));
 850+ doc.add(new Field("key",suffix+":"+key,Store.NO,Index.NOT_ANALYZED));
 851+ doc.add(new Field("suffix",suffix,Store.YES,Index.NOT_ANALYZED));
 852+ doc.add(new Field("dbname",dbname,Store.NO,Index.NOT_ANALYZED));
 853+ doc.add(new Field("namespace",article.getNamespace(),Store.YES,Index.NOT_ANALYZED));
854854 // redirect namespace
855855 if(article.isRedirect()){
856856 doc.add(new Field("redirect_namespace",Integer.toString(article.getRedirectTargetNamespace()),
857 - Field.Store.NO, Field.Index.UN_TOKENIZED));
 857+ Field.Store.NO, Field.Index.NOT_ANALYZED));
858858 }
859859 Field title = new Field("title",article.getTitle(),Store.YES, Index.NO);
860860 title.setBoost(rankBoost);
Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/ranks/Links.java
@@ -361,16 +361,16 @@
362362 StringList ak = new StringList(anchors);
363363 Analyzer an = new SplitAnalyzer(1,false);
364364 Document doc = new Document();
365 - doc.add(new Field("article_pageid",pageId,Field.Store.YES,Field.Index.UN_TOKENIZED));
 365+ doc.add(new Field("article_pageid",pageId,Field.Store.YES,Field.Index.NOT_ANALYZED));
366366 // ns:title
367 - doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 367+ doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.NOT_ANALYZED));
368368 if(redirectsTo != null)
369369 // redirect_ns:title|target_ns:title
370 - doc.add(new Field("redirect",redirectsTo+"|"+t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 370+ doc.add(new Field("redirect",redirectsTo+"|"+t.getKey(),Field.Store.YES,Field.Index.NOT_ANALYZED));
371371 else{
372372 // a list of all links/anchors
373 - doc.add(new Field("links",lk.toString(),Field.Store.NO,Field.Index.TOKENIZED));
374 - doc.add(new Field("anchors",ak.toString(),Field.Store.NO,Field.Index.TOKENIZED));
 373+ doc.add(new Field("links",lk.toString(),Field.Store.NO,Field.Index.ANALYZED));
 374+ doc.add(new Field("anchors",ak.toString(),Field.Store.NO,Field.Index.ANALYZED));
375375 }
376376
377377 writer.addDocument(doc,an);

Comments

#Comment by Nikerabbit (talk | contribs)   12:59, 24 January 2012

You probably mean that the new ones are clearer, and the old ones are deprecated.

Status & tagging log