r22838 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r22837‎ | r22838 | r22839 >
Date:14:51, 8 June 2007
Author:rainman
Status:old
Tags:
Comment:
Introduced new fields:
* with stemmed titles
* multiple fields for redirects and keywords
* alttitle field for best ranked redirects
Modified paths:
  • /trunk/lucene-search-2.0/lsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/lsearch-global.conf
@@ -17,13 +17,13 @@
1818 # host : db1.part db2.part
1919 # Mulitple hosts can search multiple dbs (N-N mapping)
2020 [Search-Group]
21 -oblak : wikilucene, wikidev
 21+oblak : wikilucene wikidev
2222
2323 # Index nodes
2424 # host: db1.part db2.part
2525 # Each db.part can be indexed by only one host
2626 [Index]
27 -oblak: wikilucene, wikidev
 27+oblak: wikilucene wikidev
2828
2929 # Rsync path where indexes are on hosts, after default value put
3030 # hosts where the location differs
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -14,6 +14,7 @@
1515 import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy;
1616 import org.wikimedia.lsearch.config.Configuration;
1717 import org.wikimedia.lsearch.config.GlobalConfiguration;
 18+import org.wikimedia.lsearch.index.WikiIndexModifier;
1819 import org.wikimedia.lsearch.search.NamespaceFilter;
1920
2021 /**
@@ -31,6 +32,10 @@
3233 Configuration.setConfigFile(System.getProperty("user.dir")+"/test-data/mwsearch.conf.test");
3334 Configuration.open();
3435 WikiQueryParser.TITLE_BOOST = 2;
 36+ WikiQueryParser.REDIRECT_BOOST = 0.2f;
 37+ WikiQueryParser.ALT_TITLE_BOOST = 6;
 38+ WikiQueryParser.KEYWORD_BOOST = 0.05f;
 39+ WikiIndexModifier.ALT_TITLES = 3;
3540 try{
3641 WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer());
3742 Query q;
@@ -249,10 +254,10 @@
250255 assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString());
251256
252257 q = parser.parseTwoPass("main:ba*",NamespacePolicy.IGNORE);
253 - assertEquals("contents:ba* title:ba*^2.0",q.toString());
 258+ assertEquals("contents:ba title:ba*^2.0",q.toString());
254259
255260 q = parser.parseTwoPass("main:ba* all:lele",NamespacePolicy.REWRITE);
256 - assertEquals("(+(+namespace:0 +contents:ba*) +contents:lele) (+(+namespace:0 +title:ba*^2.0) +title:lele^2.0)",q.toString());
 261+ assertEquals("(+(+namespace:0 +contents:ba) +contents:lele) (+(+namespace:0 +title:ba*^2.0) +title:lele^2.0)",q.toString());
257262
258263 q = parser.parseTwoPass("main:ba*beans",NamespacePolicy.IGNORE);
259264 assertEquals("(+contents:ba +(contents:beans contents:bean^0.5)) (+title:ba^2.0 +title:beans^2.0)",q.toString());
@@ -279,27 +284,28 @@
280285 q = parser.parseTwoPass("[1,a12]:beans",NamespacePolicy.IGNORE);
281286 assertEquals("(+contents:1 +contents:a12 +(contents:beans contents:bean^0.5)) (+title:1^2.0 +title:a12^2.0 +title:beans^2.0)",q.toString());
282287
283 - // Redirect third pass tests
284 - q = parser.parseThreePass("beans",NamespacePolicy.IGNORE);
285 - assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 redirect:beans^2.0",q.toString());
 288+ // Redirect third/forth pass tests
 289+ q = parser.parseFourPass("beans",NamespacePolicy.IGNORE,true);
 290+ assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 (alttitle1:beans^6.0 alttitle2:beans^6.0 alttitle3:beans^6.0 redirect1:beans^0.2 redirect2:beans^0.1 redirect3:beans^0.06666667 redirect4:beans^0.05 redirect5:beans^0.04) (keyword1:beans^0.05 keyword2:beans^0.025 keyword3:beans^0.016666668 keyword4:beans^0.0125 keyword5:beans^0.01)",q.toString());
286291
287 - q = parser.parseThreePass("beans everyone",NamespacePolicy.IGNORE);
288 - assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false)",q.toString());
 292+ q = parser.parseFourPass("beans everyone",NamespacePolicy.IGNORE,true);
 293+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0) spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04) (spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01)",q.toString());
289294
290 - q = parser.parseThreePass("beans everyone incategory:mouse",NamespacePolicy.IGNORE);
291 - assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) (+spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false) +category:mouse)",q.toString());
 295+ // TODO: check if this query will be optimized by lucene (categories)
 296+ q = parser.parseFourPass("beans everyone incategory:mouse",NamespacePolicy.IGNORE,true);
 297+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0 +category:mouse) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0 +category:mouse) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0 +category:mouse) (+spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 +category:mouse) (+spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 +category:mouse) (+spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 +category:mouse) (+spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 +category:mouse) (+spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04 +category:mouse)) ((+spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 +category:mouse) (+spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 +category:mouse) (+spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 +category:mouse) (+spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 +category:mouse) (+spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01 +category:mouse))",q.toString());
292298
293 - q = parser.parseThreePass("beans OR everyone",NamespacePolicy.IGNORE);
294 - assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0)",q.toString());
 299+ q = parser.parseFourPass("beans OR everyone",NamespacePolicy.IGNORE,true);
 300+ assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0) ((alttitle1:beans^6.0 alttitle1:everyone^6.0) (alttitle2:beans^6.0 alttitle2:everyone^6.0) (alttitle3:beans^6.0 alttitle3:everyone^6.0))",q.toString());
295301
296 - q = parser.parseThreePass("beans -everyone",NamespacePolicy.IGNORE);
297 - assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0)",q.toString());
 302+ q = parser.parseFourPass("beans -everyone",NamespacePolicy.IGNORE,true);
 303+ assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0) ((+alttitle1:beans^6.0 -alttitle1:everyone^6.0) (+alttitle2:beans^6.0 -alttitle2:everyone^6.0) (+alttitle3:beans^6.0 -alttitle3:everyone^6.0))",q.toString());
298304
299 - q = parser.parseThreePass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE);
300 - assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false))",q.toString());
 305+ q = parser.parseFourPass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE,true);
 306+ assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04)) ((+(namespace:0 namespace:1 namespace:2) +spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01))",q.toString());
301307
302 - q = parser.parseThreePass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE);
303 - assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0))",q.toString());
 308+ q = parser.parseFourPass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE,true);
 309+ assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0)) (((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+namespace:0 +alttitle1:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+namespace:0 +alttitle2:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+namespace:0 +alttitle3:mainly^6.0)))",q.toString());
304310
305311 // Test field extraction
306312 HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java
@@ -52,19 +52,9 @@
5353 public void writeEndPage() throws IOException {
5454 ArticleLinks r = links.get(page.Title.Namespace+":"+page.Title.Text);
5555 // register redirect
56 - String redirect = Localization.getRedirectTarget(revision.Text,langCode);
 56+ Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
5757 if( redirect !=null ){
58 - int ns = 0;
59 - String title = redirect;
60 - String[] parts = redirect.split(":",2);
61 - if(parts.length == 2 && parts[0].length()>1){
62 - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
63 - if(inx != null){
64 - ns = inx;
65 - title = parts[1];
66 - }
67 - }
68 - r.redirectsTo = findArticleLinks(ns,title);
 58+ r.redirectsTo = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
6959 } else // process links
7060 processLinks(revision.Text,page.Title.Namespace);
7161 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -96,7 +96,7 @@
9797 long start = System.currentTimeMillis();
9898
9999 // regenerate link and redirect information
100 - HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile),langCode);
 100+ HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile,langCode),langCode);
101101
102102 log.info("Third pass, indexing articles...");
103103
@@ -179,7 +179,7 @@
180180 return links;
181181 }
182182
183 - private static HashMap<String,ArticleLinks> getTitles(String inputfile) {
 183+ private static HashMap<String,ArticleLinks> getTitles(String inputfile,String langCode) {
184184 log.info("First pass, getting a list of valid articles...");
185185 InputStream input = null;
186186 try {
@@ -189,7 +189,7 @@
190190 return null;
191191 }
192192 // first pass, get titles
193 - TitleReader tr = new TitleReader();
 193+ TitleReader tr = new TitleReader(langCode);
194194 XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
195195 try {
196196 reader.readDump();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java
@@ -4,12 +4,15 @@
55 import java.util.ArrayList;
66 import java.util.HashMap;
77 import java.util.HashSet;
 8+import java.util.Iterator;
 9+import java.util.Map.Entry;
810
911 import org.mediawiki.importer.DumpWriter;
1012 import org.mediawiki.importer.Page;
1113 import org.mediawiki.importer.Revision;
1214 import org.mediawiki.importer.Siteinfo;
1315 import org.wikimedia.lsearch.beans.ArticleLinks;
 16+import org.wikimedia.lsearch.util.Localization;
1417
1518 /**
1619 * Read a HashSet of titles from dump
@@ -21,6 +24,11 @@
2225 Page page;
2326 Revision revision;
2427 HashMap<String,ArticleLinks> titles = new HashMap<String,ArticleLinks>();
 28+ protected String langCode;
 29+
 30+ public TitleReader(String langCode){
 31+ this.langCode = langCode;
 32+ }
2533
2634 public void writeRevision(Revision revision) throws IOException {
2735 this.revision = revision;
@@ -42,7 +50,12 @@
4351 // nop
4452 }
4553 public void writeSiteinfo(Siteinfo info) throws IOException {
46 - // nop
 54+ // write siteinfo to localization
 55+ Iterator it = info.Namespaces.orderedEntries();
 56+ while(it.hasNext()){
 57+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
 58+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
 59+ }
4760 }
4861 public void writeStartWiki() throws IOException {
4962 // nop
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -3,6 +3,7 @@
44 import java.io.IOException;
55 import java.util.ArrayList;
66 import java.util.HashMap;
 7+import java.util.Iterator;
78 import java.util.Map.Entry;
89 import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
910 import java.util.regex.Matcher;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -106,7 +106,7 @@
107107 IndexWriter writer = indexes.get(target.toString());
108108 if(writer == null)
109109 return;
110 - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters);
 110+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters,iid);
111111 Document doc = (Document) ret[0];
112112 Analyzer analyzer = (Analyzer) ret[1];
113113 try {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -10,6 +10,7 @@
1111 import org.apache.log4j.Logger;
1212 import org.apache.lucene.analysis.Analyzer;
1313 import org.apache.lucene.document.Document;
 14+import org.apache.lucene.index.IndexReader;
1415 import org.apache.lucene.queryParser.ParseException;
1516 import org.apache.lucene.search.Hits;
1617 import org.apache.lucene.search.Query;
@@ -117,13 +118,15 @@
118119 Query q = null;
119120 SearchResults res = null;
120121 long searchStart = System.currentTimeMillis();
121 - Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes();
 122+ Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes();
 123+ boolean searchAll = false;
122124
123125 // if search is over one field, try to use filters
124126 if(fields.size()==1){
125 - if(fields.contains(new NamespaceFilter()))
 127+ if(fields.contains(new NamespaceFilter())){
126128 nsfw = null; // empty filter: "all" keyword
127 - else if(!fields.contains(nsDefault)){
 129+ searchAll = true;
 130+ } else if(!fields.contains(nsDefault)){
128131 // use the specified prefix in the query (if it can be cached)
129132 NamespaceFilter f = fields.toArray(new NamespaceFilter[] {})[0];
130133 if(f.cardinality()==1 || NamespaceCache.isComposable(f))
@@ -135,7 +138,10 @@
136139
137140 try {
138141 if(nsfw == null){
139 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
 142+ if(searchAll)
 143+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
 144+ else
 145+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
140146 }
141147 else{
142148 q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -10,6 +10,7 @@
1111 import java.util.Arrays;
1212 import java.util.Collection;
1313 import java.util.Collections;
 14+import java.util.Comparator;
1415 import java.util.HashSet;
1516 import java.util.Hashtable;
1617 import java.util.Set;
@@ -28,10 +29,12 @@
2930 import org.wikimedia.lsearch.analyzers.Analyzers;
3031 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
3132 import org.wikimedia.lsearch.analyzers.FilterFactory;
 33+import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer;
3234 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
3335 import org.wikimedia.lsearch.beans.Article;
3436 import org.wikimedia.lsearch.beans.IndexReportCard;
3537 import org.wikimedia.lsearch.beans.Redirect;
 38+import org.wikimedia.lsearch.beans.Title;
3639 import org.wikimedia.lsearch.config.GlobalConfiguration;
3740 import org.wikimedia.lsearch.config.IndexId;
3841 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
@@ -54,6 +57,8 @@
5558 }
5659
5760 static public final int MAX_FIELD_LENGTH = 100000;
 61+ /** number of aditional title1, title2, .. etc fields to be filled in with redirects */
 62+ static public int ALT_TITLES = 3;
5863 /** Simple implementation of batch addition and deletion */
5964 class SimpleIndexModifier {
6065 protected IndexId iid;
@@ -179,7 +184,7 @@
180185 if(!checkPreconditions(rec))
181186 continue; // article shoouldn't be added for some (heuristic) reason
182187 IndexReportCard card = getReportCard(rec);
183 - Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters);
 188+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters,iid);
184189 Document doc = (Document) ret[0];
185190 Analyzer analyzer = (Analyzer) ret[1];
186191 try {
@@ -223,15 +228,17 @@
224229 * @return
225230 */
226231 public static boolean checkAddPreconditions(Article ar, String langCode){
227 - if(ar.getNamespace().equals("0")){
228 - String redirect = Localization.getRedirectTarget(ar.getContents(),langCode);
229 - if(redirect != null)
230 - return false; // don't add redirects
231 - /*if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){
 232+ Title redirect = Localization.getRedirectTitle(ar.getContents(),langCode);
 233+ int ns = Integer.parseInt(ar.getNamespace());
 234+ if(redirect!=null && redirect.getNamespace() == ns){
 235+ return false; // don't add redirects to same namespace, always add as redirect field
 236+ }
 237+
 238+ /*if(ar.getNamespace().equals("0")){
 239+ if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){
232240 log.debug("Not adding "+ar+" into index: "+ar.getContents());
233241 return false;
234242 } */
235 - }
236243 return true;
237244 }
238245
@@ -243,24 +250,43 @@
244251 */
245252 protected static void transformArticleForIndexing(Article ar) {
246253 ArrayList<Redirect> redirects = ar.getRedirects();
 254+ // sort redirect by their rank
 255+ Collections.sort(redirects,new Comparator<Redirect>() {
 256+ public int compare(Redirect o1,Redirect o2){
 257+ return o2.getReferences() - o1.getReferences();
 258+ }
 259+ });
247260 int ns = Integer.parseInt(ar.getNamespace());
248261 ar.setRank(ar.getReferences()); // base rank value
249262 if(redirects != null){
250263 ArrayList<String> filtered = new ArrayList<String>();
 264+ ArrayList<Integer> ranks = new ArrayList<Integer>();
251265 // index only redirects from the same namespace
252266 // to avoid a lot of unusable redirects from/to
253267 // user namespace, but always index redirect FROM main
254268 for(Redirect r : redirects){
255 - if((ns == 0 && r.getNamespace() == 0) || ns != 0){
 269+ if(ns == r.getNamespace()){
256270 filtered.add(r.getTitle());
 271+ ranks.add(r.getReferences());
257272 ar.addToRank(r.getReferences()+1);
258273 } else
259274 log.debug("Ignoring redirect "+r+" to "+ar);
260275 }
261276 ar.setRedirectKeywords(filtered);
 277+ ar.setRedirectKeywordRanks(ranks);
262278 }
263279 }
264280
 281+ /** Check if for this article for this db we should extract keywords */
 282+ public static boolean checkKeywordPreconditions(Article article, IndexId iid) {
 283+ if(global == null)
 284+ global = GlobalConfiguration.getInstance();
 285+ if(article.getNamespace().equals("0") && global.useKeywordScoring(iid.getDBname()))
 286+ return true;
 287+ else
 288+ return false;
 289+ }
 290+
265291 /**
266292 * Create necessary directories for index
267293 * @param dbname
@@ -372,7 +398,7 @@
373399 * @param languageAnalyzer
374400 * @return array { document, analyzer }
375401 */
376 - public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){
 402+ public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters, IndexId iid){
377403 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
378404 WikiTokenizer tokenizer = null;
379405 Document doc = new Document();
@@ -387,24 +413,27 @@
388414 doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED));
389415
390416 // boost document title with it's article rank
391 - Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
 417+ Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
392418 //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
393419 float rankBoost = calculateArticleRank(article.getRank());
394420 title.setBoost(rankBoost);
395421 doc.add(title);
396422
 423+ Field stemtitle = new Field("stemtitle", article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED);
 424+ //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
 425+ stemtitle.setBoost(rankBoost);
 426+ doc.add(stemtitle);
 427+
 428+ // put the best redirects as alternative titles
 429+ makeAltTitles(doc,"alttitle",article);
 430+
397431 // add titles of redirects, generated from analyzer
398 - Field redirect = new Field("redirect", "",
399 - Field.Store.NO, Field.Index.TOKENIZED);
400 - redirect.setBoost(rankBoost);
401 - doc.add(redirect);
 432+ makeKeywordField(doc,"redirect",rankBoost);
402433
403 - // most significat words in the text, gets extra score, from analyzer
404 - Field keyword = new Field("keyword", "",
405 - Field.Store.NO, Field.Index.TOKENIZED);
406 - keyword.setBoost(rankBoost);
407 - doc.add(keyword);
408 -
 434+ if(checkKeywordPreconditions(article,iid))
 435+ // most significat words in the text, gets extra score, from analyzer
 436+ makeKeywordField(doc,"keyword",rankBoost);
 437+
409438 // the next fields are generated using wikitokenizer
410439 doc.add(new Field("contents", "",
411440 Field.Store.NO, Field.Index.TOKENIZED));
@@ -425,7 +454,35 @@
426455
427456 return new Object[] { doc, perFieldAnalyzer };
428457 }
429 -
 458+
 459+ /** Make a multiple keyword field, e.g. redirect1, redirect2, redirect3 ... */
 460+ protected static void makeKeywordField(Document doc, String prefix, float boost) {
 461+ for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
 462+ Field keyfield = new Field(prefix+i, "",
 463+ Field.Store.NO, Field.Index.TOKENIZED);
 464+ keyfield.setBoost(boost);
 465+ doc.add(keyfield);
 466+ }
 467+
 468+ }
 469+
 470+ protected static void makeAltTitles(Document doc, String prefix, Article article) {
 471+ // the redirects, rank list are sorted..
 472+ final ArrayList<String> redirects = article.getRedirectKeywords();
 473+ final ArrayList<Integer> ranks = article.getRedirectKeywordRanks();
 474+ if(redirects.size() == 0)
 475+ return;
 476+ // add alternative titles alttitle1, alttitle2 ...
 477+ for(int i=0;i<ALT_TITLES && i<redirects.size();i++){
 478+ if(ranks.get(i) == 0)
 479+ break; // we don't want redirects with zero links
 480+ //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i));
 481+ Field alttitle = new Field("alttitle"+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED);
 482+ alttitle.setBoost(calculateArticleRank(ranks.get(i)));
 483+ doc.add(alttitle);
 484+ }
 485+ }
 486+
430487 /**
431488 *
432489 * Calculate document boost (article rank) from number of
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
@@ -16,12 +16,12 @@
1717 /**
1818 * For content:
1919 * * length norm is a linear function, with f(1) = 1
20 - * and f(10000) = 0.2
 20+ * and f(10000) = 0.5
2121 *
22 - * For titles:
 22+ * For titles / title aliases:
2323 * * 1/sqrt(term^3)
2424 *
25 - * For redirect:
 25+ * For redirect / keywords:
2626 * * no length norm
2727 *
2828 */
@@ -35,11 +35,11 @@
3636 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
3737 return f;
3838 }
39 - } else if(fieldName.equals("title")){
 39+ } else if(fieldName.equals("title") || fieldName.startsWith("alttitle")){
4040 float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
4141 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
4242 return f;
43 - } else if(fieldName.equals("redirect") || fieldName.equals("keyword")){
 43+ } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword")){
4444 return 1;
4545 } else
4646 return super.lengthNorm(fieldName,numTokens);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -22,6 +22,7 @@
2323 import org.apache.lucene.search.spans.SpanQuery;
2424 import org.apache.lucene.search.spans.SpanTermQuery;
2525 import org.wikimedia.lsearch.config.GlobalConfiguration;
 26+import org.wikimedia.lsearch.index.WikiIndexModifier;
2627 import org.wikimedia.lsearch.search.NamespaceFilter;
2728 import org.wikimedia.lsearch.util.UnicodeDecomposer;
2829
@@ -57,6 +58,7 @@
5859 private String field; // current field
5960 private String defaultField; // the default field value
6061 private float defaultBoost = 1;
 62+ private float defaultAliasBoost = ALIAS_BOOST;
6163 protected enum TokenType {WORD, FIELD, AND, OR, EOF };
6264
6365 private TokenStream tokenStream;
@@ -70,11 +72,16 @@
7173 protected boolean disableTitleAliases;
7274
7375 /** boost for alias words from analyzer */
74 - public final float ALIAS_BOOST = 0.5f;
 76+ public static float ALIAS_BOOST = 0.5f;
7577 /** boost for title field */
76 - public static float TITLE_BOOST = 8;
 78+ public static float TITLE_BOOST = 6;
 79+ public static float TITLE_ALIAS_BOOST = 0.2f;
 80+ public static float STEM_TITLE_BOOST = 2;
 81+ public static float STEM_TITLE_ALIAS_BOOST = 0.4f;
7782 public static float REDIRECT_BOOST = 0.2f;
78 - public static float KEYWORD_BOOST = 0.05f;
 83+ public static float ALT_TITLE_BOOST = 2;
 84+ public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
 85+ public static float KEYWORD_BOOST = 0.02f;
7986
8087 /** Policies in treating field names:
8188 *
@@ -663,9 +670,10 @@
664671 return new TermQuery(makeTerm());
665672 }
666673
667 - // check for wildcard seaches, they are also not analyzed/stemmed
 674+ // check for wildcard seaches, they are also not analyzed/stemmed, only for titles
668675 // wildcard signs are allowed only at the end of the word, minimum one letter word
669 - if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?')){
 676+ if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?') &&
 677+ defaultField.equals("title")){
670678 Query ret = new WildcardQuery(makeTerm());
671679 ret.setBoost(defaultBoost);
672680 return ret;
@@ -691,12 +699,12 @@
692700 else if(token.type().equals("stemmed")){
693701 // stemmed word
694702 t = new TermQuery(makeTerm(token));
695 - t.setBoost(ALIAS_BOOST*defaultBoost);
 703+ t.setBoost(defaultAliasBoost*defaultBoost);
696704 cur.add(t,aliasOccur);
697705 } else if(token.type().equals("alias")){
698706 // produced by alias engine (e.g. for sr)
699707 t = new TermQuery(makeTerm(token));
700 - t.setBoost(ALIAS_BOOST*defaultBoost);
 708+ t.setBoost(defaultAliasBoost*defaultBoost);
701709 cur.add(t,aliasOccur);
702710 }
703711 if( cur != bq) // returned from nested query
@@ -763,7 +771,7 @@
764772
765773 /** Duplicate a term query, setting "title" as field */
766774 private TermQuery makeTitleTermQuery(TermQuery tq){
767 - if(disableTitleAliases && tq.getBoost()==ALIAS_BOOST)
 775+ if(disableTitleAliases && tq.getBoost()==defaultAliasBoost)
768776 return null;
769777 Term term = tq.getTerm();
770778 if(term.field().equals(defaultField)){
@@ -778,7 +786,7 @@
779787
780788 /** Duplicate a phrase query, setting "title" as field */
781789 private PhraseQuery makeTitlePhraseQuery(PhraseQuery pq){
782 - if(disableTitleAliases && pq.getBoost()==ALIAS_BOOST)
 790+ if(disableTitleAliases && pq.getBoost()==defaultAliasBoost)
783791 return null;
784792 PhraseQuery pq2 = new PhraseQuery();
785793 Term[] terms = pq.getTerms();
@@ -1011,7 +1019,7 @@
10121020 span = spans.get(0);
10131021 else{
10141022 // make a span-near query that has a slop 1/2 of tokenGap
1015 - span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.tokenGap-1)/2,false);
 1023+ span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.TOKEN_GAP-1)/2,false);
10161024 span.setBoost(boost);
10171025 }
10181026 }
@@ -1028,7 +1036,81 @@
10291037 }
10301038 return null;
10311039 }
 1040+
 1041+ protected BooleanQuery multiplySpans(Query query, int level, String fieldName, float boost){
 1042+ BooleanQuery bq = new BooleanQuery(true);
 1043+ for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
 1044+ Query q = extractSpans(query,0,fieldName+i,boost/i);
 1045+ if(q != null)
 1046+ bq.add(q,BooleanClause.Occur.SHOULD);
 1047+ }
 1048+
 1049+ if(bq.getClauses() == null || bq.getClauses().length==0)
 1050+ return null;
 1051+ else
 1052+ return bq;
 1053+ }
10321054
 1055+ /** Make a redirect query in format altitle1:query altitle2:query ... redirect:spanquery */
 1056+ protected BooleanQuery makeRedirectQuery(String queryText, Query qt) {
 1057+ BooleanQuery bq = new BooleanQuery(true);
 1058+ float olfDefaultBoost = defaultBoost;
 1059+ String contentField = defaultField;
 1060+ defaultBoost = ALT_TITLE_BOOST;
 1061+ defaultAliasBoost = ALT_TITLE_ALIAS_BOOST;
 1062+ for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){
 1063+ defaultField = "alttitle"+i;
 1064+ Query q = parseRaw(queryText);
 1065+ if(q != null)
 1066+ bq.add(q,BooleanClause.Occur.SHOULD);
 1067+ }
 1068+ // pop stack
 1069+ defaultField = contentField;
 1070+ defaultBoost = olfDefaultBoost;
 1071+ defaultAliasBoost = ALIAS_BOOST;
 1072+
 1073+ Query qs = multiplySpans(qt,0,"redirect",REDIRECT_BOOST);
 1074+ // merge queries
 1075+ if(qs != null){
 1076+ bq.add(qs,BooleanClause.Occur.SHOULD);
 1077+ }
 1078+ if(bq.getClauses() == null || bq.getClauses().length==0)
 1079+ return null;
 1080+ else
 1081+ return bq;
 1082+
 1083+ }
 1084+
 1085+ /** Make title query in format: title:query stemtitle:stemmedquery */
 1086+ protected Query makeTitleQuery(String queryText) {
 1087+ String contentField = defaultField;
 1088+ float olfDefaultBoost = defaultBoost;
 1089+ defaultField = "title"; // now parse the title part
 1090+ defaultBoost = TITLE_BOOST;
 1091+ defaultAliasBoost = TITLE_ALIAS_BOOST;
 1092+ Query qt = parseRaw(queryText);
 1093+ // stemmed title
 1094+ defaultField = "stemtitle";
 1095+ defaultBoost = STEM_TITLE_BOOST;
 1096+ defaultAliasBoost = STEM_TITLE_ALIAS_BOOST;
 1097+ Query qs = parseRaw(queryText);
 1098+ // pop stack
 1099+ defaultField = contentField;
 1100+ defaultBoost = olfDefaultBoost;
 1101+ defaultAliasBoost = ALIAS_BOOST;
 1102+
 1103+ if(qt == qs) // either null, or category query
 1104+ return qt;
 1105+ if(qt == null)
 1106+ return qs;
 1107+ if(qs == null)
 1108+ return qt;
 1109+ BooleanQuery bq = new BooleanQuery(true);
 1110+ bq.add(qt,BooleanClause.Occur.SHOULD);
 1111+ bq.add(qs,BooleanClause.Occur.SHOULD);
 1112+ return bq;
 1113+ }
 1114+
10331115 /**
10341116 * Main function for multi-pass parsing.
10351117 *
@@ -1039,17 +1121,12 @@
10401122 */
10411123 protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){
10421124 if(policy != null)
1043 - this.namespacePolicy = policy;
1044 - float olfDefaultBoost = defaultBoost;
 1125+ this.namespacePolicy = policy;
10451126 defaultBoost = 1;
1046 - Query qc = parseRaw(queryText);
1047 - String contentField = defaultField;
1048 - defaultField = "title"; // now parse the title part
1049 - defaultBoost = TITLE_BOOST;
1050 - Query qt = parseRaw(queryText);
1051 - // pop stack
1052 - defaultField = contentField;
1053 - defaultBoost = olfDefaultBoost;
 1127+ defaultAliasBoost = ALIAS_BOOST;
 1128+ Query qc = parseRaw(queryText);
 1129+
 1130+ Query qt = makeTitleQuery(queryText);
10541131 if(qc == null || qt == null)
10551132 return new BooleanQuery();
10561133 if(qc.equals(qt))
@@ -1058,15 +1135,23 @@
10591136 bq.add(qc,BooleanClause.Occur.SHOULD);
10601137 bq.add(qt,BooleanClause.Occur.SHOULD);
10611138
 1139+ Query nostem = null;
 1140+ if(makeRedirect || makeKeywords){
 1141+ String contentField = defaultField;
 1142+ defaultField = "keyword"; // this field is never stemmed
 1143+ nostem = parseRaw(queryText);
 1144+ defaultField = contentField;
 1145+ }
 1146+
10621147 // redirect pass
1063 - if(makeRedirect){
1064 - Query qr = extractSpans(qt,0,"redirect",REDIRECT_BOOST);
 1148+ if(makeRedirect && nostem!=null){
 1149+ BooleanQuery qr = makeRedirectQuery(queryText,nostem);
10651150 if(qr != null)
10661151 bq.add(qr,BooleanClause.Occur.SHOULD);
10671152 }
10681153 // keyword pass
1069 - if(makeKeywords){
1070 - Query qk = extractSpans(qt,0,"keyword",KEYWORD_BOOST);
 1154+ if(makeKeywords && nostem!=null){
 1155+ Query qk = multiplySpans(nostem,0,"keyword",KEYWORD_BOOST);
10711156 if(qk != null)
10721157 bq.add(qk,BooleanClause.Occur.SHOULD);
10731158 }
@@ -1074,7 +1159,7 @@
10751160 return bq;
10761161
10771162 }
1078 -
 1163+
10791164 /**
10801165 * Three parse pases: contents, title, redirect
10811166 *
@@ -1099,6 +1184,10 @@
11001185 return parseMultiPass(queryText,policy,true,makeKeywords);
11011186 }
11021187
 1188+ public Query parseFourPass(String queryText, NamespacePolicy policy, boolean makeKeywords) throws ParseException{
 1189+ return parseMultiPass(queryText,policy,true,makeKeywords);
 1190+ }
 1191+
11031192 /**
11041193 * Parse the query according to policy. Instead of rewrite phrase, simply pass
11051194 * twice the query with different default fields.
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -14,6 +14,7 @@
1515 import org.apache.lucene.analysis.th.ThaiWordFilter;
1616 import org.wikimedia.lsearch.config.GlobalConfiguration;
1717 import org.wikimedia.lsearch.config.IndexId;
 18+import org.wikimedia.lsearch.index.WikiIndexModifier;
1819 import org.wikimedia.lsearch.test.AliasPorterStemFilter;
1920
2021 /**
@@ -63,13 +64,29 @@
6465 new CategoryAnalyzer(categories));
6566 perFieldAnalyzer.addAnalyzer("title",
6667 getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
67 - perFieldAnalyzer.addAnalyzer("redirect",
68 - new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory()));
69 - perFieldAnalyzer.addAnalyzer("keyword",
70 - new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory()));
 68+ perFieldAnalyzer.addAnalyzer("stemtitle",
 69+ getTitleAnalyzer(filters));
 70+ setAltTitleAnalyzer(perFieldAnalyzer,"alttitle",
 71+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
 72+ setKeywordAnalyzer(perFieldAnalyzer,"redirect",
 73+ new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),"redirect"));
 74+ setKeywordAnalyzer(perFieldAnalyzer,"keyword",
 75+ new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),"keyword"));
7176 return new Object[] {perFieldAnalyzer,tokenizer};
7277 }
7378
 79+ protected static void setAltTitleAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, Analyzer analyzer) {
 80+ for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){
 81+ perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
 82+ }
 83+ }
 84+
 85+ protected static void setKeywordAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, KeywordsAnalyzer analyzer) {
 86+ for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
 87+ perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
 88+ }
 89+ }
 90+
7491 public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
7592 if(global == null)
7693 global = GlobalConfiguration.getInstance();
@@ -95,6 +112,12 @@
96113 new QueryLanguageAnalyzer(filters));
97114 perFieldAnalyzer.addAnalyzer("title",
98115 getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
 116+ perFieldAnalyzer.addAnalyzer("stemtitle",
 117+ getTitleAnalyzer(filters));
 118+ setAltTitleAnalyzer(perFieldAnalyzer,"alttitle",
 119+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
 120+ perFieldAnalyzer.addAnalyzer("keyword",
 121+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
99122
100123 return perFieldAnalyzer;
101124 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
@@ -25,31 +25,68 @@
2626 *
2727 */
2828 public class KeywordsAnalyzer extends Analyzer{
29 - static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
30 - protected ArrayList<String> keywords;
31 - protected FilterFactory filters;
32 - protected KeywordsTokenStream tokens;
 29+ static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
 30+ protected KeywordsTokenStream[] tokensBySize = null;
 31+ protected String prefix;
 32+
 33+ /** number of field to be generated, e.g. keyword1 for single-word keywords,
 34+ * keyword2 for two-word keywords, etc ... the last field has all the remaining keys
 35+ */
 36+ public static final int KEYWORD_LEVELS = 5;
 37+ /** positional increment between different redirects */
 38+ public static final int TOKEN_GAP = 201;
3339
34 - public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters){
 40+ public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix){
3541 ArrayList<String> k = new ArrayList<String>();
3642 if(keywords != null)
3743 k.addAll(keywords);
38 - tokens = new KeywordsTokenStream(k,filters);
 44+ init(k,filters,prefix);
3945 }
 46+ public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix){
 47+ init(keywords,filters,prefix);
 48+ }
4049
41 - public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters){
42 - tokens = new KeywordsTokenStream(keywords,filters);
 50+ protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix) {
 51+ this.prefix = prefix;
 52+ tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS];
 53+ if(keywords == null){
 54+ // init empty token streams
 55+ for(int i=0; i< KEYWORD_LEVELS; i++){
 56+ tokensBySize[i] = new KeywordsTokenStream(null,filters);
 57+ }
 58+ return;
 59+ }
 60+ ArrayList<ArrayList<String>> keywordsBySize = new ArrayList<ArrayList<String>>();
 61+ for(int i=0;i<KEYWORD_LEVELS;i++)
 62+ keywordsBySize.add(new ArrayList<String>());
 63+ // arange keywords into a list by token number
 64+ for(String k : keywords){
 65+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k).parse();
 66+ if(parsed.size() == 0)
 67+ continue;
 68+ else if(parsed.size() < KEYWORD_LEVELS)
 69+ keywordsBySize.get(parsed.size()-1).add(k);
 70+ else
 71+ keywordsBySize.get(KEYWORD_LEVELS-1).add(k);
 72+ }
 73+ for(int i=0; i< KEYWORD_LEVELS; i++){
 74+ tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters);
 75+ }
4376 }
44 - /** positional increment between different redirects */
45 - public static final int tokenGap = 201;
4677
4778 @Override
4879 public TokenStream tokenStream(String fieldName, Reader reader) {
49 - return tokens;
 80+ if(fieldName.startsWith(prefix)){
 81+ int inx = Integer.parseInt(fieldName.substring(prefix.length()));
 82+ return tokensBySize[inx-1];
 83+ } else{
 84+ log.error("Trying to get tokenStream for wrong field "+fieldName);
 85+ return null;
 86+ }
5087 }
5188 @Override
5289 public TokenStream tokenStream(String fieldName, String text) {
53 - return tokens;
 90+ return tokenStream(fieldName,(Reader)null);
5491 }
5592
5693 class KeywordsTokenStream extends TokenStream {
@@ -80,7 +117,7 @@
81118 if(t == null){
82119 t = openNext();
83120 if(t != null)
84 - t.setPositionIncrement(tokenGap);
 121+ t.setPositionIncrement(TOKEN_GAP);
85122 }
86123 return t;
87124 } else{
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java
@@ -29,6 +29,11 @@
3030 public void setTitle(String title) {
3131 this.title = title;
3232 }
 33+ @Override
 34+ public String toString() {
 35+ return namespace+":"+title+" ("+references+")";
 36+ }
3337
3438
 39+
3540 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
@@ -43,6 +43,8 @@
4444 private ArrayList<Redirect> redirects;
4545 /** generated before indexing from the list of redirects */
4646 private transient ArrayList<String> redirectKeywords;
 47+ /** paired with previous list, ranks for each redirect */
 48+ private transient ArrayList<Integer> redirectKeywordRanks;
4749 /** generated before indexing from the reference sto this article, and references from redirects */
4850 private transient int rank;
4951
@@ -182,6 +184,16 @@
183185 public void setRedirectKeywords(ArrayList<String> redirectKeywords) {
184186 this.redirectKeywords = redirectKeywords;
185187 }
 188+
 189+ public ArrayList<Integer> getRedirectKeywordRanks() {
 190+ return redirectKeywordRanks;
 191+ }
 192+
 193+ public void setRedirectKeywordRanks(ArrayList<Integer> redirectKeywordRanks) {
 194+ this.redirectKeywordRanks = redirectKeywordRanks;
 195+ }
186196
187197
 198+
 199+
188200 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -2,6 +2,7 @@
33
44 import java.io.IOException;
55 import java.util.ArrayList;
 6+import java.util.Iterator;
67
78 import org.apache.log4j.Logger;
89 import org.mediawiki.importer.DumpWriter;
@@ -11,8 +12,10 @@
1213 import org.mediawiki.importer.Title;
1314 import org.wikimedia.lsearch.beans.Article;
1415 import org.wikimedia.lsearch.beans.Redirect;
 16+import org.wikimedia.lsearch.config.GlobalConfiguration;
1517 import org.wikimedia.lsearch.config.IndexId;
1618 import org.wikimedia.lsearch.index.IndexUpdateRecord;
 19+import org.wikimedia.lsearch.util.Localization;
1720
1821 public class IndexUpdatesCollector implements DumpWriter {
1922 Logger log = Logger.getLogger(DumpWriter.class);
@@ -23,9 +26,11 @@
2427 protected int references = 0;
2528 protected ArrayList<Redirect> redirects = new ArrayList<Redirect>();
2629 protected Siteinfo info = null;
 30+ protected String langCode;
2731
2832 public IndexUpdatesCollector(IndexId iid){
2933 this.iid = iid;
 34+ this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
3035 }
3136
3237 public void addRedirect(String redirectTitle, int references) {
@@ -69,6 +74,13 @@
7075
7176 public void writeSiteinfo(Siteinfo info) throws IOException {
7277 this.info = info;
 78+ // write to localization
 79+ Iterator it = info.Namespaces.orderedEntries();
 80+ while(it.hasNext()){
 81+ Integer inx = (Integer)it.next();
 82+ String prefix = info.Namespaces.getPrefix(inx);
 83+ Localization.addCustomMapping(prefix,inx,langCode);
 84+ }
7385 }
7486
7587 public void close() throws IOException {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
@@ -10,6 +10,7 @@
1111 import java.util.Map.Entry;
1212
1313 import org.apache.log4j.Logger;
 14+import org.wikimedia.lsearch.beans.Title;
1415 import org.wikimedia.lsearch.config.Configuration;
1516
1617 /**
@@ -27,7 +28,40 @@
2829 /** Languages for which loading of localization failed */
2930 protected static HashSet<String> badLocalizations = new HashSet<String>();
3031 protected static HashSet<String> interwiki = null;
 32+ /** lowecased canonical names of namespaces */
 33+ protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>();
 34+ static{
 35+ canonicalNamespaces.put("media",-2);
 36+ canonicalNamespaces.put("special",-1);
 37+ canonicalNamespaces.put("talk",1);
 38+ canonicalNamespaces.put("user",2);
 39+ canonicalNamespaces.put("user_talk",3);
 40+ canonicalNamespaces.put("project",4);
 41+ canonicalNamespaces.put("project_talk",5);
 42+ canonicalNamespaces.put("image",6);
 43+ canonicalNamespaces.put("image_talk",7);
 44+ canonicalNamespaces.put("mediawiki",8);
 45+ canonicalNamespaces.put("mediawiki_talk",9);
 46+ canonicalNamespaces.put("template",10);
 47+ canonicalNamespaces.put("template_talk",11);
 48+ canonicalNamespaces.put("help",12);
 49+ canonicalNamespaces.put("help_talk",13);
 50+ canonicalNamespaces.put("category",14);
 51+ canonicalNamespaces.put("category_talk",15);
 52+ }
3153
 54+ /** Add custom mapping not found in localization files from other source, e.g. project name, etc.. */
 55+ public static void addCustomMapping(String namespace, int index, String langCode){
 56+ synchronized(lock){
 57+ Hashtable<String,Integer> map = namespaces.get(langCode);
 58+ if(map == null){
 59+ map = new Hashtable<String,Integer>();
 60+ namespaces.put(langCode,map);
 61+ }
 62+ map.put(namespace.toLowerCase(),index);
 63+ }
 64+ }
 65+
3266 public static HashSet<String> getLocalizedImage(String langCode){
3367 return getLocalizedNamespace(langCode,6);
3468 }
@@ -169,6 +203,25 @@
170204 return null;
171205 }
172206
 207+ public static Title getRedirectTitle(String text, String lang){
 208+ String full = getRedirectTarget(text,lang);
 209+ if(full == null)
 210+ return null;
 211+ String[] parts = full.split(":",2);
 212+ if(parts.length == 2){
 213+ String ns = parts[0].toLowerCase();
 214+ // check canonical
 215+ if(canonicalNamespaces.containsKey(ns))
 216+ return new Title(canonicalNamespaces.get(ns),parts[1]);
 217+ // check lang namespaces
 218+ Hashtable<String,Integer> map = namespaces.get(lang);
 219+ if(map.containsKey(ns))
 220+ return new Title(map.get(ns),parts[1]);
 221+ }
 222+ // not recognized namespace, using main
 223+ return new Title(0,full);
 224+ }
 225+
173226 /** Loads interwiki from default location lib/interwiki.map */
174227 public static void loadInterwiki(){
175228 if(interwiki != null)

Status & tagging log