Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -17,13 +17,13 @@ |
18 | 18 | # host : db1.part db2.part |
19 | 19 | # Mulitple hosts can search multiple dbs (N-N mapping) |
20 | 20 | [Search-Group] |
21 | | -oblak : wikilucene, wikidev |
| 21 | +oblak : wikilucene wikidev |
22 | 22 | |
23 | 23 | # Index nodes |
24 | 24 | # host: db1.part db2.part |
25 | 25 | # Each db.part can be indexed by only one host |
26 | 26 | [Index] |
27 | | -oblak: wikilucene, wikidev |
| 27 | +oblak: wikilucene wikidev |
28 | 28 | |
29 | 29 | # Rsync path where indexes are on hosts, after default value put |
30 | 30 | # hosts where the location differs |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -14,6 +14,7 @@ |
15 | 15 | import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy; |
16 | 16 | import org.wikimedia.lsearch.config.Configuration; |
17 | 17 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 18 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
18 | 19 | import org.wikimedia.lsearch.search.NamespaceFilter; |
19 | 20 | |
20 | 21 | /** |
— | — | @@ -31,6 +32,10 @@ |
32 | 33 | Configuration.setConfigFile(System.getProperty("user.dir")+"/test-data/mwsearch.conf.test"); |
33 | 34 | Configuration.open(); |
34 | 35 | WikiQueryParser.TITLE_BOOST = 2; |
| 36 | + WikiQueryParser.REDIRECT_BOOST = 0.2f; |
| 37 | + WikiQueryParser.ALT_TITLE_BOOST = 6; |
| 38 | + WikiQueryParser.KEYWORD_BOOST = 0.05f; |
| 39 | + WikiIndexModifier.ALT_TITLES = 3; |
35 | 40 | try{ |
36 | 41 | WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer()); |
37 | 42 | Query q; |
— | — | @@ -249,10 +254,10 @@ |
250 | 255 | assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString()); |
251 | 256 | |
252 | 257 | q = parser.parseTwoPass("main:ba*",NamespacePolicy.IGNORE); |
253 | | - assertEquals("contents:ba* title:ba*^2.0",q.toString()); |
| 258 | + assertEquals("contents:ba title:ba*^2.0",q.toString()); |
254 | 259 | |
255 | 260 | q = parser.parseTwoPass("main:ba* all:lele",NamespacePolicy.REWRITE); |
256 | | - assertEquals("(+(+namespace:0 +contents:ba*) +contents:lele) (+(+namespace:0 +title:ba*^2.0) +title:lele^2.0)",q.toString()); |
| 261 | + assertEquals("(+(+namespace:0 +contents:ba) +contents:lele) (+(+namespace:0 +title:ba*^2.0) +title:lele^2.0)",q.toString()); |
257 | 262 | |
258 | 263 | q = parser.parseTwoPass("main:ba*beans",NamespacePolicy.IGNORE); |
259 | 264 | assertEquals("(+contents:ba +(contents:beans contents:bean^0.5)) (+title:ba^2.0 +title:beans^2.0)",q.toString()); |
— | — | @@ -279,27 +284,28 @@ |
280 | 285 | q = parser.parseTwoPass("[1,a12]:beans",NamespacePolicy.IGNORE); |
281 | 286 | assertEquals("(+contents:1 +contents:a12 +(contents:beans contents:bean^0.5)) (+title:1^2.0 +title:a12^2.0 +title:beans^2.0)",q.toString()); |
282 | 287 | |
283 | | - // Redirect third pass tests |
284 | | - q = parser.parseThreePass("beans",NamespacePolicy.IGNORE); |
285 | | - assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 redirect:beans^2.0",q.toString()); |
| 288 | + // Redirect third/forth pass tests |
| 289 | + q = parser.parseFourPass("beans",NamespacePolicy.IGNORE,true); |
| 290 | + assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 (alttitle1:beans^6.0 alttitle2:beans^6.0 alttitle3:beans^6.0 redirect1:beans^0.2 redirect2:beans^0.1 redirect3:beans^0.06666667 redirect4:beans^0.05 redirect5:beans^0.04) (keyword1:beans^0.05 keyword2:beans^0.025 keyword3:beans^0.016666668 keyword4:beans^0.0125 keyword5:beans^0.01)",q.toString()); |
286 | 291 | |
287 | | - q = parser.parseThreePass("beans everyone",NamespacePolicy.IGNORE); |
288 | | - assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false)",q.toString()); |
| 292 | + q = parser.parseFourPass("beans everyone",NamespacePolicy.IGNORE,true); |
| 293 | + assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0) spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04) (spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01)",q.toString()); |
289 | 294 | |
290 | | - q = parser.parseThreePass("beans everyone incategory:mouse",NamespacePolicy.IGNORE); |
291 | | - assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) (+spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false) +category:mouse)",q.toString()); |
| 295 | + // TODO: check if this query will be optimized by lucene (categories) |
| 296 | + q = parser.parseFourPass("beans everyone incategory:mouse",NamespacePolicy.IGNORE,true); |
| 297 | + assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0 +category:mouse) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0 +category:mouse) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0 +category:mouse) (+spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 +category:mouse) (+spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 +category:mouse) (+spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 +category:mouse) (+spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 +category:mouse) (+spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04 +category:mouse)) ((+spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 +category:mouse) (+spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 +category:mouse) (+spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 +category:mouse) (+spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 +category:mouse) (+spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01 +category:mouse))",q.toString()); |
292 | 298 | |
293 | | - q = parser.parseThreePass("beans OR everyone",NamespacePolicy.IGNORE); |
294 | | - assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0)",q.toString()); |
| 299 | + q = parser.parseFourPass("beans OR everyone",NamespacePolicy.IGNORE,true); |
| 300 | + assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0) ((alttitle1:beans^6.0 alttitle1:everyone^6.0) (alttitle2:beans^6.0 alttitle2:everyone^6.0) (alttitle3:beans^6.0 alttitle3:everyone^6.0))",q.toString()); |
295 | 301 | |
296 | | - q = parser.parseThreePass("beans -everyone",NamespacePolicy.IGNORE); |
297 | | - assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0)",q.toString()); |
| 302 | + q = parser.parseFourPass("beans -everyone",NamespacePolicy.IGNORE,true); |
| 303 | + assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0) ((+alttitle1:beans^6.0 -alttitle1:everyone^6.0) (+alttitle2:beans^6.0 -alttitle2:everyone^6.0) (+alttitle3:beans^6.0 -alttitle3:everyone^6.0))",q.toString()); |
298 | 304 | |
299 | | - q = parser.parseThreePass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE); |
300 | | - assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false))",q.toString()); |
| 305 | + q = parser.parseFourPass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE,true); |
| 306 | + assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04)) ((+(namespace:0 namespace:1 namespace:2) +spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01))",q.toString()); |
301 | 307 | |
302 | | - q = parser.parseThreePass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE); |
303 | | - assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0))",q.toString()); |
| 308 | + q = parser.parseFourPass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE,true); |
| 309 | + assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0)) (((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+namespace:0 +alttitle1:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+namespace:0 +alttitle2:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+namespace:0 +alttitle3:mainly^6.0)))",q.toString()); |
304 | 310 | |
305 | 311 | // Test field extraction |
306 | 312 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java |
— | — | @@ -52,19 +52,9 @@ |
53 | 53 | public void writeEndPage() throws IOException { |
54 | 54 | ArticleLinks r = links.get(page.Title.Namespace+":"+page.Title.Text); |
55 | 55 | // register redirect |
56 | | - String redirect = Localization.getRedirectTarget(revision.Text,langCode); |
| 56 | + Title redirect = Localization.getRedirectTitle(revision.Text,langCode); |
57 | 57 | if( redirect !=null ){ |
58 | | - int ns = 0; |
59 | | - String title = redirect; |
60 | | - String[] parts = redirect.split(":",2); |
61 | | - if(parts.length == 2 && parts[0].length()>1){ |
62 | | - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase()); |
63 | | - if(inx != null){ |
64 | | - ns = inx; |
65 | | - title = parts[1]; |
66 | | - } |
67 | | - } |
68 | | - r.redirectsTo = findArticleLinks(ns,title); |
| 58 | + r.redirectsTo = findArticleLinks(redirect.getNamespace(),redirect.getTitle()); |
69 | 59 | } else // process links |
70 | 60 | processLinks(revision.Text,page.Title.Namespace); |
71 | 61 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -96,7 +96,7 @@ |
97 | 97 | long start = System.currentTimeMillis(); |
98 | 98 | |
99 | 99 | // regenerate link and redirect information |
100 | | - HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile),langCode); |
| 100 | + HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile,langCode),langCode); |
101 | 101 | |
102 | 102 | log.info("Third pass, indexing articles..."); |
103 | 103 | |
— | — | @@ -179,7 +179,7 @@ |
180 | 180 | return links; |
181 | 181 | } |
182 | 182 | |
183 | | - private static HashMap<String,ArticleLinks> getTitles(String inputfile) { |
| 183 | + private static HashMap<String,ArticleLinks> getTitles(String inputfile,String langCode) { |
184 | 184 | log.info("First pass, getting a list of valid articles..."); |
185 | 185 | InputStream input = null; |
186 | 186 | try { |
— | — | @@ -189,7 +189,7 @@ |
190 | 190 | return null; |
191 | 191 | } |
192 | 192 | // first pass, get titles |
193 | | - TitleReader tr = new TitleReader(); |
| 193 | + TitleReader tr = new TitleReader(langCode); |
194 | 194 | XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
195 | 195 | try { |
196 | 196 | reader.readDump(); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java |
— | — | @@ -4,12 +4,15 @@ |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.HashMap; |
7 | 7 | import java.util.HashSet; |
| 8 | +import java.util.Iterator; |
| 9 | +import java.util.Map.Entry; |
8 | 10 | |
9 | 11 | import org.mediawiki.importer.DumpWriter; |
10 | 12 | import org.mediawiki.importer.Page; |
11 | 13 | import org.mediawiki.importer.Revision; |
12 | 14 | import org.mediawiki.importer.Siteinfo; |
13 | 15 | import org.wikimedia.lsearch.beans.ArticleLinks; |
| 16 | +import org.wikimedia.lsearch.util.Localization; |
14 | 17 | |
15 | 18 | /** |
16 | 19 | * Read a HashSet of titles from dump |
— | — | @@ -21,6 +24,11 @@ |
22 | 25 | Page page; |
23 | 26 | Revision revision; |
24 | 27 | HashMap<String,ArticleLinks> titles = new HashMap<String,ArticleLinks>(); |
| 28 | + protected String langCode; |
| 29 | + |
| 30 | + public TitleReader(String langCode){ |
| 31 | + this.langCode = langCode; |
| 32 | + } |
25 | 33 | |
26 | 34 | public void writeRevision(Revision revision) throws IOException { |
27 | 35 | this.revision = revision; |
— | — | @@ -42,7 +50,12 @@ |
43 | 51 | // nop |
44 | 52 | } |
45 | 53 | public void writeSiteinfo(Siteinfo info) throws IOException { |
46 | | - // nop |
| 54 | + // write siteinfo to localization |
| 55 | + Iterator it = info.Namespaces.orderedEntries(); |
| 56 | + while(it.hasNext()){ |
| 57 | + Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
| 58 | + Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
| 59 | + } |
47 | 60 | } |
48 | 61 | public void writeStartWiki() throws IOException { |
49 | 62 | // nop |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.HashMap; |
| 7 | +import java.util.Iterator; |
7 | 8 | import java.util.Map.Entry; |
8 | 9 | import java.util.concurrent.ThreadPoolExecutor.AbortPolicy; |
9 | 10 | import java.util.regex.Matcher; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -106,7 +106,7 @@ |
107 | 107 | IndexWriter writer = indexes.get(target.toString()); |
108 | 108 | if(writer == null) |
109 | 109 | return; |
110 | | - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters); |
| 110 | + Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters,iid); |
111 | 111 | Document doc = (Document) ret[0]; |
112 | 112 | Analyzer analyzer = (Analyzer) ret[1]; |
113 | 113 | try { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.log4j.Logger; |
12 | 12 | import org.apache.lucene.analysis.Analyzer; |
13 | 13 | import org.apache.lucene.document.Document; |
| 14 | +import org.apache.lucene.index.IndexReader; |
14 | 15 | import org.apache.lucene.queryParser.ParseException; |
15 | 16 | import org.apache.lucene.search.Hits; |
16 | 17 | import org.apache.lucene.search.Query; |
— | — | @@ -117,13 +118,15 @@ |
118 | 119 | Query q = null; |
119 | 120 | SearchResults res = null; |
120 | 121 | long searchStart = System.currentTimeMillis(); |
121 | | - Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes(); |
| 122 | + Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes(); |
| 123 | + boolean searchAll = false; |
122 | 124 | |
123 | 125 | // if search is over one field, try to use filters |
124 | 126 | if(fields.size()==1){ |
125 | | - if(fields.contains(new NamespaceFilter())) |
| 127 | + if(fields.contains(new NamespaceFilter())){ |
126 | 128 | nsfw = null; // empty filter: "all" keyword |
127 | | - else if(!fields.contains(nsDefault)){ |
| 129 | + searchAll = true; |
| 130 | + } else if(!fields.contains(nsDefault)){ |
128 | 131 | // use the specified prefix in the query (if it can be cached) |
129 | 132 | NamespaceFilter f = fields.toArray(new NamespaceFilter[] {})[0]; |
130 | 133 | if(f.cardinality()==1 || NamespaceCache.isComposable(f)) |
— | — | @@ -135,7 +138,10 @@ |
136 | 139 | |
137 | 140 | try { |
138 | 141 | if(nsfw == null){ |
139 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
| 142 | + if(searchAll) |
| 143 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
| 144 | + else |
| 145 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
140 | 146 | } |
141 | 147 | else{ |
142 | 148 | q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import java.util.Arrays; |
12 | 12 | import java.util.Collection; |
13 | 13 | import java.util.Collections; |
| 14 | +import java.util.Comparator; |
14 | 15 | import java.util.HashSet; |
15 | 16 | import java.util.Hashtable; |
16 | 17 | import java.util.Set; |
— | — | @@ -28,10 +29,12 @@ |
29 | 30 | import org.wikimedia.lsearch.analyzers.Analyzers; |
30 | 31 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
31 | 32 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
| 33 | +import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer; |
32 | 34 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
33 | 35 | import org.wikimedia.lsearch.beans.Article; |
34 | 36 | import org.wikimedia.lsearch.beans.IndexReportCard; |
35 | 37 | import org.wikimedia.lsearch.beans.Redirect; |
| 38 | +import org.wikimedia.lsearch.beans.Title; |
36 | 39 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
37 | 40 | import org.wikimedia.lsearch.config.IndexId; |
38 | 41 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
— | — | @@ -54,6 +57,8 @@ |
55 | 58 | } |
56 | 59 | |
57 | 60 | static public final int MAX_FIELD_LENGTH = 100000; |
| 61 | + /** number of aditional title1, title2, .. etc fields to be filled in with redirects */ |
| 62 | + static public int ALT_TITLES = 3; |
58 | 63 | /** Simple implementation of batch addition and deletion */ |
59 | 64 | class SimpleIndexModifier { |
60 | 65 | protected IndexId iid; |
— | — | @@ -179,7 +184,7 @@ |
180 | 185 | if(!checkPreconditions(rec)) |
181 | 186 | continue; // article shoouldn't be added for some (heuristic) reason |
182 | 187 | IndexReportCard card = getReportCard(rec); |
183 | | - Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters); |
| 188 | + Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters,iid); |
184 | 189 | Document doc = (Document) ret[0]; |
185 | 190 | Analyzer analyzer = (Analyzer) ret[1]; |
186 | 191 | try { |
— | — | @@ -223,15 +228,17 @@ |
224 | 229 | * @return |
225 | 230 | */ |
226 | 231 | public static boolean checkAddPreconditions(Article ar, String langCode){ |
227 | | - if(ar.getNamespace().equals("0")){ |
228 | | - String redirect = Localization.getRedirectTarget(ar.getContents(),langCode); |
229 | | - if(redirect != null) |
230 | | - return false; // don't add redirects |
231 | | - /*if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){ |
| 232 | + Title redirect = Localization.getRedirectTitle(ar.getContents(),langCode); |
| 233 | + int ns = Integer.parseInt(ar.getNamespace()); |
| 234 | + if(redirect!=null && redirect.getNamespace() == ns){ |
| 235 | + return false; // don't add redirects to same namespace, always add as redirect field |
| 236 | + } |
| 237 | + |
| 238 | + /*if(ar.getNamespace().equals("0")){ |
| 239 | + if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){ |
232 | 240 | log.debug("Not adding "+ar+" into index: "+ar.getContents()); |
233 | 241 | return false; |
234 | 242 | } */ |
235 | | - } |
236 | 243 | return true; |
237 | 244 | } |
238 | 245 | |
— | — | @@ -243,24 +250,43 @@ |
244 | 251 | */ |
245 | 252 | protected static void transformArticleForIndexing(Article ar) { |
246 | 253 | ArrayList<Redirect> redirects = ar.getRedirects(); |
| 254 | + // sort redirect by their rank |
| 255 | + Collections.sort(redirects,new Comparator<Redirect>() { |
| 256 | + public int compare(Redirect o1,Redirect o2){ |
| 257 | + return o2.getReferences() - o1.getReferences(); |
| 258 | + } |
| 259 | + }); |
247 | 260 | int ns = Integer.parseInt(ar.getNamespace()); |
248 | 261 | ar.setRank(ar.getReferences()); // base rank value |
249 | 262 | if(redirects != null){ |
250 | 263 | ArrayList<String> filtered = new ArrayList<String>(); |
| 264 | + ArrayList<Integer> ranks = new ArrayList<Integer>(); |
251 | 265 | // index only redirects from the same namespace |
252 | 266 | // to avoid a lot of unusable redirects from/to |
253 | 267 | // user namespace, but always index redirect FROM main |
254 | 268 | for(Redirect r : redirects){ |
255 | | - if((ns == 0 && r.getNamespace() == 0) || ns != 0){ |
| 269 | + if(ns == r.getNamespace()){ |
256 | 270 | filtered.add(r.getTitle()); |
| 271 | + ranks.add(r.getReferences()); |
257 | 272 | ar.addToRank(r.getReferences()+1); |
258 | 273 | } else |
259 | 274 | log.debug("Ignoring redirect "+r+" to "+ar); |
260 | 275 | } |
261 | 276 | ar.setRedirectKeywords(filtered); |
| 277 | + ar.setRedirectKeywordRanks(ranks); |
262 | 278 | } |
263 | 279 | } |
264 | 280 | |
| 281 | + /** Check if for this article for this db we should extract keywords */ |
| 282 | + public static boolean checkKeywordPreconditions(Article article, IndexId iid) { |
| 283 | + if(global == null) |
| 284 | + global = GlobalConfiguration.getInstance(); |
| 285 | + if(article.getNamespace().equals("0") && global.useKeywordScoring(iid.getDBname())) |
| 286 | + return true; |
| 287 | + else |
| 288 | + return false; |
| 289 | + } |
| 290 | + |
265 | 291 | /** |
266 | 292 | * Create necessary directories for index |
267 | 293 | * @param dbname |
— | — | @@ -372,7 +398,7 @@ |
373 | 399 | * @param languageAnalyzer |
374 | 400 | * @return array { document, analyzer } |
375 | 401 | */ |
376 | | - public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){ |
| 402 | + public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters, IndexId iid){ |
377 | 403 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
378 | 404 | WikiTokenizer tokenizer = null; |
379 | 405 | Document doc = new Document(); |
— | — | @@ -387,24 +413,27 @@ |
388 | 414 | doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
389 | 415 | |
390 | 416 | // boost document title with it's article rank |
391 | | - Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED); |
| 417 | + Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED); |
392 | 418 | //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
393 | 419 | float rankBoost = calculateArticleRank(article.getRank()); |
394 | 420 | title.setBoost(rankBoost); |
395 | 421 | doc.add(title); |
396 | 422 | |
| 423 | + Field stemtitle = new Field("stemtitle", article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED); |
| 424 | + //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
| 425 | + stemtitle.setBoost(rankBoost); |
| 426 | + doc.add(stemtitle); |
| 427 | + |
| 428 | + // put the best redirects as alternative titles |
| 429 | + makeAltTitles(doc,"alttitle",article); |
| 430 | + |
397 | 431 | // add titles of redirects, generated from analyzer |
398 | | - Field redirect = new Field("redirect", "", |
399 | | - Field.Store.NO, Field.Index.TOKENIZED); |
400 | | - redirect.setBoost(rankBoost); |
401 | | - doc.add(redirect); |
| 432 | + makeKeywordField(doc,"redirect",rankBoost); |
402 | 433 | |
403 | | - // most significat words in the text, gets extra score, from analyzer |
404 | | - Field keyword = new Field("keyword", "", |
405 | | - Field.Store.NO, Field.Index.TOKENIZED); |
406 | | - keyword.setBoost(rankBoost); |
407 | | - doc.add(keyword); |
408 | | - |
| 434 | + if(checkKeywordPreconditions(article,iid)) |
| 435 | + // most significat words in the text, gets extra score, from analyzer |
| 436 | + makeKeywordField(doc,"keyword",rankBoost); |
| 437 | + |
409 | 438 | // the next fields are generated using wikitokenizer |
410 | 439 | doc.add(new Field("contents", "", |
411 | 440 | Field.Store.NO, Field.Index.TOKENIZED)); |
— | — | @@ -425,7 +454,35 @@ |
426 | 455 | |
427 | 456 | return new Object[] { doc, perFieldAnalyzer }; |
428 | 457 | } |
429 | | - |
| 458 | + |
| 459 | + /** Make a multiple keyword field, e.g. redirect1, redirect2, redirect3 ... */ |
| 460 | + protected static void makeKeywordField(Document doc, String prefix, float boost) { |
| 461 | + for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){ |
| 462 | + Field keyfield = new Field(prefix+i, "", |
| 463 | + Field.Store.NO, Field.Index.TOKENIZED); |
| 464 | + keyfield.setBoost(boost); |
| 465 | + doc.add(keyfield); |
| 466 | + } |
| 467 | + |
| 468 | + } |
| 469 | + |
| 470 | + protected static void makeAltTitles(Document doc, String prefix, Article article) { |
| 471 | + // the redirects, rank list are sorted.. |
| 472 | + final ArrayList<String> redirects = article.getRedirectKeywords(); |
| 473 | + final ArrayList<Integer> ranks = article.getRedirectKeywordRanks(); |
| 474 | + if(redirects.size() == 0) |
| 475 | + return; |
| 476 | + // add alternative titles alttitle1, alttitle2 ... |
| 477 | + for(int i=0;i<ALT_TITLES && i<redirects.size();i++){ |
| 478 | + if(ranks.get(i) == 0) |
| 479 | + break; // we don't want redirects with zero links |
| 480 | + //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i)); |
| 481 | + Field alttitle = new Field("alttitle"+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED); |
| 482 | + alttitle.setBoost(calculateArticleRank(ranks.get(i))); |
| 483 | + doc.add(alttitle); |
| 484 | + } |
| 485 | + } |
| 486 | + |
430 | 487 | /** |
431 | 488 | * |
432 | 489 | * Calculate document boost (article rank) from number of |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java |
— | — | @@ -16,12 +16,12 @@ |
17 | 17 | /** |
18 | 18 | * For content: |
19 | 19 | * * length norm is a linear function, with f(1) = 1 |
20 | | - * and f(10000) = 0.2 |
| 20 | + * and f(10000) = 0.5 |
21 | 21 | * |
22 | | - * For titles: |
| 22 | + * For titles / title aliases: |
23 | 23 | * * 1/sqrt(term^3) |
24 | 24 | * |
25 | | - * For redirect: |
| 25 | + * For redirect / keywords: |
26 | 26 | * * no length norm |
27 | 27 | * |
28 | 28 | */ |
— | — | @@ -35,11 +35,11 @@ |
36 | 36 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
37 | 37 | return f; |
38 | 38 | } |
39 | | - } else if(fieldName.equals("title")){ |
| 39 | + } else if(fieldName.equals("title") || fieldName.startsWith("alttitle")){ |
40 | 40 | float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
41 | 41 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
42 | 42 | return f; |
43 | | - } else if(fieldName.equals("redirect") || fieldName.equals("keyword")){ |
| 43 | + } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword")){ |
44 | 44 | return 1; |
45 | 45 | } else |
46 | 46 | return super.lengthNorm(fieldName,numTokens); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -22,6 +22,7 @@ |
23 | 23 | import org.apache.lucene.search.spans.SpanQuery; |
24 | 24 | import org.apache.lucene.search.spans.SpanTermQuery; |
25 | 25 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 26 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
26 | 27 | import org.wikimedia.lsearch.search.NamespaceFilter; |
27 | 28 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
28 | 29 | |
— | — | @@ -57,6 +58,7 @@ |
58 | 59 | private String field; // current field |
59 | 60 | private String defaultField; // the default field value |
60 | 61 | private float defaultBoost = 1; |
| 62 | + private float defaultAliasBoost = ALIAS_BOOST; |
61 | 63 | protected enum TokenType {WORD, FIELD, AND, OR, EOF }; |
62 | 64 | |
63 | 65 | private TokenStream tokenStream; |
— | — | @@ -70,11 +72,16 @@ |
71 | 73 | protected boolean disableTitleAliases; |
72 | 74 | |
73 | 75 | /** boost for alias words from analyzer */ |
74 | | - public final float ALIAS_BOOST = 0.5f; |
| 76 | + public static float ALIAS_BOOST = 0.5f; |
75 | 77 | /** boost for title field */ |
76 | | - public static float TITLE_BOOST = 8; |
| 78 | + public static float TITLE_BOOST = 6; |
| 79 | + public static float TITLE_ALIAS_BOOST = 0.2f; |
| 80 | + public static float STEM_TITLE_BOOST = 2; |
| 81 | + public static float STEM_TITLE_ALIAS_BOOST = 0.4f; |
77 | 82 | public static float REDIRECT_BOOST = 0.2f; |
78 | | - public static float KEYWORD_BOOST = 0.05f; |
| 83 | + public static float ALT_TITLE_BOOST = 2; |
| 84 | + public static float ALT_TITLE_ALIAS_BOOST = 0.4f; |
| 85 | + public static float KEYWORD_BOOST = 0.02f; |
79 | 86 | |
80 | 87 | /** Policies in treating field names: |
81 | 88 | * |
— | — | @@ -663,9 +670,10 @@ |
664 | 671 | return new TermQuery(makeTerm()); |
665 | 672 | } |
666 | 673 | |
667 | | - // check for wildcard seaches, they are also not analyzed/stemmed |
| 674 | + // check for wildcard seaches, they are also not analyzed/stemmed, only for titles |
668 | 675 | // wildcard signs are allowed only at the end of the word, minimum one letter word |
669 | | - if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?')){ |
| 676 | + if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?') && |
| 677 | + defaultField.equals("title")){ |
670 | 678 | Query ret = new WildcardQuery(makeTerm()); |
671 | 679 | ret.setBoost(defaultBoost); |
672 | 680 | return ret; |
— | — | @@ -691,12 +699,12 @@ |
692 | 700 | else if(token.type().equals("stemmed")){ |
693 | 701 | // stemmed word |
694 | 702 | t = new TermQuery(makeTerm(token)); |
695 | | - t.setBoost(ALIAS_BOOST*defaultBoost); |
| 703 | + t.setBoost(defaultAliasBoost*defaultBoost); |
696 | 704 | cur.add(t,aliasOccur); |
697 | 705 | } else if(token.type().equals("alias")){ |
698 | 706 | // produced by alias engine (e.g. for sr) |
699 | 707 | t = new TermQuery(makeTerm(token)); |
700 | | - t.setBoost(ALIAS_BOOST*defaultBoost); |
| 708 | + t.setBoost(defaultAliasBoost*defaultBoost); |
701 | 709 | cur.add(t,aliasOccur); |
702 | 710 | } |
703 | 711 | if( cur != bq) // returned from nested query |
— | — | @@ -763,7 +771,7 @@ |
764 | 772 | |
765 | 773 | /** Duplicate a term query, setting "title" as field */ |
766 | 774 | private TermQuery makeTitleTermQuery(TermQuery tq){ |
767 | | - if(disableTitleAliases && tq.getBoost()==ALIAS_BOOST) |
| 775 | + if(disableTitleAliases && tq.getBoost()==defaultAliasBoost) |
768 | 776 | return null; |
769 | 777 | Term term = tq.getTerm(); |
770 | 778 | if(term.field().equals(defaultField)){ |
— | — | @@ -778,7 +786,7 @@ |
779 | 787 | |
780 | 788 | /** Duplicate a phrase query, setting "title" as field */ |
781 | 789 | private PhraseQuery makeTitlePhraseQuery(PhraseQuery pq){ |
782 | | - if(disableTitleAliases && pq.getBoost()==ALIAS_BOOST) |
| 790 | + if(disableTitleAliases && pq.getBoost()==defaultAliasBoost) |
783 | 791 | return null; |
784 | 792 | PhraseQuery pq2 = new PhraseQuery(); |
785 | 793 | Term[] terms = pq.getTerms(); |
— | — | @@ -1011,7 +1019,7 @@ |
1012 | 1020 | span = spans.get(0); |
1013 | 1021 | else{ |
1014 | 1022 | // make a span-near query that has a slop 1/2 of tokenGap |
1015 | | - span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.tokenGap-1)/2,false); |
| 1023 | + span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.TOKEN_GAP-1)/2,false); |
1016 | 1024 | span.setBoost(boost); |
1017 | 1025 | } |
1018 | 1026 | } |
— | — | @@ -1028,7 +1036,81 @@ |
1029 | 1037 | } |
1030 | 1038 | return null; |
1031 | 1039 | } |
| 1040 | + |
| 1041 | + protected BooleanQuery multiplySpans(Query query, int level, String fieldName, float boost){ |
| 1042 | + BooleanQuery bq = new BooleanQuery(true); |
| 1043 | + for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){ |
| 1044 | + Query q = extractSpans(query,0,fieldName+i,boost/i); |
| 1045 | + if(q != null) |
| 1046 | + bq.add(q,BooleanClause.Occur.SHOULD); |
| 1047 | + } |
| 1048 | + |
| 1049 | + if(bq.getClauses() == null || bq.getClauses().length==0) |
| 1050 | + return null; |
| 1051 | + else |
| 1052 | + return bq; |
| 1053 | + } |
1032 | 1054 | |
| 1055 | + /** Make a redirect query in format altitle1:query altitle2:query ... redirect:spanquery */ |
| 1056 | + protected BooleanQuery makeRedirectQuery(String queryText, Query qt) { |
| 1057 | + BooleanQuery bq = new BooleanQuery(true); |
| 1058 | + float olfDefaultBoost = defaultBoost; |
| 1059 | + String contentField = defaultField; |
| 1060 | + defaultBoost = ALT_TITLE_BOOST; |
| 1061 | + defaultAliasBoost = ALT_TITLE_ALIAS_BOOST; |
| 1062 | + for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){ |
| 1063 | + defaultField = "alttitle"+i; |
| 1064 | + Query q = parseRaw(queryText); |
| 1065 | + if(q != null) |
| 1066 | + bq.add(q,BooleanClause.Occur.SHOULD); |
| 1067 | + } |
| 1068 | + // pop stack |
| 1069 | + defaultField = contentField; |
| 1070 | + defaultBoost = olfDefaultBoost; |
| 1071 | + defaultAliasBoost = ALIAS_BOOST; |
| 1072 | + |
| 1073 | + Query qs = multiplySpans(qt,0,"redirect",REDIRECT_BOOST); |
| 1074 | + // merge queries |
| 1075 | + if(qs != null){ |
| 1076 | + bq.add(qs,BooleanClause.Occur.SHOULD); |
| 1077 | + } |
| 1078 | + if(bq.getClauses() == null || bq.getClauses().length==0) |
| 1079 | + return null; |
| 1080 | + else |
| 1081 | + return bq; |
| 1082 | + |
| 1083 | + } |
| 1084 | + |
| 1085 | + /** Make title query in format: title:query stemtitle:stemmedquery */ |
| 1086 | + protected Query makeTitleQuery(String queryText) { |
| 1087 | + String contentField = defaultField; |
| 1088 | + float olfDefaultBoost = defaultBoost; |
| 1089 | + defaultField = "title"; // now parse the title part |
| 1090 | + defaultBoost = TITLE_BOOST; |
| 1091 | + defaultAliasBoost = TITLE_ALIAS_BOOST; |
| 1092 | + Query qt = parseRaw(queryText); |
| 1093 | + // stemmed title |
| 1094 | + defaultField = "stemtitle"; |
| 1095 | + defaultBoost = STEM_TITLE_BOOST; |
| 1096 | + defaultAliasBoost = STEM_TITLE_ALIAS_BOOST; |
| 1097 | + Query qs = parseRaw(queryText); |
| 1098 | + // pop stack |
| 1099 | + defaultField = contentField; |
| 1100 | + defaultBoost = olfDefaultBoost; |
| 1101 | + defaultAliasBoost = ALIAS_BOOST; |
| 1102 | + |
| 1103 | + if(qt == qs) // either null, or category query |
| 1104 | + return qt; |
| 1105 | + if(qt == null) |
| 1106 | + return qs; |
| 1107 | + if(qs == null) |
| 1108 | + return qt; |
| 1109 | + BooleanQuery bq = new BooleanQuery(true); |
| 1110 | + bq.add(qt,BooleanClause.Occur.SHOULD); |
| 1111 | + bq.add(qs,BooleanClause.Occur.SHOULD); |
| 1112 | + return bq; |
| 1113 | + } |
| 1114 | + |
1033 | 1115 | /** |
1034 | 1116 | * Main function for multi-pass parsing. |
1035 | 1117 | * |
— | — | @@ -1039,17 +1121,12 @@ |
1040 | 1122 | */ |
1041 | 1123 | protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){ |
1042 | 1124 | if(policy != null) |
1043 | | - this.namespacePolicy = policy; |
1044 | | - float olfDefaultBoost = defaultBoost; |
| 1125 | + this.namespacePolicy = policy; |
1045 | 1126 | defaultBoost = 1; |
1046 | | - Query qc = parseRaw(queryText); |
1047 | | - String contentField = defaultField; |
1048 | | - defaultField = "title"; // now parse the title part |
1049 | | - defaultBoost = TITLE_BOOST; |
1050 | | - Query qt = parseRaw(queryText); |
1051 | | - // pop stack |
1052 | | - defaultField = contentField; |
1053 | | - defaultBoost = olfDefaultBoost; |
| 1127 | + defaultAliasBoost = ALIAS_BOOST; |
| 1128 | + Query qc = parseRaw(queryText); |
| 1129 | + |
| 1130 | + Query qt = makeTitleQuery(queryText); |
1054 | 1131 | if(qc == null || qt == null) |
1055 | 1132 | return new BooleanQuery(); |
1056 | 1133 | if(qc.equals(qt)) |
— | — | @@ -1058,15 +1135,23 @@ |
1059 | 1136 | bq.add(qc,BooleanClause.Occur.SHOULD); |
1060 | 1137 | bq.add(qt,BooleanClause.Occur.SHOULD); |
1061 | 1138 | |
| 1139 | + Query nostem = null; |
| 1140 | + if(makeRedirect || makeKeywords){ |
| 1141 | + String contentField = defaultField; |
| 1142 | + defaultField = "keyword"; // this field is never stemmed |
| 1143 | + nostem = parseRaw(queryText); |
| 1144 | + defaultField = contentField; |
| 1145 | + } |
| 1146 | + |
1062 | 1147 | // redirect pass |
1063 | | - if(makeRedirect){ |
1064 | | - Query qr = extractSpans(qt,0,"redirect",REDIRECT_BOOST); |
| 1148 | + if(makeRedirect && nostem!=null){ |
| 1149 | + BooleanQuery qr = makeRedirectQuery(queryText,nostem); |
1065 | 1150 | if(qr != null) |
1066 | 1151 | bq.add(qr,BooleanClause.Occur.SHOULD); |
1067 | 1152 | } |
1068 | 1153 | // keyword pass |
1069 | | - if(makeKeywords){ |
1070 | | - Query qk = extractSpans(qt,0,"keyword",KEYWORD_BOOST); |
| 1154 | + if(makeKeywords && nostem!=null){ |
| 1155 | + Query qk = multiplySpans(nostem,0,"keyword",KEYWORD_BOOST); |
1071 | 1156 | if(qk != null) |
1072 | 1157 | bq.add(qk,BooleanClause.Occur.SHOULD); |
1073 | 1158 | } |
— | — | @@ -1074,7 +1159,7 @@ |
1075 | 1160 | return bq; |
1076 | 1161 | |
1077 | 1162 | } |
1078 | | - |
| 1163 | + |
1079 | 1164 | /** |
1080 | 1165 | * Three parse pases: contents, title, redirect |
1081 | 1166 | * |
— | — | @@ -1099,6 +1184,10 @@ |
1100 | 1185 | return parseMultiPass(queryText,policy,true,makeKeywords); |
1101 | 1186 | } |
1102 | 1187 | |
| 1188 | + public Query parseFourPass(String queryText, NamespacePolicy policy, boolean makeKeywords) throws ParseException{ |
| 1189 | + return parseMultiPass(queryText,policy,true,makeKeywords); |
| 1190 | + } |
| 1191 | + |
1103 | 1192 | /** |
1104 | 1193 | * Parse the query according to policy. Instead of rewrite phrase, simply pass |
1105 | 1194 | * twice the query with different default fields. |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -14,6 +14,7 @@ |
15 | 15 | import org.apache.lucene.analysis.th.ThaiWordFilter; |
16 | 16 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
17 | 17 | import org.wikimedia.lsearch.config.IndexId; |
| 18 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
18 | 19 | import org.wikimedia.lsearch.test.AliasPorterStemFilter; |
19 | 20 | |
20 | 21 | /** |
— | — | @@ -63,13 +64,29 @@ |
64 | 65 | new CategoryAnalyzer(categories)); |
65 | 66 | perFieldAnalyzer.addAnalyzer("title", |
66 | 67 | getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
67 | | - perFieldAnalyzer.addAnalyzer("redirect", |
68 | | - new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory())); |
69 | | - perFieldAnalyzer.addAnalyzer("keyword", |
70 | | - new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory())); |
| 68 | + perFieldAnalyzer.addAnalyzer("stemtitle", |
| 69 | + getTitleAnalyzer(filters)); |
| 70 | + setAltTitleAnalyzer(perFieldAnalyzer,"alttitle", |
| 71 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
| 72 | + setKeywordAnalyzer(perFieldAnalyzer,"redirect", |
| 73 | + new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),"redirect")); |
| 74 | + setKeywordAnalyzer(perFieldAnalyzer,"keyword", |
| 75 | + new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),"keyword")); |
71 | 76 | return new Object[] {perFieldAnalyzer,tokenizer}; |
72 | 77 | } |
73 | 78 | |
| 79 | + protected static void setAltTitleAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, Analyzer analyzer) { |
| 80 | + for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){ |
| 81 | + perFieldAnalyzer.addAnalyzer(prefix+i,analyzer); |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + protected static void setKeywordAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, KeywordsAnalyzer analyzer) { |
| 86 | + for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){ |
| 87 | + perFieldAnalyzer.addAnalyzer(prefix+i,analyzer); |
| 88 | + } |
| 89 | + } |
| 90 | + |
74 | 91 | public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){ |
75 | 92 | if(global == null) |
76 | 93 | global = GlobalConfiguration.getInstance(); |
— | — | @@ -95,6 +112,12 @@ |
96 | 113 | new QueryLanguageAnalyzer(filters)); |
97 | 114 | perFieldAnalyzer.addAnalyzer("title", |
98 | 115 | getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
| 116 | + perFieldAnalyzer.addAnalyzer("stemtitle", |
| 117 | + getTitleAnalyzer(filters)); |
| 118 | + setAltTitleAnalyzer(perFieldAnalyzer,"alttitle", |
| 119 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
| 120 | + perFieldAnalyzer.addAnalyzer("keyword", |
| 121 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
99 | 122 | |
100 | 123 | return perFieldAnalyzer; |
101 | 124 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -25,31 +25,68 @@ |
26 | 26 | * |
27 | 27 | */ |
28 | 28 | public class KeywordsAnalyzer extends Analyzer{ |
29 | | - static Logger log = Logger.getLogger(KeywordsAnalyzer.class); |
30 | | - protected ArrayList<String> keywords; |
31 | | - protected FilterFactory filters; |
32 | | - protected KeywordsTokenStream tokens; |
| 29 | + static Logger log = Logger.getLogger(KeywordsAnalyzer.class); |
| 30 | + protected KeywordsTokenStream[] tokensBySize = null; |
| 31 | + protected String prefix; |
| 32 | + |
| 33 | + /** number of field to be generated, e.g. keyword1 for single-word keywords, |
| 34 | + * keyword2 for two-word keywords, etc ... the last field has all the remaining keys |
| 35 | + */ |
| 36 | + public static final int KEYWORD_LEVELS = 5; |
| 37 | + /** positional increment between different redirects */ |
| 38 | + public static final int TOKEN_GAP = 201; |
33 | 39 | |
34 | | - public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters){ |
| 40 | + public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix){ |
35 | 41 | ArrayList<String> k = new ArrayList<String>(); |
36 | 42 | if(keywords != null) |
37 | 43 | k.addAll(keywords); |
38 | | - tokens = new KeywordsTokenStream(k,filters); |
| 44 | + init(k,filters,prefix); |
39 | 45 | } |
| 46 | + public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix){ |
| 47 | + init(keywords,filters,prefix); |
| 48 | + } |
40 | 49 | |
41 | | - public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters){ |
42 | | - tokens = new KeywordsTokenStream(keywords,filters); |
| 50 | + protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix) { |
| 51 | + this.prefix = prefix; |
| 52 | + tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS]; |
| 53 | + if(keywords == null){ |
| 54 | + // init empty token streams |
| 55 | + for(int i=0; i< KEYWORD_LEVELS; i++){ |
| 56 | + tokensBySize[i] = new KeywordsTokenStream(null,filters); |
| 57 | + } |
| 58 | + return; |
| 59 | + } |
| 60 | + ArrayList<ArrayList<String>> keywordsBySize = new ArrayList<ArrayList<String>>(); |
| 61 | + for(int i=0;i<KEYWORD_LEVELS;i++) |
| 62 | + keywordsBySize.add(new ArrayList<String>()); |
| 63 | + // arange keywords into a list by token number |
| 64 | + for(String k : keywords){ |
| 65 | + ArrayList<Token> parsed = new FastWikiTokenizerEngine(k).parse(); |
| 66 | + if(parsed.size() == 0) |
| 67 | + continue; |
| 68 | + else if(parsed.size() < KEYWORD_LEVELS) |
| 69 | + keywordsBySize.get(parsed.size()-1).add(k); |
| 70 | + else |
| 71 | + keywordsBySize.get(KEYWORD_LEVELS-1).add(k); |
| 72 | + } |
| 73 | + for(int i=0; i< KEYWORD_LEVELS; i++){ |
| 74 | + tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters); |
| 75 | + } |
43 | 76 | } |
44 | | - /** positional increment between different redirects */ |
45 | | - public static final int tokenGap = 201; |
46 | 77 | |
47 | 78 | @Override |
48 | 79 | public TokenStream tokenStream(String fieldName, Reader reader) { |
49 | | - return tokens; |
| 80 | + if(fieldName.startsWith(prefix)){ |
| 81 | + int inx = Integer.parseInt(fieldName.substring(prefix.length())); |
| 82 | + return tokensBySize[inx-1]; |
| 83 | + } else{ |
| 84 | + log.error("Trying to get tokenStream for wrong field "+fieldName); |
| 85 | + return null; |
| 86 | + } |
50 | 87 | } |
51 | 88 | @Override |
52 | 89 | public TokenStream tokenStream(String fieldName, String text) { |
53 | | - return tokens; |
| 90 | + return tokenStream(fieldName,(Reader)null); |
54 | 91 | } |
55 | 92 | |
56 | 93 | class KeywordsTokenStream extends TokenStream { |
— | — | @@ -80,7 +117,7 @@ |
81 | 118 | if(t == null){ |
82 | 119 | t = openNext(); |
83 | 120 | if(t != null) |
84 | | - t.setPositionIncrement(tokenGap); |
| 121 | + t.setPositionIncrement(TOKEN_GAP); |
85 | 122 | } |
86 | 123 | return t; |
87 | 124 | } else{ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java |
— | — | @@ -29,6 +29,11 @@ |
30 | 30 | public void setTitle(String title) { |
31 | 31 | this.title = title; |
32 | 32 | } |
| 33 | + @Override |
| 34 | + public String toString() { |
| 35 | + return namespace+":"+title+" ("+references+")"; |
| 36 | + } |
33 | 37 | |
34 | 38 | |
| 39 | + |
35 | 40 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -43,6 +43,8 @@ |
44 | 44 | private ArrayList<Redirect> redirects; |
45 | 45 | /** generated before indexing from the list of redirects */ |
46 | 46 | private transient ArrayList<String> redirectKeywords; |
| 47 | + /** paired with previous list, ranks for each redirect */ |
| 48 | + private transient ArrayList<Integer> redirectKeywordRanks; |
47 | 49 | /** generated before indexing from the reference sto this article, and references from redirects */ |
48 | 50 | private transient int rank; |
49 | 51 | |
— | — | @@ -182,6 +184,16 @@ |
183 | 185 | public void setRedirectKeywords(ArrayList<String> redirectKeywords) { |
184 | 186 | this.redirectKeywords = redirectKeywords; |
185 | 187 | } |
| 188 | + |
| 189 | + public ArrayList<Integer> getRedirectKeywordRanks() { |
| 190 | + return redirectKeywordRanks; |
| 191 | + } |
| 192 | + |
| 193 | + public void setRedirectKeywordRanks(ArrayList<Integer> redirectKeywordRanks) { |
| 194 | + this.redirectKeywordRanks = redirectKeywordRanks; |
| 195 | + } |
186 | 196 | |
187 | 197 | |
| 198 | + |
| 199 | + |
188 | 200 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -2,6 +2,7 @@ |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
| 6 | +import java.util.Iterator; |
6 | 7 | |
7 | 8 | import org.apache.log4j.Logger; |
8 | 9 | import org.mediawiki.importer.DumpWriter; |
— | — | @@ -11,8 +12,10 @@ |
12 | 13 | import org.mediawiki.importer.Title; |
13 | 14 | import org.wikimedia.lsearch.beans.Article; |
14 | 15 | import org.wikimedia.lsearch.beans.Redirect; |
| 16 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
15 | 17 | import org.wikimedia.lsearch.config.IndexId; |
16 | 18 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
| 19 | +import org.wikimedia.lsearch.util.Localization; |
17 | 20 | |
18 | 21 | public class IndexUpdatesCollector implements DumpWriter { |
19 | 22 | Logger log = Logger.getLogger(DumpWriter.class); |
— | — | @@ -23,9 +26,11 @@ |
24 | 27 | protected int references = 0; |
25 | 28 | protected ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
26 | 29 | protected Siteinfo info = null; |
| 30 | + protected String langCode; |
27 | 31 | |
28 | 32 | public IndexUpdatesCollector(IndexId iid){ |
29 | 33 | this.iid = iid; |
| 34 | + this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
30 | 35 | } |
31 | 36 | |
32 | 37 | public void addRedirect(String redirectTitle, int references) { |
— | — | @@ -69,6 +74,13 @@ |
70 | 75 | |
71 | 76 | public void writeSiteinfo(Siteinfo info) throws IOException { |
72 | 77 | this.info = info; |
| 78 | + // write to localization |
| 79 | + Iterator it = info.Namespaces.orderedEntries(); |
| 80 | + while(it.hasNext()){ |
| 81 | + Integer inx = (Integer)it.next(); |
| 82 | + String prefix = info.Namespaces.getPrefix(inx); |
| 83 | + Localization.addCustomMapping(prefix,inx,langCode); |
| 84 | + } |
73 | 85 | } |
74 | 86 | |
75 | 87 | public void close() throws IOException { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import java.util.Map.Entry; |
12 | 12 | |
13 | 13 | import org.apache.log4j.Logger; |
| 14 | +import org.wikimedia.lsearch.beans.Title; |
14 | 15 | import org.wikimedia.lsearch.config.Configuration; |
15 | 16 | |
16 | 17 | /** |
— | — | @@ -27,7 +28,40 @@ |
28 | 29 | /** Languages for which loading of localization failed */ |
29 | 30 | protected static HashSet<String> badLocalizations = new HashSet<String>(); |
30 | 31 | protected static HashSet<String> interwiki = null; |
| 32 | + /** lowecased canonical names of namespaces */ |
| 33 | + protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>(); |
| 34 | + static{ |
| 35 | + canonicalNamespaces.put("media",-2); |
| 36 | + canonicalNamespaces.put("special",-1); |
| 37 | + canonicalNamespaces.put("talk",1); |
| 38 | + canonicalNamespaces.put("user",2); |
| 39 | + canonicalNamespaces.put("user_talk",3); |
| 40 | + canonicalNamespaces.put("project",4); |
| 41 | + canonicalNamespaces.put("project_talk",5); |
| 42 | + canonicalNamespaces.put("image",6); |
| 43 | + canonicalNamespaces.put("image_talk",7); |
| 44 | + canonicalNamespaces.put("mediawiki",8); |
| 45 | + canonicalNamespaces.put("mediawiki_talk",9); |
| 46 | + canonicalNamespaces.put("template",10); |
| 47 | + canonicalNamespaces.put("template_talk",11); |
| 48 | + canonicalNamespaces.put("help",12); |
| 49 | + canonicalNamespaces.put("help_talk",13); |
| 50 | + canonicalNamespaces.put("category",14); |
| 51 | + canonicalNamespaces.put("category_talk",15); |
| 52 | + } |
31 | 53 | |
| 54 | + /** Add custom mapping not found in localization files from other source, e.g. project name, etc.. */ |
| 55 | + public static void addCustomMapping(String namespace, int index, String langCode){ |
| 56 | + synchronized(lock){ |
| 57 | + Hashtable<String,Integer> map = namespaces.get(langCode); |
| 58 | + if(map == null){ |
| 59 | + map = new Hashtable<String,Integer>(); |
| 60 | + namespaces.put(langCode,map); |
| 61 | + } |
| 62 | + map.put(namespace.toLowerCase(),index); |
| 63 | + } |
| 64 | + } |
| 65 | + |
32 | 66 | public static HashSet<String> getLocalizedImage(String langCode){ |
33 | 67 | return getLocalizedNamespace(langCode,6); |
34 | 68 | } |
— | — | @@ -169,6 +203,25 @@ |
170 | 204 | return null; |
171 | 205 | } |
172 | 206 | |
| 207 | + public static Title getRedirectTitle(String text, String lang){ |
| 208 | + String full = getRedirectTarget(text,lang); |
| 209 | + if(full == null) |
| 210 | + return null; |
| 211 | + String[] parts = full.split(":",2); |
| 212 | + if(parts.length == 2){ |
| 213 | + String ns = parts[0].toLowerCase(); |
| 214 | + // check canonical |
| 215 | + if(canonicalNamespaces.containsKey(ns)) |
| 216 | + return new Title(canonicalNamespaces.get(ns),parts[1]); |
| 217 | + // check lang namespaces |
| 218 | + Hashtable<String,Integer> map = namespaces.get(lang); |
| 219 | + if(map.containsKey(ns)) |
| 220 | + return new Title(map.get(ns),parts[1]); |
| 221 | + } |
| 222 | + // not recognized namespace, using main |
| 223 | + return new Title(0,full); |
| 224 | + } |
| 225 | + |
173 | 226 | /** Loads interwiki from default location lib/interwiki.map */ |
174 | 227 | public static void loadInterwiki(){ |
175 | 228 | if(interwiki != null) |