r22838 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r22837‎ \| r22838 \| r22839 >
Date:	14:51, 8 June 2007
Author:	rainman
Status:	old
Tags:
Comment:	Introduced new fields: * with stemmed titles * multiple fields for redirects and keywords * alttitle field for best ranked redirects
Modified paths:	/trunk/lucene-search-2.0/lsearch-global.conf (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/lsearch-global.conf
—	—	@@ -17,13 +17,13 @@
18	18	# host : db1.part db2.part
19	19	# Mulitple hosts can search multiple dbs (N-N mapping)
20	20	[Search-Group]
21		~~-oblak : wikilucene, wikidev~~
	21	+oblak : wikilucene wikidev
22	22
23	23	# Index nodes
24	24	# host: db1.part db2.part
25	25	# Each db.part can be indexed by only one host
26	26	[Index]
27		~~-oblak: wikilucene, wikidev~~
	27	+oblak: wikilucene wikidev
28	28
29	29	# Rsync path where indexes are on hosts, after default value put
30	30	# hosts where the location differs
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -14,6 +14,7 @@
15	15	import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy;
16	16	import org.wikimedia.lsearch.config.Configuration;
17	17	import org.wikimedia.lsearch.config.GlobalConfiguration;
	18	+import org.wikimedia.lsearch.index.WikiIndexModifier;
18	19	import org.wikimedia.lsearch.search.NamespaceFilter;
19	20
20	21	/**
—	—	@@ -31,6 +32,10 @@
32	33	Configuration.setConfigFile(System.getProperty("user.dir")+"/test-data/mwsearch.conf.test");
33	34	Configuration.open();
34	35	WikiQueryParser.TITLE_BOOST = 2;
	36	+ WikiQueryParser.REDIRECT_BOOST = 0.2f;
	37	+ WikiQueryParser.ALT_TITLE_BOOST = 6;
	38	+ WikiQueryParser.KEYWORD_BOOST = 0.05f;
	39	+ WikiIndexModifier.ALT_TITLES = 3;
35	40	try{
36	41	WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer());
37	42	Query q;
—	—	@@ -249,10 +254,10 @@
250	255	assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString());
251	256
252	257	q = parser.parseTwoPass("main:ba*",NamespacePolicy.IGNORE);
253		~~- assertEquals("contents:ba* title:ba*^2.0",q.toString());~~
	258	+ assertEquals("contents:ba title:ba*^2.0",q.toString());
254	259
255	260	q = parser.parseTwoPass("main:ba* all:lele",NamespacePolicy.REWRITE);
256		~~- assertEquals("(+(+namespace:0 +contents:ba) +contents:lele) (+(+namespace:0 +title:ba^2.0) +title:lele^2.0)",q.toString());~~
	261	+ assertEquals("(+(+namespace:0 +contents:ba) +contents:lele) (+(+namespace:0 +title:ba*^2.0) +title:lele^2.0)",q.toString());
257	262
258	263	q = parser.parseTwoPass("main:ba*beans",NamespacePolicy.IGNORE);
259	264	assertEquals("(+contents:ba +(contents:beans contents:bean^0.5)) (+title:ba^2.0 +title:beans^2.0)",q.toString());
—	—	@@ -279,27 +284,28 @@
280	285	q = parser.parseTwoPass("[1,a12]:beans",NamespacePolicy.IGNORE);
281	286	assertEquals("(+contents:1 +contents:a12 +(contents:beans contents:bean^0.5)) (+title:1^2.0 +title:a12^2.0 +title:beans^2.0)",q.toString());
282	287
283		~~- // Redirect third pass tests~~
284		~~- q = parser.parseThreePass("beans",NamespacePolicy.IGNORE);~~
285		~~- assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 redirect:beans^2.0",q.toString());~~
	288	+ // Redirect third/forth pass tests
	289	+ q = parser.parseFourPass("beans",NamespacePolicy.IGNORE,true);
	290	+ assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 (alttitle1:beans^6.0 alttitle2:beans^6.0 alttitle3:beans^6.0 redirect1:beans^0.2 redirect2:beans^0.1 redirect3:beans^0.06666667 redirect4:beans^0.05 redirect5:beans^0.04) (keyword1:beans^0.05 keyword2:beans^0.025 keyword3:beans^0.016666668 keyword4:beans^0.0125 keyword5:beans^0.01)",q.toString());
286	291
287		~~- q = parser.parseThreePass("beans everyone",NamespacePolicy.IGNORE);~~
288		- assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false)",q.toString());
	292	+ q = parser.parseFourPass("beans everyone",NamespacePolicy.IGNORE,true);
	293	+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0) spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04) (spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01)",q.toString());
289	294
290		~~- q = parser.parseThreePass("beans everyone incategory:mouse",NamespacePolicy.IGNORE);~~
291		- assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) (+spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false) +category:mouse)",q.toString());
	295	+ // TODO: check if this query will be optimized by lucene (categories)
	296	+ q = parser.parseFourPass("beans everyone incategory:mouse",NamespacePolicy.IGNORE,true);
	297	+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0 +category:mouse) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0 +category:mouse) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0 +category:mouse) (+spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 +category:mouse) (+spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 +category:mouse) (+spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 +category:mouse) (+spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 +category:mouse) (+spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04 +category:mouse)) ((+spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 +category:mouse) (+spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 +category:mouse) (+spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 +category:mouse) (+spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 +category:mouse) (+spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01 +category:mouse))",q.toString());
292	298
293		~~- q = parser.parseThreePass("beans OR everyone",NamespacePolicy.IGNORE);~~
294		~~- assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0)",q.toString());~~
	299	+ q = parser.parseFourPass("beans OR everyone",NamespacePolicy.IGNORE,true);
	300	+ assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0) ((alttitle1:beans^6.0 alttitle1:everyone^6.0) (alttitle2:beans^6.0 alttitle2:everyone^6.0) (alttitle3:beans^6.0 alttitle3:everyone^6.0))",q.toString());
295	301
296		~~- q = parser.parseThreePass("beans -everyone",NamespacePolicy.IGNORE);~~
297		~~- assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0)",q.toString());~~
	302	+ q = parser.parseFourPass("beans -everyone",NamespacePolicy.IGNORE,true);
	303	+ assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0) ((+alttitle1:beans^6.0 -alttitle1:everyone^6.0) (+alttitle2:beans^6.0 -alttitle2:everyone^6.0) (+alttitle3:beans^6.0 -alttitle3:everyone^6.0))",q.toString());
298	304
299		~~- q = parser.parseThreePass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE);~~
300		- assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false))",q.toString());
	305	+ q = parser.parseFourPass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE,true);
	306	+ assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04)) ((+(namespace:0 namespace:1 namespace:2) +spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01))",q.toString());
301	307
302		~~- q = parser.parseThreePass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE);~~
303		- assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0))",q.toString());
	308	+ q = parser.parseFourPass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE,true);
	309	+ assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0)) (((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+namespace:0 +alttitle1:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+namespace:0 +alttitle2:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+namespace:0 +alttitle3:mainly^6.0)))",q.toString());
304	310
305	311	// Test field extraction
306	312	HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java
—	—	@@ -52,19 +52,9 @@
53	53	public void writeEndPage() throws IOException {
54	54	ArticleLinks r = links.get(page.Title.Namespace+":"+page.Title.Text);
55	55	// register redirect
56		~~- String redirect = Localization.getRedirectTarget(revision.Text,langCode);~~
	56	+ Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
57	57	if( redirect !=null ){
58		~~- int ns = 0;~~
59		~~- String title = redirect;~~
60		~~- String[] parts = redirect.split(":",2);~~
61		~~- if(parts.length == 2 && parts[0].length()>1){~~
62		~~- Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());~~
63		~~- if(inx != null){~~
64		~~- ns = inx;~~
65		~~- title = parts[1];~~
66		~~- }~~
67		~~- }~~
68		~~- r.redirectsTo = findArticleLinks(ns,title);~~
	58	+ r.redirectsTo = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
69	59	} else // process links
70	60	processLinks(revision.Text,page.Title.Namespace);
71	61	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -96,7 +96,7 @@
97	97	long start = System.currentTimeMillis();
98	98
99	99	// regenerate link and redirect information
100		~~- HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile),langCode);~~
	100	+ HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile,langCode),langCode);
101	101
102	102	log.info("Third pass, indexing articles...");
103	103
—	—	@@ -179,7 +179,7 @@
180	180	return links;
181	181	}
182	182
183		~~- private static HashMap<String,ArticleLinks> getTitles(String inputfile) {~~
	183	+ private static HashMap<String,ArticleLinks> getTitles(String inputfile,String langCode) {
184	184	log.info("First pass, getting a list of valid articles...");
185	185	InputStream input = null;
186	186	try {
—	—	@@ -189,7 +189,7 @@
190	190	return null;
191	191	}
192	192	// first pass, get titles
193		~~- TitleReader tr = new TitleReader();~~
	193	+ TitleReader tr = new TitleReader(langCode);
194	194	XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
195	195	try {
196	196	reader.readDump();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java
—	—	@@ -4,12 +4,15 @@
5	5	import java.util.ArrayList;
6	6	import java.util.HashMap;
7	7	import java.util.HashSet;
	8	+import java.util.Iterator;
	9	+import java.util.Map.Entry;
8	10
9	11	import org.mediawiki.importer.DumpWriter;
10	12	import org.mediawiki.importer.Page;
11	13	import org.mediawiki.importer.Revision;
12	14	import org.mediawiki.importer.Siteinfo;
13	15	import org.wikimedia.lsearch.beans.ArticleLinks;
	16	+import org.wikimedia.lsearch.util.Localization;
14	17
15	18	/**
16	19	* Read a HashSet of titles from dump
—	—	@@ -21,6 +24,11 @@
22	25	Page page;
23	26	Revision revision;
24	27	HashMap<String,ArticleLinks> titles = new HashMap<String,ArticleLinks>();
	28	+ protected String langCode;
	29	+
	30	+ public TitleReader(String langCode){
	31	+ this.langCode = langCode;
	32	+ }
25	33
26	34	public void writeRevision(Revision revision) throws IOException {
27	35	this.revision = revision;
—	—	@@ -42,7 +50,12 @@
43	51	// nop
44	52	}
45	53	public void writeSiteinfo(Siteinfo info) throws IOException {
46		~~- // nop~~
	54	+ // write siteinfo to localization
	55	+ Iterator it = info.Namespaces.orderedEntries();
	56	+ while(it.hasNext()){
	57	+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
	58	+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
	59	+ }
47	60	}
48	61	public void writeStartWiki() throws IOException {
49	62	// nop
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
—	—	@@ -3,6 +3,7 @@
4	4	import java.io.IOException;
5	5	import java.util.ArrayList;
6	6	import java.util.HashMap;
	7	+import java.util.Iterator;
7	8	import java.util.Map.Entry;
8	9	import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
9	10	import java.util.regex.Matcher;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
—	—	@@ -106,7 +106,7 @@
107	107	IndexWriter writer = indexes.get(target.toString());
108	108	if(writer == null)
109	109	return;
110		~~- Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters);~~
	110	+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters,iid);
111	111	Document doc = (Document) ret[0];
112	112	Analyzer analyzer = (Analyzer) ret[1];
113	113	try {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -10,6 +10,7 @@
11	11	import org.apache.log4j.Logger;
12	12	import org.apache.lucene.analysis.Analyzer;
13	13	import org.apache.lucene.document.Document;
	14	+import org.apache.lucene.index.IndexReader;
14	15	import org.apache.lucene.queryParser.ParseException;
15	16	import org.apache.lucene.search.Hits;
16	17	import org.apache.lucene.search.Query;
—	—	@@ -117,13 +118,15 @@
118	119	Query q = null;
119	120	SearchResults res = null;
120	121	long searchStart = System.currentTimeMillis();
121		~~- Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes();~~
	122	+ Hashtable<String,NamespaceFilter> cachedFilters = GlobalConfiguration.getInstance().getNamespacePrefixes();
	123	+ boolean searchAll = false;
122	124
123	125	// if search is over one field, try to use filters
124	126	if(fields.size()==1){
125		~~- if(fields.contains(new NamespaceFilter()))~~
	127	+ if(fields.contains(new NamespaceFilter())){
126	128	nsfw = null; // empty filter: "all" keyword
127		~~- else if(!fields.contains(nsDefault)){~~
	129	+ searchAll = true;
	130	+ } else if(!fields.contains(nsDefault)){
128	131	// use the specified prefix in the query (if it can be cached)
129	132	NamespaceFilter f = fields.toArray(new NamespaceFilter[] {})[0];
130	133	if(f.cardinality()==1 \|\| NamespaceCache.isComposable(f))
—	—	@@ -135,7 +138,10 @@
136	139
137	140	try {
138	141	if(nsfw == null){
139		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());~~
	142	+ if(searchAll)
	143	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
	144	+ else
	145	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
140	146	}
141	147	else{
142	148	q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -10,6 +10,7 @@
11	11	import java.util.Arrays;
12	12	import java.util.Collection;
13	13	import java.util.Collections;
	14	+import java.util.Comparator;
14	15	import java.util.HashSet;
15	16	import java.util.Hashtable;
16	17	import java.util.Set;
—	—	@@ -28,10 +29,12 @@
29	30	import org.wikimedia.lsearch.analyzers.Analyzers;
30	31	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
31	32	import org.wikimedia.lsearch.analyzers.FilterFactory;
	33	+import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer;
32	34	import org.wikimedia.lsearch.analyzers.WikiTokenizer;
33	35	import org.wikimedia.lsearch.beans.Article;
34	36	import org.wikimedia.lsearch.beans.IndexReportCard;
35	37	import org.wikimedia.lsearch.beans.Redirect;
	38	+import org.wikimedia.lsearch.beans.Title;
36	39	import org.wikimedia.lsearch.config.GlobalConfiguration;
37	40	import org.wikimedia.lsearch.config.IndexId;
38	41	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
—	—	@@ -54,6 +57,8 @@
55	58	}
56	59
57	60	static public final int MAX_FIELD_LENGTH = 100000;
	61	+ /** number of aditional title1, title2, .. etc fields to be filled in with redirects */
	62	+ static public int ALT_TITLES = 3;
58	63	/** Simple implementation of batch addition and deletion */
59	64	class SimpleIndexModifier {
60	65	protected IndexId iid;
—	—	@@ -179,7 +184,7 @@
180	185	if(!checkPreconditions(rec))
181	186	continue; // article shoouldn't be added for some (heuristic) reason
182	187	IndexReportCard card = getReportCard(rec);
183		~~- Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters);~~
	188	+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters,iid);
184	189	Document doc = (Document) ret[0];
185	190	Analyzer analyzer = (Analyzer) ret[1];
186	191	try {
—	—	@@ -223,15 +228,17 @@
224	229	* @return
225	230	*/
226	231	public static boolean checkAddPreconditions(Article ar, String langCode){
227		~~- if(ar.getNamespace().equals("0")){~~
228		~~- String redirect = Localization.getRedirectTarget(ar.getContents(),langCode);~~
229		~~- if(redirect != null)~~
230		~~- return false; // don't add redirects~~
231		~~- /*if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){~~
	232	+ Title redirect = Localization.getRedirectTitle(ar.getContents(),langCode);
	233	+ int ns = Integer.parseInt(ar.getNamespace());
	234	+ if(redirect!=null && redirect.getNamespace() == ns){
	235	+ return false; // don't add redirects to same namespace, always add as redirect field
	236	+ }
	237	+
	238	+ /*if(ar.getNamespace().equals("0")){
	239	+ if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){
232	240	log.debug("Not adding "+ar+" into index: "+ar.getContents());
233	241	return false;
234	242	} */
235		~~- }~~
236	243	return true;
237	244	}
238	245
—	—	@@ -243,24 +250,43 @@
244	251	*/
245	252	protected static void transformArticleForIndexing(Article ar) {
246	253	ArrayList<Redirect> redirects = ar.getRedirects();
	254	+ // sort redirect by their rank
	255	+ Collections.sort(redirects,new Comparator<Redirect>() {
	256	+ public int compare(Redirect o1,Redirect o2){
	257	+ return o2.getReferences() - o1.getReferences();
	258	+ }
	259	+ });
247	260	int ns = Integer.parseInt(ar.getNamespace());
248	261	ar.setRank(ar.getReferences()); // base rank value
249	262	if(redirects != null){
250	263	ArrayList<String> filtered = new ArrayList<String>();
	264	+ ArrayList<Integer> ranks = new ArrayList<Integer>();
251	265	// index only redirects from the same namespace
252	266	// to avoid a lot of unusable redirects from/to
253	267	// user namespace, but always index redirect FROM main
254	268	for(Redirect r : redirects){
255		~~- if((ns == 0 && r.getNamespace() == 0) \|\| ns != 0){~~
	269	+ if(ns == r.getNamespace()){
256	270	filtered.add(r.getTitle());
	271	+ ranks.add(r.getReferences());
257	272	ar.addToRank(r.getReferences()+1);
258	273	} else
259	274	log.debug("Ignoring redirect "+r+" to "+ar);
260	275	}
261	276	ar.setRedirectKeywords(filtered);
	277	+ ar.setRedirectKeywordRanks(ranks);
262	278	}
263	279	}
264	280
	281	+ /** Check if for this article for this db we should extract keywords */
	282	+ public static boolean checkKeywordPreconditions(Article article, IndexId iid) {
	283	+ if(global == null)
	284	+ global = GlobalConfiguration.getInstance();
	285	+ if(article.getNamespace().equals("0") && global.useKeywordScoring(iid.getDBname()))
	286	+ return true;
	287	+ else
	288	+ return false;
	289	+ }
	290	+
265	291	/**
266	292	* Create necessary directories for index
267	293	* @param dbname
—	—	@@ -372,7 +398,7 @@
373	399	* @param languageAnalyzer
374	400	* @return array { document, analyzer }
375	401	*/
376		~~- public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){~~
	402	+ public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters, IndexId iid){
377	403	PerFieldAnalyzerWrapper perFieldAnalyzer = null;
378	404	WikiTokenizer tokenizer = null;
379	405	Document doc = new Document();
—	—	@@ -387,24 +413,27 @@
388	414	doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED));
389	415
390	416	// boost document title with it's article rank
391		~~- Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);~~
	417	+ Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
392	418	//log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
393	419	float rankBoost = calculateArticleRank(article.getRank());
394	420	title.setBoost(rankBoost);
395	421	doc.add(title);
396	422
	423	+ Field stemtitle = new Field("stemtitle", article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED);
	424	+ //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
	425	+ stemtitle.setBoost(rankBoost);
	426	+ doc.add(stemtitle);
	427	+
	428	+ // put the best redirects as alternative titles
	429	+ makeAltTitles(doc,"alttitle",article);
	430	+
397	431	// add titles of redirects, generated from analyzer
398		~~- Field redirect = new Field("redirect", "",~~
399		~~- Field.Store.NO, Field.Index.TOKENIZED);~~
400		~~- redirect.setBoost(rankBoost);~~
401		~~- doc.add(redirect);~~
	432	+ makeKeywordField(doc,"redirect",rankBoost);
402	433
403		~~- // most significat words in the text, gets extra score, from analyzer~~
404		~~- Field keyword = new Field("keyword", "",~~
405		~~- Field.Store.NO, Field.Index.TOKENIZED);~~
406		~~- keyword.setBoost(rankBoost);~~
407		~~- doc.add(keyword);~~
408		-
	434	+ if(checkKeywordPreconditions(article,iid))
	435	+ // most significat words in the text, gets extra score, from analyzer
	436	+ makeKeywordField(doc,"keyword",rankBoost);
	437	+
409	438	// the next fields are generated using wikitokenizer
410	439	doc.add(new Field("contents", "",
411	440	Field.Store.NO, Field.Index.TOKENIZED));
—	—	@@ -425,7 +454,35 @@
426	455
427	456	return new Object[] { doc, perFieldAnalyzer };
428	457	}
429		-
	458	+
	459	+ /** Make a multiple keyword field, e.g. redirect1, redirect2, redirect3 ... */
	460	+ protected static void makeKeywordField(Document doc, String prefix, float boost) {
	461	+ for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
	462	+ Field keyfield = new Field(prefix+i, "",
	463	+ Field.Store.NO, Field.Index.TOKENIZED);
	464	+ keyfield.setBoost(boost);
	465	+ doc.add(keyfield);
	466	+ }
	467	+
	468	+ }
	469	+
	470	+ protected static void makeAltTitles(Document doc, String prefix, Article article) {
	471	+ // the redirects, rank list are sorted..
	472	+ final ArrayList<String> redirects = article.getRedirectKeywords();
	473	+ final ArrayList<Integer> ranks = article.getRedirectKeywordRanks();
	474	+ if(redirects.size() == 0)
	475	+ return;
	476	+ // add alternative titles alttitle1, alttitle2 ...
	477	+ for(int i=0;i<ALT_TITLES && i<redirects.size();i++){
	478	+ if(ranks.get(i) == 0)
	479	+ break; // we don't want redirects with zero links
	480	+ //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i));
	481	+ Field alttitle = new Field("alttitle"+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED);
	482	+ alttitle.setBoost(calculateArticleRank(ranks.get(i)));
	483	+ doc.add(alttitle);
	484	+ }
	485	+ }
	486	+
430	487	/**
431	488	*
432	489	* Calculate document boost (article rank) from number of
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
—	—	@@ -16,12 +16,12 @@
17	17	/**
18	18	* For content:
19	19	* * length norm is a linear function, with f(1) = 1
20		~~- * and f(10000) = 0.2~~
	20	+ * and f(10000) = 0.5
21	21	*
22		~~- * For titles:~~
	22	+ * For titles / title aliases:
23	23	* * 1/sqrt(term^3)
24	24	*
25		~~- * For redirect:~~
	25	+ * For redirect / keywords:
26	26	* * no length norm
27	27	*
28	28	*/
—	—	@@ -35,11 +35,11 @@
36	36	//log.debug("Length-norm: "+f+", numtokens: "+numTokens);
37	37	return f;
38	38	}
39		~~- } else if(fieldName.equals("title")){~~
	39	+ } else if(fieldName.equals("title") \|\| fieldName.startsWith("alttitle")){
40	40	float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
41	41	//log.debug("Length-norm: "+f+", numtokens: "+numTokens);
42	42	return f;
43		~~- } else if(fieldName.equals("redirect") \|\| fieldName.equals("keyword")){~~
	43	+ } else if(fieldName.startsWith("redirect") \|\| fieldName.startsWith("keyword")){
44	44	return 1;
45	45	} else
46	46	return super.lengthNorm(fieldName,numTokens);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -22,6 +22,7 @@
23	23	import org.apache.lucene.search.spans.SpanQuery;
24	24	import org.apache.lucene.search.spans.SpanTermQuery;
25	25	import org.wikimedia.lsearch.config.GlobalConfiguration;
	26	+import org.wikimedia.lsearch.index.WikiIndexModifier;
26	27	import org.wikimedia.lsearch.search.NamespaceFilter;
27	28	import org.wikimedia.lsearch.util.UnicodeDecomposer;
28	29
—	—	@@ -57,6 +58,7 @@
58	59	private String field; // current field
59	60	private String defaultField; // the default field value
60	61	private float defaultBoost = 1;
	62	+ private float defaultAliasBoost = ALIAS_BOOST;
61	63	protected enum TokenType {WORD, FIELD, AND, OR, EOF };
62	64
63	65	private TokenStream tokenStream;
—	—	@@ -70,11 +72,16 @@
71	73	protected boolean disableTitleAliases;
72	74
73	75	/** boost for alias words from analyzer */
74		~~- public final float ALIAS_BOOST = 0.5f;~~
	76	+ public static float ALIAS_BOOST = 0.5f;
75	77	/** boost for title field */
76		~~- public static float TITLE_BOOST = 8;~~
	78	+ public static float TITLE_BOOST = 6;
	79	+ public static float TITLE_ALIAS_BOOST = 0.2f;
	80	+ public static float STEM_TITLE_BOOST = 2;
	81	+ public static float STEM_TITLE_ALIAS_BOOST = 0.4f;
77	82	public static float REDIRECT_BOOST = 0.2f;
78		~~- public static float KEYWORD_BOOST = 0.05f;~~
	83	+ public static float ALT_TITLE_BOOST = 2;
	84	+ public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
	85	+ public static float KEYWORD_BOOST = 0.02f;
79	86
80	87	/** Policies in treating field names:
81	88	*
—	—	@@ -663,9 +670,10 @@
664	671	return new TermQuery(makeTerm());
665	672	}
666	673
667		~~- // check for wildcard seaches, they are also not analyzed/stemmed~~
	674	+ // check for wildcard seaches, they are also not analyzed/stemmed, only for titles
668	675	// wildcard signs are allowed only at the end of the word, minimum one letter word
669		~~- if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' \|\| buffer[length-1]=='?')){~~
	676	+ if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' \|\| buffer[length-1]=='?') &&
	677	+ defaultField.equals("title")){
670	678	Query ret = new WildcardQuery(makeTerm());
671	679	ret.setBoost(defaultBoost);
672	680	return ret;
—	—	@@ -691,12 +699,12 @@
692	700	else if(token.type().equals("stemmed")){
693	701	// stemmed word
694	702	t = new TermQuery(makeTerm(token));
695		~~- t.setBoost(ALIAS_BOOST*defaultBoost);~~
	703	+ t.setBoost(defaultAliasBoost*defaultBoost);
696	704	cur.add(t,aliasOccur);
697	705	} else if(token.type().equals("alias")){
698	706	// produced by alias engine (e.g. for sr)
699	707	t = new TermQuery(makeTerm(token));
700		~~- t.setBoost(ALIAS_BOOST*defaultBoost);~~
	708	+ t.setBoost(defaultAliasBoost*defaultBoost);
701	709	cur.add(t,aliasOccur);
702	710	}
703	711	if( cur != bq) // returned from nested query
—	—	@@ -763,7 +771,7 @@
764	772
765	773	/** Duplicate a term query, setting "title" as field */
766	774	private TermQuery makeTitleTermQuery(TermQuery tq){
767		~~- if(disableTitleAliases && tq.getBoost()==ALIAS_BOOST)~~
	775	+ if(disableTitleAliases && tq.getBoost()==defaultAliasBoost)
768	776	return null;
769	777	Term term = tq.getTerm();
770	778	if(term.field().equals(defaultField)){
—	—	@@ -778,7 +786,7 @@
779	787
780	788	/** Duplicate a phrase query, setting "title" as field */
781	789	private PhraseQuery makeTitlePhraseQuery(PhraseQuery pq){
782		~~- if(disableTitleAliases && pq.getBoost()==ALIAS_BOOST)~~
	790	+ if(disableTitleAliases && pq.getBoost()==defaultAliasBoost)
783	791	return null;
784	792	PhraseQuery pq2 = new PhraseQuery();
785	793	Term[] terms = pq.getTerms();
—	—	@@ -1011,7 +1019,7 @@
1012	1020	span = spans.get(0);
1013	1021	else{
1014	1022	// make a span-near query that has a slop 1/2 of tokenGap
1015		~~- span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.tokenGap-1)/2,false);~~
	1023	+ span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.TOKEN_GAP-1)/2,false);
1016	1024	span.setBoost(boost);
1017	1025	}
1018	1026	}
—	—	@@ -1028,7 +1036,81 @@
1029	1037	}
1030	1038	return null;
1031	1039	}
	1040	+
	1041	+ protected BooleanQuery multiplySpans(Query query, int level, String fieldName, float boost){
	1042	+ BooleanQuery bq = new BooleanQuery(true);
	1043	+ for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
	1044	+ Query q = extractSpans(query,0,fieldName+i,boost/i);
	1045	+ if(q != null)
	1046	+ bq.add(q,BooleanClause.Occur.SHOULD);
	1047	+ }
	1048	+
	1049	+ if(bq.getClauses() == null \|\| bq.getClauses().length==0)
	1050	+ return null;
	1051	+ else
	1052	+ return bq;
	1053	+ }
1032	1054
	1055	+ /** Make a redirect query in format altitle1:query altitle2:query ... redirect:spanquery */
	1056	+ protected BooleanQuery makeRedirectQuery(String queryText, Query qt) {
	1057	+ BooleanQuery bq = new BooleanQuery(true);
	1058	+ float olfDefaultBoost = defaultBoost;
	1059	+ String contentField = defaultField;
	1060	+ defaultBoost = ALT_TITLE_BOOST;
	1061	+ defaultAliasBoost = ALT_TITLE_ALIAS_BOOST;
	1062	+ for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){
	1063	+ defaultField = "alttitle"+i;
	1064	+ Query q = parseRaw(queryText);
	1065	+ if(q != null)
	1066	+ bq.add(q,BooleanClause.Occur.SHOULD);
	1067	+ }
	1068	+ // pop stack
	1069	+ defaultField = contentField;
	1070	+ defaultBoost = olfDefaultBoost;
	1071	+ defaultAliasBoost = ALIAS_BOOST;
	1072	+
	1073	+ Query qs = multiplySpans(qt,0,"redirect",REDIRECT_BOOST);
	1074	+ // merge queries
	1075	+ if(qs != null){
	1076	+ bq.add(qs,BooleanClause.Occur.SHOULD);
	1077	+ }
	1078	+ if(bq.getClauses() == null \|\| bq.getClauses().length==0)
	1079	+ return null;
	1080	+ else
	1081	+ return bq;
	1082	+
	1083	+ }
	1084	+
	1085	+ /** Make title query in format: title:query stemtitle:stemmedquery */
	1086	+ protected Query makeTitleQuery(String queryText) {
	1087	+ String contentField = defaultField;
	1088	+ float olfDefaultBoost = defaultBoost;
	1089	+ defaultField = "title"; // now parse the title part
	1090	+ defaultBoost = TITLE_BOOST;
	1091	+ defaultAliasBoost = TITLE_ALIAS_BOOST;
	1092	+ Query qt = parseRaw(queryText);
	1093	+ // stemmed title
	1094	+ defaultField = "stemtitle";
	1095	+ defaultBoost = STEM_TITLE_BOOST;
	1096	+ defaultAliasBoost = STEM_TITLE_ALIAS_BOOST;
	1097	+ Query qs = parseRaw(queryText);
	1098	+ // pop stack
	1099	+ defaultField = contentField;
	1100	+ defaultBoost = olfDefaultBoost;
	1101	+ defaultAliasBoost = ALIAS_BOOST;
	1102	+
	1103	+ if(qt == qs) // either null, or category query
	1104	+ return qt;
	1105	+ if(qt == null)
	1106	+ return qs;
	1107	+ if(qs == null)
	1108	+ return qt;
	1109	+ BooleanQuery bq = new BooleanQuery(true);
	1110	+ bq.add(qt,BooleanClause.Occur.SHOULD);
	1111	+ bq.add(qs,BooleanClause.Occur.SHOULD);
	1112	+ return bq;
	1113	+ }
	1114	+
1033	1115	/**
1034	1116	* Main function for multi-pass parsing.
1035	1117	*
—	—	@@ -1039,17 +1121,12 @@
1040	1122	*/
1041	1123	protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){
1042	1124	if(policy != null)
1043		~~- this.namespacePolicy = policy;~~
1044		~~- float olfDefaultBoost = defaultBoost;~~
	1125	+ this.namespacePolicy = policy;
1045	1126	defaultBoost = 1;
1046		~~- Query qc = parseRaw(queryText);~~
1047		~~- String contentField = defaultField;~~
1048		~~- defaultField = "title"; // now parse the title part~~
1049		~~- defaultBoost = TITLE_BOOST;~~
1050		~~- Query qt = parseRaw(queryText);~~
1051		~~- // pop stack~~
1052		~~- defaultField = contentField;~~
1053		~~- defaultBoost = olfDefaultBoost;~~
	1127	+ defaultAliasBoost = ALIAS_BOOST;
	1128	+ Query qc = parseRaw(queryText);
	1129	+
	1130	+ Query qt = makeTitleQuery(queryText);
1054	1131	if(qc == null \|\| qt == null)
1055	1132	return new BooleanQuery();
1056	1133	if(qc.equals(qt))
—	—	@@ -1058,15 +1135,23 @@
1059	1136	bq.add(qc,BooleanClause.Occur.SHOULD);
1060	1137	bq.add(qt,BooleanClause.Occur.SHOULD);
1061	1138
	1139	+ Query nostem = null;
	1140	+ if(makeRedirect \|\| makeKeywords){
	1141	+ String contentField = defaultField;
	1142	+ defaultField = "keyword"; // this field is never stemmed
	1143	+ nostem = parseRaw(queryText);
	1144	+ defaultField = contentField;
	1145	+ }
	1146	+
1062	1147	// redirect pass
1063		~~- if(makeRedirect){~~
1064		~~- Query qr = extractSpans(qt,0,"redirect",REDIRECT_BOOST);~~
	1148	+ if(makeRedirect && nostem!=null){
	1149	+ BooleanQuery qr = makeRedirectQuery(queryText,nostem);
1065	1150	if(qr != null)
1066	1151	bq.add(qr,BooleanClause.Occur.SHOULD);
1067	1152	}
1068	1153	// keyword pass
1069		~~- if(makeKeywords){~~
1070		~~- Query qk = extractSpans(qt,0,"keyword",KEYWORD_BOOST);~~
	1154	+ if(makeKeywords && nostem!=null){
	1155	+ Query qk = multiplySpans(nostem,0,"keyword",KEYWORD_BOOST);
1071	1156	if(qk != null)
1072	1157	bq.add(qk,BooleanClause.Occur.SHOULD);
1073	1158	}
—	—	@@ -1074,7 +1159,7 @@
1075	1160	return bq;
1076	1161
1077	1162	}
1078		-
	1163	+
1079	1164	/**
1080	1165	* Three parse pases: contents, title, redirect
1081	1166	*
—	—	@@ -1099,6 +1184,10 @@
1100	1185	return parseMultiPass(queryText,policy,true,makeKeywords);
1101	1186	}
1102	1187
	1188	+ public Query parseFourPass(String queryText, NamespacePolicy policy, boolean makeKeywords) throws ParseException{
	1189	+ return parseMultiPass(queryText,policy,true,makeKeywords);
	1190	+ }
	1191	+
1103	1192	/**
1104	1193	* Parse the query according to policy. Instead of rewrite phrase, simply pass
1105	1194	* twice the query with different default fields.
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -14,6 +14,7 @@
15	15	import org.apache.lucene.analysis.th.ThaiWordFilter;
16	16	import org.wikimedia.lsearch.config.GlobalConfiguration;
17	17	import org.wikimedia.lsearch.config.IndexId;
	18	+import org.wikimedia.lsearch.index.WikiIndexModifier;
18	19	import org.wikimedia.lsearch.test.AliasPorterStemFilter;
19	20
20	21	/**
—	—	@@ -63,13 +64,29 @@
64	65	new CategoryAnalyzer(categories));
65	66	perFieldAnalyzer.addAnalyzer("title",
66	67	getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
67		~~- perFieldAnalyzer.addAnalyzer("redirect",~~
68		~~- new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory()));~~
69		~~- perFieldAnalyzer.addAnalyzer("keyword",~~
70		~~- new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory()));~~
	68	+ perFieldAnalyzer.addAnalyzer("stemtitle",
	69	+ getTitleAnalyzer(filters));
	70	+ setAltTitleAnalyzer(perFieldAnalyzer,"alttitle",
	71	+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
	72	+ setKeywordAnalyzer(perFieldAnalyzer,"redirect",
	73	+ new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),"redirect"));
	74	+ setKeywordAnalyzer(perFieldAnalyzer,"keyword",
	75	+ new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),"keyword"));
71	76	return new Object[] {perFieldAnalyzer,tokenizer};
72	77	}
73	78
	79	+ protected static void setAltTitleAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, Analyzer analyzer) {
	80	+ for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){
	81	+ perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
	82	+ }
	83	+ }
	84	+
	85	+ protected static void setKeywordAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, KeywordsAnalyzer analyzer) {
	86	+ for(int i=1;i<=KeywordsAnalyzer.KEYWORD_LEVELS;i++){
	87	+ perFieldAnalyzer.addAnalyzer(prefix+i,analyzer);
	88	+ }
	89	+ }
	90	+
74	91	public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
75	92	if(global == null)
76	93	global = GlobalConfiguration.getInstance();
—	—	@@ -95,6 +112,12 @@
96	113	new QueryLanguageAnalyzer(filters));
97	114	perFieldAnalyzer.addAnalyzer("title",
98	115	getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
	116	+ perFieldAnalyzer.addAnalyzer("stemtitle",
	117	+ getTitleAnalyzer(filters));
	118	+ setAltTitleAnalyzer(perFieldAnalyzer,"alttitle",
	119	+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
	120	+ perFieldAnalyzer.addAnalyzer("keyword",
	121	+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
99	122
100	123	return perFieldAnalyzer;
101	124	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
—	—	@@ -25,31 +25,68 @@
26	26	*
27	27	*/
28	28	public class KeywordsAnalyzer extends Analyzer{
29		~~- static Logger log = Logger.getLogger(KeywordsAnalyzer.class);~~
30		~~- protected ArrayList<String> keywords;~~
31		~~- protected FilterFactory filters;~~
32		~~- protected KeywordsTokenStream tokens;~~
	29	+ static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
	30	+ protected KeywordsTokenStream[] tokensBySize = null;
	31	+ protected String prefix;
	32	+
	33	+ /** number of field to be generated, e.g. keyword1 for single-word keywords,
	34	+ * keyword2 for two-word keywords, etc ... the last field has all the remaining keys
	35	+ */
	36	+ public static final int KEYWORD_LEVELS = 5;
	37	+ /** positional increment between different redirects */
	38	+ public static final int TOKEN_GAP = 201;
33	39
34		~~- public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters){~~
	40	+ public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix){
35	41	ArrayList<String> k = new ArrayList<String>();
36	42	if(keywords != null)
37	43	k.addAll(keywords);
38		~~- tokens = new KeywordsTokenStream(k,filters);~~
	44	+ init(k,filters,prefix);
39	45	}
	46	+ public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix){
	47	+ init(keywords,filters,prefix);
	48	+ }
40	49
41		~~- public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters){~~
42		~~- tokens = new KeywordsTokenStream(keywords,filters);~~
	50	+ protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix) {
	51	+ this.prefix = prefix;
	52	+ tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS];
	53	+ if(keywords == null){
	54	+ // init empty token streams
	55	+ for(int i=0; i< KEYWORD_LEVELS; i++){
	56	+ tokensBySize[i] = new KeywordsTokenStream(null,filters);
	57	+ }
	58	+ return;
	59	+ }
	60	+ ArrayList<ArrayList<String>> keywordsBySize = new ArrayList<ArrayList<String>>();
	61	+ for(int i=0;i<KEYWORD_LEVELS;i++)
	62	+ keywordsBySize.add(new ArrayList<String>());
	63	+ // arange keywords into a list by token number
	64	+ for(String k : keywords){
	65	+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k).parse();
	66	+ if(parsed.size() == 0)
	67	+ continue;
	68	+ else if(parsed.size() < KEYWORD_LEVELS)
	69	+ keywordsBySize.get(parsed.size()-1).add(k);
	70	+ else
	71	+ keywordsBySize.get(KEYWORD_LEVELS-1).add(k);
	72	+ }
	73	+ for(int i=0; i< KEYWORD_LEVELS; i++){
	74	+ tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters);
	75	+ }
43	76	}
44		~~- /** positional increment between different redirects */~~
45		~~- public static final int tokenGap = 201;~~
46	77
47	78	@Override
48	79	public TokenStream tokenStream(String fieldName, Reader reader) {
49		~~- return tokens;~~
	80	+ if(fieldName.startsWith(prefix)){
	81	+ int inx = Integer.parseInt(fieldName.substring(prefix.length()));
	82	+ return tokensBySize[inx-1];
	83	+ } else{
	84	+ log.error("Trying to get tokenStream for wrong field "+fieldName);
	85	+ return null;
	86	+ }
50	87	}
51	88	@Override
52	89	public TokenStream tokenStream(String fieldName, String text) {
53		~~- return tokens;~~
	90	+ return tokenStream(fieldName,(Reader)null);
54	91	}
55	92
56	93	class KeywordsTokenStream extends TokenStream {
—	—	@@ -80,7 +117,7 @@
81	118	if(t == null){
82	119	t = openNext();
83	120	if(t != null)
84		~~- t.setPositionIncrement(tokenGap);~~
	121	+ t.setPositionIncrement(TOKEN_GAP);
85	122	}
86	123	return t;
87	124	} else{
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java
—	—	@@ -29,6 +29,11 @@
30	30	public void setTitle(String title) {
31	31	this.title = title;
32	32	}
	33	+ @Override
	34	+ public String toString() {
	35	+ return namespace+":"+title+" ("+references+")";
	36	+ }
33	37
34	38
	39	+
35	40	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
—	—	@@ -43,6 +43,8 @@
44	44	private ArrayList<Redirect> redirects;
45	45	/** generated before indexing from the list of redirects */
46	46	private transient ArrayList<String> redirectKeywords;
	47	+ /** paired with previous list, ranks for each redirect */
	48	+ private transient ArrayList<Integer> redirectKeywordRanks;
47	49	/** generated before indexing from the reference sto this article, and references from redirects */
48	50	private transient int rank;
49	51
—	—	@@ -182,6 +184,16 @@
183	185	public void setRedirectKeywords(ArrayList<String> redirectKeywords) {
184	186	this.redirectKeywords = redirectKeywords;
185	187	}
	188	+
	189	+ public ArrayList<Integer> getRedirectKeywordRanks() {
	190	+ return redirectKeywordRanks;
	191	+ }
	192	+
	193	+ public void setRedirectKeywordRanks(ArrayList<Integer> redirectKeywordRanks) {
	194	+ this.redirectKeywordRanks = redirectKeywordRanks;
	195	+ }
186	196
187	197
	198	+
	199	+
188	200	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
—	—	@@ -2,6 +2,7 @@
3	3
4	4	import java.io.IOException;
5	5	import java.util.ArrayList;
	6	+import java.util.Iterator;
6	7
7	8	import org.apache.log4j.Logger;
8	9	import org.mediawiki.importer.DumpWriter;
—	—	@@ -11,8 +12,10 @@
12	13	import org.mediawiki.importer.Title;
13	14	import org.wikimedia.lsearch.beans.Article;
14	15	import org.wikimedia.lsearch.beans.Redirect;
	16	+import org.wikimedia.lsearch.config.GlobalConfiguration;
15	17	import org.wikimedia.lsearch.config.IndexId;
16	18	import org.wikimedia.lsearch.index.IndexUpdateRecord;
	19	+import org.wikimedia.lsearch.util.Localization;
17	20
18	21	public class IndexUpdatesCollector implements DumpWriter {
19	22	Logger log = Logger.getLogger(DumpWriter.class);
—	—	@@ -23,9 +26,11 @@
24	27	protected int references = 0;
25	28	protected ArrayList<Redirect> redirects = new ArrayList<Redirect>();
26	29	protected Siteinfo info = null;
	30	+ protected String langCode;
27	31
28	32	public IndexUpdatesCollector(IndexId iid){
29	33	this.iid = iid;
	34	+ this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
30	35	}
31	36
32	37	public void addRedirect(String redirectTitle, int references) {
—	—	@@ -69,6 +74,13 @@
70	75
71	76	public void writeSiteinfo(Siteinfo info) throws IOException {
72	77	this.info = info;
	78	+ // write to localization
	79	+ Iterator it = info.Namespaces.orderedEntries();
	80	+ while(it.hasNext()){
	81	+ Integer inx = (Integer)it.next();
	82	+ String prefix = info.Namespaces.getPrefix(inx);
	83	+ Localization.addCustomMapping(prefix,inx,langCode);
	84	+ }
73	85	}
74	86
75	87	public void close() throws IOException {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
—	—	@@ -10,6 +10,7 @@
11	11	import java.util.Map.Entry;
12	12
13	13	import org.apache.log4j.Logger;
	14	+import org.wikimedia.lsearch.beans.Title;
14	15	import org.wikimedia.lsearch.config.Configuration;
15	16
16	17	/**
—	—	@@ -27,7 +28,40 @@
28	29	/** Languages for which loading of localization failed */
29	30	protected static HashSet<String> badLocalizations = new HashSet<String>();
30	31	protected static HashSet<String> interwiki = null;
	32	+ /** lowecased canonical names of namespaces */
	33	+ protected static Hashtable<String,Integer> canonicalNamespaces = new Hashtable<String,Integer>();
	34	+ static{
	35	+ canonicalNamespaces.put("media",-2);
	36	+ canonicalNamespaces.put("special",-1);
	37	+ canonicalNamespaces.put("talk",1);
	38	+ canonicalNamespaces.put("user",2);
	39	+ canonicalNamespaces.put("user_talk",3);
	40	+ canonicalNamespaces.put("project",4);
	41	+ canonicalNamespaces.put("project_talk",5);
	42	+ canonicalNamespaces.put("image",6);
	43	+ canonicalNamespaces.put("image_talk",7);
	44	+ canonicalNamespaces.put("mediawiki",8);
	45	+ canonicalNamespaces.put("mediawiki_talk",9);
	46	+ canonicalNamespaces.put("template",10);
	47	+ canonicalNamespaces.put("template_talk",11);
	48	+ canonicalNamespaces.put("help",12);
	49	+ canonicalNamespaces.put("help_talk",13);
	50	+ canonicalNamespaces.put("category",14);
	51	+ canonicalNamespaces.put("category_talk",15);
	52	+ }
31	53
	54	+ /** Add custom mapping not found in localization files from other source, e.g. project name, etc.. */
	55	+ public static void addCustomMapping(String namespace, int index, String langCode){
	56	+ synchronized(lock){
	57	+ Hashtable<String,Integer> map = namespaces.get(langCode);
	58	+ if(map == null){
	59	+ map = new Hashtable<String,Integer>();
	60	+ namespaces.put(langCode,map);
	61	+ }
	62	+ map.put(namespace.toLowerCase(),index);
	63	+ }
	64	+ }
	65	+
32	66	public static HashSet<String> getLocalizedImage(String langCode){
33	67	return getLocalizedNamespace(langCode,6);
34	68	}
—	—	@@ -169,6 +203,25 @@
170	204	return null;
171	205	}
172	206
	207	+ public static Title getRedirectTitle(String text, String lang){
	208	+ String full = getRedirectTarget(text,lang);
	209	+ if(full == null)
	210	+ return null;
	211	+ String[] parts = full.split(":",2);
	212	+ if(parts.length == 2){
	213	+ String ns = parts[0].toLowerCase();
	214	+ // check canonical
	215	+ if(canonicalNamespaces.containsKey(ns))
	216	+ return new Title(canonicalNamespaces.get(ns),parts[1]);
	217	+ // check lang namespaces
	218	+ Hashtable<String,Integer> map = namespaces.get(lang);
	219	+ if(map.containsKey(ns))
	220	+ return new Title(map.get(ns),parts[1]);
	221	+ }
	222	+ // not recognized namespace, using main
	223	+ return new Title(0,full);
	224	+ }
	225	+
173	226	/** Loads interwiki from default location lib/interwiki.map */
174	227	public static void loadInterwiki(){
175	228	if(interwiki != null)

Status & tagging log

15:16, 12 September 2011 Meno25 (talk | contribs) changed the status of r22838 [removed: ok added: old]