r32997 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r32996‎ \| r32997 \| r32998 >
Date:	23:43, 8 April 2008
Author:	rainman
Status:	old
Tags:
Comment:	Various smaller tweaks: * removed excessive caches&checks during warmup * fixed explanations when boosts are scaled multiple times * remove some unwanted error reporting * fixed CJK highlighting * proper variant conversion for sr
Modified paths:	/branches/lucene-search-2.1/lib/dict/wordnet-en.txt.gz (modified) (history) /branches/lucene-search-2.1/lsearch-global.conf (modified) (history) /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalMultiQuery.java (modified) (history) /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java (modified) (history) /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/CJKFilter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SerbianFilter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/BuildAll.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/ArticleMeta.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Utf8Set.java (modified) (history) /branches/lucene-search-2.1/webinterface/lsweb.py (modified) (history) /branches/lucene-search-2.1/webinterface/searchForm.html (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/lib/dict/wordnet-en.txt.gz
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalMultiQuery.java
—	—	@@ -20,6 +20,7 @@
21	21	protected PositionalOptions options;
22	22	protected int stopWordCount = 0;
23	23	protected ArrayList<ArrayList<Float>> boosts = new ArrayList<ArrayList<Float>>();
	24	+ protected boolean scaledBoosts = false;
24	25
25	26	public PositionalMultiQuery(PositionalOptions options){
26	27	this.options = options;
—	—	@@ -109,18 +110,17 @@
110	111	av /= terms.length;
111	112	idf += av;
112	113
113		~~- // rescale boosts to reinstall right idfs per term~~
114		~~- ArrayList<Float> fb = boosts.get(count);~~
115		~~- for(int j=0; j<idfs.length; j++){~~
116		~~- fb.set(j,fb.get(j)*(idfs[j]/av));~~
117		~~- }~~
	114	+ if(!scaledBoosts){
	115	+ // rescale boosts to reinstall right idfs per term
	116	+ ArrayList<Float> fb = boosts.get(count);
	117	+ for(int j=0; j<idfs.length; j++){
	118	+ fb.set(j,fb.get(j)*(idfs[j]/av));
	119	+ }
	120	+ }
118	121	count++;
119	122	}
	123	+ scaledBoosts = true;
120	124	}
121		-
122		~~- private final float sq(float x){~~
123		~~- return x*x;~~
124		~~- }~~
125	125
126	126	public Scorer scorer(IndexReader reader) throws IOException {
127	127	if (termArrays.size() == 0) // optimize zero-term case
—	—	@@ -224,7 +224,8 @@
225	225	}
226	226
227	227	public Query rewrite(IndexReader reader) {
228		~~- if (termArrays.size() == 1) { // optimize one-term case~~
	228	+ // optimize one-term case
	229	+ if (termArrays.size() == 1 && (options==null \|\| !options.takeMaxScore)) {
229	230	Term[] terms = (Term[])termArrays.get(0);
230	231	ArrayList<Float> boost = boosts.get(0);
231	232	if(terms.length == 1){
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java
—	—	@@ -67,6 +67,7 @@
68	68	public Alttitle(){
69	69	aggregateMeta = new AggregateInfoImpl();
70	70	takeMaxScore = true;
	71	+ //exactBoost = 2;
71	72	//wholeBoost = 10;
72	73	}
73	74	}
—	—	@@ -96,6 +97,7 @@
97	98	public Related(){
98	99	aggregateMeta = new AggregateInfoImpl();
99	100	takeMaxScore = true;
	101	+ //exactBoost = 2;
100	102	}
101	103	}
102	104
—	—	@@ -157,6 +159,14 @@
158	160	}
159	161	}
160	162
	163	+ /** Near match phrases, when more than 50% of nonstopwords are matched */
	164	+ public static class AlttitleNearMatch extends PositionalOptions {
	165	+ public AlttitleNearMatch(){
	166	+ aggregateMeta = new AggregateInfoImpl();
	167	+ takeMaxScore = true;
	168	+ }
	169	+ }
	170	+
161	171	public abstract static class NamespaceBoost implements Serializable {
162	172	public abstract float getBoost(int namespace);
163	173
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java
—	—	@@ -444,7 +444,6 @@
445	445	*/
446	446	protected final float phraseFreq() throws IOException {
447	447	int end = initPhrasePositionsBoost();
448		-
449	448	float freq = 0.0f;
450	449	boolean done = (end<0);
451	450	while (!done) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -305,7 +305,7 @@
306	306	}
307	307	if(templateLevel == 0 && tableLevel == 0)
308	308	keywordTokens+=gap; // inc by gap (usually 1, can be more before paragraphs and sections)
309		-
	309	+
310	310	// add exact token
311	311	Token exact;
312	312	if(options.exactCase)
—	—	@@ -322,6 +322,14 @@
323	323	exact.setType("titlecase");
324	324	}
325	325	addToTokens(exact);
	326	+
	327	+ // extra uppercase token, prevent exact-matches for titles
	328	+ if(options.extraUpperCaseToken && allUpperCase){
	329	+ Token t = makeToken(new String(buffer, 0, length), start, start + length, false);
	330	+ t.setPositionIncrement(0);
	331	+ t.setType(exact.type());
	332	+ addToTokens(t);
	333	+ }
326	334
327	335	if(!options.noAliases){
328	336	// add decomposed token to stream
—	—	@@ -650,8 +658,7 @@
651	659	prefixLen = 0;
652	660	semicolonInx = -1;
653	661	break;
654		~~- }~~
655		~~- if(Character.isLetter(lc)){~~
	662	+ } else{
656	663	prefixBuf[ prefixLen++ ] = Character.toLowerCase(lc);
657	664	}
658	665	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -1187,7 +1187,7 @@
1188	1188
1189	1189	BooleanQuery wrap = new BooleanQuery(true);
1190	1190	wrap.add(full,Occur.SHOULD);
1191		~~- wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD);~~
	1191	+ wrap.add(makeComplete(expandedWordsTitle,expandedBoostTitle,expandedTypes),Occur.SHOULD);
1192	1192	if(forbidden != null)
1193	1193	wrap.add(forbidden,Occur.MUST_NOT);
1194	1194
—	—	@@ -1197,7 +1197,7 @@
1198	1198	AgeScaling age = iid.getAgeScaling();
1199	1199	if(age != AgeScaling.NONE){
1200	1200	switch(age){
1201		~~- case STRONG: scale = new ArticleScaling.SqrtScale(0.3f,1); break;~~
	1201	+ case STRONG: scale = new ArticleScaling.StepScale(0.3f,1); break;
1202	1202	case MEDIUM: scale = new ArticleScaling.StepScale(0.6f,1); break;
1203	1203	case WEAK: scale = new ArticleScaling.StepScale(0.9f,1); break;
1204	1204	default: throw new RuntimeException("Unsupported age scaling "+age);
—	—	@@ -1231,14 +1231,15 @@
1232	1232	}
1233	1233
1234	1234	/** Make alternate "complete" query that will match redirects not in contents like los angles -> los angeles */
1235		~~- private Query makeComplete(ArrayList<ArrayList<String>> expanded) {~~
1236		~~- PositionalQuery pq = new PositionalQuery(new PositionalOptions.RedirectComplete());~~
	1235	+ private Query makeComplete(ArrayList<ArrayList<String>> expanded, ArrayList<ArrayList<Float>> boosts, ArrayList<ExpandedType> types) {
	1236	+ return makePositionalMulti(expanded,boosts,types,fields.alttitle(),new PositionalOptions.RedirectComplete(),0,1);
	1237	+ /* PositionalQuery pq = new PositionalQuery(new PositionalOptions.RedirectComplete());
1237	1238	for(int i=0;i<expanded.size();i++){
1238	1239	for(String w : expanded.get(i)){
1239	1240	pq.add(new Term(fields.alttitle(),w),i,stopWords.contains(w));
1240	1241	}
1241	1242	}
1242		~~- return pq;~~
	1243	+ return pq; */
1243	1244	}
1244	1245
1245	1246	private ArrayList<String> cleanupWords(ArrayList<String> words) {
—	—	@@ -1475,8 +1476,12 @@
1476	1477	return query;
1477	1478	BooleanQuery bq = new BooleanQuery(true);
1478	1479	bq.add(query,Occur.SHOULD);
1479		~~- for(Query q : additional)~~
1480		~~- bq.add(q,Occur.SHOULD);~~
	1480	+ for(Query q : additional){
	1481	+ if(q != null)
	1482	+ bq.add(q,Occur.SHOULD);
	1483	+ }
	1484	+ if(bq.clauses().size()==1)
	1485	+ return query;
1481	1486	return bq;
1482	1487	}
1483	1488
—	—	@@ -1637,6 +1642,15 @@
1638	1643	return bq;
1639	1644	}
1640	1645
	1646	+ private int countNonStopWords(ArrayList<String> words){
	1647	+ int count = 0;
	1648	+ for(String w : words){
	1649	+ if(!stopWords.contains(w))
	1650	+ count++;
	1651	+ }
	1652	+ return count;
	1653	+ }
	1654	+
1641	1655	/** Make query with short subphrases anchored in non-stop words */
1642	1656	protected Query makeAnchoredQueryMulti(ArrayList<ArrayList<String>> words, ArrayList<ArrayList<Float>> boosts, ArrayList<ExpandedType> types,
1643	1657	String field, PositionalOptions options, PositionalOptions whole, PositionalOptions wholeSloppy,
—	—	@@ -1768,9 +1782,7 @@
1769	1783
1770	1784	Query q = parseRaw(queryText);
1771	1785
1772		~~- ArrayList<String> words = wordsFromParser;~~
1773		~~- if(words == null \|\| words.size() == 0)~~
1774		~~- return q;~~
	1786	+ ArrayList<String> words = wordsFromParser;
1775	1787
1776	1788	this.builder = oldBuilder;
1777	1789	this.defaultField = oldDefaultField;
—	—	@@ -1786,31 +1798,33 @@
1787	1799	BooleanQuery full = new BooleanQuery(true);
1788	1800	full.add(q,Occur.MUST);
1789	1801
1790		~~- // main relevance~~
1791		~~- Query redirects = makeAlttitleForRedirects(words,20,1);~~
1792		~~- if(redirects != null)~~
1793		~~- full.add(redirects,Occur.SHOULD);~~
	1802	+ /*if(words != null \|\| words.size() > 0){
	1803	+ // main relevance
	1804	+ Query redirects = makeAlttitleForRedirects(words,20,1);
	1805	+ if(redirects != null)
	1806	+ full.add(redirects,Occur.SHOULD);
	1807	+
	1808	+ // singular words
	1809	+ ArrayList<String> singularWords = makeSingularWords(words);
	1810	+ if(singularWords != null){
	1811	+ Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);
	1812	+ if(redirectsSing != null)
	1813	+ full.add(redirectsSing,Occur.SHOULD);
	1814	+ }
	1815	+ } */
1794	1816
1795		~~- // singular words~~
1796		~~- ArrayList<String> singularWords = makeSingularWords(words);~~
1797		~~- if(singularWords != null){~~
1798		~~- Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);~~
1799		~~- if(redirectsSing != null)~~
1800		~~- full.add(redirectsSing,Occur.SHOULD);~~
1801		~~- }~~
1802		-
1803	1817	// fuzzy & wildcards
1804	1818	// NOTE: for these to work parseForTitles needs to called after parse()
1805		~~- if(hasWildcards() \|\| hasFuzzy()){~~
1806		~~- Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f);~~
1807		~~- if(redirectsMulti != null)~~
1808		~~- full.add(redirectsMulti,Occur.SHOULD);~~
1809		~~- }~~
	1819	+ //if(hasWildcards() \|\| hasFuzzy()){
	1820	+ Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f);
	1821	+ if(redirectsMulti != null)
	1822	+ full.add(redirectsMulti,Occur.SHOULD);
	1823	+ //}
1810	1824
1811	1825	// add another for complete matches
1812	1826	BooleanQuery wrap = new BooleanQuery(true);
1813	1827	wrap.add(full,Occur.SHOULD);
1814		~~- wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD);~~
	1828	+ wrap.add(makeComplete(expandedWordsTitle,expandedBoostTitle,expandedTypes),Occur.SHOULD);
1815	1829	if(forbidden != null)
1816	1830	wrap.add(forbidden,Occur.MUST_NOT);
1817	1831
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
—	—	@@ -35,6 +35,7 @@
36	36	protected ArrayList<Class> additionalFilters = null;
37	37	protected Singular singular = null;
38	38	protected boolean hasCanonicalFilter = false;
	39	+ protected boolean hasLanguageVariants = false;
39	40
40	41	protected FilterFactory noStemmerFilterFactory=null;
41	42	protected Set<String> stopWords;
—	—	@@ -146,6 +147,9 @@
147	148	if(lang.equals("sr"))
148	149	hasCanonicalFilter = true;
149	150
	151	+ // variants (TODO: add zh)
	152	+ if(lang.equals("sr"))
	153	+ hasLanguageVariants = true;
150	154	}
151	155
152	156	public static boolean isCJKLanguage(String lang){
—	—	@@ -368,4 +372,14 @@
369	373	public boolean isSpellCheck(){
370	374	return type == Type.SPELL_CHECK;
371	375	}
	376	+
	377	+ /** Convert word into language variants if any */
	378	+ public ArrayList<String> getVariants(String word){
	379	+ if(!hasLanguageVariants)
	380	+ return null;
	381	+ if(lang.equals("sr")){
	382	+ return SerbianFilter.getVariants(word);
	383	+ } else
	384	+ return null;
	385	+ }
372	386	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java
—	—	@@ -28,6 +28,8 @@
29	29	boolean extendedTrailing = false;
30	30	/** if to split tokens with apostrophes and points in them */
31	31	boolean split = true;
	32	+ /** generate extra original token if the word is in upper case */
	33	+ boolean extraUpperCaseToken = false;
32	34
33	35	public TokenizerOptions(boolean exactCase){
34	36	this.exactCase = exactCase;
—	—	@@ -53,10 +55,11 @@
54	56	relocationParsing = false;
55	57	noCaseDetection = true;
56	58	extendedTrailing = true;
	59	+ extraUpperCaseToken = true;
57	60	}
58	61	}
59	62
60		~~- public static class TitleNoSplit extends Title{~~
	63	+ public static class TitleNoSplit extends Title {
61	64	public TitleNoSplit(boolean exactCase){
62	65	super(exactCase);
63	66	this.split = false;
—	—	@@ -111,6 +114,7 @@
112	115	super(false);
113	116	noAliases = true;
114	117	noTrailing = true;
	118	+ extraUpperCaseToken = false;
115	119	}
116	120	}
117	121	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SerbianFilter.java
—	—	@@ -1,6 +1,8 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
4	4	import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.BitSet;
5	7	import java.util.HashMap;
6	8
7	9	import org.apache.lucene.analysis.Token;
—	—	@@ -20,10 +22,13 @@
21	23	private final char[] buffer = new char[MAX_WORD_LEN+1];
22	24	private int length;
23	25	protected static String[] conv = null;
	26	+ protected static String[] toLatin = null, toCyrillic = null;
	27	+ protected static HashMap<String,String> toCyrillicMap = null;
	28	+ protected static BitSet toCyrillicTwo = null; // pairs of two chars
24	29	protected Token nextToken;
25	30	protected boolean aliasDiff;
26	31
27		~~- public void init(){~~
	32	+ public static synchronized void init(){
28	33	conv = new String[65536];
29	34
30	35	for(int i=0;i<65536;i++)
—	—	@@ -46,6 +51,110 @@
47	52	conv['đ'] = "dj"; conv['Đ']="Dj";
48	53	}
49	54
	55	+ public static synchronized void initVariants(){
	56	+ toLatin = new String[65536];
	57	+ toCyrillic = new String[65536];
	58	+ toCyrillicMap = new HashMap<String,String>();
	59	+ toCyrillicTwo = new BitSet();
	60	+
	61	+ for(int i=0;i<65536;i++){
	62	+ toLatin[i] = null;
	63	+ toCyrillic[i] = null;
	64	+ }
	65	+
	66	+ toLatin['а']="a"; toLatin['б']="b"; toLatin['в']="v"; toLatin['г']="g"; toLatin['д']="d";
	67	+ toLatin['ђ']="đ"; toLatin['е']="e"; toLatin['ж']="ž"; toLatin['з']="z"; toLatin['и']="i";
	68	+ toLatin['ј']="j"; toLatin['к']="k"; toLatin['л']="l"; toLatin['љ']="lj"; toLatin['м']="m";
	69	+ toLatin['н']="n"; toLatin['њ']="nj"; toLatin['о']="o"; toLatin['п']="p"; toLatin['р']="r";
	70	+ toLatin['с']="s"; toLatin['т']="t"; toLatin['ћ']="ć"; toLatin['у']="u"; toLatin['ф']="f";
	71	+ toLatin['х']="h"; toLatin['ц']="c"; toLatin['ч']="č"; toLatin['џ']="dž"; toLatin['ш']="š";
	72	+
	73	+ toLatin['А']="A"; toLatin['Б']="B"; toLatin['В']="V"; toLatin['Г']="G"; toLatin['Д']="D";
	74	+ toLatin['Ђ']="Đ"; toLatin['Е']="E"; toLatin['Ж']="Ž"; toLatin['З']="Z"; toLatin['И']="I";
	75	+ toLatin['Ј']="J"; toLatin['К']="K"; toLatin['Л']="L"; toLatin['Љ']="Lj"; toLatin['М']="M";
	76	+ toLatin['Н']="N"; toLatin['Њ']="Nj"; toLatin['О']="O"; toLatin['П']="P"; toLatin['Р']="R";
	77	+ toLatin['С']="S"; toLatin['Т']="T"; toLatin['Ћ']="Ć"; toLatin['У']="U"; toLatin['Ф']="F";
	78	+ toLatin['Х']="H"; toLatin['Ц']="C"; toLatin['Ч']="Č"; toLatin['Џ']="Dž"; toLatin['Ш']="Š";
	79	+
	80	+ toCyrillic['a']="а"; toCyrillic['b']="б"; toCyrillic['c']="ц"; toCyrillic['č']="ч"; toCyrillic['ć']="ћ";
	81	+ toCyrillic['d']="д"; toCyrillic['đ']="ђ"; toCyrillic['e']="е"; toCyrillic['f']="ф";
	82	+ toCyrillic['g']="г"; toCyrillic['h']="х"; toCyrillic['i']="и"; toCyrillic['j']="ј"; toCyrillic['k']="к";
	83	+ toCyrillic['l']="л"; toCyrillic['m']="м"; toCyrillic['n']="н";
	84	+ toCyrillic['o']="о"; toCyrillic['p']="п"; toCyrillic['r']="р"; toCyrillic['s']="с"; toCyrillic['š']="ш";
	85	+ toCyrillic['t']="т"; toCyrillic['u']="у"; toCyrillic['v']="в"; toCyrillic['z']="з"; toCyrillic['ž']="ж";
	86	+
	87	+ toCyrillic['A']="А"; toCyrillic['B']="Б"; toCyrillic['C']="Ц"; toCyrillic['Č']="Ч"; toCyrillic['Ć']="Ћ";
	88	+ toCyrillic['D']="Д"; toCyrillic['Đ']="Ђ"; toCyrillic['E']="Е"; toCyrillic['F']="Ф";
	89	+ toCyrillic['G']="Г"; toCyrillic['H']="Х"; toCyrillic['I']="И"; toCyrillic['J']="Ј"; toCyrillic['K']="К";
	90	+ toCyrillic['L']="Л"; toCyrillic['M']="М"; toCyrillic['N']="Н";
	91	+ toCyrillic['O']="О"; toCyrillic['P']="П"; toCyrillic['R']="Р"; toCyrillic['S']="С"; toCyrillic['Š']="Ш";
	92	+ toCyrillic['T']="Т"; toCyrillic['U']="У"; toCyrillic['V']="В"; toCyrillic['Z']="З"; toCyrillic['Ž']="Ж";
	93	+
	94	+ toCyrillicMap.put("DŽ","Џ"); toCyrillicMap.put("Lj","Љ"); toCyrillicMap.put("Nj","Њ");
	95	+ toCyrillicMap.put("LJ","Љ"); toCyrillicMap.put("Dž","Џ"); toCyrillicMap.put("nj","њ");
	96	+ toCyrillicMap.put("dž","џ"); toCyrillicMap.put("lj","љ"); toCyrillicMap.put("NJ","Њ");
	97	+
	98	+ toCyrillicTwo.set('D'); toCyrillicTwo.set('d'); toCyrillicTwo.set('Ž'); toCyrillicTwo.set('ž');
	99	+ toCyrillicTwo.set('L'); toCyrillicTwo.set('l'); toCyrillicTwo.set('J'); toCyrillicTwo.set('j');
	100	+ toCyrillicTwo.set('N'); toCyrillicTwo.set('n');
	101	+ }
	102	+
	103	+ /** get latin and cyrillic variant of the text */
	104	+ public static ArrayList<String> getVariants(String text){
	105	+ if(toLatin == null \|\| toCyrillic==null)
	106	+ initVariants();
	107	+ if(text.length() == 0)
	108	+ return null;
	109	+ else if(text.length() == 1){
	110	+ ArrayList<String> ret = new ArrayList<String>();
	111	+ String l = toLatin[text.charAt(0)];
	112	+ if(l != null)
	113	+ ret.add(l);
	114	+ String c = toCyrillic[text.charAt(0)];
	115	+ if(c != null)
	116	+ ret.add(c);
	117	+ return ret;
	118	+ }
	119	+ StringBuilder lat = new StringBuilder();
	120	+ StringBuilder cyr = new StringBuilder();
	121	+ char c='\0', c1=text.charAt(0);
	122	+ for(int i=1;i<text.length()+1;i++){
	123	+ c = c1;
	124	+ c1 = i<text.length()? text.charAt(i) : '\0';
	125	+ String l = toLatin[c];
	126	+ if(l != null)
	127	+ lat.append(l);
	128	+ else
	129	+ lat.append(c);
	130	+ }
	131	+
	132	+ c='\0'; c1=text.charAt(0);
	133	+ for(int i=1;i<text.length()+1;i++){
	134	+ c = c1;
	135	+ c1 = i<text.length()? text.charAt(i) : '\0';
	136	+ String cl = null;
	137	+ // quick check if we should try the two-letter map
	138	+ if(toCyrillicTwo.get(c) && toCyrillicTwo.get(c1))
	139	+ cl = toCyrillicMap.get(""+c+c1);
	140	+
	141	+ if(cl != null){
	142	+ i++;
	143	+ c = c1;
	144	+ c1 = i<text.length()? text.charAt(i) : '\0';
	145	+ } else // single letter map
	146	+ cl = toCyrillic[c];
	147	+ if(cl != null)
	148	+ cyr.append(cl);
	149	+ else
	150	+ cyr.append(c);
	151	+ }
	152	+ ArrayList<String> ret = new ArrayList<String>();
	153	+ ret.add(lat.toString());
	154	+ ret.add(cyr.toString());
	155	+ return ret;
	156	+ }
	157	+
	158	+ /** Convert to ascii */
50	159	public String convert(String text){
51	160	length = 0;
52	161	String cv;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java
—	—	@@ -116,6 +116,9 @@
117	117	if(isStub()){
118	118	try {
119	119	setTermText(new String(serialized,termTextStart,termTextEnd-termTextStart,"utf-8"));
	120	+ // check if this is a cjk token
	121	+ if(termText().length()>0 && type==Type.TEXT && CJKFilter.isCJKChar(termText().codePointAt(0)))
	122	+ setType("cjk");
120	123	unstubOriginal();
121	124	} catch (UnsupportedEncodingException e) {
122	125	e.printStackTrace();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
—	—	@@ -5,6 +5,7 @@
6	6	import org.apache.lucene.analysis.Token;
7	7	import org.apache.lucene.analysis.TokenFilter;
8	8	import org.apache.lucene.analysis.TokenStream;
	9	+import org.wikimedia.lsearch.analyzers.ExtToken.Type;
9	10
10	11	/**
11	12	* Simple CJK (Chinese Japanese Korean) token filter.
—	—	@@ -24,10 +25,16 @@
25	26	if(buffer.size()!=0)
26	27	return buffer.removeFirst();
27	28
28		~~- Token token = input.next();~~
29		~~- if(token == null)~~
30		~~- return null;~~
	29	+ Token token;
	30	+ do{
	31	+ token = input.next();
	32	+ if(token == null)
	33	+ return null;
	34	+ } while(token.getPositionIncrement()==0); // discard aliases
31	35
	36	+ if(token instanceof ExtToken && ((ExtToken)token).getType()!=Type.TEXT)
	37	+ return token;
	38	+
32	39	String text = token.termText();
33	40
34	41	int i,offset,c;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Utf8Set.java
—	—	@@ -63,9 +63,13 @@
64	64	protected Utf8String str = new Utf8String();
65	65
66	66	public Utf8Set(Set<String> words){
67		~~- for(String w : words){~~
68		~~- lookup[w.charAt(0)&MASK] = true;~~
69		~~- set.add(new Utf8String(w));~~
	67	+ try{
	68	+ for(String w : words){
	69	+ lookup[w.getBytes("utf-8")[0]&MASK] = true;
	70	+ set.add(new Utf8String(w));
	71	+ }
	72	+ } catch(Exception e){
	73	+ e.printStackTrace();
70	74	}
71	75	}
72	76
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
—	—	@@ -38,6 +38,7 @@
39	39	import org.apache.lucene.search.TermQuery;
40	40	import org.apache.lucene.store.Directory;
41	41	import org.apache.lucene.store.RAMDirectory;
	42	+import org.wikimedia.lsearch.analyzers.FilterFactory;
42	43	import org.wikimedia.lsearch.analyzers.PrefixAnalyzer;
43	44	import org.wikimedia.lsearch.analyzers.SplitAnalyzer;
44	45	import org.wikimedia.lsearch.beans.Article;
—	—	@@ -74,6 +75,7 @@
75	76	protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly;
76	77	protected boolean optimized = false;
77	78	protected boolean autoOptimize = false;
	79	+ protected FilterFactory filters = null;
78	80
79	81	private Links(IndexId iid, String path, IndexWriter writer, boolean autoOptimize) throws CorruptIndexException, IOException{
80	82	this.writer = writer;
—	—	@@ -93,6 +95,7 @@
94	96	redirectOnly = makeSelector("redirect");
95	97	contextOnly = makeSelector("context");
96	98	linksOnly = makeSelector("links");
	99	+ filters = new FilterFactory(iid.getDB());
97	100	}
98	101
99	102	protected FieldSelector makeSelector(String field){
—	—	@@ -259,7 +262,6 @@
260	263	if(redirect != null){
261	264	redirectsTo = findTargetLink(redirect.getNamespace(),redirect.getTitle(),exactCase);
262	265	} else {
263		~~- HashSet<String> contextLinks = new HashSet<String>();~~
264	266	ContextParser.Context curContext = null;
265	267	while(true){
266	268	boolean hasNext = matcher.find();
—	—	@@ -275,7 +277,6 @@
276	278	curContext = context;
277	279	else if(curContext!=context){
278	280	pagelinks.add("");
279		~~- contextLinks.clear();~~
280	281	curContext = context;
281	282	}
282	283	}
—	—	@@ -314,13 +315,10 @@
315	316	continue; // skip links from other namespaces into the main namespace
316	317	String target = findTargetLink(ns,title,exactCase);
317	318	if(target != null){
318		~~- int targetNs = Integer.parseInt(target.substring(0,target.indexOf(':')));~~
	319	+ ArrayList<String> variants = filters.getVariants(target);
319	320	pagelinks.add(target);
320		~~- // register context of this link~~
321		~~- if(context != null && nsf.contains(targetNs)){~~
322		~~- contextLinks.add(target);~~
323		~~- }~~
324		-
	321	+ if(variants != null)
	322	+ pagelinks.addAll(variants);
325	323	}
326	324	}
327	325	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java
—	—	@@ -157,8 +157,8 @@
158	158	QueryParser parser = new QueryParser("contents",new CJKAnalyzer());
159	159	Query q = parser.parse("プロサッカークラブをつくろう");
160	160	System.out.println("Japanese in standard analyzer: "+q);
161		- displayTokens(new CJKAnalyzer(),"『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
162		- displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
	161	+ displayTokens(new CJKAnalyzer(),"は、工学者、大学教授、工学博士。『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
	162	+ displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木孝治（すずきこうじ、1954年 - ）『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
163	163	displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタjavaイトルはbalaton");
164	164	displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"パン");
165	165
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
—	—	@@ -24,7 +24,7 @@
25	25
26	26	public class FastWikiTokenizerTest {
27	27	public static void displayTokensForParser(String text) {
28		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions.Highlight(true));~~
	28	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions.Highlight(false));
29	29	Token[] tokens = parser.parse().toArray(new Token[] {});
30	30	for (int i = 0; i < tokens.length; i++) {
31	31	Token token = tokens[i];
—	—	@@ -70,11 +70,13 @@
71	71
72	72	public static void main(String args[]) throws Exception{
73	73	Configuration.open();
74		~~- String text = "''italic'' text bre! <nowiki><!-- see--></nowiki> <!-- nosee --> (ant) and some. it's stupid it's something and 5\"6' or more, links abacus";~~
	74	+ String text = "ATA, [[:link]] [[zh-min-nan:Something]] [[zh-min-nana:Something]] str_replace";
75	75	showTokens(text);
	76	+ text = "''italic'' text bre! <nowiki><!-- see--></nowiki> <!-- nosee --> (ant) and some. it's stupid it's something and 5\"6' or more, links abacus";
	77	+ showTokens(text);
76	78	text = ":''This article is about the humorist. For the [[Indo-Europeanist]] see [[Douglas Q. Adams]].''\n{{Infobox writer <!-- for more information see [[:Template:Infobox writer]] -->\n\| name = Douglas Adams\n\| image = Douglas adams cropped.jpg\n\| caption = Douglas Adams signing books at ApacheCon 2000\n\| birthdate = {{birth date\|1952\|3\|11\|df=yes}}\n\| birthplace = [[Cambridge]], [[England]]\n\| deathdate = {{Death date and age\|2001\|5\|11\|1952\|3\|11\|df=yes}}\n\| deathplace = [[Santa Barbara, California]], [[United States\|U.S.]]\n\| occupation = comedy writer, novelist, dramatist, fantasist\n\| genre = [[Science fiction]], [[Comedy]]\n\| movement =\n\| influences = [[Richard Dawkins]] <ref>[http://www.bbc.co.uk/cult/hitchhikers/metaguide/radio.shtml Interview extract (in RealAudio format)] where Adams states the influences on his work.</ref>, [[Monty Python]], [[Neil Gaiman]], [[Robert Sheckley]], [[Kurt Vonnegut]], <br/>[[P. G. Wodehouse]]\n\| influenced =\n\| website = http://www.douglasadams.com/\n}} And now text";
77	79	showTokens(text);
78		- text = "klarinet3.jpg Also, I think that the syntax could be changed to\n <nowiki>[[category:''category_name''\|''sort_key''\|''display_text'']]</nowiki>\nwith ''sort_key'' and ''display_text'' defaulting to ''category_name''.";
	80	+ text = "メインページ klarinet3.jpg Also, I think that the syntax could be changed to\n <nowiki>[[category:''category_name''\|''sort_key''\|''display_text'']]</nowiki>\nwith ''sort_key'' and ''display_text'' defaulting to ''category_name''.";
79	81	showTokens(text);
80	82	text = "[[meta:jao]] L.A. W. B.M.W and This. is a '''list of [[African]] countries and dependencies by [[population]]'''.\n\n{\| border=\"1\" cellpadding=\"2\" cellspacing=\"0\" style=\"border-collapse:collapse; text-align:right;\"\n\|- style=\"text-align:center; background:#efefef\"\n!Pos !! Country !! Population\n\|-\n\| align=\"left\" \|-\n\| align=\"left\" \|'''Africa''' \|\| 934,283,426\n\|-\n";
81	83	showTokens(text);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
—	—	@@ -116,6 +116,7 @@
117	117	{"Douglas Adams's Guide to The Hitch-Hiker's Guide to the Galaxy",""},
118	118	{"bethlem jesus","bethlehem jesus"},
119	119	{"los angles gardens","los angeles gardens"},
	120	+ {"huston we have a problem","houston we have a problem"},
120	121
121	122	};
122	123
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
—	—	@@ -49,6 +49,7 @@
50	50
51	51	public class Suggest {
52	52	static Logger log = Logger.getLogger(Suggest.class);
	53	+ protected static GlobalConfiguration global=null;
53	54	protected IndexId iid;
54	55	protected IndexSearcher searcher;
55	56	protected IndexReader reader;
—	—	@@ -58,6 +59,7 @@
59	60	protected HashMap<String,Boolean> wordExistCache = new HashMap<String,Boolean>();
60	61	protected enum Filtering { STRONG, WEAK };
61	62	protected boolean useLogging = true;
	63	+ protected int minWordFreq = 0;
62	64
63	65	/** Distance an metaphone metrics */
64	66	static public class Metric {
—	—	@@ -175,10 +177,13 @@
176	178	this.iid = iid;
177	179	if(searcher == null)
178	180	searcher = cache.getLocalSearcher(iid.getSpell());
	181	+ if(global == null)
	182	+ global = GlobalConfiguration.getInstance();
179	183	this.searcher = searcher;
180	184	this.reader = searcher.getIndexReader();
181	185	this.defaultNs = iid.getDefaultNamespace();
182	186	this.useLogging = useLogging;
	187	+ this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell","wordsMinFreq",3);
183	188
184	189	synchronized(stopWordsIndexes){
185	190	if(!stopWordsIndexes.containsKey(searcher)){
—	—	@@ -397,7 +402,7 @@
398	403	continue;
399	404	}
400	405	// words found within context should be spell-checked only if they are not valid words
401		~~- if(info.foundInContext.contains(w) && wordExists(w,ns)){~~
	406	+ if(info.foundInContext.contains(w) && wordExists(w,ns) && wordFrequency(w,ns)>minWordFreq*100){
402	407	addCorrectWord(w,wordSug,possibleStopWords);
403	408	continue;
404	409	}
—	—	@@ -544,7 +549,7 @@
545	550	if(s1.word.equals(w1))
546	551	c.preserves.put(i,w1);
547	552	else if((!good1 && !info.foundInTitles.contains(w1))
548		~~- \|\| ((inTitle\|\|inContext) && diff1 <=2 && !info.foundInContext.contains(w1)) )~~
	553	+ \|\| ((inTitle\|\|inContext) && diff1 <=2 && !info.foundInTitles.contains(w1)) )
549	554	c.substitutes.put(i,s1.word);
550	555	else
551	556	accept = false;
—	—	@@ -552,7 +557,7 @@
553	558	if(s2.word.equals(w2))
554	559	c.preserves.put(i2,w2);
555	560	else if((!good2 && !info.foundInTitles.contains(w2))
556		~~- \|\| ((inTitle\|\|inContext) && diff2 <= 2 && !info.foundInContext.contains(w2)) )~~
	561	+ \|\| ((inTitle\|\|inContext) && diff2 <= 2 && !info.foundInTitles.contains(w2)) )
557	562	c.substitutes.put(i2,s2.word);
558	563	else
559	564	accept = false;
—	—	@@ -1205,6 +1210,7 @@
1206	1211	* @return
1207	1212	*/
1208	1213	public ArrayList<SuggestResult> suggestWords(String word, int num, Namespaces namespaces, Filtering filter){
	1214	+ log.debug("Suggesting words for "+word);
1209	1215	if(namespaces == null) // default
1210	1216	return suggestWordsOnNamespaces(word,word,num,num,null,filter);
1211	1217
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java
—	—	@@ -10,6 +10,7 @@
11	11	import org.apache.lucene.search.RemoteSearchableMul;
12	12	import org.wikimedia.lsearch.config.GlobalConfiguration;
13	13	import org.wikimedia.lsearch.config.IndexId;
	14	+import org.wikimedia.lsearch.config.IndexRegistry;
14	15	import org.wikimedia.lsearch.search.SearcherCache;
15	16
16	17	/** Starts the RMI registry and binds all RMI objects */
—	—	@@ -17,6 +18,7 @@
18	19	protected static org.apache.log4j.Logger log = Logger.getLogger(RMIServer.class);
19	20
20	21	protected static SearcherCache cache = null;
	22	+ protected static IndexRegistry indexes = null;
21	23
22	24	public static void register(Remote engine, String name){
23	25	try {
—	—	@@ -41,18 +43,24 @@
42	44	}
43	45
44	46	/** After updating local copy of iid, rebind it's rmi object */
45		~~- public static void rebind(IndexId iid){~~
	47	+ public static boolean rebind(IndexId iid){
46	48	if(cache == null)
47	49	cache = SearcherCache.getInstance();
	50	+ if(indexes == null)
	51	+ indexes = IndexRegistry.getInstance();
48	52	String name = "RemoteSearchable<"+iid+">";
49	53	try {
50		~~- RemoteSearchableMul rs = new RemoteSearchableMul(cache.getLocalSearcher(iid));~~
51		~~- register(rs,name);~~
	54	+ if(indexes.getCurrentSearch(iid) != null){
	55	+ RemoteSearchableMul rs = new RemoteSearchableMul(cache.getLocalSearcher(iid));
	56	+ register(rs,name);
	57	+ return true;
	58	+ }
52	59	} catch (RemoteException e) {
53	60	log.warn("Error making remote searchable for "+name);
54	61	} catch(Exception e){
55	62	// do nothing, error is logged by some other class (possible SearchCache)
56	63	}
	64	+ return false;
57	65	}
58	66
59	67	/** Bind all RMI objects (Messenger, RemoteSeachables and RMIIndexDaemon) */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
—	—	@@ -45,6 +45,8 @@
46	46	public class PrefixIndexBuilder {
47	47	static Logger log = Logger.getLogger(PrefixIndexBuilder.class);
48	48
	49	+ public static float EXACT_BOOST = 25;
	50	+
49	51	protected IndexId iid, prefixIid, pre;
50	52	protected FilterFactory filters;
51	53	protected Links links=null;
—	—	@@ -187,7 +189,7 @@
188	190	}
189	191
190	192	if(key.equalsIgnoreCase(prefix))
191		~~- ref *= 100; // boost for exact match~~
	193	+ ref *= EXACT_BOOST; // boost for exact match
192	194	refs.put(key,ref);
193	195	}
194	196	ArrayList<Entry<String,Double>> sorted = new ArrayList<Entry<String,Double>>();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/BuildAll.java
—	—	@@ -31,7 +31,7 @@
32	32	static org.apache.log4j.Logger log = null;
33	33
34	34	protected static void printHelp(){
35		~~- System.out.println("Syntax: BuildAll [-f <file>] [-lt] [-i] [-sc] [dbname] [dump file]");~~
	35	+ System.out.println("Syntax: BuildAll [-f <file>] [-lt] [-i] [-sc] [dump file] [dbname]");
36	36	System.out.println("Options:");
37	37	System.out.println(" -f <file> - use a file with a list of pairs <dbname> <dump file>");
38	38	System.out.println(" -lt - leave titles - don't delete old titles indexes");
—	—	@@ -55,10 +55,14 @@
56	56	importOnly = true;
57	57	else if(args[i].equals("-sc"))
58	58	noSpellcheck = true;
	59	+ else if(args[i].startsWith("-")){
	60	+ System.out.println("Unrecognized option "+args[i]);
	61	+ printHelp();
	62	+ return;
	63	+ } else if(dump == null)
	64	+ dump = args[i];
59	65	else if(dbname == null)
60	66	dbname = args[i];
61		~~- else if(dump == null)~~
62		~~- dump = args[i];~~
63	67	else if(args[i].equals("--help")){
64	68	printHelp();
65	69	return;
—	—	@@ -145,7 +149,7 @@
146	150	}
147	151	}
148	152	}
149		~~- System.out.println("Finished building in "+ProgressReport.formatTime(System.currentTimeMillis()-start));~~
	153	+ System.out.println("Finished build in "+ProgressReport.formatTime(System.currentTimeMillis()-start));
150	154	}
151	155
152	156	protected static void copy(String from, String to) throws IOException{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -218,7 +218,7 @@
219	219	IndexThread.makeIndexSnapshot(p,p.getImportPath());
220	220	}
221	221	}
222		~~- if(makeTitles){~~
	222	+ if(makeTitles && iid.hasTitlesIndex()){
223	223	for(IndexId p : iid.getTitlesIndex().getPhysicalIndexIds()){
224	224	if(snapshotDb)
225	225	IndexThread.optimizeIndex(p,p.getImportPath(),IndexId.Transaction.IMPORT);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -899,5 +899,10 @@
900	900	public IndexId getTitleNgram(){
901	901	return IndexId.get(dbname+".title_ngram");
902	902	}
	903	+
	904	+ /** If this iid is in chinese or japanese */
	905	+ public boolean isCJK(){
	906	+ return FilterFactory.isCJKLanguage(getLangCode());
	907	+ }
903	908
904	909	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
—	—	@@ -254,9 +254,9 @@
255	255	// update registry, cache, rmi object
256	256	registry.refreshUpdates(iid);
257	257	warmupAndDeploy(pool,li,type);
	258	+ registry.refreshCurrent(li);
258	259	if(type != RebuildType.STANDALONE)
259	260	RMIServer.rebind(iid);
260		~~- registry.refreshCurrent(li);~~
261	261
262	262	// notify all remote searchers of change
263	263	messenger.notifyIndexUpdated(iid,iid.getDBSearchHosts());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
—	—	@@ -91,6 +91,7 @@
92	92	searcher = new IndexSearcherMul(path);
93	93	searcher.setSimilarity(new WikiSimilarity());
94	94	} catch (IOException e) {
	95	+ e.printStackTrace();
95	96	// tell registry this is not a good index
96	97	IndexRegistry.getInstance().invalidateCurrent(iid);
97	98	log.error("I/O Error opening index at path "+iid.getCanonicalSearchPath()+" : "+e.getMessage());
—	—	@@ -257,18 +258,21 @@
258	259
259	260	/** Warmup all local IndexSearcher (create if necessary) */
260	261	public void warmupLocalCache(){
	262	+ IndexRegistry registry = IndexRegistry.getInstance();
261	263	HashSet<IndexId> mys = global.getMySearch();
262	264	for(IndexId iid : mys){
263	265	try {
264	266	if(iid.isLogical())
265	267	continue;
266		~~- IndexSearcherMul[] pool = getSearcherPool(iid);~~
267		~~- for(IndexSearcherMul is : pool)~~
268		~~- Warmup.warmupIndexSearcher(is,iid,false);~~
269		-
270		~~- Warmup.waitForAggregate(pool);~~
	268	+ if(registry.getCurrentSearch(iid) != null){
	269	+ IndexSearcherMul[] pool = getSearcherPool(iid);
	270	+ for(IndexSearcherMul is : pool)
	271	+ Warmup.warmupIndexSearcher(is,iid,false);
	272	+
	273	+ Warmup.waitForAggregate(pool);
	274	+ }
271	275	} catch (IOException e) {
272		~~- log.warn("I/O error warming index for "+iid);~~
	276	+ log.warn("I/O error warming index for "+iid+" : "+e.getMessage());
273	277	}
274	278	}
275	279	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/ArticleMeta.java
—	—	@@ -56,6 +56,7 @@
57	57	protected SimpleDateFormat isoDate;
58	58	protected long now = 0;
59	59	protected NamespaceFilter subpages;
	60	+ protected boolean isOptimized;
60	61
61	62	protected class CachingThread extends Thread {
62	63	public void run(){
—	—	@@ -67,12 +68,16 @@
68	69	subpage = new boolean[reader.maxDoc()];
69	70	daysOld = new float[reader.maxDoc()];
70	71	for(int i=0;i<reader.maxDoc();i++){
	72	+ if(!isOptimized && reader.isDeleted(i))
	73	+ continue;
71	74	try{
72		~~- subpage[i] = resolveSubpage(i);~~
73		~~- daysOld[i] = resolveDaysOld(i);~~
	75	+ Document d = reader.document(i);
	76	+ subpage[i] = resolveSubpage(d);
	77	+ daysOld[i] = resolveDaysOld(d);
74	78	} catch(Exception e2){
75	79	e2.printStackTrace();
76	80	log.error("Error reading article meta for docid="+i+" : "+e2.getMessage());
	81	+ throw e2;
77	82	}
78	83	}
79	84	log.info("Finished caching article info for "+reader.directory());
—	—	@@ -90,10 +95,7 @@
91	96	/** See if article is a subpage
92	97	* @throws IOException
93	98	* @throws CorruptIndexException */
94		~~- protected final boolean resolveSubpage(int docid) throws IOException{~~
95		~~- if(reader.isDeleted(docid))~~
96		~~- return false;~~
97		~~- Document d = reader.document(docid);~~
	99	+ protected final boolean resolveSubpage(Document d) throws IOException{
98	100	String ns = d.get("namespace");
99	101	if(ns == null)
100	102	return false;
—	—	@@ -107,10 +109,7 @@
108	110	return false;
109	111	}
110	112	/** Calculate how old the indexed article is */
111		~~- protected final float resolveDaysOld(int docid) throws IOException {~~
112		~~- if(reader.isDeleted(docid))~~
113		~~- return 0;~~
114		~~- Document d = reader.document(docid);~~
	113	+ protected final float resolveDaysOld(Document d) throws IOException {
115	114	String dateStr = d.get("date");
116	115	if(dateStr == null)
117	116	return 0;
—	—	@@ -141,6 +140,7 @@
142	141	this.subpages = subpages;
143	142	isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
144	143	isoDate.setTimeZone(TimeZone.getTimeZone("GMT"));
	144	+ this.isOptimized = reader.isOptimized();
145	145
146	146	// run background caching
147	147	new CachingThread().start();
—	—	@@ -148,14 +148,14 @@
149	149
150	150	public final boolean isSubpage(int docid) throws IOException {
151	151	if(!finishedCaching)
152		~~- return resolveSubpage(docid);~~
	152	+ return resolveSubpage(reader.document(docid));
153	153
154	154	return subpage[docid];
155	155	}
156	156
157	157	public float daysOld(int docid) throws IOException {
158	158	if(!finishedCaching)
159		~~- return resolveDaysOld(docid);~~
	159	+ return resolveDaysOld(reader.document(docid));
160	160
161	161	return daysOld[docid];
162	162	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java
—	—	@@ -81,6 +81,7 @@
82	82	protected IndexReader reader = null;
83	83	protected String field;
84	84	protected boolean cachingFinished = false;
	85	+ protected boolean isOptimized;
85	86
86	87	protected class CachingThread extends Thread {
87	88	public void run(){
—	—	@@ -105,7 +106,7 @@
106	107	for(int i=0;i<maxdoc;i++){
107	108	byte[] stored = null;
108	109	try{
109		~~- if(reader.isDeleted(i))~~
	110	+ if(!isOptimized && reader.isDeleted(i))
110	111	continue;
111	112	Document doc = reader.document(i);
112	113	stored = doc.getBinaryValue(field);
—	—	@@ -134,6 +135,7 @@
135	136	} catch(Exception e){
136	137	log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage());
137	138	e.printStackTrace();
	139	+ throw e;
138	140	}
139	141	}
140	142	// compact arrays
—	—	@@ -178,6 +180,7 @@
179	181	protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{
180	182	this.reader = reader;
181	183	this.field = fieldBase+"_meta";
	184	+ this.isOptimized = reader.isOptimized();
182	185	Collection fields = reader.getFieldNames(FieldOption.ALL);
183	186	if(!fields.contains(field)){
184	187	cachingFinished = true;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
—	—	@@ -76,6 +76,7 @@
77	77	try{
78	78	boolean waitForAggregate = Configuration.open().getString("Search","warmupaggregate","false").equalsIgnoreCase("true");
79	79	if(waitForAggregate){ // wait for aggregate fields to be cached
	80	+ log.info("Wait for aggregate caches...");
80	81	boolean wait;
81	82	do{
82	83	wait = false;
—	—	@@ -109,35 +110,46 @@
110	111
111	112	int count = getWarmupCount(iid);
112	113
113		~~- if(iid.isSpell() && count > 0){~~
114		~~- Terms terms = getTermsForLang(iid.getLangCode());~~
115		~~- Suggest sug = new Suggest(iid,is,false);~~
116		~~- WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid));~~
117		~~- NamespaceFilter nsf = iid.getDefaultNamespace();~~
118		~~- for(int i=0;i<count;i++){~~
119		~~- String searchterm = terms.next();~~
120		~~- sug.suggest(searchterm,parser.tokenizeForSpellCheck(searchterm),new Suggest.ExtraInfo(),nsf);~~
	114	+ if(iid.isSpell()){
	115	+ if(count > 0){
	116	+ Terms terms = getTermsForLang(iid.getLangCode());
	117	+ Suggest sug = new Suggest(iid,is,false);
	118	+ WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid));
	119	+ NamespaceFilter nsf = iid.getDefaultNamespace();
	120	+ for(int i=0;i<count;i++){
	121	+ String searchterm = terms.next();
	122	+ sug.suggest(searchterm,parser.tokenizeForSpellCheck(searchterm),new Suggest.ExtraInfo(),nsf);
	123	+ }
121	124	}
122		~~- } else if(iid.isTitleNgram() && count > 0){~~
123		~~- Terms terms = getTermsForLang(iid.getLangCode());~~
124		~~- SuggestSimilar sim = new SuggestSimilar(iid,is);~~
125		~~- for(int i=0;i<count;i++){~~
126		~~- sim.getSimilarTitles(terms.next(),new NamespaceFilter(),4);~~
	125	+ } else if(iid.isTitleNgram()){
	126	+ if(count > 0){
	127	+ Terms terms = getTermsForLang(iid.getLangCode());
	128	+ SuggestSimilar sim = new SuggestSimilar(iid,is);
	129	+ for(int i=0;i<count;i++){
	130	+ sim.getSimilarTitles(terms.next(),new NamespaceFilter(),4);
	131	+ }
127	132	}
128		~~- } else if(iid.isPrefix() && count > 0){~~
129		~~- Terms terms = getTermsForLang(iid.getLangCode());~~
130		~~- SearchEngine search = new SearchEngine();~~
131		~~- for(int i=0;i<count;i++){~~
132		~~- String searchterm = terms.next();~~
133		~~- searchterm = searchterm.substring(0,(int)Math.min(8*Math.random()+1,searchterm.length()));~~
134		~~- search.searchPrefixLocal(iid,searchterm,20,iid.getDefaultNamespace(),is);~~
	133	+ } else if(iid.isPrefix()){
	134	+ if(count > 0){
	135	+ Terms terms = getTermsForLang(iid.getLangCode());
	136	+ SearchEngine search = new SearchEngine();
	137	+ for(int i=0;i<count;i++){
	138	+ String searchterm = terms.next();
	139	+ searchterm = searchterm.substring(0,(int)Math.min(8*Math.random()+1,searchterm.length()));
	140	+ search.searchPrefixLocal(iid,searchterm,20,iid.getDefaultNamespace(),is);
	141	+ }
135	142	}
136		~~- } else if((iid.isHighlight() \|\| iid.isRelated()) && count > 0 && !iid.isTitlesBySuffix()){~~
137		~~- // NOTE: this might not warmup all caches, but should read stuff into memory buffers~~
138		~~- for(int i=0;i<count;i++){~~
139		~~- int docid = (int)(Math.random()*is.maxDoc());~~
140		~~- reader.document(docid).get("key");~~
141		~~- }~~
	143	+ } else if((iid.isHighlight() \|\| iid.isRelated()) && !iid.isTitlesBySuffix()){
	144	+ if(count > 0){
	145	+ // NOTE: this might not warmup all caches, but should read stuff into memory buffers
	146	+ for(int i=0;i<count;i++){
	147	+ int docid = (int)(Math.random()*is.maxDoc());
	148	+ reader.document(docid).get("key");
	149	+ }
	150	+ }
	151	+ } else if(iid.isTitlesBySuffix()){
	152	+ // just initiate meta field caching, we want to avoid caching unnecessary filters
	153	+ AggregateMetaField.getCachedSource(is.getIndexReader(),"alttitle");
142	154	} else{
143	155	// normal indexes
144	156	if(count == 0){
—	—	@@ -180,7 +192,7 @@
181	193	log.error("Error warming up local IndexSearcherMul for "+iid);
182	194	} catch (Exception e) {
183	195	e.printStackTrace();
184		~~- log.error("Exception during warmup "+e.getMessage());~~
	196	+ log.error("Exception during warmup of "+iid+" : "+e.getMessage());
185	197	}
186	198	}
187	199
—	—	@@ -188,14 +200,9 @@
189	201	protected static Terms getTermsForLang(String lang) {
190	202	String lib = Configuration.open().getLibraryPath();
191	203	if("en".equals(lang) \|\| "de".equals(lang) \|\| "es".equals(lang) \|\| "fr".equals(lang) \|\| "it".equals(lang) \|\| "pt".equals(lang))
192		~~- langTerms.put(lang,new WordTerms(lib+Configuration.PATH_SEP+"dict"+Configuration.PATH_SEP+"terms-"+lang+".txt.gz"));~~
193		~~- if(lang.equals("sample"))~~
194		~~- return new SampleTerms();~~
195		-
196		~~- if(langTerms.containsKey(lang))~~
197		~~- return langTerms.get(lang);~~
	204	+ return new WordTerms(lib+Configuration.PATH_SEP+"dict"+Configuration.PATH_SEP+"terms-"+lang+".txt.gz");
198	205	else
199		~~- return langTerms.get("en");~~
	206	+ return new SampleTerms();
200	207	}
201	208
202	209	/** Preload all predefined filters */
—	—	@@ -218,7 +225,7 @@
219	226	try{
220	227	FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
221	228	WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
222		~~- Query q = parser.parse("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons");~~
	229	+ Query q = parser.parse("wikimedia foundation");
223	230	is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
224	231	} catch (IOException e) {
225	232	log.error("Error warming up local IndexSearcherMul for "+iid);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java
—	—	@@ -177,6 +177,7 @@
178	178	return WildcardType.INVALID;
179	179	}
180	180
	181	+ /** Get terms from a local searcher if available */
181	182	public static ArrayList<String> getLocalTerms(IndexId iid, String wildcard, boolean exactCase) throws IOException {
182	183	if(searcherCache == null)
183	184	searcherCache = SearcherCache.getInstance();
—	—	@@ -218,6 +219,7 @@
219	220	return list;
220	221	}
221	222
	223	+ /** Fetch terms matching a wildcard pattern into the target collection */
222	224	protected static void addTerms(Collection<String> ret, Term wildcardTerm, IndexReader reader, WildcardType type) throws IOException{
223	225	Term t;
224	226	WildcardTermEnum te = new WildcardTermEnum(reader,wildcardTerm);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -45,6 +45,7 @@
46	46	import org.wikimedia.lsearch.highlight.HighlightResult;
47	47	import org.wikimedia.lsearch.index.MessengerThread;
48	48	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	49	+import org.wikimedia.lsearch.prefix.PrefixIndexBuilder;
49	50	import org.wikimedia.lsearch.ranks.StringList;
50	51	import org.wikimedia.lsearch.related.Related;
51	52	import org.wikimedia.lsearch.related.RelatedTitle;
—	—	@@ -371,12 +372,18 @@
372	373	ArrayList<String> keys = new ArrayList<String>();
373	374	if(prefixKey.startsWith("0:")){
374	375	String title = prefixKey.substring(2);
	376	+ String alt = null;
	377	+ if(title.startsWith("\"") && title.length()>1)
	378	+ alt = title.substring(1);
375	379	for(Integer ns : nsf.getNamespacesOrdered()){
376	380	keys.add(ns+":"+title);
	381	+ if(alt != null)
	382	+ keys.add(ns+":"+alt);
377	383	}
	384	+
378	385	} else
379	386	keys.add(prefixKey);
380		-
	387	+
381	388	ArrayList<PrefixMatch> results = new ArrayList<PrefixMatch>();
382	389	IndexReader reader = searcher.getIndexReader();
383	390
—	—	@@ -403,7 +410,7 @@
404	411	if(td1.next()){
405	412	PrefixMatch m = new PrefixMatch(reader.document(td1.doc()).get("article"));
406	413	if(r.equals(key))
407		~~- m.score *= 100; // exact boost~~
	414	+ m.score *= PrefixIndexBuilder.EXACT_BOOST; // exact boost
408	415	results.add(m);
409	416
410	417	}
—	—	@@ -996,8 +1003,9 @@
997	1004	}
998	1005
999	1006	protected void sendStats(long delta){
1000		~~- boolean succ = delta < 10000; // we queries taking more than 10s as bad~~
1001		~~- SearchServer.stats.add(succ, delta, SearchDaemon.getOpenCount());~~
	1007	+ boolean succ = delta < 10000; // we queries taking more than 10s as bad
	1008	+ if(SearchServer.stats != null)
	1009	+ SearchServer.stats.add(succ, delta, SearchDaemon.getOpenCount());
1002	1010	}
1003	1011
1004	1012	protected void logRequest(IndexId iid, String what, String searchterm, Query query, int numhits, long start, Searchable searcher) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java
—	—	@@ -36,6 +36,7 @@
37	37	protected Set<String> stopWords;
38	38
39	39	protected boolean highlightAllStop = false;
	40	+ protected boolean isCJK = false;
40	41
41	42	// for custom scoring
42	43	protected int textLength = 0;
—	—	@@ -236,7 +237,7 @@
237	238	// make snippet in range showBegin,showEnd
238	239	Snippet s = new Snippet();
239	240	StringBuilder sb = new StringBuilder();
240		~~- int start=0, end=0; // range~~
	241	+ int start=0, end=0, mid=0; // range
241	242	if(showBegin > 0 && tokens.get(showBegin).getType() == ExtToken.Type.TEXT)
242	243	showBegin--; // always start with nontext token to catch " and (
243	244	if(showEnd == tokens.size())
—	—	@@ -275,12 +276,42 @@
276	277	continue;
277	278	}
278	279	if(t.getPositionIncrement() != 0){
	280	+ if(isCJK && t.getType() == Type.TEXT && t.type().equals("cjk")){
	281	+ boolean lastOnly = false;
	282	+ // reconstruct CJK tokens from stream C1C2 C2C3 C3C4 -> C1C2C3C4
	283	+ if(mainToken != null && mainToken.getType()==Type.TEXT && mainToken.type().equals("cjk") && mid!=start){
	284	+ start = mid; // C2C3 token, start of this token is "in the middle of last added token"
	285	+ lastOnly = true;
	286	+ } else
	287	+ start = getLength(sb); // C1C2 token
	288	+
	289	+ // add current
	290	+ mid = start;
	291	+ String tt = t.getText();
	292	+ int len = tt.length();
	293	+ if(len>=2){
	294	+ // not terminal, calculate new midpoint
	295	+ int point = len-1;
	296	+ if(Character.isSurrogatePair(tt.charAt(len-2),tt.charAt(len-1)))
	297	+ point = len-2;
	298	+
	299	+ if(!lastOnly)
	300	+ sb.append(tt.substring(0,point));
	301	+ mid = getLength(sb);
	302	+ sb.append(tt.substring(point));
	303	+ } else
	304	+ sb.append(tt);
	305	+
	306	+ end = getLength(sb);
	307	+ } else{
	308	+ start = getLength(sb);
	309	+ sb.append(t.getText());
	310	+ end = getLength(sb);
	311	+ }
279	312	mainToken = t;
280		~~- start = getLength(sb);~~
281		~~- sb.append(t.getText());~~
282		~~- end = getLength(sb);~~
283	313	}
284	314	if(highlight.contains(t.termText()) && !isolatedStopWords(t.termText(),i)){
	315	+ // highlight part of the text
285	316	if(mainToken != null && mainToken!=t && (mainToken.termText().contains(".") \|\| mainToken.termText().contains("'"))){
286	317	Snippet.Range range = findSubRange(mainToken,t,start);
287	318	if(range != null)
—	—	@@ -293,6 +324,7 @@
294	325	if(alttitle != null)
295	326	s.setOriginalText(alttitle.getTitle());
296	327
	328	+ s.simplifyRanges();
297	329	return s;
298	330	}
299	331
—	—	@@ -362,7 +394,9 @@
363	395	}
364	396	}
365	397
366		~~- public RawSnippet(ArrayList<ExtToken> tokens, FragmentScore f, Set<String> highlight, Set<String> newTerms, Set<String> stopWords){~~
	398	+ public RawSnippet(ArrayList<ExtToken> tokens, FragmentScore f,
	399	+ Set<String> highlight, Set<String> newTerms, Set<String> stopWords,
	400	+ boolean isCJK){
367	401	this.tokens = new ArrayList<ExtToken>();
368	402	// include initial nontext token
369	403	if(f.start > 0 && f.start < tokens.size() && tokens.get(f.start).getType()==ExtToken.Type.TEXT)
—	—	@@ -385,6 +419,7 @@
386	420	this.cur = f;
387	421	this.sequenceNum = f.sequenceNum;
388	422	this.stopWords = stopWords;
	423	+ this.isCJK = isCJK;
389	424	this.textLength = noAliasLength();
390	425	if(stopWords!=null && stopWords.size()>0){
391	426	highlightAllStop = true;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java
—	—	@@ -100,6 +100,21 @@
101	101	return getFormatted();
102	102	}
103	103
	104	+ /** If consequtive words are being highlighted, merge ranges */
	105	+ public void simplifyRanges(){
	106	+ Range last = null;
	107	+ ArrayList<Range> simplified = new ArrayList<Range>();
	108	+ for(Range r : highlighted){
	109	+ if(last != null && last.end >= r.start)
	110	+ last.end = r.end;
	111	+ else{
	112	+ simplified.add(r);
	113	+ last = r;
	114	+ }
	115	+ }
	116	+ highlighted = simplified;
	117	+ }
	118	+
104	119	/** Get default formatting with <b> and </b> tags */
105	120	public String getFormatted(){
106	121	return getFormatted("<b>","</b>");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java
—	—	@@ -111,7 +111,11 @@
112	112	boolean foundAllInTitle = false, foundAllInAltTitle = false;
113	113	int firstHitRank = 0;
114	114	HashSet<String> inTitle = new HashSet<String>();
	115	+ boolean isCJK = iid.getDB().isCJK();
115	116
	117	+ //System.out.println("Terms: "+Arrays.toString(terms));
	118	+ //System.out.println("Words: "+words);
	119	+
116	120	// terms weighted with idf
117	121	HashMap<String,Double> weightTerm = new HashMap<String,Double>();
118	122	for(int i=0;i<terms.length;i++){
—	—	@@ -155,12 +159,12 @@
156	160	firstHitRank = alttitles.getTitle().getRank();
157	161
158	162	HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles,wordIndex);
159		~~- ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine );~~
160		~~- ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,words,wordIndex,1,true,stopWords,false,phrases,inContext,false,false);~~
	163	+ ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine, isCJK );
	164	+ ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,words,wordIndex,1,true,stopWords,false,phrases,inContext,false,false,isCJK);
161	165	RawSnippet redirectSnippet = null;
162	166	// don't show redirect if we matched whole title
163	167	if(! (titleSnippets.size()>0 && titleSnippets.get(0).countPositions()==titleSnippets.get(0).noAliasLength())){
164		~~- redirectSnippet = getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext);~~
	168	+ redirectSnippet = getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext,isCJK);
165	169	}
166	170	RawSnippet sectionSnippet = null;
167	171	if(redirectSnippet == null){
—	—	@@ -169,7 +173,7 @@
170	174	if(notInTitle.containsKey(s))
171	175	notInTitle.remove(s);
172	176	}
173		~~- sectionSnippet = getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext);~~
	177	+ sectionSnippet = getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext,isCJK);
174	178	}
175	179
176	180	HighlightResult hr = new HighlightResult();
—	—	@@ -182,7 +186,7 @@
183	187	boolean addSection = true, added = true;
184	188	while(added && more(hr.textLength())){
185	189	// add more snippets if there is still space
186		~~- added = extendSnippet(raw,hr,raw.size()-1,tokens,addSection,stopWords);~~
	190	+ added = extendSnippet(raw,hr,raw.size()-1,tokens,addSection,stopWords,isCJK);
187	191	addSection = false;
188	192	}
189	193	} else if(textSnippets.size() >= 2){
—	—	@@ -203,13 +207,13 @@
204	208	if(more(hr.textLength())){
205	209	// first pass of snippet extension, extend shortest first
206	210	if(s1.length() < s2.length()){
207		~~- extendSnippet(raw,hr,0,tokens,true,stopWords);~~
	211	+ extendSnippet(raw,hr,0,tokens,true,stopWords,isCJK);
208	212	if(more(hr.textLength()))
209		~~- extendSnippet(raw,hr,raw.size()-1,tokens,true,stopWords);~~
	213	+ extendSnippet(raw,hr,raw.size()-1,tokens,true,stopWords,isCJK);
210	214	} else {
211		~~- extendSnippet(raw,hr,1,tokens,true,stopWords);~~
	215	+ extendSnippet(raw,hr,1,tokens,true,stopWords,isCJK);
212	216	if(more(hr.textLength()))
213		~~- extendSnippet(raw,hr,0,tokens,true,stopWords);~~
	217	+ extendSnippet(raw,hr,0,tokens,true,stopWords,isCJK);
214	218	}
215	219	}
216	220	boolean added = true;
—	—	@@ -219,7 +223,7 @@
220	224	for(int i=0;i<hr.getText().size() && more(hr.textLength());i++){
221	225	boolean addedNow = false;
222	226	if(hr.getText().get(i).isExtendable()){
223		~~- addedNow = extendSnippet(raw,hr,i,tokens,false,stopWords);~~
	227	+ addedNow = extendSnippet(raw,hr,i,tokens,false,stopWords,isCJK);
224	228	if(addedNow)
225	229	i++;
226	230	}
—	—	@@ -337,14 +341,14 @@
338	342	}
339	343
340	344	private static boolean extendSnippet(ArrayList<RawSnippet> raw, HighlightResult hr, int index,
341		~~- ArrayList<ExtToken> tokens, boolean addSection, HashSet<String> stopWords){~~
	345	+ ArrayList<ExtToken> tokens, boolean addSection, HashSet<String> stopWords, boolean isCJK){
342	346	Snippet curS = hr.getText().get(index);
343	347	RawSnippet curRs = raw.get(index);
344	348	int len = hr.textLength();
345	349	boolean added = false;
346	350	// add section
347	351	if(addSection && more(len)){
348		~~- RawSnippet rs = sectionSnippet(curRs,curS,tokens,stopWords);~~
	352	+ RawSnippet rs = sectionSnippet(curRs,curS,tokens,stopWords,isCJK);
349	353	if(rs != null && !raw.contains(rs)){
350	354	Snippet s = rs.makeSnippet(diff(len));
351	355	setSuffix(s,rs);
—	—	@@ -364,7 +368,7 @@
365	369	}
366	370	// add next snippet
367	371	if(more(len)){
368		~~- RawSnippet rs = nextSnippet(curRs,curS,tokens,stopWords);~~
	372	+ RawSnippet rs = nextSnippet(curRs,curS,tokens,stopWords,isCJK);
369	373	if(rs != null && !raw.contains(rs)){
370	374	Snippet s = rs.makeSnippet(diff(len));
371	375	setSuffix(curS,curRs);
—	—	@@ -378,17 +382,17 @@
379	383	return added;
380	384	}
381	385
382		~~- protected static RawSnippet nextSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords){~~
	386	+ protected static RawSnippet nextSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords, boolean isCJK){
383	387	if(rs.next == null)
384	388	return null;
385		~~- return new RawSnippet(tokens,rs.next,rs.highlight,new HashSet<String>(),stopWords);~~
	389	+ return new RawSnippet(tokens,rs.next,rs.highlight,new HashSet<String>(),stopWords,isCJK);
386	390	}
387	391
388		~~- protected static RawSnippet sectionSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords){~~
	392	+ protected static RawSnippet sectionSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords, boolean isCJK){
389	393	if(rs.section == null)
390	394	return null;
391	395	if(s.length() < SHORT_SNIPPET)
392		~~- return new RawSnippet(tokens,rs.section,rs.highlight,new HashSet<String>(),stopWords);~~
	396	+ return new RawSnippet(tokens,rs.section,rs.highlight,new HashSet<String>(),stopWords,isCJK);
393	397	return null;
394	398	}
395	399
—	—	@@ -418,7 +422,7 @@
419	423	/** Alttitle and sections highlighting */
420	424	protected static RawSnippet getBestAltTitle(ArrayList<Alttitles.Info> altInfos, HashMap<String,Double> weightTerm,
421	425	HashMap<String,Double> notInTitle, HashSet<String> stopWords, ArrayList<String> words, HashMap<String,Integer> wordIndex,
422		~~- int minAdditional, HashSet<String> phrases, HashSet<String> inContext){~~
	426	+ int minAdditional, HashSet<String> phrases, HashSet<String> inContext, boolean isCJK){
423	427	ArrayList<RawSnippet> res = new ArrayList<RawSnippet>();
424	428	for(Alttitles.Info ainf : altInfos){
425	429	double matched = 0, additionalScore = 0;
—	—	@@ -445,7 +449,7 @@
446	450	}
447	451	}
448	452	if(length == matchedPositions.size() \|\| additional > minAdditional \|\| (additional != 0 && additional == notInTitle.size())){
449		~~- ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false);~~
	453	+ ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false, isCJK);
450	454	if(snippets.size() > 0){
451	455	RawSnippet snippet = snippets.get(0);
452	456	snippet.setAlttitle(ainf);
—	—	@@ -520,7 +524,8 @@
521	525	/** Highlight text */
522	526	protected static ArrayList<RawSnippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms,
523	527	ArrayList<String> words, HashMap<String,Integer> wordIndex, int maxSnippets, boolean ignoreBreaks, HashSet<String> stopWords,
524		~~- boolean showFirstIfNone, HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases, final boolean alwaysIncludeFirstLine) {~~
	528	+ boolean showFirstIfNone, HashSet<String> phrases, HashSet<String> foundInContext,
	529	+ final boolean sortByPhrases, final boolean alwaysIncludeFirstLine, final boolean isCJK) {
525	530
526	531	// pieces of text to ge highlighted
527	532	ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>();
—	—	@@ -593,7 +598,7 @@
594	599	if(foundAllInFirst && beginLen > 2*MAX_CONTEXT && firstFragment!=null){
595	600	// made enough snippets, return the first one
596	601	ArrayList<RawSnippet> res = new ArrayList<RawSnippet>();
597		~~- res.add(new RawSnippet(tokens,firstFragment,weightTerms.keySet(),firstFragment.found,stopWords));~~
	602	+ res.add(new RawSnippet(tokens,firstFragment,weightTerms.keySet(),firstFragment.found,stopWords,isCJK));
598	603	return res;
599	604	}
600	605	fs.next = new FragmentScore(fs.end, sequence++); // link into list
—	—	@@ -759,7 +764,7 @@
760	765	if(f.found != null)
761	766	termsFound.addAll(f.found);
762	767	adjustBest(f,tokens,weightTerms,words,wordIndex,newTerms);
763		~~- RawSnippet s = new RawSnippet(tokens,f,wordHighlight,newTerms,stopWords);~~
	768	+ RawSnippet s = new RawSnippet(tokens,f,wordHighlight,newTerms,stopWords,isCJK);
764	769	res.add(s);
765	770	} else if(resNoNew.size() < maxSnippets)
766	771	resNoNew.add(f);
—	—	@@ -768,7 +773,7 @@
769	774	}
770	775	// if text doesn't match show some body text
771	776	if(showFirstIfNone && res.size() == 0 && fragmentsBeginning != null){
772		~~- res.add(new RawSnippet(tokens,fragmentsBeginning,wordHighlight,wordHighlight,stopWords));~~
	777	+ res.add(new RawSnippet(tokens,fragmentsBeginning,wordHighlight,wordHighlight,stopWords,isCJK));
773	778	}
774	779	// always show snippet that is before in the text first
775	780	Collections.sort(res, new Comparator<RawSnippet>() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -787,6 +787,10 @@
788	788	doc.add(new Field("date",isoDate.format(article.getDate()),Store.YES,Index.NO));
789	789
790	790	float rankBoost = transformRank(article.getRank());
	791	+
	792	+ // add both title and redirects to content, so queries that match part of title and content won't fail
	793	+ String contents = article.getContents();
	794	+ contents = article.getTitle()+". "+contents+". "+serializeRedirects(article.getRedirectKeywords());
791	795
792	796	/** Following fields can be optionally case-dependent */
793	797	for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
—	—	@@ -797,7 +801,7 @@
798	802	TokenizerOptions options = new TokenizerOptions(bs.isExactCase());
799	803	if(filters.isSpellCheck())
800	804	options = new TokenizerOptions.SpellCheck();
801		~~- WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,options);~~
	805	+ WikiTokenizer tokenizer = new WikiTokenizer(contents,iid,options);
802	806	tokenizer.tokenize();
803	807
804	808	// title
—	—	@@ -844,6 +848,18 @@
845	849	return doc;
846	850	}
847	851
	852	+ /** Serialize redirects that will be added to end of the article */
	853	+ private static String serializeRedirects(ArrayList<String> redirectKeywords) {
	854	+ if(redirectKeywords.size()==0)
	855	+ return "";
	856	+ StringBuilder sb = new StringBuilder();
	857	+ for(String s : redirectKeywords){
	858	+ sb.append(s);
	859	+ sb.append(". ");
	860	+ }
	861	+ return sb.toString();
	862	+ }
	863	+
848	864	/** Make the document that will be indexed as highlighting data */
849	865	public static Document makeHighlightDocument(Article article, FieldBuilder builder, IndexId iid) throws IOException{
850	866	WikiIndexModifier.transformArticleForIndexing(article);
Index: branches/lucene-search-2.1/webinterface/lsweb.py
—	—	@@ -6,7 +6,14 @@
7	7	from urllib2 import URLError, HTTPError
8	8
9	9	#search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv79:8123' }
10		~~-search_host = {'<default>' : 'localhost:8123', 'enwiki' : "srv79:8123", 'srwiki' : "srv79:8123" }~~
	10	+search_host = {'<default>' : 'srv79:8123',
	11	+ 'jawiki' : "localhost:8123",
	12	+ 'frwiki' : "localhost:8123",
	13	+ 'dewiki' : "localhost:8123",
	14	+ 'itwiki' : "localhost:8123",
	15	+ 'jawikiquote' : "localhost:8123",
	16	+ 'wikilucene' : 'localhost:8123' }
	17	+#search_host = {'<default>' : 'localhost:8123'}
11	18
12	19	canon_namespaces = { 0 : '', 1: 'Talk', 2: 'User', 3: 'User_talk',
13	20	4 : 'Project', 5 : 'Project_talk', 6 : 'Image', 7 : 'Image_talk',
Index: branches/lucene-search-2.1/webinterface/searchForm.html
—	—	@@ -41,7 +41,7 @@
42	42	</p>
43	43
44	44	<p>
45		~~-<strong>Status</strong> only en.wiki updated~~
	45	+<strong>Status</strong> Up
46	46	</p>
47	47	<strong>Search:</strong>
48	48	<hr>
—	—	@@ -53,25 +53,19 @@
54	54	<option value="enwiki">enwiki</option>
55	55	<option value="dewiki">dewiki</option>
56	56	<option value="frwiki">frwiki</option>
57		~~- <option value="mediawikiwiki">mediawikiwiki</option>~~
58		~~- <option value="metawiki">metawiki</option>~~
59		~~- <option value="wikilucene">wikilucene</option>~~
60		~~- <option value="wikidev">wikidev</option>~~
61		~~- <option value="enwiktionary">enwiktionary</option>~~
62		~~- <option value="enwiktionary-exact">enwiktionary-exact</option>~~
63		~~- <option value="enwikinews">enwikinews</option>~~
64		~~- <option value="plwiki">plwiki</option>~~
65	57	<option value="jawiki">jawiki</option>
66		~~- <option value="nlwiki">nlwiki</option>~~
67	58	<option value="itwiki">itwiki</option>
68		~~- <option value="ptwiki">ptwiki</option>~~
69		~~- <option value="eswiki">eswiki</option>~~
70		~~- <option value="svwiki">svwiki</option>~~
71		~~- <option value="ruwiki">ruwiki</option>~~
72		~~- <option value="zhwiki">zhwiki</option>~~
73		~~- <option value="fiwiki">fiwiki</option>~~
74		~~- <option value="nowiki">nowiki</option>~~
75		~~- <option value="srwiki">srwiki</option>~~
	59	+ <option value="srwiki">srwiki</option>
	60	+ <option value="enwiktionary">enwiktionary</option>
	61	+ <option value="enwikinews">enwikinews</option>
	62	+ <option value="enwikisource">enwikisource</option>
	63	+ <option value="enwikiquote">enwikiquote</option>
	64	+ <option value="enwikibooks">enwikibooks</option>
	65	+ <option value="enwikiversity">enwikiversity</option>
	66	+ <option value="enwiktionary-exact">enwiktionary-exact</option>
	67	+ <!--<option value="jawikiquote">jawikiquote</option>
	68	+ <option value="wikilucene">wikilucene</option>
	69	+ <option value="wikidev">wikidev</option> -->
76	70	</select>
77	71
78	72	Search for <input type='text' name="query" value="" size="30" id="lsearchbox" />
Index: branches/lucene-search-2.1/lsearch-global.conf
—	—	@@ -21,6 +21,7 @@
22	22	wikiwiktionary, wikiwikisource : (single) (language,en) (prefix)
23	23	enwiki,viwiki,srwiki,eswiki,dewiki,mlwiki,zhwiki,jawiki,itwiki,thwiki : (single)
24	24	mediawikiwiki, metawiki : (single) (language,en)
	25	+jawikiquote : (single) (prefix)
25	26
26	27	# Titles group by interwiki, <all> is the general rule, exceptions can be explicitely set
27	28	[Database-Group]
—	—	@@ -32,16 +33,7 @@
33	34	# host : db1.part db2.part
34	35	# Mulitple hosts can search multiple dbs (N-N mapping)
35	36	[Search-Group]
36		-oblak : wikilucene* wikidev*
37		~~-#oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links~~
38		~~-#oblak : wikilucene.nspart1.sub1 wikilucene.nspart1.sub2~~
39		~~-#oblak : wikilucene.nspart1.sub1.hl wikilucene.nspart1.sub2.hl~~
40		~~-#oblak : wikilucene.nspart2.hl wikilucene.nspart3.hl~~
41		~~-#oblak : wikilucene.nspart2 wikilucene.nspart3 wikilucene.title_ngram~~
42		~~-#oblak : wikilucene.prefix wikilucene.spell wikilucene.related wikilucene.links~~
43		~~-#oblak : wikiwiktionary wikiwikisource wikiwiktionary.prefix~~
44		~~-#oblak : wiki-titles wiki-titles.tspart1 wiki-titles.tspart2~~
45		~~-#oblak : wikidev.prefix wikidev.hl wikidev.spell~~
	37	+oblak : wikilucene* wikidev* ja*
46	38
47	39	# Index nodes
48	40	# host: db1.part db2.part
—	—	@@ -66,7 +58,7 @@
67	59	# Global properies
68	60	[Properties]
69	61	# suffixes to database name, the rest is assumed to be language code
70		~~-Database.suffix=wiki wiktionary wikisource~~
	62	+Database.suffix=wiki wiktionary wikisource wikiquote
71	63
72	64	# use languages codes as interwiki prefixes (usefultokenizer heuristics for WMF-style wiki farms)
73	65	Database.smartInterwiki=false

Status & tagging log

15:25, 12 September 2011 Meno25 (talk | contribs) changed the status of r32997 [removed: ok added: old]