r32997 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r32996‎ | r32997 | r32998 >
Date:23:43, 8 April 2008
Author:rainman
Status:old
Tags:
Comment:
Various smaller tweaks:
* removed excessive caches&checks during warmup
* fixed explanations when boosts are scaled multiple times
* remove some unwanted error reporting
* fixed CJK highlighting
* proper variant conversion for sr
Modified paths:
  • /branches/lucene-search-2.1/lib/dict/wordnet-en.txt.gz (modified) (history)
  • /branches/lucene-search-2.1/lsearch-global.conf (modified) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalMultiQuery.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/CJKFilter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SerbianFilter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/BuildAll.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/ArticleMeta.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Utf8Set.java (modified) (history)
  • /branches/lucene-search-2.1/webinterface/lsweb.py (modified) (history)
  • /branches/lucene-search-2.1/webinterface/searchForm.html (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/lib/dict/wordnet-en.txt.gz
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalMultiQuery.java
@@ -20,6 +20,7 @@
2121 protected PositionalOptions options;
2222 protected int stopWordCount = 0;
2323 protected ArrayList<ArrayList<Float>> boosts = new ArrayList<ArrayList<Float>>();
 24+ protected boolean scaledBoosts = false;
2425
2526 public PositionalMultiQuery(PositionalOptions options){
2627 this.options = options;
@@ -109,18 +110,17 @@
110111 av /= terms.length;
111112 idf += av;
112113
113 - // rescale boosts to reinstall right idfs per term
114 - ArrayList<Float> fb = boosts.get(count);
115 - for(int j=0; j<idfs.length; j++){
116 - fb.set(j,fb.get(j)*(idfs[j]/av));
117 - }
 114+ if(!scaledBoosts){
 115+ // rescale boosts to reinstall right idfs per term
 116+ ArrayList<Float> fb = boosts.get(count);
 117+ for(int j=0; j<idfs.length; j++){
 118+ fb.set(j,fb.get(j)*(idfs[j]/av));
 119+ }
 120+ }
118121 count++;
119122 }
 123+ scaledBoosts = true;
120124 }
121 -
122 - private final float sq(float x){
123 - return x*x;
124 - }
125125
126126 public Scorer scorer(IndexReader reader) throws IOException {
127127 if (termArrays.size() == 0) // optimize zero-term case
@@ -224,7 +224,8 @@
225225 }
226226
227227 public Query rewrite(IndexReader reader) {
228 - if (termArrays.size() == 1) { // optimize one-term case
 228+ // optimize one-term case
 229+ if (termArrays.size() == 1 && (options==null || !options.takeMaxScore)) {
229230 Term[] terms = (Term[])termArrays.get(0);
230231 ArrayList<Float> boost = boosts.get(0);
231232 if(terms.length == 1){
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java
@@ -67,6 +67,7 @@
6868 public Alttitle(){
6969 aggregateMeta = new AggregateInfoImpl();
7070 takeMaxScore = true;
 71+ //exactBoost = 2;
7172 //wholeBoost = 10;
7273 }
7374 }
@@ -96,6 +97,7 @@
9798 public Related(){
9899 aggregateMeta = new AggregateInfoImpl();
99100 takeMaxScore = true;
 101+ //exactBoost = 2;
100102 }
101103 }
102104
@@ -157,6 +159,14 @@
158160 }
159161 }
160162
 163+ /** Near match phrases, when more than 50% of nonstopwords are matched */
 164+ public static class AlttitleNearMatch extends PositionalOptions {
 165+ public AlttitleNearMatch(){
 166+ aggregateMeta = new AggregateInfoImpl();
 167+ takeMaxScore = true;
 168+ }
 169+ }
 170+
161171 public abstract static class NamespaceBoost implements Serializable {
162172 public abstract float getBoost(int namespace);
163173
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java
@@ -444,7 +444,6 @@
445445 */
446446 protected final float phraseFreq() throws IOException {
447447 int end = initPhrasePositionsBoost();
448 -
449448 float freq = 0.0f;
450449 boolean done = (end<0);
451450 while (!done) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -305,7 +305,7 @@
306306 }
307307 if(templateLevel == 0 && tableLevel == 0)
308308 keywordTokens+=gap; // inc by gap (usually 1, can be more before paragraphs and sections)
309 -
 309+
310310 // add exact token
311311 Token exact;
312312 if(options.exactCase)
@@ -322,6 +322,14 @@
323323 exact.setType("titlecase");
324324 }
325325 addToTokens(exact);
 326+
 327+ // extra uppercase token, prevent exact-matches for titles
 328+ if(options.extraUpperCaseToken && allUpperCase){
 329+ Token t = makeToken(new String(buffer, 0, length), start, start + length, false);
 330+ t.setPositionIncrement(0);
 331+ t.setType(exact.type());
 332+ addToTokens(t);
 333+ }
326334
327335 if(!options.noAliases){
328336 // add decomposed token to stream
@@ -650,8 +658,7 @@
651659 prefixLen = 0;
652660 semicolonInx = -1;
653661 break;
654 - }
655 - if(Character.isLetter(lc)){
 662+ } else{
656663 prefixBuf[ prefixLen++ ] = Character.toLowerCase(lc);
657664 }
658665 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -1187,7 +1187,7 @@
11881188
11891189 BooleanQuery wrap = new BooleanQuery(true);
11901190 wrap.add(full,Occur.SHOULD);
1191 - wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD);
 1191+ wrap.add(makeComplete(expandedWordsTitle,expandedBoostTitle,expandedTypes),Occur.SHOULD);
11921192 if(forbidden != null)
11931193 wrap.add(forbidden,Occur.MUST_NOT);
11941194
@@ -1197,7 +1197,7 @@
11981198 AgeScaling age = iid.getAgeScaling();
11991199 if(age != AgeScaling.NONE){
12001200 switch(age){
1201 - case STRONG: scale = new ArticleScaling.SqrtScale(0.3f,1); break;
 1201+ case STRONG: scale = new ArticleScaling.StepScale(0.3f,1); break;
12021202 case MEDIUM: scale = new ArticleScaling.StepScale(0.6f,1); break;
12031203 case WEAK: scale = new ArticleScaling.StepScale(0.9f,1); break;
12041204 default: throw new RuntimeException("Unsupported age scaling "+age);
@@ -1231,14 +1231,15 @@
12321232 }
12331233
12341234 /** Make alternate "complete" query that will match redirects not in contents like los angles -> los angeles */
1235 - private Query makeComplete(ArrayList<ArrayList<String>> expanded) {
1236 - PositionalQuery pq = new PositionalQuery(new PositionalOptions.RedirectComplete());
 1235+ private Query makeComplete(ArrayList<ArrayList<String>> expanded, ArrayList<ArrayList<Float>> boosts, ArrayList<ExpandedType> types) {
 1236+ return makePositionalMulti(expanded,boosts,types,fields.alttitle(),new PositionalOptions.RedirectComplete(),0,1);
 1237+ /* PositionalQuery pq = new PositionalQuery(new PositionalOptions.RedirectComplete());
12371238 for(int i=0;i<expanded.size();i++){
12381239 for(String w : expanded.get(i)){
12391240 pq.add(new Term(fields.alttitle(),w),i,stopWords.contains(w));
12401241 }
12411242 }
1242 - return pq;
 1243+ return pq; */
12431244 }
12441245
12451246 private ArrayList<String> cleanupWords(ArrayList<String> words) {
@@ -1475,8 +1476,12 @@
14761477 return query;
14771478 BooleanQuery bq = new BooleanQuery(true);
14781479 bq.add(query,Occur.SHOULD);
1479 - for(Query q : additional)
1480 - bq.add(q,Occur.SHOULD);
 1480+ for(Query q : additional){
 1481+ if(q != null)
 1482+ bq.add(q,Occur.SHOULD);
 1483+ }
 1484+ if(bq.clauses().size()==1)
 1485+ return query;
14811486 return bq;
14821487 }
14831488
@@ -1637,6 +1642,15 @@
16381643 return bq;
16391644 }
16401645
 1646+ private int countNonStopWords(ArrayList<String> words){
 1647+ int count = 0;
 1648+ for(String w : words){
 1649+ if(!stopWords.contains(w))
 1650+ count++;
 1651+ }
 1652+ return count;
 1653+ }
 1654+
16411655 /** Make query with short subphrases anchored in non-stop words */
16421656 protected Query makeAnchoredQueryMulti(ArrayList<ArrayList<String>> words, ArrayList<ArrayList<Float>> boosts, ArrayList<ExpandedType> types,
16431657 String field, PositionalOptions options, PositionalOptions whole, PositionalOptions wholeSloppy,
@@ -1768,9 +1782,7 @@
17691783
17701784 Query q = parseRaw(queryText);
17711785
1772 - ArrayList<String> words = wordsFromParser;
1773 - if(words == null || words.size() == 0)
1774 - return q;
 1786+ ArrayList<String> words = wordsFromParser;
17751787
17761788 this.builder = oldBuilder;
17771789 this.defaultField = oldDefaultField;
@@ -1786,31 +1798,33 @@
17871799 BooleanQuery full = new BooleanQuery(true);
17881800 full.add(q,Occur.MUST);
17891801
1790 - // main relevance
1791 - Query redirects = makeAlttitleForRedirects(words,20,1);
1792 - if(redirects != null)
1793 - full.add(redirects,Occur.SHOULD);
 1802+ /*if(words != null || words.size() > 0){
 1803+ // main relevance
 1804+ Query redirects = makeAlttitleForRedirects(words,20,1);
 1805+ if(redirects != null)
 1806+ full.add(redirects,Occur.SHOULD);
 1807+
 1808+ // singular words
 1809+ ArrayList<String> singularWords = makeSingularWords(words);
 1810+ if(singularWords != null){
 1811+ Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);
 1812+ if(redirectsSing != null)
 1813+ full.add(redirectsSing,Occur.SHOULD);
 1814+ }
 1815+ } */
17941816
1795 - // singular words
1796 - ArrayList<String> singularWords = makeSingularWords(words);
1797 - if(singularWords != null){
1798 - Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);
1799 - if(redirectsSing != null)
1800 - full.add(redirectsSing,Occur.SHOULD);
1801 - }
1802 -
18031817 // fuzzy & wildcards
18041818 // NOTE: for these to work parseForTitles needs to called after parse()
1805 - if(hasWildcards() || hasFuzzy()){
1806 - Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f);
1807 - if(redirectsMulti != null)
1808 - full.add(redirectsMulti,Occur.SHOULD);
1809 - }
 1819+ //if(hasWildcards() || hasFuzzy()){
 1820+ Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f);
 1821+ if(redirectsMulti != null)
 1822+ full.add(redirectsMulti,Occur.SHOULD);
 1823+ //}
18101824
18111825 // add another for complete matches
18121826 BooleanQuery wrap = new BooleanQuery(true);
18131827 wrap.add(full,Occur.SHOULD);
1814 - wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD);
 1828+ wrap.add(makeComplete(expandedWordsTitle,expandedBoostTitle,expandedTypes),Occur.SHOULD);
18151829 if(forbidden != null)
18161830 wrap.add(forbidden,Occur.MUST_NOT);
18171831
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -35,6 +35,7 @@
3636 protected ArrayList<Class> additionalFilters = null;
3737 protected Singular singular = null;
3838 protected boolean hasCanonicalFilter = false;
 39+ protected boolean hasLanguageVariants = false;
3940
4041 protected FilterFactory noStemmerFilterFactory=null;
4142 protected Set<String> stopWords;
@@ -146,6 +147,9 @@
147148 if(lang.equals("sr"))
148149 hasCanonicalFilter = true;
149150
 151+ // variants (TODO: add zh)
 152+ if(lang.equals("sr"))
 153+ hasLanguageVariants = true;
150154 }
151155
152156 public static boolean isCJKLanguage(String lang){
@@ -368,4 +372,14 @@
369373 public boolean isSpellCheck(){
370374 return type == Type.SPELL_CHECK;
371375 }
 376+
 377+ /** Convert word into language variants if any */
 378+ public ArrayList<String> getVariants(String word){
 379+ if(!hasLanguageVariants)
 380+ return null;
 381+ if(lang.equals("sr")){
 382+ return SerbianFilter.getVariants(word);
 383+ } else
 384+ return null;
 385+ }
372386 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java
@@ -28,6 +28,8 @@
2929 boolean extendedTrailing = false;
3030 /** if to split tokens with apostrophes and points in them */
3131 boolean split = true;
 32+ /** generate extra original token if the word is in upper case */
 33+ boolean extraUpperCaseToken = false;
3234
3335 public TokenizerOptions(boolean exactCase){
3436 this.exactCase = exactCase;
@@ -53,10 +55,11 @@
5456 relocationParsing = false;
5557 noCaseDetection = true;
5658 extendedTrailing = true;
 59+ extraUpperCaseToken = true;
5760 }
5861 }
5962
60 - public static class TitleNoSplit extends Title{
 63+ public static class TitleNoSplit extends Title {
6164 public TitleNoSplit(boolean exactCase){
6265 super(exactCase);
6366 this.split = false;
@@ -111,6 +114,7 @@
112115 super(false);
113116 noAliases = true;
114117 noTrailing = true;
 118+ extraUpperCaseToken = false;
115119 }
116120 }
117121 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SerbianFilter.java
@@ -1,6 +1,8 @@
22 package org.wikimedia.lsearch.analyzers;
33
44 import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.BitSet;
57 import java.util.HashMap;
68
79 import org.apache.lucene.analysis.Token;
@@ -20,10 +22,13 @@
2123 private final char[] buffer = new char[MAX_WORD_LEN+1];
2224 private int length;
2325 protected static String[] conv = null;
 26+ protected static String[] toLatin = null, toCyrillic = null;
 27+ protected static HashMap<String,String> toCyrillicMap = null;
 28+ protected static BitSet toCyrillicTwo = null; // pairs of two chars
2429 protected Token nextToken;
2530 protected boolean aliasDiff;
2631
27 - public void init(){
 32+ public static synchronized void init(){
2833 conv = new String[65536];
2934
3035 for(int i=0;i<65536;i++)
@@ -46,6 +51,110 @@
4752 conv['đ'] = "dj"; conv['Đ']="Dj";
4853 }
4954
 55+ public static synchronized void initVariants(){
 56+ toLatin = new String[65536];
 57+ toCyrillic = new String[65536];
 58+ toCyrillicMap = new HashMap<String,String>();
 59+ toCyrillicTwo = new BitSet();
 60+
 61+ for(int i=0;i<65536;i++){
 62+ toLatin[i] = null;
 63+ toCyrillic[i] = null;
 64+ }
 65+
 66+ toLatin['а']="a"; toLatin['б']="b"; toLatin['в']="v"; toLatin['г']="g"; toLatin['д']="d";
 67+ toLatin['ђ']="đ"; toLatin['е']="e"; toLatin['ж']="ž"; toLatin['з']="z"; toLatin['и']="i";
 68+ toLatin['ј']="j"; toLatin['к']="k"; toLatin['л']="l"; toLatin['љ']="lj"; toLatin['м']="m";
 69+ toLatin['н']="n"; toLatin['њ']="nj"; toLatin['о']="o"; toLatin['п']="p"; toLatin['р']="r";
 70+ toLatin['с']="s"; toLatin['т']="t"; toLatin['ћ']="ć"; toLatin['у']="u"; toLatin['ф']="f";
 71+ toLatin['х']="h"; toLatin['ц']="c"; toLatin['ч']="č"; toLatin['џ']="dž"; toLatin['ш']="š";
 72+
 73+ toLatin['А']="A"; toLatin['Б']="B"; toLatin['В']="V"; toLatin['Г']="G"; toLatin['Д']="D";
 74+ toLatin['Ђ']="Đ"; toLatin['Е']="E"; toLatin['Ж']="Ž"; toLatin['З']="Z"; toLatin['И']="I";
 75+ toLatin['Ј']="J"; toLatin['К']="K"; toLatin['Л']="L"; toLatin['Љ']="Lj"; toLatin['М']="M";
 76+ toLatin['Н']="N"; toLatin['Њ']="Nj"; toLatin['О']="O"; toLatin['П']="P"; toLatin['Р']="R";
 77+ toLatin['С']="S"; toLatin['Т']="T"; toLatin['Ћ']="Ć"; toLatin['У']="U"; toLatin['Ф']="F";
 78+ toLatin['Х']="H"; toLatin['Ц']="C"; toLatin['Ч']="Č"; toLatin['Џ']="Dž"; toLatin['Ш']="Š";
 79+
 80+ toCyrillic['a']="а"; toCyrillic['b']="б"; toCyrillic['c']="ц"; toCyrillic['č']="ч"; toCyrillic['ć']="ћ";
 81+ toCyrillic['d']="д"; toCyrillic['đ']="ђ"; toCyrillic['e']="е"; toCyrillic['f']="ф";
 82+ toCyrillic['g']="г"; toCyrillic['h']="х"; toCyrillic['i']="и"; toCyrillic['j']="ј"; toCyrillic['k']="к";
 83+ toCyrillic['l']="л"; toCyrillic['m']="м"; toCyrillic['n']="н";
 84+ toCyrillic['o']="о"; toCyrillic['p']="п"; toCyrillic['r']="р"; toCyrillic['s']="с"; toCyrillic['š']="ш";
 85+ toCyrillic['t']="т"; toCyrillic['u']="у"; toCyrillic['v']="в"; toCyrillic['z']="з"; toCyrillic['ž']="ж";
 86+
 87+ toCyrillic['A']="А"; toCyrillic['B']="Б"; toCyrillic['C']="Ц"; toCyrillic['Č']="Ч"; toCyrillic['Ć']="Ћ";
 88+ toCyrillic['D']="Д"; toCyrillic['Đ']="Ђ"; toCyrillic['E']="Е"; toCyrillic['F']="Ф";
 89+ toCyrillic['G']="Г"; toCyrillic['H']="Х"; toCyrillic['I']="И"; toCyrillic['J']="Ј"; toCyrillic['K']="К";
 90+ toCyrillic['L']="Л"; toCyrillic['M']="М"; toCyrillic['N']="Н";
 91+ toCyrillic['O']="О"; toCyrillic['P']="П"; toCyrillic['R']="Р"; toCyrillic['S']="С"; toCyrillic['Š']="Ш";
 92+ toCyrillic['T']="Т"; toCyrillic['U']="У"; toCyrillic['V']="В"; toCyrillic['Z']="З"; toCyrillic['Ž']="Ж";
 93+
 94+ toCyrillicMap.put("DŽ","Џ"); toCyrillicMap.put("Lj","Љ"); toCyrillicMap.put("Nj","Њ");
 95+ toCyrillicMap.put("LJ","Љ"); toCyrillicMap.put("Dž","Џ"); toCyrillicMap.put("nj","њ");
 96+ toCyrillicMap.put("dž","џ"); toCyrillicMap.put("lj","љ"); toCyrillicMap.put("NJ","Њ");
 97+
 98+ toCyrillicTwo.set('D'); toCyrillicTwo.set('d'); toCyrillicTwo.set('Ž'); toCyrillicTwo.set('ž');
 99+ toCyrillicTwo.set('L'); toCyrillicTwo.set('l'); toCyrillicTwo.set('J'); toCyrillicTwo.set('j');
 100+ toCyrillicTwo.set('N'); toCyrillicTwo.set('n');
 101+ }
 102+
 103+ /** get latin and cyrillic variant of the text */
 104+ public static ArrayList<String> getVariants(String text){
 105+ if(toLatin == null || toCyrillic==null)
 106+ initVariants();
 107+ if(text.length() == 0)
 108+ return null;
 109+ else if(text.length() == 1){
 110+ ArrayList<String> ret = new ArrayList<String>();
 111+ String l = toLatin[text.charAt(0)];
 112+ if(l != null)
 113+ ret.add(l);
 114+ String c = toCyrillic[text.charAt(0)];
 115+ if(c != null)
 116+ ret.add(c);
 117+ return ret;
 118+ }
 119+ StringBuilder lat = new StringBuilder();
 120+ StringBuilder cyr = new StringBuilder();
 121+ char c='\0', c1=text.charAt(0);
 122+ for(int i=1;i<text.length()+1;i++){
 123+ c = c1;
 124+ c1 = i<text.length()? text.charAt(i) : '\0';
 125+ String l = toLatin[c];
 126+ if(l != null)
 127+ lat.append(l);
 128+ else
 129+ lat.append(c);
 130+ }
 131+
 132+ c='\0'; c1=text.charAt(0);
 133+ for(int i=1;i<text.length()+1;i++){
 134+ c = c1;
 135+ c1 = i<text.length()? text.charAt(i) : '\0';
 136+ String cl = null;
 137+ // quick check if we should try the two-letter map
 138+ if(toCyrillicTwo.get(c) && toCyrillicTwo.get(c1))
 139+ cl = toCyrillicMap.get(""+c+c1);
 140+
 141+ if(cl != null){
 142+ i++;
 143+ c = c1;
 144+ c1 = i<text.length()? text.charAt(i) : '\0';
 145+ } else // single letter map
 146+ cl = toCyrillic[c];
 147+ if(cl != null)
 148+ cyr.append(cl);
 149+ else
 150+ cyr.append(c);
 151+ }
 152+ ArrayList<String> ret = new ArrayList<String>();
 153+ ret.add(lat.toString());
 154+ ret.add(cyr.toString());
 155+ return ret;
 156+ }
 157+
 158+ /** Convert to ascii */
50159 public String convert(String text){
51160 length = 0;
52161 String cv;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java
@@ -116,6 +116,9 @@
117117 if(isStub()){
118118 try {
119119 setTermText(new String(serialized,termTextStart,termTextEnd-termTextStart,"utf-8"));
 120+ // check if this is a cjk token
 121+ if(termText().length()>0 && type==Type.TEXT && CJKFilter.isCJKChar(termText().codePointAt(0)))
 122+ setType("cjk");
120123 unstubOriginal();
121124 } catch (UnsupportedEncodingException e) {
122125 e.printStackTrace();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
@@ -5,6 +5,7 @@
66 import org.apache.lucene.analysis.Token;
77 import org.apache.lucene.analysis.TokenFilter;
88 import org.apache.lucene.analysis.TokenStream;
 9+import org.wikimedia.lsearch.analyzers.ExtToken.Type;
910
1011 /**
1112 * Simple CJK (Chinese Japanese Korean) token filter.
@@ -24,10 +25,16 @@
2526 if(buffer.size()!=0)
2627 return buffer.removeFirst();
2728
28 - Token token = input.next();
29 - if(token == null)
30 - return null;
 29+ Token token;
 30+ do{
 31+ token = input.next();
 32+ if(token == null)
 33+ return null;
 34+ } while(token.getPositionIncrement()==0); // discard aliases
3135
 36+ if(token instanceof ExtToken && ((ExtToken)token).getType()!=Type.TEXT)
 37+ return token;
 38+
3239 String text = token.termText();
3340
3441 int i,offset,c;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Utf8Set.java
@@ -63,9 +63,13 @@
6464 protected Utf8String str = new Utf8String();
6565
6666 public Utf8Set(Set<String> words){
67 - for(String w : words){
68 - lookup[w.charAt(0)&MASK] = true;
69 - set.add(new Utf8String(w));
 67+ try{
 68+ for(String w : words){
 69+ lookup[w.getBytes("utf-8")[0]&MASK] = true;
 70+ set.add(new Utf8String(w));
 71+ }
 72+ } catch(Exception e){
 73+ e.printStackTrace();
7074 }
7175 }
7276
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
@@ -38,6 +38,7 @@
3939 import org.apache.lucene.search.TermQuery;
4040 import org.apache.lucene.store.Directory;
4141 import org.apache.lucene.store.RAMDirectory;
 42+import org.wikimedia.lsearch.analyzers.FilterFactory;
4243 import org.wikimedia.lsearch.analyzers.PrefixAnalyzer;
4344 import org.wikimedia.lsearch.analyzers.SplitAnalyzer;
4445 import org.wikimedia.lsearch.beans.Article;
@@ -74,6 +75,7 @@
7576 protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly;
7677 protected boolean optimized = false;
7778 protected boolean autoOptimize = false;
 79+ protected FilterFactory filters = null;
7880
7981 private Links(IndexId iid, String path, IndexWriter writer, boolean autoOptimize) throws CorruptIndexException, IOException{
8082 this.writer = writer;
@@ -93,6 +95,7 @@
9496 redirectOnly = makeSelector("redirect");
9597 contextOnly = makeSelector("context");
9698 linksOnly = makeSelector("links");
 99+ filters = new FilterFactory(iid.getDB());
97100 }
98101
99102 protected FieldSelector makeSelector(String field){
@@ -259,7 +262,6 @@
260263 if(redirect != null){
261264 redirectsTo = findTargetLink(redirect.getNamespace(),redirect.getTitle(),exactCase);
262265 } else {
263 - HashSet<String> contextLinks = new HashSet<String>();
264266 ContextParser.Context curContext = null;
265267 while(true){
266268 boolean hasNext = matcher.find();
@@ -275,7 +277,6 @@
276278 curContext = context;
277279 else if(curContext!=context){
278280 pagelinks.add("");
279 - contextLinks.clear();
280281 curContext = context;
281282 }
282283 }
@@ -314,13 +315,10 @@
315316 continue; // skip links from other namespaces into the main namespace
316317 String target = findTargetLink(ns,title,exactCase);
317318 if(target != null){
318 - int targetNs = Integer.parseInt(target.substring(0,target.indexOf(':')));
 319+ ArrayList<String> variants = filters.getVariants(target);
319320 pagelinks.add(target);
320 - // register context of this link
321 - if(context != null && nsf.contains(targetNs)){
322 - contextLinks.add(target);
323 - }
324 -
 321+ if(variants != null)
 322+ pagelinks.addAll(variants);
325323 }
326324 }
327325 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java
@@ -157,8 +157,8 @@
158158 QueryParser parser = new QueryParser("contents",new CJKAnalyzer());
159159 Query q = parser.parse("プロサッカークラブをつくろう");
160160 System.out.println("Japanese in standard analyzer: "+q);
161 - displayTokens(new CJKAnalyzer(),"『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
162 - displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
 161+ displayTokens(new CJKAnalyzer(),"は、工学者、大学教授、工学博士。『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
 162+ displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木 孝治(すずき こうじ、1954年 - )『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
163163 displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタjavaイトルはbalaton");
164164 displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"パ ン");
165165
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -24,7 +24,7 @@
2525
2626 public class FastWikiTokenizerTest {
2727 public static void displayTokensForParser(String text) {
28 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions.Highlight(true));
 28+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions.Highlight(false));
2929 Token[] tokens = parser.parse().toArray(new Token[] {});
3030 for (int i = 0; i < tokens.length; i++) {
3131 Token token = tokens[i];
@@ -70,11 +70,13 @@
7171
7272 public static void main(String args[]) throws Exception{
7373 Configuration.open();
74 - String text = "''italic'' text bre! <nowiki><!-- see--></nowiki> <!-- nosee --> (ant) and some. it's stupid it's something and 5\"6' or more, links abacus";
 74+ String text = "ATA, [[:link]] [[zh-min-nan:Something]] [[zh-min-nana:Something]] str_replace";
7575 showTokens(text);
 76+ text = "''italic'' text bre! <nowiki><!-- see--></nowiki> <!-- nosee --> (ant) and some. it's stupid it's something and 5\"6' or more, links abacus";
 77+ showTokens(text);
7678 text = ":''This article is about the humorist. For the [[Indo-Europeanist]] see [[Douglas Q. Adams]].''\n{{Infobox writer <!-- for more information see [[:Template:Infobox writer]] -->\n| name = Douglas Adams\n| image = Douglas adams cropped.jpg\n| caption = Douglas Adams signing books at ApacheCon 2000\n| birthdate = {{birth date|1952|3|11|df=yes}}\n| birthplace = [[Cambridge]], [[England]]\n| deathdate = {{Death date and age|2001|5|11|1952|3|11|df=yes}}\n| deathplace = [[Santa Barbara, California]], [[United States|U.S.]]\n| occupation = comedy writer, novelist, dramatist, fantasist\n| genre = [[Science fiction]], [[Comedy]]\n| movement =\n| influences = [[Richard Dawkins]] <ref>[http://www.bbc.co.uk/cult/hitchhikers/metaguide/radio.shtml Interview extract (in RealAudio format)] where Adams states the influences on his work.</ref>, [[Monty Python]], [[Neil Gaiman]], [[Robert Sheckley]], [[Kurt Vonnegut]], <br/>[[P. G. Wodehouse]]\n| influenced =\n| website = http://www.douglasadams.com/\n}} And now text";
7779 showTokens(text);
78 - text = "klarinet3.jpg Also, I think that the syntax could be changed to\n <nowiki>[[category:''category_name''|''sort_key''|''display_text'']]</nowiki>\nwith ''sort_key'' and ''display_text'' defaulting to ''category_name''.";
 80+ text = "メインページ klarinet3.jpg Also, I think that the syntax could be changed to\n <nowiki>[[category:''category_name''|''sort_key''|''display_text'']]</nowiki>\nwith ''sort_key'' and ''display_text'' defaulting to ''category_name''.";
7981 showTokens(text);
8082 text = "[[meta:jao]] L.A. W. B.M.W and This. is a '''list of [[African]] countries and dependencies by [[population]]'''.\n\n{| border=\"1\" cellpadding=\"2\" cellspacing=\"0\" style=\"border-collapse:collapse; text-align:right;\"\n|- style=\"text-align:center; background:#efefef\"\n!Pos !! Country !! Population\n|-\n| align=\"left\" |-\n| align=\"left\" |'''Africa''' || 934,283,426\n|-\n";
8183 showTokens(text);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
@@ -116,6 +116,7 @@
117117 {"Douglas Adams's Guide to The Hitch-Hiker's Guide to the Galaxy",""},
118118 {"bethlem jesus","bethlehem jesus"},
119119 {"los angles gardens","los angeles gardens"},
 120+ {"huston we have a problem","houston we have a problem"},
120121
121122 };
122123
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -49,6 +49,7 @@
5050
5151 public class Suggest {
5252 static Logger log = Logger.getLogger(Suggest.class);
 53+ protected static GlobalConfiguration global=null;
5354 protected IndexId iid;
5455 protected IndexSearcher searcher;
5556 protected IndexReader reader;
@@ -58,6 +59,7 @@
5960 protected HashMap<String,Boolean> wordExistCache = new HashMap<String,Boolean>();
6061 protected enum Filtering { STRONG, WEAK };
6162 protected boolean useLogging = true;
 63+ protected int minWordFreq = 0;
6264
6365 /** Distance an metaphone metrics */
6466 static public class Metric {
@@ -175,10 +177,13 @@
176178 this.iid = iid;
177179 if(searcher == null)
178180 searcher = cache.getLocalSearcher(iid.getSpell());
 181+ if(global == null)
 182+ global = GlobalConfiguration.getInstance();
179183 this.searcher = searcher;
180184 this.reader = searcher.getIndexReader();
181185 this.defaultNs = iid.getDefaultNamespace();
182186 this.useLogging = useLogging;
 187+ this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell","wordsMinFreq",3);
183188
184189 synchronized(stopWordsIndexes){
185190 if(!stopWordsIndexes.containsKey(searcher)){
@@ -397,7 +402,7 @@
398403 continue;
399404 }
400405 // words found within context should be spell-checked only if they are not valid words
401 - if(info.foundInContext.contains(w) && wordExists(w,ns)){
 406+ if(info.foundInContext.contains(w) && wordExists(w,ns) && wordFrequency(w,ns)>minWordFreq*100){
402407 addCorrectWord(w,wordSug,possibleStopWords);
403408 continue;
404409 }
@@ -544,7 +549,7 @@
545550 if(s1.word.equals(w1))
546551 c.preserves.put(i,w1);
547552 else if((!good1 && !info.foundInTitles.contains(w1))
548 - || ((inTitle||inContext) && diff1 <=2 && !info.foundInContext.contains(w1)) )
 553+ || ((inTitle||inContext) && diff1 <=2 && !info.foundInTitles.contains(w1)) )
549554 c.substitutes.put(i,s1.word);
550555 else
551556 accept = false;
@@ -552,7 +557,7 @@
553558 if(s2.word.equals(w2))
554559 c.preserves.put(i2,w2);
555560 else if((!good2 && !info.foundInTitles.contains(w2))
556 - || ((inTitle||inContext) && diff2 <= 2 && !info.foundInContext.contains(w2)) )
 561+ || ((inTitle||inContext) && diff2 <= 2 && !info.foundInTitles.contains(w2)) )
557562 c.substitutes.put(i2,s2.word);
558563 else
559564 accept = false;
@@ -1205,6 +1210,7 @@
12061211 * @return
12071212 */
12081213 public ArrayList<SuggestResult> suggestWords(String word, int num, Namespaces namespaces, Filtering filter){
 1214+ log.debug("Suggesting words for "+word);
12091215 if(namespaces == null) // default
12101216 return suggestWordsOnNamespaces(word,word,num,num,null,filter);
12111217
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java
@@ -10,6 +10,7 @@
1111 import org.apache.lucene.search.RemoteSearchableMul;
1212 import org.wikimedia.lsearch.config.GlobalConfiguration;
1313 import org.wikimedia.lsearch.config.IndexId;
 14+import org.wikimedia.lsearch.config.IndexRegistry;
1415 import org.wikimedia.lsearch.search.SearcherCache;
1516
1617 /** Starts the RMI registry and binds all RMI objects */
@@ -17,6 +18,7 @@
1819 protected static org.apache.log4j.Logger log = Logger.getLogger(RMIServer.class);
1920
2021 protected static SearcherCache cache = null;
 22+ protected static IndexRegistry indexes = null;
2123
2224 public static void register(Remote engine, String name){
2325 try {
@@ -41,18 +43,24 @@
4244 }
4345
4446 /** After updating local copy of iid, rebind it's rmi object */
45 - public static void rebind(IndexId iid){
 47+ public static boolean rebind(IndexId iid){
4648 if(cache == null)
4749 cache = SearcherCache.getInstance();
 50+ if(indexes == null)
 51+ indexes = IndexRegistry.getInstance();
4852 String name = "RemoteSearchable<"+iid+">";
4953 try {
50 - RemoteSearchableMul rs = new RemoteSearchableMul(cache.getLocalSearcher(iid));
51 - register(rs,name);
 54+ if(indexes.getCurrentSearch(iid) != null){
 55+ RemoteSearchableMul rs = new RemoteSearchableMul(cache.getLocalSearcher(iid));
 56+ register(rs,name);
 57+ return true;
 58+ }
5259 } catch (RemoteException e) {
5360 log.warn("Error making remote searchable for "+name);
5461 } catch(Exception e){
5562 // do nothing, error is logged by some other class (possible SearchCache)
5663 }
 64+ return false;
5765 }
5866
5967 /** Bind all RMI objects (Messenger, RemoteSeachables and RMIIndexDaemon) */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
@@ -45,6 +45,8 @@
4646 public class PrefixIndexBuilder {
4747 static Logger log = Logger.getLogger(PrefixIndexBuilder.class);
4848
 49+ public static float EXACT_BOOST = 25;
 50+
4951 protected IndexId iid, prefixIid, pre;
5052 protected FilterFactory filters;
5153 protected Links links=null;
@@ -187,7 +189,7 @@
188190 }
189191
190192 if(key.equalsIgnoreCase(prefix))
191 - ref *= 100; // boost for exact match
 193+ ref *= EXACT_BOOST; // boost for exact match
192194 refs.put(key,ref);
193195 }
194196 ArrayList<Entry<String,Double>> sorted = new ArrayList<Entry<String,Double>>();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/BuildAll.java
@@ -31,7 +31,7 @@
3232 static org.apache.log4j.Logger log = null;
3333
3434 protected static void printHelp(){
35 - System.out.println("Syntax: BuildAll [-f <file>] [-lt] [-i] [-sc] [dbname] [dump file]");
 35+ System.out.println("Syntax: BuildAll [-f <file>] [-lt] [-i] [-sc] [dump file] [dbname]");
3636 System.out.println("Options:");
3737 System.out.println(" -f <file> - use a file with a list of pairs <dbname> <dump file>");
3838 System.out.println(" -lt - leave titles - don't delete old titles indexes");
@@ -55,10 +55,14 @@
5656 importOnly = true;
5757 else if(args[i].equals("-sc"))
5858 noSpellcheck = true;
 59+ else if(args[i].startsWith("-")){
 60+ System.out.println("Unrecognized option "+args[i]);
 61+ printHelp();
 62+ return;
 63+ } else if(dump == null)
 64+ dump = args[i];
5965 else if(dbname == null)
6066 dbname = args[i];
61 - else if(dump == null)
62 - dump = args[i];
6367 else if(args[i].equals("--help")){
6468 printHelp();
6569 return;
@@ -145,7 +149,7 @@
146150 }
147151 }
148152 }
149 - System.out.println("Finished building in "+ProgressReport.formatTime(System.currentTimeMillis()-start));
 153+ System.out.println("Finished build in "+ProgressReport.formatTime(System.currentTimeMillis()-start));
150154 }
151155
152156 protected static void copy(String from, String to) throws IOException{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
@@ -218,7 +218,7 @@
219219 IndexThread.makeIndexSnapshot(p,p.getImportPath());
220220 }
221221 }
222 - if(makeTitles){
 222+ if(makeTitles && iid.hasTitlesIndex()){
223223 for(IndexId p : iid.getTitlesIndex().getPhysicalIndexIds()){
224224 if(snapshotDb)
225225 IndexThread.optimizeIndex(p,p.getImportPath(),IndexId.Transaction.IMPORT);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
@@ -899,5 +899,10 @@
900900 public IndexId getTitleNgram(){
901901 return IndexId.get(dbname+".title_ngram");
902902 }
 903+
 904+ /** If this iid is in chinese or japanese */
 905+ public boolean isCJK(){
 906+ return FilterFactory.isCJKLanguage(getLangCode());
 907+ }
903908
904909 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -254,9 +254,9 @@
255255 // update registry, cache, rmi object
256256 registry.refreshUpdates(iid);
257257 warmupAndDeploy(pool,li,type);
 258+ registry.refreshCurrent(li);
258259 if(type != RebuildType.STANDALONE)
259260 RMIServer.rebind(iid);
260 - registry.refreshCurrent(li);
261261
262262 // notify all remote searchers of change
263263 messenger.notifyIndexUpdated(iid,iid.getDBSearchHosts());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
@@ -91,6 +91,7 @@
9292 searcher = new IndexSearcherMul(path);
9393 searcher.setSimilarity(new WikiSimilarity());
9494 } catch (IOException e) {
 95+ e.printStackTrace();
9596 // tell registry this is not a good index
9697 IndexRegistry.getInstance().invalidateCurrent(iid);
9798 log.error("I/O Error opening index at path "+iid.getCanonicalSearchPath()+" : "+e.getMessage());
@@ -257,18 +258,21 @@
258259
259260 /** Warmup all local IndexSearcher (create if necessary) */
260261 public void warmupLocalCache(){
 262+ IndexRegistry registry = IndexRegistry.getInstance();
261263 HashSet<IndexId> mys = global.getMySearch();
262264 for(IndexId iid : mys){
263265 try {
264266 if(iid.isLogical())
265267 continue;
266 - IndexSearcherMul[] pool = getSearcherPool(iid);
267 - for(IndexSearcherMul is : pool)
268 - Warmup.warmupIndexSearcher(is,iid,false);
269 -
270 - Warmup.waitForAggregate(pool);
 268+ if(registry.getCurrentSearch(iid) != null){
 269+ IndexSearcherMul[] pool = getSearcherPool(iid);
 270+ for(IndexSearcherMul is : pool)
 271+ Warmup.warmupIndexSearcher(is,iid,false);
 272+
 273+ Warmup.waitForAggregate(pool);
 274+ }
271275 } catch (IOException e) {
272 - log.warn("I/O error warming index for "+iid);
 276+ log.warn("I/O error warming index for "+iid+" : "+e.getMessage());
273277 }
274278 }
275279 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/ArticleMeta.java
@@ -56,6 +56,7 @@
5757 protected SimpleDateFormat isoDate;
5858 protected long now = 0;
5959 protected NamespaceFilter subpages;
 60+ protected boolean isOptimized;
6061
6162 protected class CachingThread extends Thread {
6263 public void run(){
@@ -67,12 +68,16 @@
6869 subpage = new boolean[reader.maxDoc()];
6970 daysOld = new float[reader.maxDoc()];
7071 for(int i=0;i<reader.maxDoc();i++){
 72+ if(!isOptimized && reader.isDeleted(i))
 73+ continue;
7174 try{
72 - subpage[i] = resolveSubpage(i);
73 - daysOld[i] = resolveDaysOld(i);
 75+ Document d = reader.document(i);
 76+ subpage[i] = resolveSubpage(d);
 77+ daysOld[i] = resolveDaysOld(d);
7478 } catch(Exception e2){
7579 e2.printStackTrace();
7680 log.error("Error reading article meta for docid="+i+" : "+e2.getMessage());
 81+ throw e2;
7782 }
7883 }
7984 log.info("Finished caching article info for "+reader.directory());
@@ -90,10 +95,7 @@
9196 /** See if article is a subpage
9297 * @throws IOException
9398 * @throws CorruptIndexException */
94 - protected final boolean resolveSubpage(int docid) throws IOException{
95 - if(reader.isDeleted(docid))
96 - return false;
97 - Document d = reader.document(docid);
 99+ protected final boolean resolveSubpage(Document d) throws IOException{
98100 String ns = d.get("namespace");
99101 if(ns == null)
100102 return false;
@@ -107,10 +109,7 @@
108110 return false;
109111 }
110112 /** Calculate how old the indexed article is */
111 - protected final float resolveDaysOld(int docid) throws IOException {
112 - if(reader.isDeleted(docid))
113 - return 0;
114 - Document d = reader.document(docid);
 113+ protected final float resolveDaysOld(Document d) throws IOException {
115114 String dateStr = d.get("date");
116115 if(dateStr == null)
117116 return 0;
@@ -141,6 +140,7 @@
142141 this.subpages = subpages;
143142 isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
144143 isoDate.setTimeZone(TimeZone.getTimeZone("GMT"));
 144+ this.isOptimized = reader.isOptimized();
145145
146146 // run background caching
147147 new CachingThread().start();
@@ -148,14 +148,14 @@
149149
150150 public final boolean isSubpage(int docid) throws IOException {
151151 if(!finishedCaching)
152 - return resolveSubpage(docid);
 152+ return resolveSubpage(reader.document(docid));
153153
154154 return subpage[docid];
155155 }
156156
157157 public float daysOld(int docid) throws IOException {
158158 if(!finishedCaching)
159 - return resolveDaysOld(docid);
 159+ return resolveDaysOld(reader.document(docid));
160160
161161 return daysOld[docid];
162162 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java
@@ -81,6 +81,7 @@
8282 protected IndexReader reader = null;
8383 protected String field;
8484 protected boolean cachingFinished = false;
 85+ protected boolean isOptimized;
8586
8687 protected class CachingThread extends Thread {
8788 public void run(){
@@ -105,7 +106,7 @@
106107 for(int i=0;i<maxdoc;i++){
107108 byte[] stored = null;
108109 try{
109 - if(reader.isDeleted(i))
 110+ if(!isOptimized && reader.isDeleted(i))
110111 continue;
111112 Document doc = reader.document(i);
112113 stored = doc.getBinaryValue(field);
@@ -134,6 +135,7 @@
135136 } catch(Exception e){
136137 log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage());
137138 e.printStackTrace();
 139+ throw e;
138140 }
139141 }
140142 // compact arrays
@@ -178,6 +180,7 @@
179181 protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{
180182 this.reader = reader;
181183 this.field = fieldBase+"_meta";
 184+ this.isOptimized = reader.isOptimized();
182185 Collection fields = reader.getFieldNames(FieldOption.ALL);
183186 if(!fields.contains(field)){
184187 cachingFinished = true;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
@@ -76,6 +76,7 @@
7777 try{
7878 boolean waitForAggregate = Configuration.open().getString("Search","warmupaggregate","false").equalsIgnoreCase("true");
7979 if(waitForAggregate){ // wait for aggregate fields to be cached
 80+ log.info("Wait for aggregate caches...");
8081 boolean wait;
8182 do{
8283 wait = false;
@@ -109,35 +110,46 @@
110111
111112 int count = getWarmupCount(iid);
112113
113 - if(iid.isSpell() && count > 0){
114 - Terms terms = getTermsForLang(iid.getLangCode());
115 - Suggest sug = new Suggest(iid,is,false);
116 - WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid));
117 - NamespaceFilter nsf = iid.getDefaultNamespace();
118 - for(int i=0;i<count;i++){
119 - String searchterm = terms.next();
120 - sug.suggest(searchterm,parser.tokenizeForSpellCheck(searchterm),new Suggest.ExtraInfo(),nsf);
 114+ if(iid.isSpell()){
 115+ if(count > 0){
 116+ Terms terms = getTermsForLang(iid.getLangCode());
 117+ Suggest sug = new Suggest(iid,is,false);
 118+ WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid));
 119+ NamespaceFilter nsf = iid.getDefaultNamespace();
 120+ for(int i=0;i<count;i++){
 121+ String searchterm = terms.next();
 122+ sug.suggest(searchterm,parser.tokenizeForSpellCheck(searchterm),new Suggest.ExtraInfo(),nsf);
 123+ }
121124 }
122 - } else if(iid.isTitleNgram() && count > 0){
123 - Terms terms = getTermsForLang(iid.getLangCode());
124 - SuggestSimilar sim = new SuggestSimilar(iid,is);
125 - for(int i=0;i<count;i++){
126 - sim.getSimilarTitles(terms.next(),new NamespaceFilter(),4);
 125+ } else if(iid.isTitleNgram()){
 126+ if(count > 0){
 127+ Terms terms = getTermsForLang(iid.getLangCode());
 128+ SuggestSimilar sim = new SuggestSimilar(iid,is);
 129+ for(int i=0;i<count;i++){
 130+ sim.getSimilarTitles(terms.next(),new NamespaceFilter(),4);
 131+ }
127132 }
128 - } else if(iid.isPrefix() && count > 0){
129 - Terms terms = getTermsForLang(iid.getLangCode());
130 - SearchEngine search = new SearchEngine();
131 - for(int i=0;i<count;i++){
132 - String searchterm = terms.next();
133 - searchterm = searchterm.substring(0,(int)Math.min(8*Math.random()+1,searchterm.length()));
134 - search.searchPrefixLocal(iid,searchterm,20,iid.getDefaultNamespace(),is);
 133+ } else if(iid.isPrefix()){
 134+ if(count > 0){
 135+ Terms terms = getTermsForLang(iid.getLangCode());
 136+ SearchEngine search = new SearchEngine();
 137+ for(int i=0;i<count;i++){
 138+ String searchterm = terms.next();
 139+ searchterm = searchterm.substring(0,(int)Math.min(8*Math.random()+1,searchterm.length()));
 140+ search.searchPrefixLocal(iid,searchterm,20,iid.getDefaultNamespace(),is);
 141+ }
135142 }
136 - } else if((iid.isHighlight() || iid.isRelated()) && count > 0 && !iid.isTitlesBySuffix()){
137 - // NOTE: this might not warmup all caches, but should read stuff into memory buffers
138 - for(int i=0;i<count;i++){
139 - int docid = (int)(Math.random()*is.maxDoc());
140 - reader.document(docid).get("key");
141 - }
 143+ } else if((iid.isHighlight() || iid.isRelated()) && !iid.isTitlesBySuffix()){
 144+ if(count > 0){
 145+ // NOTE: this might not warmup all caches, but should read stuff into memory buffers
 146+ for(int i=0;i<count;i++){
 147+ int docid = (int)(Math.random()*is.maxDoc());
 148+ reader.document(docid).get("key");
 149+ }
 150+ }
 151+ } else if(iid.isTitlesBySuffix()){
 152+ // just initiate meta field caching, we want to avoid caching unnecessary filters
 153+ AggregateMetaField.getCachedSource(is.getIndexReader(),"alttitle");
142154 } else{
143155 // normal indexes
144156 if(count == 0){
@@ -180,7 +192,7 @@
181193 log.error("Error warming up local IndexSearcherMul for "+iid);
182194 } catch (Exception e) {
183195 e.printStackTrace();
184 - log.error("Exception during warmup "+e.getMessage());
 196+ log.error("Exception during warmup of "+iid+" : "+e.getMessage());
185197 }
186198 }
187199
@@ -188,14 +200,9 @@
189201 protected static Terms getTermsForLang(String lang) {
190202 String lib = Configuration.open().getLibraryPath();
191203 if("en".equals(lang) || "de".equals(lang) || "es".equals(lang) || "fr".equals(lang) || "it".equals(lang) || "pt".equals(lang))
192 - langTerms.put(lang,new WordTerms(lib+Configuration.PATH_SEP+"dict"+Configuration.PATH_SEP+"terms-"+lang+".txt.gz"));
193 - if(lang.equals("sample"))
194 - return new SampleTerms();
195 -
196 - if(langTerms.containsKey(lang))
197 - return langTerms.get(lang);
 204+ return new WordTerms(lib+Configuration.PATH_SEP+"dict"+Configuration.PATH_SEP+"terms-"+lang+".txt.gz");
198205 else
199 - return langTerms.get("en");
 206+ return new SampleTerms();
200207 }
201208
202209 /** Preload all predefined filters */
@@ -218,7 +225,7 @@
219226 try{
220227 FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
221228 WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
222 - Query q = parser.parse("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons");
 229+ Query q = parser.parse("wikimedia foundation");
223230 is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
224231 } catch (IOException e) {
225232 log.error("Error warming up local IndexSearcherMul for "+iid);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java
@@ -177,6 +177,7 @@
178178 return WildcardType.INVALID;
179179 }
180180
 181+ /** Get terms from a local searcher if available */
181182 public static ArrayList<String> getLocalTerms(IndexId iid, String wildcard, boolean exactCase) throws IOException {
182183 if(searcherCache == null)
183184 searcherCache = SearcherCache.getInstance();
@@ -218,6 +219,7 @@
219220 return list;
220221 }
221222
 223+ /** Fetch terms matching a wildcard pattern into the target collection */
222224 protected static void addTerms(Collection<String> ret, Term wildcardTerm, IndexReader reader, WildcardType type) throws IOException{
223225 Term t;
224226 WildcardTermEnum te = new WildcardTermEnum(reader,wildcardTerm);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -45,6 +45,7 @@
4646 import org.wikimedia.lsearch.highlight.HighlightResult;
4747 import org.wikimedia.lsearch.index.MessengerThread;
4848 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 49+import org.wikimedia.lsearch.prefix.PrefixIndexBuilder;
4950 import org.wikimedia.lsearch.ranks.StringList;
5051 import org.wikimedia.lsearch.related.Related;
5152 import org.wikimedia.lsearch.related.RelatedTitle;
@@ -371,12 +372,18 @@
372373 ArrayList<String> keys = new ArrayList<String>();
373374 if(prefixKey.startsWith("0:")){
374375 String title = prefixKey.substring(2);
 376+ String alt = null;
 377+ if(title.startsWith("\"") && title.length()>1)
 378+ alt = title.substring(1);
375379 for(Integer ns : nsf.getNamespacesOrdered()){
376380 keys.add(ns+":"+title);
 381+ if(alt != null)
 382+ keys.add(ns+":"+alt);
377383 }
 384+
378385 } else
379386 keys.add(prefixKey);
380 -
 387+
381388 ArrayList<PrefixMatch> results = new ArrayList<PrefixMatch>();
382389 IndexReader reader = searcher.getIndexReader();
383390
@@ -403,7 +410,7 @@
404411 if(td1.next()){
405412 PrefixMatch m = new PrefixMatch(reader.document(td1.doc()).get("article"));
406413 if(r.equals(key))
407 - m.score *= 100; // exact boost
 414+ m.score *= PrefixIndexBuilder.EXACT_BOOST; // exact boost
408415 results.add(m);
409416
410417 }
@@ -996,8 +1003,9 @@
9971004 }
9981005
9991006 protected void sendStats(long delta){
1000 - boolean succ = delta < 10000; // we queries taking more than 10s as bad
1001 - SearchServer.stats.add(succ, delta, SearchDaemon.getOpenCount());
 1007+ boolean succ = delta < 10000; // we queries taking more than 10s as bad
 1008+ if(SearchServer.stats != null)
 1009+ SearchServer.stats.add(succ, delta, SearchDaemon.getOpenCount());
10021010 }
10031011
10041012 protected void logRequest(IndexId iid, String what, String searchterm, Query query, int numhits, long start, Searchable searcher) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java
@@ -36,6 +36,7 @@
3737 protected Set<String> stopWords;
3838
3939 protected boolean highlightAllStop = false;
 40+ protected boolean isCJK = false;
4041
4142 // for custom scoring
4243 protected int textLength = 0;
@@ -236,7 +237,7 @@
237238 // make snippet in range showBegin,showEnd
238239 Snippet s = new Snippet();
239240 StringBuilder sb = new StringBuilder();
240 - int start=0, end=0; // range
 241+ int start=0, end=0, mid=0; // range
241242 if(showBegin > 0 && tokens.get(showBegin).getType() == ExtToken.Type.TEXT)
242243 showBegin--; // always start with nontext token to catch " and (
243244 if(showEnd == tokens.size())
@@ -275,12 +276,42 @@
276277 continue;
277278 }
278279 if(t.getPositionIncrement() != 0){
 280+ if(isCJK && t.getType() == Type.TEXT && t.type().equals("cjk")){
 281+ boolean lastOnly = false;
 282+ // reconstruct CJK tokens from stream C1C2 C2C3 C3C4 -> C1C2C3C4
 283+ if(mainToken != null && mainToken.getType()==Type.TEXT && mainToken.type().equals("cjk") && mid!=start){
 284+ start = mid; // C2C3 token, start of this token is "in the middle of last added token"
 285+ lastOnly = true;
 286+ } else
 287+ start = getLength(sb); // C1C2 token
 288+
 289+ // add current
 290+ mid = start;
 291+ String tt = t.getText();
 292+ int len = tt.length();
 293+ if(len>=2){
 294+ // not terminal, calculate new midpoint
 295+ int point = len-1;
 296+ if(Character.isSurrogatePair(tt.charAt(len-2),tt.charAt(len-1)))
 297+ point = len-2;
 298+
 299+ if(!lastOnly)
 300+ sb.append(tt.substring(0,point));
 301+ mid = getLength(sb);
 302+ sb.append(tt.substring(point));
 303+ } else
 304+ sb.append(tt);
 305+
 306+ end = getLength(sb);
 307+ } else{
 308+ start = getLength(sb);
 309+ sb.append(t.getText());
 310+ end = getLength(sb);
 311+ }
279312 mainToken = t;
280 - start = getLength(sb);
281 - sb.append(t.getText());
282 - end = getLength(sb);
283313 }
284314 if(highlight.contains(t.termText()) && !isolatedStopWords(t.termText(),i)){
 315+ // highlight part of the text
285316 if(mainToken != null && mainToken!=t && (mainToken.termText().contains(".") || mainToken.termText().contains("'"))){
286317 Snippet.Range range = findSubRange(mainToken,t,start);
287318 if(range != null)
@@ -293,6 +324,7 @@
294325 if(alttitle != null)
295326 s.setOriginalText(alttitle.getTitle());
296327
 328+ s.simplifyRanges();
297329 return s;
298330 }
299331
@@ -362,7 +394,9 @@
363395 }
364396 }
365397
366 - public RawSnippet(ArrayList<ExtToken> tokens, FragmentScore f, Set<String> highlight, Set<String> newTerms, Set<String> stopWords){
 398+ public RawSnippet(ArrayList<ExtToken> tokens, FragmentScore f,
 399+ Set<String> highlight, Set<String> newTerms, Set<String> stopWords,
 400+ boolean isCJK){
367401 this.tokens = new ArrayList<ExtToken>();
368402 // include initial nontext token
369403 if(f.start > 0 && f.start < tokens.size() && tokens.get(f.start).getType()==ExtToken.Type.TEXT)
@@ -385,6 +419,7 @@
386420 this.cur = f;
387421 this.sequenceNum = f.sequenceNum;
388422 this.stopWords = stopWords;
 423+ this.isCJK = isCJK;
389424 this.textLength = noAliasLength();
390425 if(stopWords!=null && stopWords.size()>0){
391426 highlightAllStop = true;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java
@@ -100,6 +100,21 @@
101101 return getFormatted();
102102 }
103103
 104+ /** If consequtive words are being highlighted, merge ranges */
 105+ public void simplifyRanges(){
 106+ Range last = null;
 107+ ArrayList<Range> simplified = new ArrayList<Range>();
 108+ for(Range r : highlighted){
 109+ if(last != null && last.end >= r.start)
 110+ last.end = r.end;
 111+ else{
 112+ simplified.add(r);
 113+ last = r;
 114+ }
 115+ }
 116+ highlighted = simplified;
 117+ }
 118+
104119 /** Get default formatting with <b> and </b> tags */
105120 public String getFormatted(){
106121 return getFormatted("<b>","</b>");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java
@@ -111,7 +111,11 @@
112112 boolean foundAllInTitle = false, foundAllInAltTitle = false;
113113 int firstHitRank = 0;
114114 HashSet<String> inTitle = new HashSet<String>();
 115+ boolean isCJK = iid.getDB().isCJK();
115116
 117+ //System.out.println("Terms: "+Arrays.toString(terms));
 118+ //System.out.println("Words: "+words);
 119+
116120 // terms weighted with idf
117121 HashMap<String,Double> weightTerm = new HashMap<String,Double>();
118122 for(int i=0;i<terms.length;i++){
@@ -155,12 +159,12 @@
156160 firstHitRank = alttitles.getTitle().getRank();
157161
158162 HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles,wordIndex);
159 - ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine );
160 - ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,words,wordIndex,1,true,stopWords,false,phrases,inContext,false,false);
 163+ ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine, isCJK );
 164+ ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,words,wordIndex,1,true,stopWords,false,phrases,inContext,false,false,isCJK);
161165 RawSnippet redirectSnippet = null;
162166 // don't show redirect if we matched whole title
163167 if(! (titleSnippets.size()>0 && titleSnippets.get(0).countPositions()==titleSnippets.get(0).noAliasLength())){
164 - redirectSnippet = getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext);
 168+ redirectSnippet = getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext,isCJK);
165169 }
166170 RawSnippet sectionSnippet = null;
167171 if(redirectSnippet == null){
@@ -169,7 +173,7 @@
170174 if(notInTitle.containsKey(s))
171175 notInTitle.remove(s);
172176 }
173 - sectionSnippet = getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext);
 177+ sectionSnippet = getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext,isCJK);
174178 }
175179
176180 HighlightResult hr = new HighlightResult();
@@ -182,7 +186,7 @@
183187 boolean addSection = true, added = true;
184188 while(added && more(hr.textLength())){
185189 // add more snippets if there is still space
186 - added = extendSnippet(raw,hr,raw.size()-1,tokens,addSection,stopWords);
 190+ added = extendSnippet(raw,hr,raw.size()-1,tokens,addSection,stopWords,isCJK);
187191 addSection = false;
188192 }
189193 } else if(textSnippets.size() >= 2){
@@ -203,13 +207,13 @@
204208 if(more(hr.textLength())){
205209 // first pass of snippet extension, extend shortest first
206210 if(s1.length() < s2.length()){
207 - extendSnippet(raw,hr,0,tokens,true,stopWords);
 211+ extendSnippet(raw,hr,0,tokens,true,stopWords,isCJK);
208212 if(more(hr.textLength()))
209 - extendSnippet(raw,hr,raw.size()-1,tokens,true,stopWords);
 213+ extendSnippet(raw,hr,raw.size()-1,tokens,true,stopWords,isCJK);
210214 } else {
211 - extendSnippet(raw,hr,1,tokens,true,stopWords);
 215+ extendSnippet(raw,hr,1,tokens,true,stopWords,isCJK);
212216 if(more(hr.textLength()))
213 - extendSnippet(raw,hr,0,tokens,true,stopWords);
 217+ extendSnippet(raw,hr,0,tokens,true,stopWords,isCJK);
214218 }
215219 }
216220 boolean added = true;
@@ -219,7 +223,7 @@
220224 for(int i=0;i<hr.getText().size() && more(hr.textLength());i++){
221225 boolean addedNow = false;
222226 if(hr.getText().get(i).isExtendable()){
223 - addedNow = extendSnippet(raw,hr,i,tokens,false,stopWords);
 227+ addedNow = extendSnippet(raw,hr,i,tokens,false,stopWords,isCJK);
224228 if(addedNow)
225229 i++;
226230 }
@@ -337,14 +341,14 @@
338342 }
339343
340344 private static boolean extendSnippet(ArrayList<RawSnippet> raw, HighlightResult hr, int index,
341 - ArrayList<ExtToken> tokens, boolean addSection, HashSet<String> stopWords){
 345+ ArrayList<ExtToken> tokens, boolean addSection, HashSet<String> stopWords, boolean isCJK){
342346 Snippet curS = hr.getText().get(index);
343347 RawSnippet curRs = raw.get(index);
344348 int len = hr.textLength();
345349 boolean added = false;
346350 // add section
347351 if(addSection && more(len)){
348 - RawSnippet rs = sectionSnippet(curRs,curS,tokens,stopWords);
 352+ RawSnippet rs = sectionSnippet(curRs,curS,tokens,stopWords,isCJK);
349353 if(rs != null && !raw.contains(rs)){
350354 Snippet s = rs.makeSnippet(diff(len));
351355 setSuffix(s,rs);
@@ -364,7 +368,7 @@
365369 }
366370 // add next snippet
367371 if(more(len)){
368 - RawSnippet rs = nextSnippet(curRs,curS,tokens,stopWords);
 372+ RawSnippet rs = nextSnippet(curRs,curS,tokens,stopWords,isCJK);
369373 if(rs != null && !raw.contains(rs)){
370374 Snippet s = rs.makeSnippet(diff(len));
371375 setSuffix(curS,curRs);
@@ -378,17 +382,17 @@
379383 return added;
380384 }
381385
382 - protected static RawSnippet nextSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords){
 386+ protected static RawSnippet nextSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords, boolean isCJK){
383387 if(rs.next == null)
384388 return null;
385 - return new RawSnippet(tokens,rs.next,rs.highlight,new HashSet<String>(),stopWords);
 389+ return new RawSnippet(tokens,rs.next,rs.highlight,new HashSet<String>(),stopWords,isCJK);
386390 }
387391
388 - protected static RawSnippet sectionSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords){
 392+ protected static RawSnippet sectionSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords, boolean isCJK){
389393 if(rs.section == null)
390394 return null;
391395 if(s.length() < SHORT_SNIPPET)
392 - return new RawSnippet(tokens,rs.section,rs.highlight,new HashSet<String>(),stopWords);
 396+ return new RawSnippet(tokens,rs.section,rs.highlight,new HashSet<String>(),stopWords,isCJK);
393397 return null;
394398 }
395399
@@ -418,7 +422,7 @@
419423 /** Alttitle and sections highlighting */
420424 protected static RawSnippet getBestAltTitle(ArrayList<Alttitles.Info> altInfos, HashMap<String,Double> weightTerm,
421425 HashMap<String,Double> notInTitle, HashSet<String> stopWords, ArrayList<String> words, HashMap<String,Integer> wordIndex,
422 - int minAdditional, HashSet<String> phrases, HashSet<String> inContext){
 426+ int minAdditional, HashSet<String> phrases, HashSet<String> inContext, boolean isCJK){
423427 ArrayList<RawSnippet> res = new ArrayList<RawSnippet>();
424428 for(Alttitles.Info ainf : altInfos){
425429 double matched = 0, additionalScore = 0;
@@ -445,7 +449,7 @@
446450 }
447451 }
448452 if(length == matchedPositions.size() || additional > minAdditional || (additional != 0 && additional == notInTitle.size())){
449 - ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false);
 453+ ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false, isCJK);
450454 if(snippets.size() > 0){
451455 RawSnippet snippet = snippets.get(0);
452456 snippet.setAlttitle(ainf);
@@ -520,7 +524,8 @@
521525 /** Highlight text */
522526 protected static ArrayList<RawSnippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms,
523527 ArrayList<String> words, HashMap<String,Integer> wordIndex, int maxSnippets, boolean ignoreBreaks, HashSet<String> stopWords,
524 - boolean showFirstIfNone, HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases, final boolean alwaysIncludeFirstLine) {
 528+ boolean showFirstIfNone, HashSet<String> phrases, HashSet<String> foundInContext,
 529+ final boolean sortByPhrases, final boolean alwaysIncludeFirstLine, final boolean isCJK) {
525530
526531 // pieces of text to ge highlighted
527532 ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>();
@@ -593,7 +598,7 @@
594599 if(foundAllInFirst && beginLen > 2*MAX_CONTEXT && firstFragment!=null){
595600 // made enough snippets, return the first one
596601 ArrayList<RawSnippet> res = new ArrayList<RawSnippet>();
597 - res.add(new RawSnippet(tokens,firstFragment,weightTerms.keySet(),firstFragment.found,stopWords));
 602+ res.add(new RawSnippet(tokens,firstFragment,weightTerms.keySet(),firstFragment.found,stopWords,isCJK));
598603 return res;
599604 }
600605 fs.next = new FragmentScore(fs.end, sequence++); // link into list
@@ -759,7 +764,7 @@
760765 if(f.found != null)
761766 termsFound.addAll(f.found);
762767 adjustBest(f,tokens,weightTerms,words,wordIndex,newTerms);
763 - RawSnippet s = new RawSnippet(tokens,f,wordHighlight,newTerms,stopWords);
 768+ RawSnippet s = new RawSnippet(tokens,f,wordHighlight,newTerms,stopWords,isCJK);
764769 res.add(s);
765770 } else if(resNoNew.size() < maxSnippets)
766771 resNoNew.add(f);
@@ -768,7 +773,7 @@
769774 }
770775 // if text doesn't match show some body text
771776 if(showFirstIfNone && res.size() == 0 && fragmentsBeginning != null){
772 - res.add(new RawSnippet(tokens,fragmentsBeginning,wordHighlight,wordHighlight,stopWords));
 777+ res.add(new RawSnippet(tokens,fragmentsBeginning,wordHighlight,wordHighlight,stopWords,isCJK));
773778 }
774779 // always show snippet that is before in the text first
775780 Collections.sort(res, new Comparator<RawSnippet>() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -787,6 +787,10 @@
788788 doc.add(new Field("date",isoDate.format(article.getDate()),Store.YES,Index.NO));
789789
790790 float rankBoost = transformRank(article.getRank());
 791+
 792+ // add both title and redirects to content, so queries that match part of title and content won't fail
 793+ String contents = article.getContents();
 794+ contents = article.getTitle()+". "+contents+". "+serializeRedirects(article.getRedirectKeywords());
791795
792796 /** Following fields can be optionally case-dependent */
793797 for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
@@ -797,7 +801,7 @@
798802 TokenizerOptions options = new TokenizerOptions(bs.isExactCase());
799803 if(filters.isSpellCheck())
800804 options = new TokenizerOptions.SpellCheck();
801 - WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,options);
 805+ WikiTokenizer tokenizer = new WikiTokenizer(contents,iid,options);
802806 tokenizer.tokenize();
803807
804808 // title
@@ -844,6 +848,18 @@
845849 return doc;
846850 }
847851
 852+ /** Serialize redirects that will be added to end of the article */
 853+ private static String serializeRedirects(ArrayList<String> redirectKeywords) {
 854+ if(redirectKeywords.size()==0)
 855+ return "";
 856+ StringBuilder sb = new StringBuilder();
 857+ for(String s : redirectKeywords){
 858+ sb.append(s);
 859+ sb.append(". ");
 860+ }
 861+ return sb.toString();
 862+ }
 863+
848864 /** Make the document that will be indexed as highlighting data */
849865 public static Document makeHighlightDocument(Article article, FieldBuilder builder, IndexId iid) throws IOException{
850866 WikiIndexModifier.transformArticleForIndexing(article);
Index: branches/lucene-search-2.1/webinterface/lsweb.py
@@ -6,7 +6,14 @@
77 from urllib2 import URLError, HTTPError
88
99 #search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv79:8123' }
10 -search_host = {'<default>' : 'localhost:8123', 'enwiki' : "srv79:8123", 'srwiki' : "srv79:8123" }
 10+search_host = {'<default>' : 'srv79:8123',
 11+ 'jawiki' : "localhost:8123",
 12+ 'frwiki' : "localhost:8123",
 13+ 'dewiki' : "localhost:8123",
 14+ 'itwiki' : "localhost:8123",
 15+ 'jawikiquote' : "localhost:8123",
 16+ 'wikilucene' : 'localhost:8123' }
 17+#search_host = {'<default>' : 'localhost:8123'}
1118
1219 canon_namespaces = { 0 : '', 1: 'Talk', 2: 'User', 3: 'User_talk',
1320 4 : 'Project', 5 : 'Project_talk', 6 : 'Image', 7 : 'Image_talk',
Index: branches/lucene-search-2.1/webinterface/searchForm.html
@@ -41,7 +41,7 @@
4242 </p>
4343
4444 <p>
45 -<strong>Status</strong> only en.wiki updated
 45+<strong>Status</strong> Up
4646 </p>
4747 <strong>Search:</strong>
4848 <hr>
@@ -53,25 +53,19 @@
5454 <option value="enwiki">enwiki</option>
5555 <option value="dewiki">dewiki</option>
5656 <option value="frwiki">frwiki</option>
57 - <option value="mediawikiwiki">mediawikiwiki</option>
58 - <option value="metawiki">metawiki</option>
59 - <option value="wikilucene">wikilucene</option>
60 - <option value="wikidev">wikidev</option>
61 - <option value="enwiktionary">enwiktionary</option>
62 - <option value="enwiktionary-exact">enwiktionary-exact</option>
63 - <option value="enwikinews">enwikinews</option>
64 - <option value="plwiki">plwiki</option>
6557 <option value="jawiki">jawiki</option>
66 - <option value="nlwiki">nlwiki</option>
6758 <option value="itwiki">itwiki</option>
68 - <option value="ptwiki">ptwiki</option>
69 - <option value="eswiki">eswiki</option>
70 - <option value="svwiki">svwiki</option>
71 - <option value="ruwiki">ruwiki</option>
72 - <option value="zhwiki">zhwiki</option>
73 - <option value="fiwiki">fiwiki</option>
74 - <option value="nowiki">nowiki</option>
75 - <option value="srwiki">srwiki</option>
 59+ <option value="srwiki">srwiki</option>
 60+ <option value="enwiktionary">enwiktionary</option>
 61+ <option value="enwikinews">enwikinews</option>
 62+ <option value="enwikisource">enwikisource</option>
 63+ <option value="enwikiquote">enwikiquote</option>
 64+ <option value="enwikibooks">enwikibooks</option>
 65+ <option value="enwikiversity">enwikiversity</option>
 66+ <option value="enwiktionary-exact">enwiktionary-exact</option>
 67+ <!--<option value="jawikiquote">jawikiquote</option>
 68+ <option value="wikilucene">wikilucene</option>
 69+ <option value="wikidev">wikidev</option> -->
7670 </select>
7771
7872 Search for <input type='text' name="query" value="" size="30" id="lsearchbox" />
Index: branches/lucene-search-2.1/lsearch-global.conf
@@ -21,6 +21,7 @@
2222 wikiwiktionary, wikiwikisource : (single) (language,en) (prefix)
2323 enwiki,viwiki,srwiki,eswiki,dewiki,mlwiki,zhwiki,jawiki,itwiki,thwiki : (single)
2424 mediawikiwiki, metawiki : (single) (language,en)
 25+jawikiquote : (single) (prefix)
2526
2627 # Titles group by interwiki, <all> is the general rule, exceptions can be explicitely set
2728 [Database-Group]
@@ -32,16 +33,7 @@
3334 # host : db1.part db2.part
3435 # Mulitple hosts can search multiple dbs (N-N mapping)
3536 [Search-Group]
36 -oblak : wikilucene* wikidev*
37 -#oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links
38 -#oblak : wikilucene.nspart1.sub1 wikilucene.nspart1.sub2
39 -#oblak : wikilucene.nspart1.sub1.hl wikilucene.nspart1.sub2.hl
40 -#oblak : wikilucene.nspart2.hl wikilucene.nspart3.hl
41 -#oblak : wikilucene.nspart2 wikilucene.nspart3 wikilucene.title_ngram
42 -#oblak : wikilucene.prefix wikilucene.spell wikilucene.related wikilucene.links
43 -#oblak : wikiwiktionary wikiwikisource wikiwiktionary.prefix
44 -#oblak : wiki-titles wiki-titles.tspart1 wiki-titles.tspart2
45 -#oblak : wikidev.prefix wikidev.hl wikidev.spell
 37+oblak : wikilucene* wikidev* ja*
4638
4739 # Index nodes
4840 # host: db1.part db2.part
@@ -66,7 +58,7 @@
6759 # Global properies
6860 [Properties]
6961 # suffixes to database name, the rest is assumed to be language code
70 -Database.suffix=wiki wiktionary wikisource
 62+Database.suffix=wiki wiktionary wikisource wikiquote
7163
7264 # use languages codes as interwiki prefixes (usefultokenizer heuristics for WMF-style wiki farms)
7365 Database.smartInterwiki=false

Status & tagging log