r23926 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r23925‎ | r23926 | r23927 >
Date:21:42, 9 July 2007
Author:rainman
Status:old
Tags:
Comment:
* Improve CJK processing, filter C1C2C3C4 -> C1C2 C2C3 C3C4, use phase queries
* Don't abort warmup if there are bad queries in warmup files
Modified paths:
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CJKFilter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/StartupManager.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceCache.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /trunk/lucene-search-2.0/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test
@@ -46,6 +46,7 @@
4747 [OAI]
4848 wiktionary : http://$lang.wiktionary.org/w/index.php
4949 frtest : http://localhost/wiki-lucene/phase3/index.php
 50+rswikimedia : http://rs.wikimedia.org/w/index.php
5051 <default> : http://$lang.wikipedia.org/w/index.php
5152
5253 # Path where indexes are on hosts, after default value put hosts where
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -347,6 +347,16 @@
348348 q = parser.parseTwoPass("welche rolle spielen Mineralstoffe in der Ernährung?",NamespacePolicy.IGNORE);
349349 assertEquals("(+(contents:welche contents:welch^0.5) +(contents:rolle contents:roll^0.5) +(contents:spielen contents:spiel^0.5) +(contents:mineralstoffe contents:mineralstoff^0.5) +contents:in +contents:der +(+(contents:ernahrung contents:ernahr^0.5) (contents:ernaehrung contents:ernaehr^0.5))) (+title:welche^2.0 +title:rolle^2.0 +title:spielen^2.0 +title:mineralstoffe^2.0 +title:in^2.0 +title:der^2.0 +(title:ernahrung^2.0 title:ernaehrung^2.0))",q.toString());
350350
 351+ // CJK
 352+ analyzer = Analyzers.getSearcherAnalyzer("ja");
 353+ bs = new FieldBuilder("ja").getBuilder();
 354+ parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE);
 355+ q = parser.parseFourPass("うろパン",NamespacePolicy.IGNORE,false);
 356+ assertEquals("contents:\"うろ ろハ ハン\" title:\"うろ ろハ ハン\"^2.0 (alttitle1:\"うろ ろハ ハン\"^6.0 alttitle2:\"うろ ろハ ハン\"^6.0 alttitle3:\"うろ ろハ ハン\"^6.0)",q.toString());
 357+
 358+ q = parser.parseFourPass("ナイロン100C other ャポン! ",NamespacePolicy.IGNORE,false);
 359+ assertEquals("(+contents:\"ナイ イロ ロン\" +(+contents:100 +contents:c) +contents:other +contents:\"ャホ ホン\") (+title:\"ナイ イロ ロン\"^2.0 +(+title:100^2.0 +title:c^2.0) +title:other^2.0 +title:\"ャホ ホン\"^2.0) ((+alttitle1:\"ナイ イロ ロン\"^6.0 +(+alttitle1:100^6.0 +alttitle1:c^6.0) +alttitle1:other^6.0 +alttitle1:\"ャホ ホン\"^6.0) (+alttitle2:\"ナイ イロ ロン\"^6.0 +(+alttitle2:100^6.0 +alttitle2:c^6.0) +alttitle2:other^6.0 +alttitle2:\"ャホ ホン\"^6.0) (+alttitle3:\"ナイ イロ ロン\"^6.0 +(+alttitle3:100^6.0 +alttitle3:c^6.0) +alttitle3:other^6.0 +alttitle3:\"ャホ ホン\"^6.0))",q.toString());
 360+
351361 // Test field extraction
352362 HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
353363 assertEquals(3,fs.size());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -183,10 +183,11 @@
184184 assertEquals("http://$lang.wikipedia.org/w/index.php",oairepo.get("<default>"));
185185
186186 assertEquals("http://sr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("srwiki"));
187 - assertEquals("http://fr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest"));
 187+ assertEquals("http://localhost/wiki-lucene/phase3/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest"));
188188
189189 // InitialiseSettings test
190190 assertEquals("sr",testgc.getLanguage("rswikimedia"));
 191+ assertEquals("http://rs.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("rswikimedia"));
191192 assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki"));
192193
193194 } catch (MalformedURLException e) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -970,8 +970,12 @@
971971 /** Get OAI-repo url for dbname */
972972 public String getOAIRepo(String dbname){
973973 String repo = null;
 974+ // try non-default values from global settings
 975+ repo = findSuffix(oaiRepo.keySet(),dbname);
 976+ if(repo != null)
 977+ repo = oaiRepo.get(repo);
974978 // try to get from initialise settings
975 - if(wgServer != null){
 979+ if(repo == null && wgServer != null){
976980 String key = findSuffix(wgServer.keySet(),dbname);
977981 if(key == null)
978982 key = "default";
@@ -992,7 +996,7 @@
993997 repo = oaiRepo.get("<default>");
994998 }
995999 if(repo == null)
996 - return ""; // failed, no url
 1000+ return ""; // failed, no url
9971001
9981002 // process $lang
9991003 String lang = getLanguage(dbname);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/StartupManager.java
@@ -6,6 +6,7 @@
77
88 import java.util.HashSet;
99
 10+import org.apache.lucene.search.BooleanQuery;
1011 import org.wikimedia.lsearch.frontend.HTTPIndexServer;
1112 import org.wikimedia.lsearch.frontend.RPCIndexServer;
1213 import org.wikimedia.lsearch.frontend.SearchServer;
@@ -34,7 +35,7 @@
3536 Configuration.setConfigFile(args[++i]);
3637 }
3738 }
38 -
 39+ BooleanQuery.setMaxClauseCount(10000);
3940 Configuration config = Configuration.open();
4041 GlobalConfiguration global = GlobalConfiguration.getInstance();
4142 // preload localizations
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceCache.java
@@ -36,12 +36,13 @@
3737
3838 /** Returns true if the filter can be composed from filters in cache */
3939 public static boolean isComposable(NamespaceFilter key){
40 - ArrayList<NamespaceFilter> dec = key.decompose();
 40+ return true;
 41+ /* ArrayList<NamespaceFilter> dec = key.decompose();
4142 for(NamespaceFilter nsf : dec){
4243 if(!cache.containsKey(nsf))
4344 return false;
4445 }
45 - return true;
 46+ return true; */
4647 }
4748
4849 /**
@@ -63,26 +64,22 @@
6465 if(key.cardinality() > 1){
6566 ArrayList<NamespaceFilter> dec = key.decompose();
6667 ArrayList<Filter> filters = new ArrayList<Filter>();
67 - boolean succ = true;
6868 for(NamespaceFilter nsf : dec){
6969 if(cache.containsKey(nsf))
7070 filters.add(cache.get(nsf));
71 - else{ // didn't find the apropriate filter in cache :(
72 - succ = false;
73 - break;
 71+ else{ // didn't find the apropriate filter, make it
 72+ log.info("Making filter for "+nsf);
 73+ CachingWrapperFilter cwf = makeFilter(nsf);
 74+ cache.put(nsf,cwf);
 75+ filters.add(cwf);
7476 }
7577 }
76 - if(succ){
77 - log.debug("Made composite filter for "+key);
78 - // never cache composite filters
79 - return new NamespaceCompositeFilter(filters).bits(reader);
80 - } else {
81 - log.info("Cannot compose filter "+key+" from cache. This should happen only in warmup phase.");
82 - }
 78+ log.debug("Made composite filter for "+key);
 79+ // never cache composite filters
 80+ return new NamespaceCompositeFilter(filters).bits(reader);
8381 }
8482 // build new filter from query
85 - Query q = WikiQueryParser.generateRewrite(key);
86 - CachingWrapperFilter cwf = new CachingWrapperFilter(new QueryFilter(q));
 83+ CachingWrapperFilter cwf = makeFilter(key);
8784 // cache only if defined as a textual prefix in global conf, or filters one namespace
8885 if(GlobalConfiguration.getInstance().getNamespacePrefixes().containsValue(key) || key.cardinality()==1)
8986 cache.put(key,cwf);
@@ -90,4 +87,9 @@
9188 return cwf.bits(reader);
9289 }
9390 }
 91+
 92+ protected static CachingWrapperFilter makeFilter(NamespaceFilter key){
 93+ Query q = WikiQueryParser.generateRewrite(key);
 94+ return new CachingWrapperFilter(new QueryFilter(q));
 95+ }
9496 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
@@ -85,7 +85,8 @@
8686 log.error("Error warming up local IndexSearcherMul for "+iid);
8787 } catch (ParseException e) {
8888 log.error("Error parsing query in warmup of IndexSearcherMul for "+iid);
89 - } catch (InterruptedException e) {
 89+ } catch (Exception e) {
 90+ log.error("Exception during warmup "+e.getMessage());
9091 }
9192 }
9293
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -297,7 +297,8 @@
298298
299299 // pluses and minuses, underscores can be within words, *,? are for wildcard queries
300300 if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*'){
301 - buffer[length++] = ch;
 301+ if(length<buffer.length)
 302+ buffer[length++] = ch;
302303 } else{
303304 cur--; // position before the nonletter character
304305 break;
@@ -421,7 +422,7 @@
422423 // end of phrase query
423424 if(text[cur] == '"')
424425 break;
425 - else
 426+ else if(length < buffer.length)
426427 buffer[length++] = text[cur];
427428 }
428429 if(length != 0){
@@ -1128,6 +1129,46 @@
11291130 return bq;
11301131 }
11311132
 1133+ /** Quote CJK chars to avoid frequency-based analysis */
 1134+ protected String quoteCJK(String queryText){
 1135+ if(!builder.filters.isUsingCJK())
 1136+ return queryText;
 1137+
 1138+ StringBuilder sb = new StringBuilder();
 1139+ int c;
 1140+ boolean prevCJK = false;
 1141+ int offset = 0;
 1142+ boolean closeQuote = false;
 1143+ for(int i=0;i<queryText.length();i++){
 1144+ c = queryText.codePointAt(i);
 1145+ if(CJKFilter.isCJKChar(c)){
 1146+ if(!prevCJK){ // begin of CJK stream
 1147+ if(i!=0)
 1148+ sb.append(queryText.substring(offset,i));
 1149+ offset = i;
 1150+ sb.append('"');
 1151+ closeQuote = true;
 1152+ prevCJK = true;
 1153+ }
 1154+ } else if(prevCJK){
 1155+ // end of CJK stream
 1156+ sb.append(queryText.substring(offset,i));
 1157+ offset = i;
 1158+ sb.append('"');
 1159+ closeQuote = true;
 1160+ prevCJK = false;
 1161+ }
 1162+ }
 1163+ if(offset == 0 && !closeQuote)
 1164+ return queryText;
 1165+ else{
 1166+ sb.append(queryText.substring(offset,queryText.length()));
 1167+ if(closeQuote)
 1168+ sb.append('"');
 1169+ return sb.toString();
 1170+ }
 1171+ }
 1172+
11321173 /**
11331174 * Main function for multi-pass parsing.
11341175 *
@@ -1137,6 +1178,7 @@
11381179 * @return
11391180 */
11401181 protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){
 1182+ queryText = quoteCJK(queryText);
11411183 if(policy != null)
11421184 this.namespacePolicy = policy;
11431185 defaultBoost = 1;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
@@ -7,7 +7,9 @@
88 import org.apache.lucene.analysis.TokenStream;
99
1010 /**
11 - * Simple CJK (Chinese Japanese Korean) token filter. One CJK symbol per token.
 11+ * Simple CJK (Chinese Japanese Korean) token filter.
 12+ * Filter: C1C2C3C4 -> C1C2 C2C3 C3C4.
 13+ * Ordinary word breaks are handled by lower level tokenizer.
1214 */
1315
1416 public final class CJKFilter extends TokenFilter {
@@ -28,20 +30,29 @@
2931
3032 String text = token.termText();
3133
32 - int i,offset,len,c;
 34+ int i,offset,c;
 35+ int len; // length of single token (if it's non-cjk word)
 36+ char last=0,cur; // last/cur cjk char
 37+ // split the token into cjk chars
3338 for(i=0,offset=0,len=0;i<text.length();i++){
3439 c = text.codePointAt(i);
3540 if(isCJKChar(c)){
3641 if(len != 0)
37 - buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len));
 42+ buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1));
3843 offset = i+1;
3944 len = 0;
40 - buffer.add(new Token(text.substring(i,i+1),token.startOffset()+i,token.startOffset()+i+1));
 45+ cur = text.charAt(i);
 46+ if(last != 0)
 47+ buffer.add(new Token(""+last+cur,token.startOffset()+i-1,token.startOffset()+i+1));
 48+ last = cur;
 49+ } else if(last != 0){
 50+ buffer.add(new Token(""+last,token.startOffset()+i,token.startOffset()+i+1));
 51+ last = 0;
4152 } else
4253 len++;
4354 }
4455 if(len != 0 && len != text.length())
45 - buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len));
 56+ buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1));
4657
4758 if(buffer.size() == 0)
4859 return token;
@@ -49,7 +60,7 @@
5061 return buffer.removeFirst();
5162 }
5263
53 - public final boolean isCJKChar(int c){
 64+ public static final boolean isCJKChar(int c){
5465 return (c >= 0x3040 && c <= 0x318f) ||
5566 (c >= 0x3300 && c <= 0x337f) ||
5667 (c >= 0x3400 && c <= 0x3d2d) ||
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -19,6 +19,7 @@
2020 protected boolean useStemmer,useCustomFilter;
2121 protected Class stemmer = null;
2222 protected Class customFilter = null;
 23+ protected boolean usingCJK = false;
2324
2425 protected FilterFactory noStemmerFilterFactory=null;
2526
@@ -87,9 +88,10 @@
8889 else if(lang.equals("vi"))
8990 customFilter = VietnameseFilter.class;
9091 else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") ||
91 - lang.equals("ko") || lang.equals("zh-classical") || lang.equals("zh-yue"))
 92+ lang.equals("zh-classical") || lang.equals("zh-yue")){
9293 customFilter = CJKFilter.class;
93 - else
 94+ usingCJK = true;
 95+ } else
9496 useCustomFilter = false;
9597
9698 }
@@ -128,6 +130,10 @@
129131 return useStemmer;
130132 }
131133
 134+ public boolean isUsingCJK() {
 135+ return usingCJK;
 136+ }
 137+
132138 public boolean hasCustomFilter(){
133139 return useCustomFilter;
134140 }
@@ -135,4 +141,6 @@
136142 public String getLanguage(){
137143 return lang;
138144 }
 145+
 146+
139147 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java
@@ -30,6 +30,7 @@
3131 public OAIHarvester(IndexId iid, String url, Authenticator auth){
3232 this.urlbase = url;
3333 this.iid = iid;
 34+ log.info(iid+" using base url: "+url);
3435 Authenticator.setDefault(auth);
3536 }
3637

Status & tagging log