Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test |
— | — | @@ -46,6 +46,7 @@ |
47 | 47 | [OAI] |
48 | 48 | wiktionary : http://$lang.wiktionary.org/w/index.php |
49 | 49 | frtest : http://localhost/wiki-lucene/phase3/index.php |
| 50 | +rswikimedia : http://rs.wikimedia.org/w/index.php |
50 | 51 | <default> : http://$lang.wikipedia.org/w/index.php |
51 | 52 | |
52 | 53 | # Path where indexes are on hosts, after default value put hosts where |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -347,6 +347,16 @@ |
348 | 348 | q = parser.parseTwoPass("welche rolle spielen Mineralstoffe in der Ernährung?",NamespacePolicy.IGNORE); |
349 | 349 | assertEquals("(+(contents:welche contents:welch^0.5) +(contents:rolle contents:roll^0.5) +(contents:spielen contents:spiel^0.5) +(contents:mineralstoffe contents:mineralstoff^0.5) +contents:in +contents:der +(+(contents:ernahrung contents:ernahr^0.5) (contents:ernaehrung contents:ernaehr^0.5))) (+title:welche^2.0 +title:rolle^2.0 +title:spielen^2.0 +title:mineralstoffe^2.0 +title:in^2.0 +title:der^2.0 +(title:ernahrung^2.0 title:ernaehrung^2.0))",q.toString()); |
350 | 350 | |
| 351 | + // CJK |
| 352 | + analyzer = Analyzers.getSearcherAnalyzer("ja"); |
| 353 | + bs = new FieldBuilder("ja").getBuilder(); |
| 354 | + parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.IGNORE); |
| 355 | + q = parser.parseFourPass("うろパン",NamespacePolicy.IGNORE,false); |
| 356 | + assertEquals("contents:\"うろ ろハ ハン\" title:\"うろ ろハ ハン\"^2.0 (alttitle1:\"うろ ろハ ハン\"^6.0 alttitle2:\"うろ ろハ ハン\"^6.0 alttitle3:\"うろ ろハ ハン\"^6.0)",q.toString()); |
| 357 | + |
| 358 | + q = parser.parseFourPass("ナイロン100C other ャポン! ",NamespacePolicy.IGNORE,false); |
| 359 | + assertEquals("(+contents:\"ナイ イロ ロン\" +(+contents:100 +contents:c) +contents:other +contents:\"ャホ ホン\") (+title:\"ナイ イロ ロン\"^2.0 +(+title:100^2.0 +title:c^2.0) +title:other^2.0 +title:\"ャホ ホン\"^2.0) ((+alttitle1:\"ナイ イロ ロン\"^6.0 +(+alttitle1:100^6.0 +alttitle1:c^6.0) +alttitle1:other^6.0 +alttitle1:\"ャホ ホン\"^6.0) (+alttitle2:\"ナイ イロ ロン\"^6.0 +(+alttitle2:100^6.0 +alttitle2:c^6.0) +alttitle2:other^6.0 +alttitle2:\"ャホ ホン\"^6.0) (+alttitle3:\"ナイ イロ ロン\"^6.0 +(+alttitle3:100^6.0 +alttitle3:c^6.0) +alttitle3:other^6.0 +alttitle3:\"ャホ ホン\"^6.0))",q.toString()); |
| 360 | + |
351 | 361 | // Test field extraction |
352 | 362 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
353 | 363 | assertEquals(3,fs.size()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -183,10 +183,11 @@ |
184 | 184 | assertEquals("http://$lang.wikipedia.org/w/index.php",oairepo.get("<default>")); |
185 | 185 | |
186 | 186 | assertEquals("http://sr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("srwiki")); |
187 | | - assertEquals("http://fr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest")); |
| 187 | + assertEquals("http://localhost/wiki-lucene/phase3/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest")); |
188 | 188 | |
189 | 189 | // InitialiseSettings test |
190 | 190 | assertEquals("sr",testgc.getLanguage("rswikimedia")); |
| 191 | + assertEquals("http://rs.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("rswikimedia")); |
191 | 192 | assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki")); |
192 | 193 | |
193 | 194 | } catch (MalformedURLException e) { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -970,8 +970,12 @@ |
971 | 971 | /** Get OAI-repo url for dbname */ |
972 | 972 | public String getOAIRepo(String dbname){ |
973 | 973 | String repo = null; |
| 974 | + // try non-default values from global settings |
| 975 | + repo = findSuffix(oaiRepo.keySet(),dbname); |
| 976 | + if(repo != null) |
| 977 | + repo = oaiRepo.get(repo); |
974 | 978 | // try to get from initialise settings |
975 | | - if(wgServer != null){ |
| 979 | + if(repo == null && wgServer != null){ |
976 | 980 | String key = findSuffix(wgServer.keySet(),dbname); |
977 | 981 | if(key == null) |
978 | 982 | key = "default"; |
— | — | @@ -992,7 +996,7 @@ |
993 | 997 | repo = oaiRepo.get("<default>"); |
994 | 998 | } |
995 | 999 | if(repo == null) |
996 | | - return ""; // failed, no url |
| 1000 | + return ""; // failed, no url |
997 | 1001 | |
998 | 1002 | // process $lang |
999 | 1003 | String lang = getLanguage(dbname); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/StartupManager.java |
— | — | @@ -6,6 +6,7 @@ |
7 | 7 | |
8 | 8 | import java.util.HashSet; |
9 | 9 | |
| 10 | +import org.apache.lucene.search.BooleanQuery; |
10 | 11 | import org.wikimedia.lsearch.frontend.HTTPIndexServer; |
11 | 12 | import org.wikimedia.lsearch.frontend.RPCIndexServer; |
12 | 13 | import org.wikimedia.lsearch.frontend.SearchServer; |
— | — | @@ -34,7 +35,7 @@ |
35 | 36 | Configuration.setConfigFile(args[++i]); |
36 | 37 | } |
37 | 38 | } |
38 | | - |
| 39 | + BooleanQuery.setMaxClauseCount(10000); |
39 | 40 | Configuration config = Configuration.open(); |
40 | 41 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
41 | 42 | // preload localizations |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceCache.java |
— | — | @@ -36,12 +36,13 @@ |
37 | 37 | |
38 | 38 | /** Returns true if the filter can be composed from filters in cache */ |
39 | 39 | public static boolean isComposable(NamespaceFilter key){ |
40 | | - ArrayList<NamespaceFilter> dec = key.decompose(); |
| 40 | + return true; |
| 41 | + /* ArrayList<NamespaceFilter> dec = key.decompose(); |
41 | 42 | for(NamespaceFilter nsf : dec){ |
42 | 43 | if(!cache.containsKey(nsf)) |
43 | 44 | return false; |
44 | 45 | } |
45 | | - return true; |
| 46 | + return true; */ |
46 | 47 | } |
47 | 48 | |
48 | 49 | /** |
— | — | @@ -63,26 +64,22 @@ |
64 | 65 | if(key.cardinality() > 1){ |
65 | 66 | ArrayList<NamespaceFilter> dec = key.decompose(); |
66 | 67 | ArrayList<Filter> filters = new ArrayList<Filter>(); |
67 | | - boolean succ = true; |
68 | 68 | for(NamespaceFilter nsf : dec){ |
69 | 69 | if(cache.containsKey(nsf)) |
70 | 70 | filters.add(cache.get(nsf)); |
71 | | - else{ // didn't find the apropriate filter in cache :( |
72 | | - succ = false; |
73 | | - break; |
| 71 | + else{ // didn't find the apropriate filter, make it |
| 72 | + log.info("Making filter for "+nsf); |
| 73 | + CachingWrapperFilter cwf = makeFilter(nsf); |
| 74 | + cache.put(nsf,cwf); |
| 75 | + filters.add(cwf); |
74 | 76 | } |
75 | 77 | } |
76 | | - if(succ){ |
77 | | - log.debug("Made composite filter for "+key); |
78 | | - // never cache composite filters |
79 | | - return new NamespaceCompositeFilter(filters).bits(reader); |
80 | | - } else { |
81 | | - log.info("Cannot compose filter "+key+" from cache. This should happen only in warmup phase."); |
82 | | - } |
| 78 | + log.debug("Made composite filter for "+key); |
| 79 | + // never cache composite filters |
| 80 | + return new NamespaceCompositeFilter(filters).bits(reader); |
83 | 81 | } |
84 | 82 | // build new filter from query |
85 | | - Query q = WikiQueryParser.generateRewrite(key); |
86 | | - CachingWrapperFilter cwf = new CachingWrapperFilter(new QueryFilter(q)); |
| 83 | + CachingWrapperFilter cwf = makeFilter(key); |
87 | 84 | // cache only if defined as a textual prefix in global conf, or filters one namespace |
88 | 85 | if(GlobalConfiguration.getInstance().getNamespacePrefixes().containsValue(key) || key.cardinality()==1) |
89 | 86 | cache.put(key,cwf); |
— | — | @@ -90,4 +87,9 @@ |
91 | 88 | return cwf.bits(reader); |
92 | 89 | } |
93 | 90 | } |
| 91 | + |
| 92 | + protected static CachingWrapperFilter makeFilter(NamespaceFilter key){ |
| 93 | + Query q = WikiQueryParser.generateRewrite(key); |
| 94 | + return new CachingWrapperFilter(new QueryFilter(q)); |
| 95 | + } |
94 | 96 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -85,7 +85,8 @@ |
86 | 86 | log.error("Error warming up local IndexSearcherMul for "+iid); |
87 | 87 | } catch (ParseException e) { |
88 | 88 | log.error("Error parsing query in warmup of IndexSearcherMul for "+iid); |
89 | | - } catch (InterruptedException e) { |
| 89 | + } catch (Exception e) { |
| 90 | + log.error("Exception during warmup "+e.getMessage()); |
90 | 91 | } |
91 | 92 | } |
92 | 93 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -297,7 +297,8 @@ |
298 | 298 | |
299 | 299 | // pluses and minuses, underscores can be within words, *,? are for wildcard queries |
300 | 300 | if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*'){ |
301 | | - buffer[length++] = ch; |
| 301 | + if(length<buffer.length) |
| 302 | + buffer[length++] = ch; |
302 | 303 | } else{ |
303 | 304 | cur--; // position before the nonletter character |
304 | 305 | break; |
— | — | @@ -421,7 +422,7 @@ |
422 | 423 | // end of phrase query |
423 | 424 | if(text[cur] == '"') |
424 | 425 | break; |
425 | | - else |
| 426 | + else if(length < buffer.length) |
426 | 427 | buffer[length++] = text[cur]; |
427 | 428 | } |
428 | 429 | if(length != 0){ |
— | — | @@ -1128,6 +1129,46 @@ |
1129 | 1130 | return bq; |
1130 | 1131 | } |
1131 | 1132 | |
| 1133 | + /** Quote CJK chars to avoid frequency-based analysis */ |
| 1134 | + protected String quoteCJK(String queryText){ |
| 1135 | + if(!builder.filters.isUsingCJK()) |
| 1136 | + return queryText; |
| 1137 | + |
| 1138 | + StringBuilder sb = new StringBuilder(); |
| 1139 | + int c; |
| 1140 | + boolean prevCJK = false; |
| 1141 | + int offset = 0; |
| 1142 | + boolean closeQuote = false; |
| 1143 | + for(int i=0;i<queryText.length();i++){ |
| 1144 | + c = queryText.codePointAt(i); |
| 1145 | + if(CJKFilter.isCJKChar(c)){ |
| 1146 | + if(!prevCJK){ // begin of CJK stream |
| 1147 | + if(i!=0) |
| 1148 | + sb.append(queryText.substring(offset,i)); |
| 1149 | + offset = i; |
| 1150 | + sb.append('"'); |
| 1151 | + closeQuote = true; |
| 1152 | + prevCJK = true; |
| 1153 | + } |
| 1154 | + } else if(prevCJK){ |
| 1155 | + // end of CJK stream |
| 1156 | + sb.append(queryText.substring(offset,i)); |
| 1157 | + offset = i; |
| 1158 | + sb.append('"'); |
| 1159 | + closeQuote = true; |
| 1160 | + prevCJK = false; |
| 1161 | + } |
| 1162 | + } |
| 1163 | + if(offset == 0 && !closeQuote) |
| 1164 | + return queryText; |
| 1165 | + else{ |
| 1166 | + sb.append(queryText.substring(offset,queryText.length())); |
| 1167 | + if(closeQuote) |
| 1168 | + sb.append('"'); |
| 1169 | + return sb.toString(); |
| 1170 | + } |
| 1171 | + } |
| 1172 | + |
1132 | 1173 | /** |
1133 | 1174 | * Main function for multi-pass parsing. |
1134 | 1175 | * |
— | — | @@ -1137,6 +1178,7 @@ |
1138 | 1179 | * @return |
1139 | 1180 | */ |
1140 | 1181 | protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){ |
| 1182 | + queryText = quoteCJK(queryText); |
1141 | 1183 | if(policy != null) |
1142 | 1184 | this.namespacePolicy = policy; |
1143 | 1185 | defaultBoost = 1; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CJKFilter.java |
— | — | @@ -7,7 +7,9 @@ |
8 | 8 | import org.apache.lucene.analysis.TokenStream; |
9 | 9 | |
10 | 10 | /** |
11 | | - * Simple CJK (Chinese Japanese Korean) token filter. One CJK symbol per token. |
| 11 | + * Simple CJK (Chinese Japanese Korean) token filter. |
| 12 | + * Filter: C1C2C3C4 -> C1C2 C2C3 C3C4. |
| 13 | + * Ordinary word breaks are handled by lower level tokenizer. |
12 | 14 | */ |
13 | 15 | |
14 | 16 | public final class CJKFilter extends TokenFilter { |
— | — | @@ -28,20 +30,29 @@ |
29 | 31 | |
30 | 32 | String text = token.termText(); |
31 | 33 | |
32 | | - int i,offset,len,c; |
| 34 | + int i,offset,c; |
| 35 | + int len; // length of single token (if it's non-cjk word) |
| 36 | + char last=0,cur; // last/cur cjk char |
| 37 | + // split the token into cjk chars |
33 | 38 | for(i=0,offset=0,len=0;i<text.length();i++){ |
34 | 39 | c = text.codePointAt(i); |
35 | 40 | if(isCJKChar(c)){ |
36 | 41 | if(len != 0) |
37 | | - buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len)); |
| 42 | + buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1)); |
38 | 43 | offset = i+1; |
39 | 44 | len = 0; |
40 | | - buffer.add(new Token(text.substring(i,i+1),token.startOffset()+i,token.startOffset()+i+1)); |
| 45 | + cur = text.charAt(i); |
| 46 | + if(last != 0) |
| 47 | + buffer.add(new Token(""+last+cur,token.startOffset()+i-1,token.startOffset()+i+1)); |
| 48 | + last = cur; |
| 49 | + } else if(last != 0){ |
| 50 | + buffer.add(new Token(""+last,token.startOffset()+i,token.startOffset()+i+1)); |
| 51 | + last = 0; |
41 | 52 | } else |
42 | 53 | len++; |
43 | 54 | } |
44 | 55 | if(len != 0 && len != text.length()) |
45 | | - buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len)); |
| 56 | + buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1)); |
46 | 57 | |
47 | 58 | if(buffer.size() == 0) |
48 | 59 | return token; |
— | — | @@ -49,7 +60,7 @@ |
50 | 61 | return buffer.removeFirst(); |
51 | 62 | } |
52 | 63 | |
53 | | - public final boolean isCJKChar(int c){ |
| 64 | + public static final boolean isCJKChar(int c){ |
54 | 65 | return (c >= 0x3040 && c <= 0x318f) || |
55 | 66 | (c >= 0x3300 && c <= 0x337f) || |
56 | 67 | (c >= 0x3400 && c <= 0x3d2d) || |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | protected boolean useStemmer,useCustomFilter; |
21 | 21 | protected Class stemmer = null; |
22 | 22 | protected Class customFilter = null; |
| 23 | + protected boolean usingCJK = false; |
23 | 24 | |
24 | 25 | protected FilterFactory noStemmerFilterFactory=null; |
25 | 26 | |
— | — | @@ -87,9 +88,10 @@ |
88 | 89 | else if(lang.equals("vi")) |
89 | 90 | customFilter = VietnameseFilter.class; |
90 | 91 | else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") || |
91 | | - lang.equals("ko") || lang.equals("zh-classical") || lang.equals("zh-yue")) |
| 92 | + lang.equals("zh-classical") || lang.equals("zh-yue")){ |
92 | 93 | customFilter = CJKFilter.class; |
93 | | - else |
| 94 | + usingCJK = true; |
| 95 | + } else |
94 | 96 | useCustomFilter = false; |
95 | 97 | |
96 | 98 | } |
— | — | @@ -128,6 +130,10 @@ |
129 | 131 | return useStemmer; |
130 | 132 | } |
131 | 133 | |
| 134 | + public boolean isUsingCJK() { |
| 135 | + return usingCJK; |
| 136 | + } |
| 137 | + |
132 | 138 | public boolean hasCustomFilter(){ |
133 | 139 | return useCustomFilter; |
134 | 140 | } |
— | — | @@ -135,4 +141,6 @@ |
136 | 142 | public String getLanguage(){ |
137 | 143 | return lang; |
138 | 144 | } |
| 145 | + |
| 146 | + |
139 | 147 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | public OAIHarvester(IndexId iid, String url, Authenticator auth){ |
32 | 32 | this.urlbase = url; |
33 | 33 | this.iid = iid; |
| 34 | + log.info(iid+" using base url: "+url); |
34 | 35 | Authenticator.setDefault(auth); |
35 | 36 | } |
36 | 37 | |