Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -315,10 +315,10 @@ |
316 | 316 | |
317 | 317 | // alternative transliterations |
318 | 318 | q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true); |
319 | | - assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) +(contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString()); |
| 319 | + assertEquals("(+(contents:something contents:someth^0.5) +contents:for +((contents:gödels contents:gödel^0.5) (contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +((title:gödels^2.0 title:godels^2.0 title:goedels^2.0))) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +((alttitle1:gödels^6.0 alttitle1:godels^6.0 alttitle1:goedels^6.0))) (+alttitle2:something^6.0 +alttitle2:for^6.0 +((alttitle2:gödels^6.0 alttitle2:godels^6.0 alttitle2:goedels^6.0))) (+alttitle3:something^6.0 +alttitle3:for^6.0 +((alttitle3:gödels^6.0 alttitle3:godels^6.0 alttitle3:goedels^6.0))))",q.toString()); |
320 | 320 | |
321 | 321 | q = parser.parseFourPass("Something for Gödel",NamespacePolicy.IGNORE,true); |
322 | | - assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString()); |
| 322 | + assertEquals("(+(contents:something contents:someth^0.5) +contents:for +((contents:gödel contents:godel contents:goedel))) (+title:something^2.0 +title:for^2.0 +((title:gödel^2.0 title:godel^2.0 title:goedel^2.0))) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +((alttitle1:gödel^6.0 alttitle1:godel^6.0 alttitle1:goedel^6.0))) (+alttitle2:something^6.0 +alttitle2:for^6.0 +((alttitle2:gödel^6.0 alttitle2:godel^6.0 alttitle2:goedel^6.0))) (+alttitle3:something^6.0 +alttitle3:for^6.0 +((alttitle3:gödel^6.0 alttitle3:godel^6.0 alttitle3:goedel^6.0))))",q.toString()); |
323 | 323 | |
324 | 324 | // Test field extraction |
325 | 325 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
— | — | @@ -335,7 +335,7 @@ |
336 | 336 | assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli^0.4) +(title:на^2.0 title:na^0.4) +(title:википедију^2.0 title:vikipediju^0.4))",q.toString()); |
337 | 337 | |
338 | 338 | q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE); |
339 | | - assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString()); |
| 339 | + assertEquals("(+(contents:dobrodošli contents:dobrodosli) +contents:na +(+contents:šdjčćž +contents:sdjccz)) (+(title:dobrodošli^2.0 title:dobrodosli^2.0) +title:na^2.0 +(+title:šdjčćž^2.0 +title:sdjccz^2.0))",q.toString()); |
340 | 340 | |
341 | 341 | analyzer = Analyzers.getSearcherAnalyzer("th"); |
342 | 342 | parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -64,6 +64,8 @@ |
65 | 65 | showTokens(text); |
66 | 66 | text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire"; |
67 | 67 | showTokens(text); |
| 68 | + text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d"; |
| 69 | + showTokens(text); |
68 | 70 | text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]"; |
69 | 71 | showTokens(text); |
70 | 72 | text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages."; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/SampleTerms.java |
— | — | @@ -4,7 +4,7 @@ |
5 | 5 | public class SampleTerms implements Terms { |
6 | 6 | protected int pos; |
7 | 7 | |
8 | | - SampleTerms(){ |
| 8 | + public SampleTerms(){ |
9 | 9 | pos = 0; |
10 | 10 | } |
11 | 11 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | import org.wikimedia.lsearch.analyzers.Analyzers; |
14 | 14 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
15 | 15 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
| 16 | +import org.wikimedia.lsearch.benchmark.SampleTerms; |
16 | 17 | import org.wikimedia.lsearch.benchmark.Terms; |
17 | 18 | import org.wikimedia.lsearch.benchmark.WordTerms; |
18 | 19 | import org.wikimedia.lsearch.config.Configuration; |
— | — | @@ -89,8 +90,8 @@ |
90 | 91 | /** Get database of example search terms for language */ |
91 | 92 | protected static Terms getTermsForLang(String language) { |
92 | 93 | String lib = Configuration.open().getString("MWConfig","lib","./lib"); |
93 | | - if(language.equals("en") && langTerms.get("en")==null) |
94 | | - langTerms.put("en",new WordTerms(lib+"/dict/english.txt.gz")); |
| 94 | + if(language.equals("en")) |
| 95 | + return new SampleTerms(); |
95 | 96 | if(language.equals("fr") && langTerms.get("fr")==null) |
96 | 97 | langTerms.put("fr",new WordTerms(lib+"/dict/french.txt.gz")); |
97 | 98 | if(language.equals("de") && langTerms.get("de")==null) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -157,6 +157,7 @@ |
158 | 158 | decompLength = 0; |
159 | 159 | aliasLength = 0; |
160 | 160 | boolean addToAlias; |
| 161 | + boolean addDecomposed = false; |
161 | 162 | for(int i=0;i<length;i++){ |
162 | 163 | addToAlias = true; |
163 | 164 | if( ! exactCase ) |
— | — | @@ -179,6 +180,9 @@ |
180 | 181 | } else if(cl == 'Å'){ |
181 | 182 | addToTokenAlias("Aa"); |
182 | 183 | addToAlias = false; |
| 184 | + } else if(cl == 'Ø'){ |
| 185 | + addToTokenAlias("O"); |
| 186 | + addToAlias = false; |
183 | 187 | } |
184 | 188 | } |
185 | 189 | // special alias transliterations ä -> ae, etc ... |
— | — | @@ -200,6 +204,9 @@ |
201 | 205 | } else if(cl == 'å'){ |
202 | 206 | addToTokenAlias("aa"); |
203 | 207 | addToAlias = false; |
| 208 | + } else if(cl == 'ø'){ |
| 209 | + addToTokenAlias("o"); |
| 210 | + addToAlias = false; |
204 | 211 | } |
205 | 212 | |
206 | 213 | decomp = decompose(cl); |
— | — | @@ -210,6 +217,7 @@ |
211 | 218 | if(addToAlias && aliasLength!=0 && aliasLength<aliasBuffer.length) |
212 | 219 | aliasBuffer[aliasLength++] = cl; |
213 | 220 | } else{ |
| 221 | + addDecomposed = true; // there are differences to the original version |
214 | 222 | for(decompi = 0; decompi < decomp.length; decompi++){ |
215 | 223 | if(decompLength<decompBuffer.length) |
216 | 224 | decompBuffer[decompLength++] = decomp[decompi]; |
— | — | @@ -218,10 +226,25 @@ |
219 | 227 | } |
220 | 228 | } |
221 | 229 | } |
| 230 | + // make the original buffered version |
| 231 | + Token exact; |
| 232 | + if(exactCase) |
| 233 | + exact = new Token( |
| 234 | + new String(buffer, 0, length), start, start + length); |
| 235 | + else |
| 236 | + exact = new Token( |
| 237 | + new String(buffer, 0, length).toLowerCase(), start, start + length); |
| 238 | + if(addDecomposed && decompLength!=0) |
| 239 | + exact.setType("unicode"); |
| 240 | + tokens.add(exact); |
222 | 241 | // add decomposed token to stream |
223 | | - if(decompLength!=0) |
224 | | - tokens.add(new Token( |
225 | | - new String(decompBuffer, 0, decompLength), start, start + length)); |
| 242 | + if(addDecomposed && decompLength!=0){ |
| 243 | + Token t = new Token( |
| 244 | + new String(decompBuffer, 0, decompLength), start, start + length); |
| 245 | + t.setPositionIncrement(0); |
| 246 | + t.setType("transliteration"); |
| 247 | + tokens.add(t); |
| 248 | + } |
226 | 249 | // add alias (if any) token to stream |
227 | 250 | if(aliasLength!=0){ |
228 | 251 | Token t = new Token( |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java |
— | — | @@ -0,0 +1,70 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | + |
| 6 | +import org.apache.lucene.analysis.Token; |
| 7 | +import org.apache.lucene.analysis.TokenFilter; |
| 8 | +import org.apache.lucene.analysis.TokenStream; |
| 9 | + |
| 10 | +/** |
| 11 | + * Vietnamese standard transliterations to ascii. Most of the stuff is done by unicode decomposed, |
| 12 | + * we just additionaly convert Đ/đ -> D/d |
| 13 | + * |
| 14 | + * @author rainman |
| 15 | + * |
| 16 | + */ |
| 17 | +public class VietnameseFilter extends TokenFilter { |
| 18 | + private static final int MAX_WORD_LEN = 255; |
| 19 | + private final char[] buffer = new char[MAX_WORD_LEN]; |
| 20 | + private int len; |
| 21 | + private Token next, afterNext; |
| 22 | + |
| 23 | + public VietnameseFilter(TokenStream input) { |
| 24 | + super(input); |
| 25 | + next = null; |
| 26 | + afterNext = null; |
| 27 | + } |
| 28 | + |
| 29 | + @Override |
| 30 | + public Token next() throws IOException { |
| 31 | + Token t; |
| 32 | + if(next!=null){ |
| 33 | + t = next; |
| 34 | + next = afterNext; |
| 35 | + afterNext = null; |
| 36 | + } else |
| 37 | + t = input.next(); |
| 38 | + if(t == null) |
| 39 | + return null; |
| 40 | + |
| 41 | + len = 0; |
| 42 | + boolean replace = false; |
| 43 | + for(char c : t.termText().toCharArray()){ |
| 44 | + if(c == 'Đ'){ |
| 45 | + buffer[len++] = 'D'; |
| 46 | + replace = true; |
| 47 | + } else if(c == 'đ'){ |
| 48 | + buffer[len++] = 'd'; |
| 49 | + replace = true; |
| 50 | + } else |
| 51 | + buffer[len++] = c; |
| 52 | + } |
| 53 | + if(replace){ |
| 54 | + Token tt = new Token(new String(buffer,0,len),t.startOffset(),t.endOffset(),t.type()); |
| 55 | + tt.setPositionIncrement(0); |
| 56 | + next = input.next(); |
| 57 | + if(next!=null && next.type().equals("transliteration")) |
| 58 | + return t; // we'll replace d's in next token |
| 59 | + else if(t.type().equals("transliteration")){ |
| 60 | + return tt; // replace the transliterated token with one with d's |
| 61 | + } else{ |
| 62 | + afterNext = next; |
| 63 | + next = tt; |
| 64 | + return t; |
| 65 | + } |
| 66 | + } else |
| 67 | + return t; |
| 68 | + |
| 69 | + } |
| 70 | + |
| 71 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -703,13 +703,16 @@ |
704 | 704 | t.setBoost(defaultBoost); |
705 | 705 | cur = new BooleanQuery(); |
706 | 706 | cur.add(t,BooleanClause.Occur.SHOULD); |
707 | | - bq.add(cur,boolDefault); |
| 707 | + bq.add(cur,BooleanClause.Occur.SHOULD); |
708 | 708 | continue; |
709 | 709 | } else{ |
710 | 710 | // alternative transliteration |
711 | 711 | t = new TermQuery(makeTerm(token)); |
712 | 712 | t.setBoost(defaultBoost); |
713 | 713 | cur.add(t,aliasOccur); |
| 714 | + // fetch the next token to same query if it's transliteration |
| 715 | + if((i+1) < tokens.size() && tokens.get(i+1).getPositionIncrement()==0 && tokens.get(i+1).type().equals("transliteration")) |
| 716 | + continue; |
714 | 717 | } |
715 | 718 | } |
716 | 719 | if( cur != bq) // returned from nested query |
— | — | @@ -723,7 +726,10 @@ |
724 | 727 | // e.g. anti-hero => anti hero |
725 | 728 | cur = new BooleanQuery(); |
726 | 729 | cur.add(t,BooleanClause.Occur.SHOULD); |
727 | | - bq.add(cur,boolDefault); |
| 730 | + if(token.type().equals("unicode")) |
| 731 | + bq.add(cur,BooleanClause.Occur.SHOULD); |
| 732 | + else |
| 733 | + bq.add(cur,boolDefault); |
728 | 734 | } else if((i+1) >= tokens.size() || tokens.get(i+1).getPositionIncrement()!=0) |
729 | 735 | cur.add(t,boolDefault); |
730 | 736 | else |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -84,6 +84,8 @@ |
85 | 85 | customFilter = ThaiWordFilter.class; |
86 | 86 | else if(lang.equals("sr")) |
87 | 87 | customFilter = SerbianFilter.class; |
| 88 | + else if(lang.equals("vi")) |
| 89 | + customFilter = VietnameseFilter.class; |
88 | 90 | else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") || |
89 | 91 | lang.equals("ko") || lang.equals("zh-classical") || lang.equals("zh-yue")) |
90 | 92 | customFilter = CJKFilter.class; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java |
— | — | @@ -19,6 +19,19 @@ |
20 | 20 | * |
21 | 21 | */ |
22 | 22 | public class UnicodeDecomposer { |
| 23 | + class Buffer { |
| 24 | + char[] buffer; |
| 25 | + int len; |
| 26 | + public Buffer(char[] buffer, int len) { |
| 27 | + this.buffer = buffer; |
| 28 | + this.len = len; |
| 29 | + } |
| 30 | + public void add(char ch){ |
| 31 | + if(len<buffer.length) |
| 32 | + buffer[len++] = ch; |
| 33 | + } |
| 34 | + |
| 35 | + } |
23 | 36 | static org.apache.log4j.Logger log = Logger.getLogger(UnicodeDecomposer.class); |
24 | 37 | final protected static char[][] decomposition = new char[65536][]; |
25 | 38 | protected static UnicodeDecomposer instance = null; |
— | — | @@ -50,12 +63,7 @@ |
51 | 64 | |
52 | 65 | return instance; |
53 | 66 | } |
54 | | - |
55 | | - protected final void nodecomp(char ch){ |
56 | | - //decomposition[ch] = new char[] { ch }; |
57 | | - decomposition[ch] = null; |
58 | | - } |
59 | | - |
| 67 | + |
60 | 68 | /** |
61 | 69 | * Read unicode data from the UnicodeData.txt file |
62 | 70 | * @param path |
— | — | @@ -80,11 +88,16 @@ |
81 | 89 | } |
82 | 90 | in.close(); |
83 | 91 | |
| 92 | + // decomposition table |
| 93 | + char[][] table = new char[65536][]; |
| 94 | + |
84 | 95 | // default for all chars: no decomposition |
85 | | - for(int ich = 0; ich <= 0xFFFF; ich++) |
86 | | - nodecomp((char)ich); |
| 96 | + for(int ich = 0; ich <= 0xFFFF; ich++){ |
| 97 | + decomposition[ich]=null; |
| 98 | + table[ich]=null; |
| 99 | + } |
87 | 100 | |
88 | | - // second pass, make the decomposition mapping |
| 101 | + // second pass, make the decomposition table |
89 | 102 | in = new BufferedReader(new FileReader(path)); |
90 | 103 | while((line = in.readLine()) != null){ |
91 | 104 | String[] parts = line.split(";"); |
— | — | @@ -106,12 +119,24 @@ |
107 | 120 | buf[len++] = chd; |
108 | 121 | } |
109 | 122 | if( len != 0 ){ |
110 | | - decomposition[ch]= new char[len]; |
| 123 | + table[ch]= new char[len]; |
111 | 124 | for(i=0;i<len;i++) |
112 | | - decomposition[ch][i] = buf[i]; |
| 125 | + table[ch][i] = buf[i]; |
113 | 126 | } |
114 | 127 | } |
115 | 128 | } |
| 129 | + // using decomposition table recursively decompose characters |
| 130 | + for(int ich = 0; ich <= 0xFFFF; ich++){ |
| 131 | + if(table[ich]==null) |
| 132 | + continue; |
| 133 | + Buffer buffer = new Buffer(buf,0); |
| 134 | + recursiveDecompose(buffer,table,letters,(char)ich); |
| 135 | + if(buffer.len != 0){ |
| 136 | + decomposition[ich]= new char[buffer.len]; |
| 137 | + for(i=0;i<len;i++) |
| 138 | + decomposition[ich][i] = buffer.buffer[i]; |
| 139 | + } |
| 140 | + } |
116 | 141 | in.close(); |
117 | 142 | } catch (FileNotFoundException e) { |
118 | 143 | e.printStackTrace(); |
— | — | @@ -124,4 +149,24 @@ |
125 | 150 | log.error("Error in unicode data file at "+path+" : "+e.getMessage()); |
126 | 151 | } |
127 | 152 | } |
| 153 | + |
| 154 | + /** |
| 155 | + * Depth-first recursion, gradually decompose characters (if it has many diacritics) |
| 156 | + * |
| 157 | + * @param buf - buffer where to write resulting decompositions |
| 158 | + * @param table - mapping char -> decomposing letters |
| 159 | + * @param letters - bitset of letter characters |
| 160 | + * @param c - char to decompose |
| 161 | + */ |
| 162 | + protected void recursiveDecompose(Buffer buf, char[][] table, BitSet letters, char c) { |
| 163 | + // terminal |
| 164 | + if(table[c]==null && letters.get(c)){ |
| 165 | + buf.add(c); |
| 166 | + } else if(table[c]!=null && letters.get(c)){ |
| 167 | + // depth-first recursion |
| 168 | + for(char ch : table[c]){ |
| 169 | + recursiveDecompose(buf,table,letters,ch); |
| 170 | + } |
| 171 | + } |
| 172 | + } |
128 | 173 | } |
Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -5,6 +5,10 @@ |
6 | 6 | # mainsplit <noparams> |
7 | 7 | # single, mainpart, restpart, partN (where N is number 1..segments): |
8 | 8 | # <optimize> (true/false), <mergeFactor>, <maxBufferedDocs> |
| 9 | +# split, nssplit <numOfSubindexes> |
| 10 | +# nspartN: |
| 11 | +# <specification>, <optimize>, <mergeFactor>, <maxBufferedDocs> |
| 12 | +# where specification is in format: [ns1,ns2] or [] for rest |
9 | 13 | # language <languageCode> |
10 | 14 | # warmup <numberOfQueries> |
11 | 15 | # databases can be writen as {url}, where url contains list of dbs |