Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -15,7 +15,7 @@ |
16 | 16 | |
17 | 17 | public class FastWikiTokenizerTest { |
18 | 18 | public static void displayTokensForParser(String text) { |
19 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr",false); |
| 19 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false); |
20 | 20 | Token[] tokens = parser.parse().toArray(new Token[] {}); |
21 | 21 | for (int i = 0; i < tokens.length; i++) { |
22 | 22 | Token token = tokens[i]; |
— | — | @@ -62,10 +62,12 @@ |
63 | 63 | public static void main(String args[]) throws IOException{ |
64 | 64 | String text = "(ant) and some"; |
65 | 65 | showTokens(text); |
66 | | - text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire"; |
| 66 | + text = "Æ (ď), l' (ľ), תּפר ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; ij čakšire תפר "; |
67 | 67 | showTokens(text); |
68 | | - text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d"; |
| 68 | + text = "Dž (Dž), dž (dž), d' (ď), l' (ľ), t' (ť), IJ (IJ), ij (ij), LJ (LJ), Lj (Lj), lj (lj). NJ (NJ), Nj (Nj), nj (nj). All characters in parentheses are the single-unicode form; those not in parentheses are component character forms. There's also the issue of searching for AE (Æ), ae (æ), OE (Œ), & oe (œ)."; |
69 | 69 | showTokens(text); |
| 70 | + text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained "; |
| 71 | + showTokens(text); |
70 | 72 | text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]"; |
71 | 73 | showTokens(text); |
72 | 74 | text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages."; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java |
— | — | @@ -175,7 +175,7 @@ |
176 | 176 | String verb = "search"; |
177 | 177 | String namespace = ""; |
178 | 178 | String namespaceFilter= "0"; |
179 | | - String lang = "en-b"; |
| 179 | + String lang = "en"; |
180 | 180 | int runs = 5000; |
181 | 181 | int threads = 10; |
182 | 182 | int words = 1; |
— | — | @@ -230,8 +230,7 @@ |
231 | 231 | } |
232 | 232 | } |
233 | 233 | if("en".equals(lang) || "de".equals(lang) || "es".equals(lang) || "fr".equals(lang) || "it".equals(lang) || "pt".equals(lang)) |
234 | | - terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz"); |
235 | | - |
| 234 | + terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz"); |
236 | 235 | else if(lang.equals("sample")) |
237 | 236 | terms = new SampleTerms(); |
238 | 237 | else |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -179,6 +179,12 @@ |
180 | 180 | } else if(cl == 'Ø'){ |
181 | 181 | addToTokenAlias("O"); |
182 | 182 | addToAlias = false; |
| 183 | + } else if(cl == 'Æ'){ |
| 184 | + addToTokenAlias("AE"); |
| 185 | + addToAlias = false; |
| 186 | + } else if(cl == 'Œ'){ |
| 187 | + addToTokenAlias("OE"); |
| 188 | + addToAlias = false; |
183 | 189 | } |
184 | 190 | } |
185 | 191 | // special alias transliterations ä -> ae, etc ... |
— | — | @@ -203,6 +209,12 @@ |
204 | 210 | } else if(cl == 'ø'){ |
205 | 211 | addToTokenAlias("o"); |
206 | 212 | addToAlias = false; |
| 213 | + } else if(cl == 'æ'){ |
| 214 | + addToTokenAlias("ae"); |
| 215 | + addToAlias = false; |
| 216 | + } else if(cl == 'œ'){ |
| 217 | + addToTokenAlias("oe"); |
| 218 | + addToAlias = false; |
207 | 219 | } |
208 | 220 | |
209 | 221 | decomp = decompose(cl); |
— | — | @@ -266,9 +278,9 @@ |
267 | 279 | * buffer, if it's not a letter, new token is created |
268 | 280 | */ |
269 | 281 | private final void addLetter(){ |
270 | | - try{ |
| 282 | + try{ |
271 | 283 | // add new character to buffer |
272 | | - if(Character.isLetter(c)){ |
| 284 | + if(Character.isLetter(c)){ |
273 | 285 | if(numberToken) // we were fetching a number |
274 | 286 | addToken(); |
275 | 287 | |
— | — | @@ -285,11 +297,8 @@ |
286 | 298 | |
287 | 299 | if(length<buffer.length) |
288 | 300 | buffer[length++] = c; |
289 | | - // add dot and comma to digits if they are not at the beginning |
290 | | - /* } else if(numberToken && (c == '.' || c == ',')){ |
291 | | - if(length<buffer.length) |
292 | | - buffer[length++] = c; */ |
293 | | - } else{ |
| 301 | + } else if(decomposer.isCombiningChar(c)); // ignore |
| 302 | + else{ |
294 | 303 | addToken(); |
295 | 304 | } |
296 | 305 | } catch(Exception e){ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java |
— | — | @@ -34,6 +34,7 @@ |
35 | 35 | } |
36 | 36 | static org.apache.log4j.Logger log = Logger.getLogger(UnicodeDecomposer.class); |
37 | 37 | final protected static char[][] decomposition = new char[65536][]; |
| 38 | + final protected static boolean[] combining = new boolean[65536]; |
38 | 39 | protected static UnicodeDecomposer instance = null; |
39 | 40 | |
40 | 41 | /** |
— | — | @@ -50,6 +51,10 @@ |
51 | 52 | log.info("Loaded unicode decomposer"); |
52 | 53 | } |
53 | 54 | |
| 55 | + public boolean isCombiningChar(char ch){ |
| 56 | + return combining[ch]; |
| 57 | + } |
| 58 | + |
54 | 59 | /** |
55 | 60 | * Get singleton instance of the Unicode decomposer class. |
56 | 61 | * Loads lib/UnicodeData.txt on first call |
— | — | @@ -85,6 +90,11 @@ |
86 | 91 | continue; // ignore any additional chars |
87 | 92 | if(parts[2].charAt(0) == 'L') |
88 | 93 | letters.set(chVal); |
| 94 | + |
| 95 | + if(parts[2].charAt(0) == 'M') |
| 96 | + combining[chVal] = true; |
| 97 | + else |
| 98 | + combining[chVal] = false; |
89 | 99 | } |
90 | 100 | in.close(); |
91 | 101 | |
— | — | @@ -133,7 +143,7 @@ |
134 | 144 | recursiveDecompose(buffer,table,letters,(char)ich); |
135 | 145 | if(buffer.len != 0){ |
136 | 146 | decomposition[ich]= new char[buffer.len]; |
137 | | - for(i=0;i<len;i++) |
| 147 | + for(i=0;i<buffer.len;i++) |
138 | 148 | decomposition[ich][i] = buffer.buffer[i]; |
139 | 149 | } |
140 | 150 | } |