r23211 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r23210‎ | r23211 | r23212 >
Date:11:36, 22 June 2007
Author:rainman
Status:old
Tags:
Comment:
Handle Hebrew pointing better and add some more common transliterations.
Thanks to Connel and enwiktionary people.
Modified paths:
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -15,7 +15,7 @@
1616
1717 public class FastWikiTokenizerTest {
1818 public static void displayTokensForParser(String text) {
19 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr",false);
 19+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false);
2020 Token[] tokens = parser.parse().toArray(new Token[] {});
2121 for (int i = 0; i < tokens.length; i++) {
2222 Token token = tokens[i];
@@ -62,10 +62,12 @@
6363 public static void main(String args[]) throws IOException{
6464 String text = "(ant) and some";
6565 showTokens(text);
66 - text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire";
 66+ text = "Æ (ď), l' (ľ), תּפר ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; ij čakšire תפר ";
6767 showTokens(text);
68 - text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d";
 68+ text = "Dž (Dž), dž (dž), d' (ď), l' (ľ), t' (ť), IJ (IJ), ij (ij), LJ (LJ), Lj (Lj), lj (lj). NJ (NJ), Nj (Nj), nj (nj). All characters in parentheses are the single-unicode form; those not in parentheses are component character forms. There's also the issue of searching for AE (Æ), ae (æ), OE (Œ), & oe (œ).";
6969 showTokens(text);
 70+ text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained ";
 71+ showTokens(text);
7072 text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]";
7173 showTokens(text);
7274 text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java
@@ -175,7 +175,7 @@
176176 String verb = "search";
177177 String namespace = "";
178178 String namespaceFilter= "0";
179 - String lang = "en-b";
 179+ String lang = "en";
180180 int runs = 5000;
181181 int threads = 10;
182182 int words = 1;
@@ -230,8 +230,7 @@
231231 }
232232 }
233233 if("en".equals(lang) || "de".equals(lang) || "es".equals(lang) || "fr".equals(lang) || "it".equals(lang) || "pt".equals(lang))
234 - terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz");
235 -
 234+ terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz");
236235 else if(lang.equals("sample"))
237236 terms = new SampleTerms();
238237 else
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -179,6 +179,12 @@
180180 } else if(cl == 'Ø'){
181181 addToTokenAlias("O");
182182 addToAlias = false;
 183+ } else if(cl == 'Æ'){
 184+ addToTokenAlias("AE");
 185+ addToAlias = false;
 186+ } else if(cl == 'Œ'){
 187+ addToTokenAlias("OE");
 188+ addToAlias = false;
183189 }
184190 }
185191 // special alias transliterations ä -> ae, etc ...
@@ -203,6 +209,12 @@
204210 } else if(cl == 'ø'){
205211 addToTokenAlias("o");
206212 addToAlias = false;
 213+ } else if(cl == 'æ'){
 214+ addToTokenAlias("ae");
 215+ addToAlias = false;
 216+ } else if(cl == 'œ'){
 217+ addToTokenAlias("oe");
 218+ addToAlias = false;
207219 }
208220
209221 decomp = decompose(cl);
@@ -266,9 +278,9 @@
267279 * buffer, if it's not a letter, new token is created
268280 */
269281 private final void addLetter(){
270 - try{
 282+ try{
271283 // add new character to buffer
272 - if(Character.isLetter(c)){
 284+ if(Character.isLetter(c)){
273285 if(numberToken) // we were fetching a number
274286 addToken();
275287
@@ -285,11 +297,8 @@
286298
287299 if(length<buffer.length)
288300 buffer[length++] = c;
289 - // add dot and comma to digits if they are not at the beginning
290 - /* } else if(numberToken && (c == '.' || c == ',')){
291 - if(length<buffer.length)
292 - buffer[length++] = c; */
293 - } else{
 301+ } else if(decomposer.isCombiningChar(c)); // ignore
 302+ else{
294303 addToken();
295304 }
296305 } catch(Exception e){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java
@@ -34,6 +34,7 @@
3535 }
3636 static org.apache.log4j.Logger log = Logger.getLogger(UnicodeDecomposer.class);
3737 final protected static char[][] decomposition = new char[65536][];
 38+ final protected static boolean[] combining = new boolean[65536];
3839 protected static UnicodeDecomposer instance = null;
3940
4041 /**
@@ -50,6 +51,10 @@
5152 log.info("Loaded unicode decomposer");
5253 }
5354
 55+ public boolean isCombiningChar(char ch){
 56+ return combining[ch];
 57+ }
 58+
5459 /**
5560 * Get singleton instance of the Unicode decomposer class.
5661 * Loads lib/UnicodeData.txt on first call
@@ -85,6 +90,11 @@
8691 continue; // ignore any additional chars
8792 if(parts[2].charAt(0) == 'L')
8893 letters.set(chVal);
 94+
 95+ if(parts[2].charAt(0) == 'M')
 96+ combining[chVal] = true;
 97+ else
 98+ combining[chVal] = false;
8999 }
90100 in.close();
91101
@@ -133,7 +143,7 @@
134144 recursiveDecompose(buffer,table,letters,(char)ich);
135145 if(buffer.len != 0){
136146 decomposition[ich]= new char[buffer.len];
137 - for(i=0;i<len;i++)
 147+ for(i=0;i<buffer.len;i++)
138148 decomposition[ich][i] = buffer.buffer[i];
139149 }
140150 }

Status & tagging log