r23020 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r23019‎ | r23020 | r23021 >
Date:23:39, 15 June 2007
Author:rainman
Status:old
Tags:
Comment:
I18n:
* make UnicodeDecomposer remove multiple diacritics properly
* tweaks for Vietnamese, convert some more chars to ascii
Modified paths:
  • /trunk/lucene-search-2.0/lsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/SampleTerms.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -315,10 +315,10 @@
316316
317317 // alternative transliterations
318318 q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true);
319 - assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) +(contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString());
 319+ assertEquals("(+(contents:something contents:someth^0.5) +contents:for +((contents:gödels contents:gödel^0.5) (contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +((title:gödels^2.0 title:godels^2.0 title:goedels^2.0))) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +((alttitle1:gödels^6.0 alttitle1:godels^6.0 alttitle1:goedels^6.0))) (+alttitle2:something^6.0 +alttitle2:for^6.0 +((alttitle2:gödels^6.0 alttitle2:godels^6.0 alttitle2:goedels^6.0))) (+alttitle3:something^6.0 +alttitle3:for^6.0 +((alttitle3:gödels^6.0 alttitle3:godels^6.0 alttitle3:goedels^6.0))))",q.toString());
320320
321321 q = parser.parseFourPass("Something for Gödel",NamespacePolicy.IGNORE,true);
322 - assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString());
 322+ assertEquals("(+(contents:something contents:someth^0.5) +contents:for +((contents:gödel contents:godel contents:goedel))) (+title:something^2.0 +title:for^2.0 +((title:gödel^2.0 title:godel^2.0 title:goedel^2.0))) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +((alttitle1:gödel^6.0 alttitle1:godel^6.0 alttitle1:goedel^6.0))) (+alttitle2:something^6.0 +alttitle2:for^6.0 +((alttitle2:gödel^6.0 alttitle2:godel^6.0 alttitle2:goedel^6.0))) (+alttitle3:something^6.0 +alttitle3:for^6.0 +((alttitle3:gödel^6.0 alttitle3:godel^6.0 alttitle3:goedel^6.0))))",q.toString());
323323
324324 // Test field extraction
325325 HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
@@ -335,7 +335,7 @@
336336 assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli^0.4) +(title:на^2.0 title:na^0.4) +(title:википедију^2.0 title:vikipediju^0.4))",q.toString());
337337
338338 q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE);
339 - assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString());
 339+ assertEquals("(+(contents:dobrodošli contents:dobrodosli) +contents:na +(+contents:šdjčćž +contents:sdjccz)) (+(title:dobrodošli^2.0 title:dobrodosli^2.0) +title:na^2.0 +(+title:šdjčćž^2.0 +title:sdjccz^2.0))",q.toString());
340340
341341 analyzer = Analyzers.getSearcherAnalyzer("th");
342342 parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -64,6 +64,8 @@
6565 showTokens(text);
6666 text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire";
6767 showTokens(text);
 68+ text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d";
 69+ showTokens(text);
6870 text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]";
6971 showTokens(text);
7072 text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/SampleTerms.java
@@ -4,7 +4,7 @@
55 public class SampleTerms implements Terms {
66 protected int pos;
77
8 - SampleTerms(){
 8+ public SampleTerms(){
99 pos = 0;
1010 }
1111
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
@@ -12,6 +12,7 @@
1313 import org.wikimedia.lsearch.analyzers.Analyzers;
1414 import org.wikimedia.lsearch.analyzers.FieldNameFactory;
1515 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
 16+import org.wikimedia.lsearch.benchmark.SampleTerms;
1617 import org.wikimedia.lsearch.benchmark.Terms;
1718 import org.wikimedia.lsearch.benchmark.WordTerms;
1819 import org.wikimedia.lsearch.config.Configuration;
@@ -89,8 +90,8 @@
9091 /** Get database of example search terms for language */
9192 protected static Terms getTermsForLang(String language) {
9293 String lib = Configuration.open().getString("MWConfig","lib","./lib");
93 - if(language.equals("en") && langTerms.get("en")==null)
94 - langTerms.put("en",new WordTerms(lib+"/dict/english.txt.gz"));
 94+ if(language.equals("en"))
 95+ return new SampleTerms();
9596 if(language.equals("fr") && langTerms.get("fr")==null)
9697 langTerms.put("fr",new WordTerms(lib+"/dict/french.txt.gz"));
9798 if(language.equals("de") && langTerms.get("de")==null)
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -157,6 +157,7 @@
158158 decompLength = 0;
159159 aliasLength = 0;
160160 boolean addToAlias;
 161+ boolean addDecomposed = false;
161162 for(int i=0;i<length;i++){
162163 addToAlias = true;
163164 if( ! exactCase )
@@ -179,6 +180,9 @@
180181 } else if(cl == 'Å'){
181182 addToTokenAlias("Aa");
182183 addToAlias = false;
 184+ } else if(cl == 'Ø'){
 185+ addToTokenAlias("O");
 186+ addToAlias = false;
183187 }
184188 }
185189 // special alias transliterations ä -> ae, etc ...
@@ -200,6 +204,9 @@
201205 } else if(cl == 'å'){
202206 addToTokenAlias("aa");
203207 addToAlias = false;
 208+ } else if(cl == 'ø'){
 209+ addToTokenAlias("o");
 210+ addToAlias = false;
204211 }
205212
206213 decomp = decompose(cl);
@@ -210,6 +217,7 @@
211218 if(addToAlias && aliasLength!=0 && aliasLength<aliasBuffer.length)
212219 aliasBuffer[aliasLength++] = cl;
213220 } else{
 221+ addDecomposed = true; // there are differences to the original version
214222 for(decompi = 0; decompi < decomp.length; decompi++){
215223 if(decompLength<decompBuffer.length)
216224 decompBuffer[decompLength++] = decomp[decompi];
@@ -218,10 +226,25 @@
219227 }
220228 }
221229 }
 230+ // make the original buffered version
 231+ Token exact;
 232+ if(exactCase)
 233+ exact = new Token(
 234+ new String(buffer, 0, length), start, start + length);
 235+ else
 236+ exact = new Token(
 237+ new String(buffer, 0, length).toLowerCase(), start, start + length);
 238+ if(addDecomposed && decompLength!=0)
 239+ exact.setType("unicode");
 240+ tokens.add(exact);
222241 // add decomposed token to stream
223 - if(decompLength!=0)
224 - tokens.add(new Token(
225 - new String(decompBuffer, 0, decompLength), start, start + length));
 242+ if(addDecomposed && decompLength!=0){
 243+ Token t = new Token(
 244+ new String(decompBuffer, 0, decompLength), start, start + length);
 245+ t.setPositionIncrement(0);
 246+ t.setType("transliteration");
 247+ tokens.add(t);
 248+ }
226249 // add alias (if any) token to stream
227250 if(aliasLength!=0){
228251 Token t = new Token(
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
@@ -0,0 +1,70 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+
 6+import org.apache.lucene.analysis.Token;
 7+import org.apache.lucene.analysis.TokenFilter;
 8+import org.apache.lucene.analysis.TokenStream;
 9+
 10+/**
 11+ * Vietnamese standard transliterations to ascii. Most of the stuff is done by unicode decomposed,
 12+ * we just additionaly convert Đ/đ -> D/d
 13+ *
 14+ * @author rainman
 15+ *
 16+ */
 17+public class VietnameseFilter extends TokenFilter {
 18+ private static final int MAX_WORD_LEN = 255;
 19+ private final char[] buffer = new char[MAX_WORD_LEN];
 20+ private int len;
 21+ private Token next, afterNext;
 22+
 23+ public VietnameseFilter(TokenStream input) {
 24+ super(input);
 25+ next = null;
 26+ afterNext = null;
 27+ }
 28+
 29+ @Override
 30+ public Token next() throws IOException {
 31+ Token t;
 32+ if(next!=null){
 33+ t = next;
 34+ next = afterNext;
 35+ afterNext = null;
 36+ } else
 37+ t = input.next();
 38+ if(t == null)
 39+ return null;
 40+
 41+ len = 0;
 42+ boolean replace = false;
 43+ for(char c : t.termText().toCharArray()){
 44+ if(c == 'Đ'){
 45+ buffer[len++] = 'D';
 46+ replace = true;
 47+ } else if(c == 'đ'){
 48+ buffer[len++] = 'd';
 49+ replace = true;
 50+ } else
 51+ buffer[len++] = c;
 52+ }
 53+ if(replace){
 54+ Token tt = new Token(new String(buffer,0,len),t.startOffset(),t.endOffset(),t.type());
 55+ tt.setPositionIncrement(0);
 56+ next = input.next();
 57+ if(next!=null && next.type().equals("transliteration"))
 58+ return t; // we'll replace d's in next token
 59+ else if(t.type().equals("transliteration")){
 60+ return tt; // replace the transliterated token with one with d's
 61+ } else{
 62+ afterNext = next;
 63+ next = tt;
 64+ return t;
 65+ }
 66+ } else
 67+ return t;
 68+
 69+ }
 70+
 71+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -703,13 +703,16 @@
704704 t.setBoost(defaultBoost);
705705 cur = new BooleanQuery();
706706 cur.add(t,BooleanClause.Occur.SHOULD);
707 - bq.add(cur,boolDefault);
 707+ bq.add(cur,BooleanClause.Occur.SHOULD);
708708 continue;
709709 } else{
710710 // alternative transliteration
711711 t = new TermQuery(makeTerm(token));
712712 t.setBoost(defaultBoost);
713713 cur.add(t,aliasOccur);
 714+ // fetch the next token to same query if it's transliteration
 715+ if((i+1) < tokens.size() && tokens.get(i+1).getPositionIncrement()==0 && tokens.get(i+1).type().equals("transliteration"))
 716+ continue;
714717 }
715718 }
716719 if( cur != bq) // returned from nested query
@@ -723,7 +726,10 @@
724727 // e.g. anti-hero => anti hero
725728 cur = new BooleanQuery();
726729 cur.add(t,BooleanClause.Occur.SHOULD);
727 - bq.add(cur,boolDefault);
 730+ if(token.type().equals("unicode"))
 731+ bq.add(cur,BooleanClause.Occur.SHOULD);
 732+ else
 733+ bq.add(cur,boolDefault);
728734 } else if((i+1) >= tokens.size() || tokens.get(i+1).getPositionIncrement()!=0)
729735 cur.add(t,boolDefault);
730736 else
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -84,6 +84,8 @@
8585 customFilter = ThaiWordFilter.class;
8686 else if(lang.equals("sr"))
8787 customFilter = SerbianFilter.class;
 88+ else if(lang.equals("vi"))
 89+ customFilter = VietnameseFilter.class;
8890 else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") ||
8991 lang.equals("ko") || lang.equals("zh-classical") || lang.equals("zh-yue"))
9092 customFilter = CJKFilter.class;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java
@@ -19,6 +19,19 @@
2020 *
2121 */
2222 public class UnicodeDecomposer {
 23+ class Buffer {
 24+ char[] buffer;
 25+ int len;
 26+ public Buffer(char[] buffer, int len) {
 27+ this.buffer = buffer;
 28+ this.len = len;
 29+ }
 30+ public void add(char ch){
 31+ if(len<buffer.length)
 32+ buffer[len++] = ch;
 33+ }
 34+
 35+ }
2336 static org.apache.log4j.Logger log = Logger.getLogger(UnicodeDecomposer.class);
2437 final protected static char[][] decomposition = new char[65536][];
2538 protected static UnicodeDecomposer instance = null;
@@ -50,12 +63,7 @@
5164
5265 return instance;
5366 }
54 -
55 - protected final void nodecomp(char ch){
56 - //decomposition[ch] = new char[] { ch };
57 - decomposition[ch] = null;
58 - }
59 -
 67+
6068 /**
6169 * Read unicode data from the UnicodeData.txt file
6270 * @param path
@@ -80,11 +88,16 @@
8189 }
8290 in.close();
8391
 92+ // decomposition table
 93+ char[][] table = new char[65536][];
 94+
8495 // default for all chars: no decomposition
85 - for(int ich = 0; ich <= 0xFFFF; ich++)
86 - nodecomp((char)ich);
 96+ for(int ich = 0; ich <= 0xFFFF; ich++){
 97+ decomposition[ich]=null;
 98+ table[ich]=null;
 99+ }
87100
88 - // second pass, make the decomposition mapping
 101+ // second pass, make the decomposition table
89102 in = new BufferedReader(new FileReader(path));
90103 while((line = in.readLine()) != null){
91104 String[] parts = line.split(";");
@@ -106,12 +119,24 @@
107120 buf[len++] = chd;
108121 }
109122 if( len != 0 ){
110 - decomposition[ch]= new char[len];
 123+ table[ch]= new char[len];
111124 for(i=0;i<len;i++)
112 - decomposition[ch][i] = buf[i];
 125+ table[ch][i] = buf[i];
113126 }
114127 }
115128 }
 129+ // using decomposition table recursively decompose characters
 130+ for(int ich = 0; ich <= 0xFFFF; ich++){
 131+ if(table[ich]==null)
 132+ continue;
 133+ Buffer buffer = new Buffer(buf,0);
 134+ recursiveDecompose(buffer,table,letters,(char)ich);
 135+ if(buffer.len != 0){
 136+ decomposition[ich]= new char[buffer.len];
 137+ for(i=0;i<len;i++)
 138+ decomposition[ich][i] = buffer.buffer[i];
 139+ }
 140+ }
116141 in.close();
117142 } catch (FileNotFoundException e) {
118143 e.printStackTrace();
@@ -124,4 +149,24 @@
125150 log.error("Error in unicode data file at "+path+" : "+e.getMessage());
126151 }
127152 }
 153+
 154+ /**
 155+ * Depth-first recursion, gradually decompose characters (if it has many diacritics)
 156+ *
 157+ * @param buf - buffer where to write resulting decompositions
 158+ * @param table - mapping char -> decomposing letters
 159+ * @param letters - bitset of letter characters
 160+ * @param c - char to decompose
 161+ */
 162+ protected void recursiveDecompose(Buffer buf, char[][] table, BitSet letters, char c) {
 163+ // terminal
 164+ if(table[c]==null && letters.get(c)){
 165+ buf.add(c);
 166+ } else if(table[c]!=null && letters.get(c)){
 167+ // depth-first recursion
 168+ for(char ch : table[c]){
 169+ recursiveDecompose(buf,table,letters,ch);
 170+ }
 171+ }
 172+ }
128173 }
Index: trunk/lucene-search-2.0/lsearch-global.conf
@@ -5,6 +5,10 @@
66 # mainsplit <noparams>
77 # single, mainpart, restpart, partN (where N is number 1..segments):
88 # <optimize> (true/false), <mergeFactor>, <maxBufferedDocs>
 9+# split, nssplit <numOfSubindexes>
 10+# nspartN:
 11+# <specification>, <optimize>, <mergeFactor>, <maxBufferedDocs>
 12+# where specification is in format: [ns1,ns2] or [] for rest
913 # language <languageCode>
1014 # warmup <numberOfQueries>
1115 # databases can be writen as {url}, where url contains list of dbs

Status & tagging log