r23211 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r23210‎ \| r23211 \| r23212 >
Date:	11:36, 22 June 2007
Author:	rainman
Status:	old
Tags:
Comment:	Handle Hebrew pointing better and add some more common transliterations. Thanks to Connel and enwiktionary people.
Modified paths:	/trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
—	—	@@ -15,7 +15,7 @@
16	16
17	17	public class FastWikiTokenizerTest {
18	18	public static void displayTokensForParser(String text) {
19		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr",false);~~
	19	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false);
20	20	Token[] tokens = parser.parse().toArray(new Token[] {});
21	21	for (int i = 0; i < tokens.length; i++) {
22	22	Token token = tokens[i];
—	—	@@ -62,10 +62,12 @@
63	63	public static void main(String args[]) throws IOException{
64	64	String text = "(ant) and some";
65	65	showTokens(text);
66		~~- text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire";~~
	66	+ text = "Æ (ď), l' (ľ), תּפר ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; ĳ čakšire תפר ";
67	67	showTokens(text);
68		~~- text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d";~~
	68	+ text = "Dž (Dž), dž (dž), d' (ď), l' (ľ), t' (ť), IJ (Ĳ), ij (ĳ), LJ (Ǉ), Lj (ǈ), lj (ǉ). NJ (Ǌ), Nj (ǋ), nj (ǌ). All characters in parentheses are the single-unicode form; those not in parentheses are component character forms. There's also the issue of searching for AE (Æ), ae (æ), OE (Œ), & oe (œ).";
69	69	showTokens(text);
	70	+ text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained ";
	71	+ showTokens(text);
70	72	text = "[[Category:Blah Blah?!\|Caption]], and [[:Category:Link to category]]";
71	73	showTokens(text);
72	74	text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol\|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java
—	—	@@ -175,7 +175,7 @@
176	176	String verb = "search";
177	177	String namespace = "";
178	178	String namespaceFilter= "0";
179		~~- String lang = "en-b";~~
	179	+ String lang = "en";
180	180	int runs = 5000;
181	181	int threads = 10;
182	182	int words = 1;
—	—	@@ -230,8 +230,7 @@
231	231	}
232	232	}
233	233	if("en".equals(lang) \|\| "de".equals(lang) \|\| "es".equals(lang) \|\| "fr".equals(lang) \|\| "it".equals(lang) \|\| "pt".equals(lang))
234		~~- terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz");~~
235		-
	234	+ terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz");
236	235	else if(lang.equals("sample"))
237	236	terms = new SampleTerms();
238	237	else
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -179,6 +179,12 @@
180	180	} else if(cl == 'Ø'){
181	181	addToTokenAlias("O");
182	182	addToAlias = false;
	183	+ } else if(cl == 'Æ'){
	184	+ addToTokenAlias("AE");
	185	+ addToAlias = false;
	186	+ } else if(cl == 'Œ'){
	187	+ addToTokenAlias("OE");
	188	+ addToAlias = false;
183	189	}
184	190	}
185	191	// special alias transliterations ä -> ae, etc ...
—	—	@@ -203,6 +209,12 @@
204	210	} else if(cl == 'ø'){
205	211	addToTokenAlias("o");
206	212	addToAlias = false;
	213	+ } else if(cl == 'æ'){
	214	+ addToTokenAlias("ae");
	215	+ addToAlias = false;
	216	+ } else if(cl == 'œ'){
	217	+ addToTokenAlias("oe");
	218	+ addToAlias = false;
207	219	}
208	220
209	221	decomp = decompose(cl);
—	—	@@ -266,9 +278,9 @@
267	279	* buffer, if it's not a letter, new token is created
268	280	*/
269	281	private final void addLetter(){
270		~~- try{~~
	282	+ try{
271	283	// add new character to buffer
272		~~- if(Character.isLetter(c)){~~
	284	+ if(Character.isLetter(c)){
273	285	if(numberToken) // we were fetching a number
274	286	addToken();
275	287
—	—	@@ -285,11 +297,8 @@
286	298
287	299	if(length<buffer.length)
288	300	buffer[length++] = c;
289		~~- // add dot and comma to digits if they are not at the beginning~~
290		~~- /* } else if(numberToken && (c == '.' \|\| c == ',')){~~
291		~~- if(length<buffer.length)~~
292		~~- buffer[length++] = c; */~~
293		~~- } else{~~
	301	+ } else if(decomposer.isCombiningChar(c)); // ignore
	302	+ else{
294	303	addToken();
295	304	}
296	305	} catch(Exception e){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/UnicodeDecomposer.java
—	—	@@ -34,6 +34,7 @@
35	35	}
36	36	static org.apache.log4j.Logger log = Logger.getLogger(UnicodeDecomposer.class);
37	37	final protected static char[][] decomposition = new char[65536][];
	38	+ final protected static boolean[] combining = new boolean[65536];
38	39	protected static UnicodeDecomposer instance = null;
39	40
40	41	/**
—	—	@@ -50,6 +51,10 @@
51	52	log.info("Loaded unicode decomposer");
52	53	}
53	54
	55	+ public boolean isCombiningChar(char ch){
	56	+ return combining[ch];
	57	+ }
	58	+
54	59	/**
55	60	* Get singleton instance of the Unicode decomposer class.
56	61	* Loads lib/UnicodeData.txt on first call
—	—	@@ -85,6 +90,11 @@
86	91	continue; // ignore any additional chars
87	92	if(parts[2].charAt(0) == 'L')
88	93	letters.set(chVal);
	94	+
	95	+ if(parts[2].charAt(0) == 'M')
	96	+ combining[chVal] = true;
	97	+ else
	98	+ combining[chVal] = false;
89	99	}
90	100	in.close();
91	101
—	—	@@ -133,7 +143,7 @@
134	144	recursiveDecompose(buffer,table,letters,(char)ich);
135	145	if(buffer.len != 0){
136	146	decomposition[ich]= new char[buffer.len];
137		~~- for(i=0;i<len;i++)~~
	147	+ for(i=0;i<buffer.len;i++)
138	148	decomposition[ich][i] = buffer.buffer[i];
139	149	}
140	150	}

Status & tagging log

15:19, 12 September 2011 Meno25 (talk | contribs) changed the status of r23211 [removed: ok added: old]