r45728 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r45727‎ \| r45728 \| r45729 >
Date:	16:03, 14 January 2009
Author:	rainman
Status:	deferred
Tags:
Comment:	Minor patches: * test cases for CJK, cannot reproduce jawiki bug, needs more investigation * network thread tuning and reporting * add mwdumper into the repository so people don't need to build their own
Modified paths:	/branches/lucene-search-2.1/lib/mwdumper.jar (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NetworkStatusThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java (modified) (history) /branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/AnalysisTest.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/AnalysisTest.java
—	—	@@ -49,6 +49,11 @@
50	50	GlobalConfiguration.getInstance();
51	51	}
52	52	}
	53	+
	54	+ public void testCJKAnalyzer(){
	55	+ a = new CJKAnalyzer();
	56	+ assertEquals("[(いわ,0,2,type=double), (わさ,1,3,type=double), (さき,2,4,type=double), (ic,4,6,type=single), (カー,6,8,type=double), (ード,7,9,type=double)]",tokens("いわさきicカード"));
	57	+ }
53	58
54	59	/** Common test for indexer and searcher analyzers */
55	60	public void commonEnglish(){
—	—	@@ -225,7 +230,7 @@
226	231	printCodePoints("“കൊറിയ”");
227	232
228	233	QueryParser parser = new QueryParser("contents",new CJKAnalyzer());
229		~~- Query q = parser.parse("プロサッカークラブをつくろう");~~
	234	+ Query q = parser.parse("いわさきicカードプロサッカークラブをつくろう");
230	235	System.out.println("Japanese in standard analyzer: "+q);
231	236	displayTokens(new CJKAnalyzer(),"は、工学者、大学教授、工学博士。『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
232	237	displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木孝治（すずきこうじ、1954年 - ）『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java
—	—	@@ -9,6 +9,7 @@
10	10	import org.mediawiki.dumper.ProgressFilter;
11	11	import org.mediawiki.dumper.Tools;
12	12	import org.mediawiki.importer.XmlDumpReader;
	13	+import org.wikimedia.lsearch.beans.LocalIndex;
13	14	import org.wikimedia.lsearch.config.Configuration;
14	15	import org.wikimedia.lsearch.config.GlobalConfiguration;
15	16	import org.wikimedia.lsearch.config.IndexId;
—	—	@@ -31,12 +32,12 @@
32	33	boolean useSnapshot = false;
33	34	boolean local = false;
34	35	ArrayList<String> dbnames = new ArrayList<String>();
35		-
	36	+
36	37	System.out.println("MediaWiki lucene-search indexer - build spelling suggestion index.\n");
37		-
	38	+
38	39	Configuration.open();
39	40	GlobalConfiguration global = GlobalConfiguration.getInstance();
40		-
	41	+
41	42	for(int i=0;i<args.length;i++){
42	43	if(args[i].equals("-s"))
43	44	useSnapshot = true;
—	—	@@ -50,7 +51,7 @@
51	52	else if(inputfile == null)
52	53	inputfile = args[i];
53	54	}
54		-
	55	+
55	56	if(dbnames.size() == 0 && !local){
56	57	System.out.println("Syntax: java SuggestBuilder [-l] [-s] [<dbname>] [<dumpfile>]");
57	58	System.out.println("Options:");
—	—	@@ -58,67 +59,80 @@
59	60	System.out.println(" -l rebuild all local indexes from snapshots");
60	61	return;
61	62	}
62		-
	63	+
63	64	UnicodeDecomposer.getInstance();
64	65	Localization.loadInterwiki();
65	66	Collections.sort(dbnames);
66	67	long start = System.currentTimeMillis();
67	68	for(String dbname : dbnames){
68		~~- // preload~~
69		~~- Localization.readLocalization(global.getLanguage(dbname));~~
	69	+ try{
	70	+ log.info("Building spell-check for "+dbname);
	71	+ // preload
	72	+ Localization.readLocalization(global.getLanguage(dbname));
70	73
71		~~- IndexId iid = IndexId.get(dbname);~~
72		~~- IndexId spell = iid.getSpell();~~
73		~~- IndexId pre = spell.getPrecursor();~~
74		~~- if(spell == null){~~
75		~~- log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration.");~~
76		~~- continue;~~
77		~~- }~~
78		-
79		~~- if(inputfile != null){~~
80		~~- log.info("Rebuilding precursor index...");~~
81		~~- // open~~
82		~~- InputStream input = null;~~
83		~~- try {~~
84		~~- input = Tools.openInputFile(inputfile);~~
85		~~- } catch (IOException e) {~~
86		~~- log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());~~
87		~~- return;~~
	74	+ IndexId iid = IndexId.get(dbname);
	75	+ if( !iid.hasSpell() )
	76	+ continue;
	77	+ IndexId spell = iid.getSpell();
	78	+ IndexId pre = spell.getPrecursor();
	79	+ if(spell == null){
	80	+ log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration.");
	81	+ continue;
88	82	}
89	83
90		~~- // make fresh clean index~~
91		~~- try {~~
92		~~- CleanIndexImporter importer = new CleanIndexImporter(pre,iid.getLangCode());~~
93		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000));~~
94		~~- reader.readDump();~~
95		~~- importer.closeIndex();~~
96		~~- IndexThread.makeIndexSnapshot(pre,pre.getImportPath());~~
97		~~- } catch (IOException e) {~~
98		~~- if(!e.getMessage().equals("stopped")){~~
99		~~- e.printStackTrace();~~
100		~~- log.fatal("I/O error reading dump for "+dbname+" from "+inputfile+" : "+e.getMessage());~~
	84	+ if(inputfile != null){
	85	+ log.info("Rebuilding precursor index...");
	86	+ // open
	87	+ InputStream input = null;
	88	+ try {
	89	+ input = Tools.openInputFile(inputfile);
	90	+ } catch (IOException e) {
	91	+ log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());
101	92	return;
102	93	}
103		~~- }~~
104		~~- }~~
105	94
106		~~- log.info("Making spell-check index");~~
107		~~- // make phrase index~~
	95	+ // make fresh clean index
	96	+ try {
	97	+ CleanIndexImporter importer = new CleanIndexImporter(pre,iid.getLangCode());
	98	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000));
	99	+ reader.readDump();
	100	+ importer.closeIndex();
	101	+ IndexThread.makeIndexSnapshot(pre,pre.getImportPath());
	102	+ } catch (IOException e) {
	103	+ if(!e.getMessage().equals("stopped")){
	104	+ e.printStackTrace();
	105	+ log.fatal("I/O error reading dump for "+dbname+" from "+inputfile+" : "+e.getMessage());
	106	+ return;
	107	+ }
	108	+ }
	109	+ }
108	110
109		~~- SpellCheckIndexer tInx = new SpellCheckIndexer(spell);~~
110		~~- String path = pre.getImportPath();~~
111		~~- if(useSnapshot)~~
112		~~- path = IndexRegistry.getInstance().getLatestSnapshot(pre).getPath();~~
113		~~- tInx.createFromPrecursor(path);~~
	111	+ log.info("Making spell-check index");
	112	+ // make phrase index
114	113
115		~~- // make snapshots~~
116		~~- IndexThread.makeIndexSnapshot(spell,spell.getImportPath());~~
	114	+ SpellCheckIndexer tInx = new SpellCheckIndexer(spell);
	115	+ String path = pre.getImportPath();
	116	+ if(useSnapshot){
	117	+ LocalIndex li = IndexRegistry.getInstance().getLatestSnapshot(pre);
	118	+ if(li == null){
	119	+ log.info("Snapshot for "+pre+" not available.");
	120	+ continue;
	121	+ }
	122	+ path = li.getPath();
	123	+ }
	124	+ tInx.createFromPrecursor(path);
	125	+
	126	+ // make snapshots
	127	+ IndexThread.makeIndexSnapshot(spell,spell.getImportPath());
	128	+ } catch(Exception e){
	129	+ log.error("Exception building spellcheck index for "+dbname+" "+e.getMessage(),e);
	130	+ }
117	131	}
118	132	long end = System.currentTimeMillis();
119	133
120	134	System.out.println("Finished making spell-check index in "+formatTime(end-start));
121	135	}
122		-
	136	+
123	137	private static String formatTime(long l) {
124	138	l /= 1000;
125	139	if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
—	—	@@ -27,6 +27,7 @@
28	28	import org.wikimedia.lsearch.beans.Article;
29	29	import org.wikimedia.lsearch.beans.Title;
30	30	import org.wikimedia.lsearch.config.Configuration;
	31	+import org.wikimedia.lsearch.config.GlobalConfiguration;
31	32	import org.wikimedia.lsearch.config.IndexId;
32	33	import org.wikimedia.lsearch.config.IndexRegistry;
33	34	import org.wikimedia.lsearch.index.IndexThread;
—	—	@@ -97,18 +98,20 @@
98	99	public static void main(String[] args) throws IOException{
99	100	int perPrefix = 15;
100	101	boolean usetemp = false;
101		~~- String dbname = null;~~
102	102	boolean useSnapshot = false;
	103	+ ArrayList<String> dbnames = new ArrayList<String>();
103	104
104	105	System.out.println("MediaWiki lucene-search indexer - rebuild prefix index used for ajax suggestions.");
105	106
106	107	Configuration.open();
	108	+ GlobalConfiguration global = GlobalConfiguration.getInstance();
107	109	if(args.length == 0){
108		~~- System.out.println("Syntax: java PrefixIndexBuilder [-t] [-p <num>] <dbname>");~~
	110	+ System.out.println("Syntax: java PrefixIndexBuilder [-t] [-l] [-p <num>] <dbname>");
109	111	System.out.println("Options:");
110	112	System.out.println(" -p - reuse temporary precursor index (import path)");
111	113	System.out.println(" -s - reuse latest temporary precursor snapshot");
112	114	System.out.println(" -t <num> - titles per prefix (default: "+perPrefix+")");
	115	+ System.out.println(" -l rebuild all local indexes from snapshots");
113	116	return;
114	117	}
115	118	for(int i=0;i<args.length;i++){
—	—	@@ -116,24 +119,36 @@
117	120	usetemp = true;
118	121	else if(args[i].equals("-t"))
119	122	perPrefix = Integer.parseInt(args[++i]);
120		~~- else if(args[i].equals("-s")){~~
	123	+ else if(args[i].equals("-l")){
	124	+ useSnapshot = true;
	125	+ dbnames.addAll(global.getMyIndexDBnames());
	126	+ } else if(args[i].equals("-s")){
121	127	usetemp = true;
122	128	useSnapshot = true;
123	129	} else if(args[i].startsWith("-")){
124	130	System.out.println("Unrecognized option "+args[i]);
125	131	return;
126	132	} else
127		~~- dbname = args[i];~~
	133	+ dbnames.add(args[i]);
128	134	}
129	135
130		~~- IndexId iid = IndexId.get(dbname);~~
131		~~- PrefixIndexBuilder builder = usetemp? newForPrefixOnly(iid) : newFromStandalone(iid);~~
132		~~- IndexId pre = iid.getPrefix().getPrecursor();~~
133		~~- String precursorPath = pre.getImportPath();~~
134		~~- if(useSnapshot)~~
135		~~- precursorPath = IndexRegistry.getInstance().getLatestSnapshot(pre).path;~~
136		-
137		~~- builder.createNewFromLinks(perPrefix,usetemp,precursorPath);~~
	136	+ for(String dbname : dbnames){
	137	+ try{
	138	+ log.info("Building prefix index for "+dbname);
	139	+ IndexId iid = IndexId.get(dbname);
	140	+ if( !iid.hasPrefix() )
	141	+ continue;
	142	+ PrefixIndexBuilder builder = usetemp? newForPrefixOnly(iid) : newFromStandalone(iid);
	143	+ IndexId pre = iid.getPrefix().getPrecursor();
	144	+ String precursorPath = pre.getImportPath();
	145	+ if(useSnapshot)
	146	+ precursorPath = IndexRegistry.getInstance().getLatestSnapshot(pre).path;
	147	+
	148	+ builder.createNewFromLinks(perPrefix,usetemp,precursorPath);
	149	+ } catch(Exception e){
	150	+ log.error("Exception building prefix index for "+dbname+" "+e.getMessage(),e);
	151	+ }
	152	+ }
138	153	}
139	154
140	155	/**
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NetworkStatusThread.java
—	—	@@ -84,8 +84,8 @@
85	85	cache.reInitializeRemote(sh.iid,sh.host);
86	86	}
87	87	} catch (RemoteException e) {
88		~~- log.warn("Host "+sh.host+" still down.");~~
89		~~- noRetryHosts.add(sh.host);~~
	88	+ log.warn("Host "+sh.host+" for "+sh.iid+" still down.",e);
	89	+ //noRetryHosts.add(sh.host);
90	90	} catch (Exception e) {
91	91	e.printStackTrace();
92	92	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java
—	—	@@ -127,11 +127,14 @@
128	128	for(Range r : highlighted){
129	129	sb.append(text.substring(last,r.start));
130	130	sb.append(beginTag);
131		~~- sb.append(text.substring(r.start,r.end));~~
	131	+ if(r.end > text.length())
	132	+ sb.append(text.substring(r.start));
	133	+ else
	134	+ sb.append(text.substring(r.start,r.end));
132	135	sb.append(endTag);
133	136	last = r.end;
134	137	}
135		~~- if(last != text.length())~~
	138	+ if(last < text.length())
136	139	sb.append(text.substring(last));
137	140	return sb.toString();
138	141	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java
—	—	@@ -7,6 +7,7 @@
8	8
9	9	import org.apache.lucene.analysis.Token;
10	10	import org.wikimedia.lsearch.analyzers.Alttitles;
	11	+import org.wikimedia.lsearch.analyzers.CJKFilter;
11	12	import org.wikimedia.lsearch.analyzers.ExtToken;
12	13	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
13	14	import org.wikimedia.lsearch.analyzers.ExtToken.Position;
—	—	@@ -289,7 +290,7 @@
290	291	mid = start;
291	292	String tt = t.getText();
292	293	int len = tt.length();
293		~~- if(len>=2){~~
	294	+ if(len>=2 && CJKFilter.isCJKChar(tt.codePointAt(0))){
294	295	// not terminal, calculate new midpoint
295	296	int point = len-1;
296	297	if(Character.isSurrogatePair(tt.charAt(len-2),tt.charAt(len-1)))
Index: branches/lucene-search-2.1/lib/mwdumper.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: branches/lucene-search-2.1/lib/mwdumper.jar
___________________________________________________________________
Name: svn:mime-type
297	298	+ application/octet-stream

Status & tagging log

01:13, 15 January 2009 Brion VIBBER (talk | contribs) changed the status of r45728 [removed: new added: deferred]