r45728 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r45727‎ | r45728 | r45729 >
Date:16:03, 14 January 2009
Author:rainman
Status:deferred
Tags:
Comment:
Minor patches:
* test cases for CJK, cannot reproduce jawiki bug, needs more investigation
* network thread tuning and reporting
* add mwdumper into the repository so people don't need to build their own
Modified paths:
  • /branches/lucene-search-2.1/lib/mwdumper.jar (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NetworkStatusThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/AnalysisTest.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/AnalysisTest.java
@@ -49,6 +49,11 @@
5050 GlobalConfiguration.getInstance();
5151 }
5252 }
 53+
 54+ public void testCJKAnalyzer(){
 55+ a = new CJKAnalyzer();
 56+ assertEquals("[(いわ,0,2,type=double), (わさ,1,3,type=double), (さき,2,4,type=double), (ic,4,6,type=single), (カー,6,8,type=double), (ード,7,9,type=double)]",tokens("いわさきicカード"));
 57+ }
5358
5459 /** Common test for indexer and searcher analyzers */
5560 public void commonEnglish(){
@@ -225,7 +230,7 @@
226231 printCodePoints("“കൊറിയ”");
227232
228233 QueryParser parser = new QueryParser("contents",new CJKAnalyzer());
229 - Query q = parser.parse("プロサッカークラブをつくろう");
 234+ Query q = parser.parse("いわさきicカード プロサッカークラブをつくろう");
230235 System.out.println("Japanese in standard analyzer: "+q);
231236 displayTokens(new CJKAnalyzer(),"は、工学者、大学教授、工学博士。『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
232237 displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木 孝治(すずき こうじ、1954年 - )『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java
@@ -9,6 +9,7 @@
1010 import org.mediawiki.dumper.ProgressFilter;
1111 import org.mediawiki.dumper.Tools;
1212 import org.mediawiki.importer.XmlDumpReader;
 13+import org.wikimedia.lsearch.beans.LocalIndex;
1314 import org.wikimedia.lsearch.config.Configuration;
1415 import org.wikimedia.lsearch.config.GlobalConfiguration;
1516 import org.wikimedia.lsearch.config.IndexId;
@@ -31,12 +32,12 @@
3233 boolean useSnapshot = false;
3334 boolean local = false;
3435 ArrayList<String> dbnames = new ArrayList<String>();
35 -
 36+
3637 System.out.println("MediaWiki lucene-search indexer - build spelling suggestion index.\n");
37 -
 38+
3839 Configuration.open();
3940 GlobalConfiguration global = GlobalConfiguration.getInstance();
40 -
 41+
4142 for(int i=0;i<args.length;i++){
4243 if(args[i].equals("-s"))
4344 useSnapshot = true;
@@ -50,7 +51,7 @@
5152 else if(inputfile == null)
5253 inputfile = args[i];
5354 }
54 -
 55+
5556 if(dbnames.size() == 0 && !local){
5657 System.out.println("Syntax: java SuggestBuilder [-l] [-s] [<dbname>] [<dumpfile>]");
5758 System.out.println("Options:");
@@ -58,67 +59,80 @@
5960 System.out.println(" -l rebuild all local indexes from snapshots");
6061 return;
6162 }
62 -
 63+
6364 UnicodeDecomposer.getInstance();
6465 Localization.loadInterwiki();
6566 Collections.sort(dbnames);
6667 long start = System.currentTimeMillis();
6768 for(String dbname : dbnames){
68 - // preload
69 - Localization.readLocalization(global.getLanguage(dbname));
 69+ try{
 70+ log.info("Building spell-check for "+dbname);
 71+ // preload
 72+ Localization.readLocalization(global.getLanguage(dbname));
7073
71 - IndexId iid = IndexId.get(dbname);
72 - IndexId spell = iid.getSpell();
73 - IndexId pre = spell.getPrecursor();
74 - if(spell == null){
75 - log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration.");
76 - continue;
77 - }
78 -
79 - if(inputfile != null){
80 - log.info("Rebuilding precursor index...");
81 - // open
82 - InputStream input = null;
83 - try {
84 - input = Tools.openInputFile(inputfile);
85 - } catch (IOException e) {
86 - log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());
87 - return;
 74+ IndexId iid = IndexId.get(dbname);
 75+ if( !iid.hasSpell() )
 76+ continue;
 77+ IndexId spell = iid.getSpell();
 78+ IndexId pre = spell.getPrecursor();
 79+ if(spell == null){
 80+ log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration.");
 81+ continue;
8882 }
8983
90 - // make fresh clean index
91 - try {
92 - CleanIndexImporter importer = new CleanIndexImporter(pre,iid.getLangCode());
93 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000));
94 - reader.readDump();
95 - importer.closeIndex();
96 - IndexThread.makeIndexSnapshot(pre,pre.getImportPath());
97 - } catch (IOException e) {
98 - if(!e.getMessage().equals("stopped")){
99 - e.printStackTrace();
100 - log.fatal("I/O error reading dump for "+dbname+" from "+inputfile+" : "+e.getMessage());
 84+ if(inputfile != null){
 85+ log.info("Rebuilding precursor index...");
 86+ // open
 87+ InputStream input = null;
 88+ try {
 89+ input = Tools.openInputFile(inputfile);
 90+ } catch (IOException e) {
 91+ log.fatal("I/O error opening "+inputfile+" : "+e.getMessage());
10192 return;
10293 }
103 - }
104 - }
10594
106 - log.info("Making spell-check index");
107 - // make phrase index
 95+ // make fresh clean index
 96+ try {
 97+ CleanIndexImporter importer = new CleanIndexImporter(pre,iid.getLangCode());
 98+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000));
 99+ reader.readDump();
 100+ importer.closeIndex();
 101+ IndexThread.makeIndexSnapshot(pre,pre.getImportPath());
 102+ } catch (IOException e) {
 103+ if(!e.getMessage().equals("stopped")){
 104+ e.printStackTrace();
 105+ log.fatal("I/O error reading dump for "+dbname+" from "+inputfile+" : "+e.getMessage());
 106+ return;
 107+ }
 108+ }
 109+ }
108110
109 - SpellCheckIndexer tInx = new SpellCheckIndexer(spell);
110 - String path = pre.getImportPath();
111 - if(useSnapshot)
112 - path = IndexRegistry.getInstance().getLatestSnapshot(pre).getPath();
113 - tInx.createFromPrecursor(path);
 111+ log.info("Making spell-check index");
 112+ // make phrase index
114113
115 - // make snapshots
116 - IndexThread.makeIndexSnapshot(spell,spell.getImportPath());
 114+ SpellCheckIndexer tInx = new SpellCheckIndexer(spell);
 115+ String path = pre.getImportPath();
 116+ if(useSnapshot){
 117+ LocalIndex li = IndexRegistry.getInstance().getLatestSnapshot(pre);
 118+ if(li == null){
 119+ log.info("Snapshot for "+pre+" not available.");
 120+ continue;
 121+ }
 122+ path = li.getPath();
 123+ }
 124+ tInx.createFromPrecursor(path);
 125+
 126+ // make snapshots
 127+ IndexThread.makeIndexSnapshot(spell,spell.getImportPath());
 128+ } catch(Exception e){
 129+ log.error("Exception building spellcheck index for "+dbname+" "+e.getMessage(),e);
 130+ }
117131 }
118132 long end = System.currentTimeMillis();
119133
120134 System.out.println("Finished making spell-check index in "+formatTime(end-start));
121135 }
122 -
 136+
123137 private static String formatTime(long l) {
124138 l /= 1000;
125139 if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
@@ -27,6 +27,7 @@
2828 import org.wikimedia.lsearch.beans.Article;
2929 import org.wikimedia.lsearch.beans.Title;
3030 import org.wikimedia.lsearch.config.Configuration;
 31+import org.wikimedia.lsearch.config.GlobalConfiguration;
3132 import org.wikimedia.lsearch.config.IndexId;
3233 import org.wikimedia.lsearch.config.IndexRegistry;
3334 import org.wikimedia.lsearch.index.IndexThread;
@@ -97,18 +98,20 @@
9899 public static void main(String[] args) throws IOException{
99100 int perPrefix = 15;
100101 boolean usetemp = false;
101 - String dbname = null;
102102 boolean useSnapshot = false;
 103+ ArrayList<String> dbnames = new ArrayList<String>();
103104
104105 System.out.println("MediaWiki lucene-search indexer - rebuild prefix index used for ajax suggestions.");
105106
106107 Configuration.open();
 108+ GlobalConfiguration global = GlobalConfiguration.getInstance();
107109 if(args.length == 0){
108 - System.out.println("Syntax: java PrefixIndexBuilder [-t] [-p <num>] <dbname>");
 110+ System.out.println("Syntax: java PrefixIndexBuilder [-t] [-l] [-p <num>] <dbname>");
109111 System.out.println("Options:");
110112 System.out.println(" -p - reuse temporary precursor index (import path)");
111113 System.out.println(" -s - reuse latest temporary precursor snapshot");
112114 System.out.println(" -t <num> - titles per prefix (default: "+perPrefix+")");
 115+ System.out.println(" -l rebuild all local indexes from snapshots");
113116 return;
114117 }
115118 for(int i=0;i<args.length;i++){
@@ -116,24 +119,36 @@
117120 usetemp = true;
118121 else if(args[i].equals("-t"))
119122 perPrefix = Integer.parseInt(args[++i]);
120 - else if(args[i].equals("-s")){
 123+ else if(args[i].equals("-l")){
 124+ useSnapshot = true;
 125+ dbnames.addAll(global.getMyIndexDBnames());
 126+ } else if(args[i].equals("-s")){
121127 usetemp = true;
122128 useSnapshot = true;
123129 } else if(args[i].startsWith("-")){
124130 System.out.println("Unrecognized option "+args[i]);
125131 return;
126132 } else
127 - dbname = args[i];
 133+ dbnames.add(args[i]);
128134 }
129135
130 - IndexId iid = IndexId.get(dbname);
131 - PrefixIndexBuilder builder = usetemp? newForPrefixOnly(iid) : newFromStandalone(iid);
132 - IndexId pre = iid.getPrefix().getPrecursor();
133 - String precursorPath = pre.getImportPath();
134 - if(useSnapshot)
135 - precursorPath = IndexRegistry.getInstance().getLatestSnapshot(pre).path;
136 -
137 - builder.createNewFromLinks(perPrefix,usetemp,precursorPath);
 136+ for(String dbname : dbnames){
 137+ try{
 138+ log.info("Building prefix index for "+dbname);
 139+ IndexId iid = IndexId.get(dbname);
 140+ if( !iid.hasPrefix() )
 141+ continue;
 142+ PrefixIndexBuilder builder = usetemp? newForPrefixOnly(iid) : newFromStandalone(iid);
 143+ IndexId pre = iid.getPrefix().getPrecursor();
 144+ String precursorPath = pre.getImportPath();
 145+ if(useSnapshot)
 146+ precursorPath = IndexRegistry.getInstance().getLatestSnapshot(pre).path;
 147+
 148+ builder.createNewFromLinks(perPrefix,usetemp,precursorPath);
 149+ } catch(Exception e){
 150+ log.error("Exception building prefix index for "+dbname+" "+e.getMessage(),e);
 151+ }
 152+ }
138153 }
139154
140155 /**
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NetworkStatusThread.java
@@ -84,8 +84,8 @@
8585 cache.reInitializeRemote(sh.iid,sh.host);
8686 }
8787 } catch (RemoteException e) {
88 - log.warn("Host "+sh.host+" still down.");
89 - noRetryHosts.add(sh.host);
 88+ log.warn("Host "+sh.host+" for "+sh.iid+" still down.",e);
 89+ //noRetryHosts.add(sh.host);
9090 } catch (Exception e) {
9191 e.printStackTrace();
9292 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java
@@ -127,11 +127,14 @@
128128 for(Range r : highlighted){
129129 sb.append(text.substring(last,r.start));
130130 sb.append(beginTag);
131 - sb.append(text.substring(r.start,r.end));
 131+ if(r.end > text.length())
 132+ sb.append(text.substring(r.start));
 133+ else
 134+ sb.append(text.substring(r.start,r.end));
132135 sb.append(endTag);
133136 last = r.end;
134137 }
135 - if(last != text.length())
 138+ if(last < text.length())
136139 sb.append(text.substring(last));
137140 return sb.toString();
138141 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java
@@ -7,6 +7,7 @@
88
99 import org.apache.lucene.analysis.Token;
1010 import org.wikimedia.lsearch.analyzers.Alttitles;
 11+import org.wikimedia.lsearch.analyzers.CJKFilter;
1112 import org.wikimedia.lsearch.analyzers.ExtToken;
1213 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
1314 import org.wikimedia.lsearch.analyzers.ExtToken.Position;
@@ -289,7 +290,7 @@
290291 mid = start;
291292 String tt = t.getText();
292293 int len = tt.length();
293 - if(len>=2){
 294+ if(len>=2 && CJKFilter.isCJKChar(tt.codePointAt(0))){
294295 // not terminal, calculate new midpoint
295296 int point = len-1;
296297 if(Character.isSurrogatePair(tt.charAt(len-2),tt.charAt(len-1)))
Index: branches/lucene-search-2.1/lib/mwdumper.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: branches/lucene-search-2.1/lib/mwdumper.jar
___________________________________________________________________
Name: svn:mime-type
297298 + application/octet-stream

Status & tagging log