Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/AnalysisTest.java |
— | — | @@ -49,6 +49,11 @@ |
50 | 50 | GlobalConfiguration.getInstance(); |
51 | 51 | } |
52 | 52 | } |
| 53 | + |
| 54 | + public void testCJKAnalyzer(){ |
| 55 | + a = new CJKAnalyzer(); |
| 56 | + assertEquals("[(いわ,0,2,type=double), (わさ,1,3,type=double), (さき,2,4,type=double), (ic,4,6,type=single), (カー,6,8,type=double), (ード,7,9,type=double)]",tokens("いわさきicカード")); |
| 57 | + } |
53 | 58 | |
54 | 59 | /** Common test for indexer and searcher analyzers */ |
55 | 60 | public void commonEnglish(){ |
— | — | @@ -225,7 +230,7 @@ |
226 | 231 | printCodePoints("“കൊറിയ”"); |
227 | 232 | |
228 | 233 | QueryParser parser = new QueryParser("contents",new CJKAnalyzer()); |
229 | | - Query q = parser.parse("プロサッカークラブをつくろう"); |
| 234 | + Query q = parser.parse("いわさきicカード プロサッカークラブをつくろう"); |
230 | 235 | System.out.println("Japanese in standard analyzer: "+q); |
231 | 236 | displayTokens(new CJKAnalyzer(),"は、工学者、大学教授、工学博士。『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
232 | 237 | displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木 孝治(すずき こうじ、1954年 - )『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java |
— | — | @@ -9,6 +9,7 @@ |
10 | 10 | import org.mediawiki.dumper.ProgressFilter; |
11 | 11 | import org.mediawiki.dumper.Tools; |
12 | 12 | import org.mediawiki.importer.XmlDumpReader; |
| 13 | +import org.wikimedia.lsearch.beans.LocalIndex; |
13 | 14 | import org.wikimedia.lsearch.config.Configuration; |
14 | 15 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
15 | 16 | import org.wikimedia.lsearch.config.IndexId; |
— | — | @@ -31,12 +32,12 @@ |
32 | 33 | boolean useSnapshot = false; |
33 | 34 | boolean local = false; |
34 | 35 | ArrayList<String> dbnames = new ArrayList<String>(); |
35 | | - |
| 36 | + |
36 | 37 | System.out.println("MediaWiki lucene-search indexer - build spelling suggestion index.\n"); |
37 | | - |
| 38 | + |
38 | 39 | Configuration.open(); |
39 | 40 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
40 | | - |
| 41 | + |
41 | 42 | for(int i=0;i<args.length;i++){ |
42 | 43 | if(args[i].equals("-s")) |
43 | 44 | useSnapshot = true; |
— | — | @@ -50,7 +51,7 @@ |
51 | 52 | else if(inputfile == null) |
52 | 53 | inputfile = args[i]; |
53 | 54 | } |
54 | | - |
| 55 | + |
55 | 56 | if(dbnames.size() == 0 && !local){ |
56 | 57 | System.out.println("Syntax: java SuggestBuilder [-l] [-s] [<dbname>] [<dumpfile>]"); |
57 | 58 | System.out.println("Options:"); |
— | — | @@ -58,67 +59,80 @@ |
59 | 60 | System.out.println(" -l rebuild all local indexes from snapshots"); |
60 | 61 | return; |
61 | 62 | } |
62 | | - |
| 63 | + |
63 | 64 | UnicodeDecomposer.getInstance(); |
64 | 65 | Localization.loadInterwiki(); |
65 | 66 | Collections.sort(dbnames); |
66 | 67 | long start = System.currentTimeMillis(); |
67 | 68 | for(String dbname : dbnames){ |
68 | | - // preload |
69 | | - Localization.readLocalization(global.getLanguage(dbname)); |
| 69 | + try{ |
| 70 | + log.info("Building spell-check for "+dbname); |
| 71 | + // preload |
| 72 | + Localization.readLocalization(global.getLanguage(dbname)); |
70 | 73 | |
71 | | - IndexId iid = IndexId.get(dbname); |
72 | | - IndexId spell = iid.getSpell(); |
73 | | - IndexId pre = spell.getPrecursor(); |
74 | | - if(spell == null){ |
75 | | - log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration."); |
76 | | - continue; |
77 | | - } |
78 | | - |
79 | | - if(inputfile != null){ |
80 | | - log.info("Rebuilding precursor index..."); |
81 | | - // open |
82 | | - InputStream input = null; |
83 | | - try { |
84 | | - input = Tools.openInputFile(inputfile); |
85 | | - } catch (IOException e) { |
86 | | - log.fatal("I/O error opening "+inputfile+" : "+e.getMessage()); |
87 | | - return; |
| 74 | + IndexId iid = IndexId.get(dbname); |
| 75 | + if( !iid.hasSpell() ) |
| 76 | + continue; |
| 77 | + IndexId spell = iid.getSpell(); |
| 78 | + IndexId pre = spell.getPrecursor(); |
| 79 | + if(spell == null){ |
| 80 | + log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration."); |
| 81 | + continue; |
88 | 82 | } |
89 | 83 | |
90 | | - // make fresh clean index |
91 | | - try { |
92 | | - CleanIndexImporter importer = new CleanIndexImporter(pre,iid.getLangCode()); |
93 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000)); |
94 | | - reader.readDump(); |
95 | | - importer.closeIndex(); |
96 | | - IndexThread.makeIndexSnapshot(pre,pre.getImportPath()); |
97 | | - } catch (IOException e) { |
98 | | - if(!e.getMessage().equals("stopped")){ |
99 | | - e.printStackTrace(); |
100 | | - log.fatal("I/O error reading dump for "+dbname+" from "+inputfile+" : "+e.getMessage()); |
| 84 | + if(inputfile != null){ |
| 85 | + log.info("Rebuilding precursor index..."); |
| 86 | + // open |
| 87 | + InputStream input = null; |
| 88 | + try { |
| 89 | + input = Tools.openInputFile(inputfile); |
| 90 | + } catch (IOException e) { |
| 91 | + log.fatal("I/O error opening "+inputfile+" : "+e.getMessage()); |
101 | 92 | return; |
102 | 93 | } |
103 | | - } |
104 | | - } |
105 | 94 | |
106 | | - log.info("Making spell-check index"); |
107 | | - // make phrase index |
| 95 | + // make fresh clean index |
| 96 | + try { |
| 97 | + CleanIndexImporter importer = new CleanIndexImporter(pre,iid.getLangCode()); |
| 98 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000)); |
| 99 | + reader.readDump(); |
| 100 | + importer.closeIndex(); |
| 101 | + IndexThread.makeIndexSnapshot(pre,pre.getImportPath()); |
| 102 | + } catch (IOException e) { |
| 103 | + if(!e.getMessage().equals("stopped")){ |
| 104 | + e.printStackTrace(); |
| 105 | + log.fatal("I/O error reading dump for "+dbname+" from "+inputfile+" : "+e.getMessage()); |
| 106 | + return; |
| 107 | + } |
| 108 | + } |
| 109 | + } |
108 | 110 | |
109 | | - SpellCheckIndexer tInx = new SpellCheckIndexer(spell); |
110 | | - String path = pre.getImportPath(); |
111 | | - if(useSnapshot) |
112 | | - path = IndexRegistry.getInstance().getLatestSnapshot(pre).getPath(); |
113 | | - tInx.createFromPrecursor(path); |
| 111 | + log.info("Making spell-check index"); |
| 112 | + // make phrase index |
114 | 113 | |
115 | | - // make snapshots |
116 | | - IndexThread.makeIndexSnapshot(spell,spell.getImportPath()); |
| 114 | + SpellCheckIndexer tInx = new SpellCheckIndexer(spell); |
| 115 | + String path = pre.getImportPath(); |
| 116 | + if(useSnapshot){ |
| 117 | + LocalIndex li = IndexRegistry.getInstance().getLatestSnapshot(pre); |
| 118 | + if(li == null){ |
| 119 | + log.info("Snapshot for "+pre+" not available."); |
| 120 | + continue; |
| 121 | + } |
| 122 | + path = li.getPath(); |
| 123 | + } |
| 124 | + tInx.createFromPrecursor(path); |
| 125 | + |
| 126 | + // make snapshots |
| 127 | + IndexThread.makeIndexSnapshot(spell,spell.getImportPath()); |
| 128 | + } catch(Exception e){ |
| 129 | + log.error("Exception building spellcheck index for "+dbname+" "+e.getMessage(),e); |
| 130 | + } |
117 | 131 | } |
118 | 132 | long end = System.currentTimeMillis(); |
119 | 133 | |
120 | 134 | System.out.println("Finished making spell-check index in "+formatTime(end-start)); |
121 | 135 | } |
122 | | - |
| 136 | + |
123 | 137 | private static String formatTime(long l) { |
124 | 138 | l /= 1000; |
125 | 139 | if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s"; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java |
— | — | @@ -27,6 +27,7 @@ |
28 | 28 | import org.wikimedia.lsearch.beans.Article; |
29 | 29 | import org.wikimedia.lsearch.beans.Title; |
30 | 30 | import org.wikimedia.lsearch.config.Configuration; |
| 31 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
31 | 32 | import org.wikimedia.lsearch.config.IndexId; |
32 | 33 | import org.wikimedia.lsearch.config.IndexRegistry; |
33 | 34 | import org.wikimedia.lsearch.index.IndexThread; |
— | — | @@ -97,18 +98,20 @@ |
98 | 99 | public static void main(String[] args) throws IOException{ |
99 | 100 | int perPrefix = 15; |
100 | 101 | boolean usetemp = false; |
101 | | - String dbname = null; |
102 | 102 | boolean useSnapshot = false; |
| 103 | + ArrayList<String> dbnames = new ArrayList<String>(); |
103 | 104 | |
104 | 105 | System.out.println("MediaWiki lucene-search indexer - rebuild prefix index used for ajax suggestions."); |
105 | 106 | |
106 | 107 | Configuration.open(); |
| 108 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
107 | 109 | if(args.length == 0){ |
108 | | - System.out.println("Syntax: java PrefixIndexBuilder [-t] [-p <num>] <dbname>"); |
| 110 | + System.out.println("Syntax: java PrefixIndexBuilder [-t] [-l] [-p <num>] <dbname>"); |
109 | 111 | System.out.println("Options:"); |
110 | 112 | System.out.println(" -p - reuse temporary precursor index (import path)"); |
111 | 113 | System.out.println(" -s - reuse latest temporary precursor snapshot"); |
112 | 114 | System.out.println(" -t <num> - titles per prefix (default: "+perPrefix+")"); |
| 115 | + System.out.println(" -l rebuild all local indexes from snapshots"); |
113 | 116 | return; |
114 | 117 | } |
115 | 118 | for(int i=0;i<args.length;i++){ |
— | — | @@ -116,24 +119,36 @@ |
117 | 120 | usetemp = true; |
118 | 121 | else if(args[i].equals("-t")) |
119 | 122 | perPrefix = Integer.parseInt(args[++i]); |
120 | | - else if(args[i].equals("-s")){ |
| 123 | + else if(args[i].equals("-l")){ |
| 124 | + useSnapshot = true; |
| 125 | + dbnames.addAll(global.getMyIndexDBnames()); |
| 126 | + } else if(args[i].equals("-s")){ |
121 | 127 | usetemp = true; |
122 | 128 | useSnapshot = true; |
123 | 129 | } else if(args[i].startsWith("-")){ |
124 | 130 | System.out.println("Unrecognized option "+args[i]); |
125 | 131 | return; |
126 | 132 | } else |
127 | | - dbname = args[i]; |
| 133 | + dbnames.add(args[i]); |
128 | 134 | } |
129 | 135 | |
130 | | - IndexId iid = IndexId.get(dbname); |
131 | | - PrefixIndexBuilder builder = usetemp? newForPrefixOnly(iid) : newFromStandalone(iid); |
132 | | - IndexId pre = iid.getPrefix().getPrecursor(); |
133 | | - String precursorPath = pre.getImportPath(); |
134 | | - if(useSnapshot) |
135 | | - precursorPath = IndexRegistry.getInstance().getLatestSnapshot(pre).path; |
136 | | - |
137 | | - builder.createNewFromLinks(perPrefix,usetemp,precursorPath); |
| 136 | + for(String dbname : dbnames){ |
| 137 | + try{ |
| 138 | + log.info("Building prefix index for "+dbname); |
| 139 | + IndexId iid = IndexId.get(dbname); |
| 140 | + if( !iid.hasPrefix() ) |
| 141 | + continue; |
| 142 | + PrefixIndexBuilder builder = usetemp? newForPrefixOnly(iid) : newFromStandalone(iid); |
| 143 | + IndexId pre = iid.getPrefix().getPrecursor(); |
| 144 | + String precursorPath = pre.getImportPath(); |
| 145 | + if(useSnapshot) |
| 146 | + precursorPath = IndexRegistry.getInstance().getLatestSnapshot(pre).path; |
| 147 | + |
| 148 | + builder.createNewFromLinks(perPrefix,usetemp,precursorPath); |
| 149 | + } catch(Exception e){ |
| 150 | + log.error("Exception building prefix index for "+dbname+" "+e.getMessage(),e); |
| 151 | + } |
| 152 | + } |
138 | 153 | } |
139 | 154 | |
140 | 155 | /** |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NetworkStatusThread.java |
— | — | @@ -84,8 +84,8 @@ |
85 | 85 | cache.reInitializeRemote(sh.iid,sh.host); |
86 | 86 | } |
87 | 87 | } catch (RemoteException e) { |
88 | | - log.warn("Host "+sh.host+" still down."); |
89 | | - noRetryHosts.add(sh.host); |
| 88 | + log.warn("Host "+sh.host+" for "+sh.iid+" still down.",e); |
| 89 | + //noRetryHosts.add(sh.host); |
90 | 90 | } catch (Exception e) { |
91 | 91 | e.printStackTrace(); |
92 | 92 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java |
— | — | @@ -127,11 +127,14 @@ |
128 | 128 | for(Range r : highlighted){ |
129 | 129 | sb.append(text.substring(last,r.start)); |
130 | 130 | sb.append(beginTag); |
131 | | - sb.append(text.substring(r.start,r.end)); |
| 131 | + if(r.end > text.length()) |
| 132 | + sb.append(text.substring(r.start)); |
| 133 | + else |
| 134 | + sb.append(text.substring(r.start,r.end)); |
132 | 135 | sb.append(endTag); |
133 | 136 | last = r.end; |
134 | 137 | } |
135 | | - if(last != text.length()) |
| 138 | + if(last < text.length()) |
136 | 139 | sb.append(text.substring(last)); |
137 | 140 | return sb.toString(); |
138 | 141 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java |
— | — | @@ -7,6 +7,7 @@ |
8 | 8 | |
9 | 9 | import org.apache.lucene.analysis.Token; |
10 | 10 | import org.wikimedia.lsearch.analyzers.Alttitles; |
| 11 | +import org.wikimedia.lsearch.analyzers.CJKFilter; |
11 | 12 | import org.wikimedia.lsearch.analyzers.ExtToken; |
12 | 13 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
13 | 14 | import org.wikimedia.lsearch.analyzers.ExtToken.Position; |
— | — | @@ -289,7 +290,7 @@ |
290 | 291 | mid = start; |
291 | 292 | String tt = t.getText(); |
292 | 293 | int len = tt.length(); |
293 | | - if(len>=2){ |
| 294 | + if(len>=2 && CJKFilter.isCJKChar(tt.codePointAt(0))){ |
294 | 295 | // not terminal, calculate new midpoint |
295 | 296 | int point = len-1; |
296 | 297 | if(Character.isSurrogatePair(tt.charAt(len-2),tt.charAt(len-1))) |
Index: branches/lucene-search-2.1/lib/mwdumper.jar |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: branches/lucene-search-2.1/lib/mwdumper.jar |
___________________________________________________________________ |
Name: svn:mime-type |
297 | 298 | + application/octet-stream |