Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test |
— | — | @@ -19,14 +19,16 @@ |
20 | 20 | # host : db1.role, db2.role |
21 | 21 | # Mulitple hosts can search multiple dbs (N-N mapping) |
22 | 22 | [Search-Group] |
23 | | -192.168.0.2 : entest, entest.mainpart |
| 23 | +192.168.0.2 : entest.mainpart |
24 | 24 | 192.168.0.5 : entest.mainpart, entest.restpart |
25 | 25 | [Search-Group] |
26 | 26 | 192.168.0.4 : frtest.part1, frtest.part2 |
27 | 27 | 192.168.0.6 : frtest.part3, detest |
28 | 28 | [Search-Group] |
29 | | -192.168.0.10 : entest, entest.mainpart |
| 29 | +192.168.0.10 :entest.mainpart |
30 | 30 | 192.168.0.2 : entest.restpart, rutest |
| 31 | +[Search-Group] |
| 32 | +192.168.0.1 : njawiki |
31 | 33 | |
32 | 34 | # Index nodes |
33 | 35 | # host: db1.role, db2.role |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -87,7 +87,7 @@ |
88 | 88 | // nop |
89 | 89 | } |
90 | 90 | |
91 | | - public void closeIndex(){ |
| 91 | + public void closeIndex() throws IOException { |
92 | 92 | writer.close(); |
93 | 93 | } |
94 | 94 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -77,7 +77,7 @@ |
78 | 78 | } |
79 | 79 | } |
80 | 80 | writer.setSimilarity(new WikiSimilarity()); |
81 | | - int glMergeFactor = iid.getIntParam("mergeFactor",2); |
| 81 | + int glMergeFactor = iid.getIntParam("mergeFactor",10); |
82 | 82 | int glMaxBufDocs = iid.getIntParam("maxBufDocs",10); |
83 | 83 | if(mergeFactor!=null) |
84 | 84 | writer.setMergeFactor(mergeFactor); |
— | — | @@ -122,8 +122,9 @@ |
123 | 123 | } |
124 | 124 | } |
125 | 125 | |
126 | | - /** Close and (if specified in global config) optimize indexes */ |
127 | | - public void close(){ |
| 126 | + /** Close and (if specified in global config) optimize indexes |
| 127 | + * @throws IOException */ |
| 128 | + public void close() throws IOException{ |
128 | 129 | for(Entry<String,IndexWriter> en : indexes.entrySet()){ |
129 | 130 | IndexId iid = IndexId.get(en.getKey()); |
130 | 131 | IndexWriter writer = en.getValue(); |
— | — | @@ -137,6 +138,7 @@ |
138 | 139 | writer.close(); |
139 | 140 | } catch(IOException e){ |
140 | 141 | log.warn("I/O error optimizing/closing index at "+iid.getImportPath()); |
| 142 | + throw e; |
141 | 143 | } |
142 | 144 | } |
143 | 145 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -139,7 +139,13 @@ |
140 | 140 | long end = System.currentTimeMillis(); |
141 | 141 | |
142 | 142 | log.info("Closing/optimizing index..."); |
143 | | - dp.closeIndex(); |
| 143 | + try{ |
| 144 | + dp.closeIndex(); |
| 145 | + } catch(IOException e){ |
| 146 | + e.printStackTrace(); |
| 147 | + log.fatal("Cannot close/optimize index : "+e.getMessage()); |
| 148 | + System.exit(1); |
| 149 | + } |
144 | 150 | |
145 | 151 | long finalEnd = System.currentTimeMillis(); |
146 | 152 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -145,6 +145,27 @@ |
146 | 146 | System.out.println("ERROR in GlobalConfiguration: Default path for index absent. Check section [Index-Path]."); |
147 | 147 | return false; |
148 | 148 | } |
| 149 | + // expand logical index names on searchers |
| 150 | + for(String host : search.keySet()){ |
| 151 | + ArrayList<String> hostsearch = search.get(host); |
| 152 | + for(String dbname : hostsearch.toArray(new String[]{})){ |
| 153 | + Hashtable<String, Hashtable<String,String>> types = database.get(dbname); |
| 154 | + if(types != null){ // if not null, dbrole is dbname |
| 155 | + if(types.containsKey("mainsplit")){ |
| 156 | + hostsearch.add(dbname+".mainpart"); |
| 157 | + hostsearch.add(dbname+".restpart"); |
| 158 | + } else if(types.containsKey("split")){ |
| 159 | + int factor = Integer.parseInt(database.get(dbname).get("split").get("number")); |
| 160 | + for(int i=1;i<=factor;i++) |
| 161 | + hostsearch.add(dbname+".part"+i); |
| 162 | + } else if(types.containsKey("nssplit")){ |
| 163 | + int factor = Integer.parseInt(database.get(dbname).get("nssplit").get("number")); |
| 164 | + for(int i=1;i<=factor;i++) |
| 165 | + hostsearch.add(dbname+".nspart"+i); |
| 166 | + } |
| 167 | + } |
| 168 | + } |
| 169 | + } |
149 | 170 | // for each DB check if the corresponding parts are defined |
150 | 171 | // if not, put them in with default values |
151 | 172 | for(String dbname : database.keySet()){ |
— | — | @@ -161,6 +182,13 @@ |
162 | 183 | if(!types.contains(dbpart)) |
163 | 184 | database.get(dbname).put(dbpart,new Hashtable<String,String>()); |
164 | 185 | } |
| 186 | + } else if(types.contains("nssplit")){ |
| 187 | + int factor = Integer.parseInt(database.get(dbname).get("nssplit").get("number")); |
| 188 | + for(int i=1;i<factor+1;i++){ |
| 189 | + String dbpart = "nspart"+i; |
| 190 | + if(!types.contains(dbpart)) |
| 191 | + database.get(dbname).put(dbpart,new Hashtable<String,String>()); |
| 192 | + } |
165 | 193 | } |
166 | 194 | } |
167 | 195 | // check if every db.type has an indexer and searcher |
— | — | @@ -196,7 +224,7 @@ |
197 | 225 | } |
198 | 226 | } |
199 | 227 | boolean searched = (getSearchHosts(dbrole).size() != 0); |
200 | | - if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split"))){ |
| 228 | + if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit"))){ |
201 | 229 | System.out.println("WARNING: in Global Configuration: index "+dbrole+" is not searched by any host."); |
202 | 230 | } |
203 | 231 | } |
— | — | @@ -663,8 +691,15 @@ |
664 | 692 | type.equals("restpart") || type.matches("part[1-9][0-9]*")){ |
665 | 693 | |
666 | 694 | // all params are optional, if absent default will be used |
667 | | - if(tokens.length>1) |
668 | | - params.put("optimize",tokens[1].trim().toLowerCase()); |
| 695 | + if(tokens.length>1){ |
| 696 | + String token = tokens[1].trim().toLowerCase(); |
| 697 | + if(token.equals("true") || token.equals("false")) |
| 698 | + params.put("optimize",token); |
| 699 | + else{ |
| 700 | + System.err.println("Expecting true/false as second paramter of type "+type+" in database def: "+role); |
| 701 | + System.exit(1); |
| 702 | + } |
| 703 | + } |
669 | 704 | if(tokens.length>2) |
670 | 705 | params.put("mergeFactor",tokens[2]); |
671 | 706 | if(tokens.length>3) |
— | — | @@ -718,8 +753,15 @@ |
719 | 754 | params.put("namespaces",ns); |
720 | 755 | |
721 | 756 | // all params are optional, if absent default will be used |
722 | | - if(tokens.length>1) |
723 | | - params.put("optimize",tokens[1].trim().toLowerCase()); |
| 757 | + if(tokens.length>1){ |
| 758 | + String token = tokens[1].trim().toLowerCase(); |
| 759 | + if(token.equals("true") || token.equals("false")) |
| 760 | + params.put("optimize",token); |
| 761 | + else{ |
| 762 | + System.err.println("Expecting true/false as third paramter of type "+type+" in database def: "+role); |
| 763 | + System.exit(1); |
| 764 | + } |
| 765 | + } |
724 | 766 | if(tokens.length>2) |
725 | 767 | params.put("mergeFactor",tokens[2]); |
726 | 768 | if(tokens.length>3) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/StartupManager.java |
— | — | @@ -63,12 +63,12 @@ |
64 | 64 | } |
65 | 65 | } |
66 | 66 | if(global.isSearcher()){ |
| 67 | + // startup |
| 68 | + (new SearchServer()).start(); |
67 | 69 | // warmup local indexes |
68 | 70 | SearcherCache.getInstance().warmupLocalCache(); |
69 | | - // startup |
70 | | - (new SearchServer()).start(); |
71 | 71 | UpdateThread.getInstance().start(); // updater for local indexes |
72 | | - NetworkStatusThread.getInstance().start(); // network monitor |
| 72 | + NetworkStatusThread.getInstance().start(); // network monitor |
73 | 73 | } |
74 | 74 | |
75 | 75 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -484,5 +484,12 @@ |
485 | 485 | } else |
486 | 486 | return null; |
487 | 487 | } |
| 488 | + |
| 489 | + /** Return the set of namespaces which are searched by this nssplit part */ |
| 490 | + public HashSet<String> getNamespaceSet() { |
| 491 | + return namespaceSet; |
| 492 | + } |
| 493 | + |
| 494 | + |
488 | 495 | |
489 | 496 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearcherCache.java |
— | — | @@ -165,6 +165,8 @@ |
166 | 166 | HashSet<IndexId> mys = global.getMySearch(); |
167 | 167 | for(IndexId iid : mys){ |
168 | 168 | try { |
| 169 | + if(iid.isLogical()) |
| 170 | + continue; |
169 | 171 | IndexSearcherMul is = getLocalSearcher(iid); |
170 | 172 | Warmup.warmupIndexSearcher(is,iid,false); |
171 | 173 | } catch (IOException e) { |
— | — | @@ -262,6 +264,8 @@ |
263 | 265 | log.debug("Openning local index for "+iid); |
264 | 266 | if(!iid.isMySearch()) |
265 | 267 | throw new IOException(iid+" is not searched by this host."); |
| 268 | + if(iid.isLogical()) |
| 269 | + throw new IOException(iid+" will not open logical index."); |
266 | 270 | try { |
267 | 271 | searcher = new IndexSearcherMul(iid.getCanonicalSearchPath()); |
268 | 272 | searcher.setSimilarity(new WikiSimilarity()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java |
— | — | @@ -57,9 +57,14 @@ |
58 | 58 | } |
59 | 59 | // get the new snapshots via rsync, might be lengthy |
60 | 60 | for(LocalIndex li : forUpdate){ |
61 | | - log.debug("Syncing "+li.iid); |
62 | | - rebuild(li); // rsync, update registry, cache |
63 | | - pending.remove(li.iid.toString()); |
| 61 | + try{ |
| 62 | + log.debug("Syncing "+li.iid); |
| 63 | + rebuild(li); // rsync, update registry, cache |
| 64 | + pending.remove(li.iid.toString()); |
| 65 | + } catch(Exception e){ |
| 66 | + e.printStackTrace(); |
| 67 | + log.error("Error syncing "+li+" : "+e.getMessage()); |
| 68 | + } |
64 | 69 | } |
65 | 70 | } |
66 | 71 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | import org.apache.lucene.search.Searcher; |
21 | 21 | import org.apache.lucene.search.TopDocs; |
22 | 22 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 23 | +import org.wikimedia.lsearch.analyzers.FieldBuilder; |
23 | 24 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
24 | 25 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
25 | 26 | import org.wikimedia.lsearch.beans.ResultSet; |
— | — | @@ -92,18 +93,24 @@ |
93 | 94 | } |
94 | 95 | |
95 | 96 | /** Search mainpart or restpart of the split index */ |
96 | | - public SearchResults searchPart(IndexId iid, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){ |
| 97 | + public SearchResults searchPart(IndexId iid, String searchterm, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){ |
97 | 98 | if( ! (iid.isMainsplit() || iid.isNssplit())) |
98 | 99 | return null; |
99 | | - try { |
| 100 | + try { |
100 | 101 | SearcherCache cache = SearcherCache.getInstance(); |
101 | 102 | IndexSearcherMul searcher; |
102 | 103 | long searchStart = System.currentTimeMillis(); |
103 | 104 | |
104 | 105 | searcher = cache.getLocalSearcher(iid); |
105 | | - |
106 | | - Hits hits = searcher.search(q,filter); |
107 | | - return makeSearchResults(searcher,hits,offset,limit,iid,q.toString(),q,searchStart,explain); |
| 106 | + NamespaceFilterWrapper localfilter = filter; |
| 107 | + if(iid.isMainsplit() && iid.isMainPart()) |
| 108 | + localfilter = null; |
| 109 | + else if(iid.isNssplit() && !iid.isLogical() && iid.getNamespaceSet().size()==1) |
| 110 | + localfilter = null; |
| 111 | + if(localfilter != null) |
| 112 | + log.info("Using local filter: "+localfilter); |
| 113 | + Hits hits = searcher.search(q,localfilter); |
| 114 | + return makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
108 | 115 | } catch (IOException e) { |
109 | 116 | SearchResults res = new SearchResults(); |
110 | 117 | res.setErrorMsg("Internal error in SearchEngine: "+e.getMessage()); |
— | — | @@ -121,8 +128,8 @@ |
122 | 129 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
123 | 130 | if(nsDefault == null || nsDefault.cardinality() == 0) |
124 | 131 | nsDefault = new NamespaceFilter("0"); // default to main namespace |
125 | | - FieldNameFactory ff = new FieldNameFactory(exactCase); |
126 | | - WikiQueryParser parser = new WikiQueryParser(ff.contents(),nsDefault,analyzer,ff,WikiQueryParser.NamespacePolicy.IGNORE); |
| 132 | + FieldBuilder.BuilderSet bs = new FieldBuilder(global.getLanguage(iid.getDBname()),exactCase).getBuilder(exactCase); |
| 133 | + WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE); |
127 | 134 | HashSet<NamespaceFilter> fields = parser.getFieldNamespaces(searchterm); |
128 | 135 | NamespaceFilterWrapper nsfw = null; |
129 | 136 | Query q = null; |
— | — | @@ -183,7 +190,7 @@ |
184 | 191 | return res; |
185 | 192 | } |
186 | 193 | RMIMessengerClient messenger = new RMIMessengerClient(); |
187 | | - return messenger.searchPart(piid,q,nsfw,offset,limit,explain,host); |
| 194 | + return messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host); |
188 | 195 | } |
189 | 196 | } |
190 | 197 | // normal search |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.lucene.search.Query; |
12 | 12 | import org.apache.lucene.search.TermQuery; |
13 | 13 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 14 | +import org.wikimedia.lsearch.analyzers.FieldBuilder; |
14 | 15 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
15 | 16 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
16 | 17 | import org.wikimedia.lsearch.benchmark.SampleTerms; |
— | — | @@ -62,9 +63,10 @@ |
63 | 64 | |
64 | 65 | /** Warmup index using some number of simple searches */ |
65 | 66 | protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) { |
66 | | - FieldNameFactory fields = new FieldNameFactory(); |
67 | | - WikiQueryParser parser = new WikiQueryParser(fields.contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),fields,WikiQueryParser.NamespacePolicy.IGNORE); |
68 | | - Terms terms = getTermsForLang(global.getLanguage(iid.getDBname())); |
| 67 | + String lang = global.getLanguage(iid.getDBname()); |
| 68 | + FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder(); |
| 69 | + WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE); |
| 70 | + Terms terms = getTermsForLang(lang); |
69 | 71 | |
70 | 72 | try{ |
71 | 73 | for(int i=0; i < count ; i++){ |
— | — | @@ -88,17 +90,15 @@ |
89 | 91 | } |
90 | 92 | |
91 | 93 | /** Get database of example search terms for language */ |
92 | | - protected static Terms getTermsForLang(String language) { |
| 94 | + protected static Terms getTermsForLang(String lang) { |
93 | 95 | String lib = Configuration.open().getString("MWConfig","lib","./lib"); |
94 | | - if(language.equals("en")) |
| 96 | + if("en".equals(lang) || "de".equals(lang) || "es".equals(lang) || "fr".equals(lang) || "it".equals(lang) || "pt".equals(lang)) |
| 97 | + langTerms.put(lang,new WordTerms(lib+"/dict/terms-"+lang+".txt.gz")); |
| 98 | + if(lang.equals("sample")) |
95 | 99 | return new SampleTerms(); |
96 | | - if(language.equals("fr") && langTerms.get("fr")==null) |
97 | | - langTerms.put("fr",new WordTerms(lib+"/dict/french.txt.gz")); |
98 | | - if(language.equals("de") && langTerms.get("de")==null) |
99 | | - langTerms.put("de",new WordTerms(lib+"/dict/german.txt.gz")); |
100 | 100 | |
101 | | - if(langTerms.containsKey(language)) |
102 | | - return langTerms.get(language); |
| 101 | + if(langTerms.containsKey(lang)) |
| 102 | + return langTerms.get(lang); |
103 | 103 | else |
104 | 104 | return langTerms.get("en"); |
105 | 105 | } |
— | — | @@ -119,8 +119,9 @@ |
120 | 120 | /** Just run one complex query and rebuild the main namespace filter */ |
121 | 121 | public static void simpleWarmup(IndexSearcherMul is, IndexId iid){ |
122 | 122 | try{ |
123 | | - FieldNameFactory fields = new FieldNameFactory(); |
124 | | - WikiQueryParser parser = new WikiQueryParser(fields.contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),fields,WikiQueryParser.NamespacePolicy.IGNORE); |
| 123 | + String lang = global.getLanguage(iid.getDBname()); |
| 124 | + FieldBuilder.BuilderSet b = new FieldBuilder(lang).getBuilder(); |
| 125 | + WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE); |
125 | 126 | Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
126 | 127 | is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0"))); |
127 | 128 | } catch (IOException e) { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java |
— | — | @@ -23,6 +23,7 @@ |
24 | 24 | import org.apache.lucene.search.highlight.TextFragment; |
25 | 25 | import org.wikimedia.lsearch.analyzers.Analyzers; |
26 | 26 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 27 | +import org.wikimedia.lsearch.analyzers.FieldBuilder; |
27 | 28 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
28 | 29 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
29 | 30 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
— | — | @@ -126,9 +127,9 @@ |
127 | 128 | boolean exactCase = global.exactCaseIndex(iid.getDBname()); |
128 | 129 | String lang = global.getLanguage(dbname); |
129 | 130 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
130 | | - FieldNameFactory fields = new FieldNameFactory(exactCase); |
131 | | - WikiQueryParser parser = new WikiQueryParser(fields.contents(), |
132 | | - new NamespaceFilter("0"),analyzer,fields,WikiQueryParser.NamespacePolicy.IGNORE); |
| 131 | + FieldBuilder.BuilderSet bs = new FieldBuilder(lang,exactCase).getBuilder(exactCase); |
| 132 | + WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(), |
| 133 | + new NamespaceFilter("0"),analyzer,bs,WikiQueryParser.NamespacePolicy.IGNORE); |
133 | 134 | Query q = parser.parseFourPass(query,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
134 | 135 | Scorer scorer = new QueryScorer(q); |
135 | 136 | SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"searchmatch\">","</span>"); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -172,7 +172,7 @@ |
173 | 173 | } |
174 | 174 | } |
175 | 175 | writer.setSimilarity(new WikiSimilarity()); |
176 | | - int mergeFactor = iid.getIntParam("mergeFactor",2); |
| 176 | + int mergeFactor = iid.getIntParam("mergeFactor",10); |
177 | 177 | int maxBufDocs = iid.getIntParam("maxBufDocs",10); |
178 | 178 | writer.setMergeFactor(mergeFactor); |
179 | 179 | writer.setMaxBufferedDocs(maxBufDocs); |
— | — | @@ -429,21 +429,20 @@ |
430 | 430 | title.setBoost(rankBoost); |
431 | 431 | doc.add(title); |
432 | 432 | |
433 | | - Field stemtitle = new Field(fields.stemtitle(), article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED); |
434 | | - //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
435 | | - stemtitle.setBoost(rankBoost); |
436 | | - doc.add(stemtitle); |
| 433 | + if(bs.getFilters().hasStemmer()){ |
| 434 | + Field stemtitle = new Field(fields.stemtitle(), article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED); |
| 435 | + //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
| 436 | + stemtitle.setBoost(rankBoost); |
| 437 | + doc.add(stemtitle); |
| 438 | + } |
437 | 439 | |
438 | 440 | // put the best redirects as alternative titles |
439 | 441 | makeAltTitles(doc,fields.alttitle(),article); |
440 | 442 | |
441 | | - // add titles of redirects, generated from analyzer |
442 | | - makeKeywordField(doc,fields.redirect(),rankBoost); |
| 443 | + bs.setAddKeywords(checkKeywordPreconditions(article,iid)); |
| 444 | + // most significant words in the text, gets extra score, from analyzer |
| 445 | + makeKeywordField(doc,fields.keyword(),rankBoost); |
443 | 446 | |
444 | | - if(checkKeywordPreconditions(article,iid)) |
445 | | - // most significat words in the text, gets extra score, from analyzer |
446 | | - makeKeywordField(doc,fields.keyword(),rankBoost); |
447 | | - |
448 | 447 | // the next fields are generated using wikitokenizer |
449 | 448 | doc.add(new Field(fields.contents(), "", |
450 | 449 | Field.Store.NO, Field.Index.TOKENIZED)); |
— | — | @@ -453,10 +452,6 @@ |
454 | 453 | // keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size())); |
455 | 454 | } |
456 | 455 | // make analyzer |
457 | | - if(article.getTitle().equalsIgnoreCase("wiki")){ |
458 | | - int b =10; |
459 | | - b++; |
460 | | - } |
461 | 456 | String text = article.getContents(); |
462 | 457 | Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords()); |
463 | 458 | perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0]; |
— | — | @@ -487,7 +482,7 @@ |
488 | 483 | if(ranks.get(i) == 0) |
489 | 484 | break; // we don't want redirects with zero links |
490 | 485 | //log.info("For "+article+" alttitle"+(i+1)+" "+redirects.get(i)+" = "+ranks.get(i)); |
491 | | - Field alttitle = new Field("alttitle"+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED); |
| 486 | + Field alttitle = new Field(prefix+(i+1), redirects.get(i),Field.Store.NO, Field.Index.TOKENIZED); |
492 | 487 | alttitle.setBoost(calculateArticleRank(ranks.get(i))); |
493 | 488 | doc.add(alttitle); |
494 | 489 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java |
— | — | @@ -12,9 +12,12 @@ |
13 | 13 | public class BuilderSet{ |
14 | 14 | FilterFactory filters; |
15 | 15 | FieldNameFactory fields; |
| 16 | + boolean addKeywords; // wether to add keywords from beginning of article |
| 17 | + |
16 | 18 | public BuilderSet(FilterFactory filters, FieldNameFactory fields) { |
17 | 19 | this.filters = filters; |
18 | 20 | this.fields = fields; |
| 21 | + this.addKeywords = false; |
19 | 22 | } |
20 | 23 | public FieldNameFactory getFields() { |
21 | 24 | return fields; |
— | — | @@ -24,11 +27,23 @@ |
25 | 28 | } |
26 | 29 | public boolean isExactCase() { |
27 | 30 | return fields.isExactCase(); |
28 | | - } |
| 31 | + } |
| 32 | + public boolean isAddKeywords() { |
| 33 | + return addKeywords; |
| 34 | + } |
| 35 | + public void setAddKeywords(boolean addKeywords) { |
| 36 | + this.addKeywords = addKeywords; |
| 37 | + } |
| 38 | + |
29 | 39 | } |
30 | 40 | |
31 | 41 | protected BuilderSet[] builders = new BuilderSet[2]; |
32 | 42 | |
| 43 | + /** Construct case-insensitive field builder */ |
| 44 | + public FieldBuilder(String lang){ |
| 45 | + this(lang,false); |
| 46 | + } |
| 47 | + |
33 | 48 | public FieldBuilder(String lang, boolean exactCase){ |
34 | 49 | if(exactCase){ |
35 | 50 | builders = new BuilderSet[2]; |
— | — | @@ -49,5 +64,18 @@ |
50 | 65 | return builders; |
51 | 66 | } |
52 | 67 | |
| 68 | + /** Get the case-insensitive builder */ |
| 69 | + public BuilderSet getBuilder(){ |
| 70 | + return getBuilder(false); |
| 71 | + } |
53 | 72 | |
| 73 | + /** Get BuilderSet for exactCase value */ |
| 74 | + public BuilderSet getBuilder(boolean exactCase){ |
| 75 | + if(exactCase && builders.length > 1) |
| 76 | + return builders[1]; |
| 77 | + else |
| 78 | + return builders[0]; |
| 79 | + } |
| 80 | + |
| 81 | + |
54 | 82 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java |
— | — | @@ -35,9 +35,6 @@ |
36 | 36 | * |
37 | 37 | * @param str |
38 | 38 | */ |
39 | | - public WikiTokenizer(String str, boolean exactCase){ |
40 | | - this(str,null,exactCase); |
41 | | - } |
42 | 39 | |
43 | 40 | public WikiTokenizer(String str, String lang, boolean exactCase){ |
44 | 41 | parser = new FastWikiTokenizerEngine(str,lang,exactCase); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -51,7 +51,7 @@ |
52 | 52 | private char cl; // lowercased character |
53 | 53 | private boolean numberToken; // if the buffer holds a number token |
54 | 54 | private int headings = 0; // how many headings did we see |
55 | | - private int templateLevel = 0; // level of nestedness of templates |
| 55 | + private int templateLevel = 0; // level of nestedness of templates |
56 | 56 | |
57 | 57 | private int prefixLen = 0; |
58 | 58 | private final char[] prefixBuf = new char[MAX_WORD_LEN]; |
— | — | @@ -78,7 +78,7 @@ |
79 | 79 | enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD, |
80 | 80 | LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS, |
81 | 81 | TEMPLATE_BEGIN, TEMPLATE_WORDS, TEMPLATE_END, |
82 | | - TABLE_BEGIN}; |
| 82 | + TABLE_BEGIN, CATEGORY_WORDS }; |
83 | 83 | |
84 | 84 | enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD }; |
85 | 85 | |
— | — | @@ -109,10 +109,6 @@ |
110 | 110 | } |
111 | 111 | } |
112 | 112 | |
113 | | - public FastWikiTokenizerEngine(String text, boolean exactCase){ |
114 | | - this(text,null,exactCase); |
115 | | - } |
116 | | - |
117 | 113 | public FastWikiTokenizerEngine(String text, String lang, boolean exactCase){ |
118 | 114 | this.text = text.toCharArray(); |
119 | 115 | this.textString = text; |
— | — | @@ -227,22 +223,27 @@ |
228 | 224 | } |
229 | 225 | } |
230 | 226 | // make the original buffered version |
231 | | - Token exact; |
232 | | - if(exactCase) |
233 | | - exact = new Token( |
234 | | - new String(buffer, 0, length), start, start + length); |
235 | | - else |
236 | | - exact = new Token( |
237 | | - new String(buffer, 0, length).toLowerCase(), start, start + length); |
238 | | - if(addDecomposed && decompLength!=0) |
239 | | - exact.setType("unicode"); |
240 | | - tokens.add(exact); |
| 227 | + // TODO: maybe do this optionally for some languages |
| 228 | + /* if(!("de".equals(language) && aliasLength!=0)){ |
| 229 | + Token exact; |
| 230 | + if(exactCase) |
| 231 | + exact = new Token( |
| 232 | + new String(buffer, 0, length), start, start + length); |
| 233 | + else |
| 234 | + exact = new Token( |
| 235 | + new String(buffer, 0, length).toLowerCase(), start, start + length); |
| 236 | + if(addDecomposed && decompLength!=0) |
| 237 | + exact.setType("unicode"); |
| 238 | + tokens.add(exact); |
| 239 | + } */ |
241 | 240 | // add decomposed token to stream |
242 | | - if(addDecomposed && decompLength!=0){ |
| 241 | + if(decompLength!=0){ |
243 | 242 | Token t = new Token( |
244 | 243 | new String(decompBuffer, 0, decompLength), start, start + length); |
245 | | - t.setPositionIncrement(0); |
246 | | - t.setType("transliteration"); |
| 244 | + /*if(!"de".equals(language)){ |
| 245 | + t.setPositionIncrement(0); |
| 246 | + t.setType("transliteration"); |
| 247 | + } */ |
247 | 248 | tokens.add(t); |
248 | 249 | } |
249 | 250 | // add alias (if any) token to stream |
— | — | @@ -434,6 +435,7 @@ |
435 | 436 | String prefix = ""; |
436 | 437 | char ignoreEnd = ' '; // end of ignore block |
437 | 438 | int pipeInx = 0; |
| 439 | + int fetchStart = -1; // start index if link fetching |
438 | 440 | |
439 | 441 | if(tokens == null) |
440 | 442 | tokens = new ArrayList<Token>(); |
— | — | @@ -448,7 +450,7 @@ |
449 | 451 | c = text[cur]; |
450 | 452 | |
451 | 453 | // actions for various parser states |
452 | | - switch(state){ |
| 454 | + switch(state){ |
453 | 455 | case WORD: |
454 | 456 | switch(c){ |
455 | 457 | case '=': |
— | — | @@ -548,6 +550,7 @@ |
549 | 551 | cur = semicolonInx; |
550 | 552 | fetch = FetchState.CATEGORY; |
551 | 553 | state = ParserState.LINK_FETCH; |
| 554 | + fetchStart = cur; |
552 | 555 | continue; |
553 | 556 | } else if(isInterwiki(prefix)){ |
554 | 557 | cur = semicolonInx; |
— | — | @@ -615,7 +618,7 @@ |
616 | 619 | |
617 | 620 | if(length<buffer.length) |
618 | 621 | buffer[length++] = c; |
619 | | - continue; |
| 622 | + continue; |
620 | 623 | case LINK_END: |
621 | 624 | if(c == ']'){ // good link ending |
622 | 625 | state = ParserState.WORD; |
— | — | @@ -628,6 +631,13 @@ |
629 | 632 | categories.add(new String(buffer,0,length)); |
630 | 633 | length = 0; |
631 | 634 | fetch = FetchState.WORD; |
| 635 | + // index category words |
| 636 | + if(fetchStart != -1){ |
| 637 | + cur = fetchStart; |
| 638 | + state = ParserState.CATEGORY_WORDS; |
| 639 | + } else |
| 640 | + System.err.print("ERROR: Inconsistent parser state, attepmted category backtrace for uninitalized fetchStart."); |
| 641 | + fetchStart = -1; |
632 | 642 | continue; |
633 | 643 | case INTERWIKI: |
634 | 644 | interwikis.put(prefix, |
— | — | @@ -648,6 +658,22 @@ |
649 | 659 | continue; |
650 | 660 | } |
651 | 661 | continue; |
| 662 | + case CATEGORY_WORDS: |
| 663 | + if(c == ']'){ |
| 664 | + state = ParserState.WORD; // end of category |
| 665 | + continue; |
| 666 | + } else if(c == '|'){ // ignore everything up to ] |
| 667 | + for( lookup = cur + 1 ; lookup < textLength ; lookup++ ){ |
| 668 | + if(text[lookup] == ']'){ // we know the syntax is correct since we checked it in LINK_FETCH |
| 669 | + state = ParserState.WORD; |
| 670 | + cur = lookup; |
| 671 | + break; |
| 672 | + } |
| 673 | + } |
| 674 | + continue; |
| 675 | + } |
| 676 | + addLetter(); |
| 677 | + continue; |
652 | 678 | case TABLE_BEGIN: |
653 | 679 | // ignore everything up to the newspace, since they are table display params |
654 | 680 | while(cur < textLength && (text[cur]!='\r' && text[cur]!='\n')) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -105,6 +105,7 @@ |
106 | 106 | private NamespacePolicy namespacePolicy; |
107 | 107 | protected NamespaceFilter defaultNamespaceFilter; |
108 | 108 | protected static GlobalConfiguration global=null; |
| 109 | + protected FieldBuilder.BuilderSet builder; |
109 | 110 | protected FieldNameFactory fields; |
110 | 111 | |
111 | 112 | /** default value for boolean queries */ |
— | — | @@ -130,8 +131,8 @@ |
131 | 132 | * @param field default field name |
132 | 133 | * @param analyzer |
133 | 134 | */ |
134 | | - public WikiQueryParser(String field, Analyzer analyzer, FieldNameFactory fields){ |
135 | | - this(field,(NamespaceFilter)null,analyzer,fields,NamespacePolicy.LEAVE); |
| 135 | + public WikiQueryParser(String field, Analyzer analyzer, FieldBuilder.BuilderSet builder){ |
| 136 | + this(field,(NamespaceFilter)null,analyzer,builder,NamespacePolicy.LEAVE); |
136 | 137 | } |
137 | 138 | |
138 | 139 | /** |
— | — | @@ -142,14 +143,15 @@ |
143 | 144 | * @param analyzer |
144 | 145 | * @param nsPolicy |
145 | 146 | */ |
146 | | - public WikiQueryParser(String field, String namespace, Analyzer analyzer, FieldNameFactory fields, NamespacePolicy nsPolicy){ |
147 | | - this(field,new NamespaceFilter(namespace),analyzer,fields,nsPolicy); |
| 147 | + public WikiQueryParser(String field, String namespace, Analyzer analyzer, FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy){ |
| 148 | + this(field,new NamespaceFilter(namespace),analyzer,builder,nsPolicy); |
148 | 149 | } |
149 | 150 | |
150 | | - public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, FieldNameFactory fields, NamespacePolicy nsPolicy){ |
| 151 | + public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy){ |
151 | 152 | defaultField = field; |
152 | 153 | this.analyzer = analyzer; |
153 | | - this.fields = fields; |
| 154 | + this.builder = builder; |
| 155 | + this.fields = builder.getFields(); |
154 | 156 | tokens = new ArrayList<Token>(); |
155 | 157 | this.namespacePolicy = nsPolicy; |
156 | 158 | disableTitleAliases = true; |
— | — | @@ -999,6 +1001,8 @@ |
1000 | 1002 | } else if(q instanceof PhraseQuery){ // -> SpanNearQuery(slop=0,inOrder=true) |
1001 | 1003 | PhraseQuery pq = (PhraseQuery)q; |
1002 | 1004 | Term[] terms = pq.getTerms(); |
| 1005 | + if(terms == null || terms.length==0) |
| 1006 | + continue; |
1003 | 1007 | if(terms[0].field().equals("category")){ |
1004 | 1008 | categories.add(q); |
1005 | 1009 | } else{ |
— | — | @@ -1081,12 +1085,6 @@ |
1082 | 1086 | defaultBoost = olfDefaultBoost; |
1083 | 1087 | defaultAliasBoost = ALIAS_BOOST; |
1084 | 1088 | |
1085 | | - BooleanQuery qs = multiplySpans(qt,0,fields.redirect(),REDIRECT_BOOST); |
1086 | | - // merge queries |
1087 | | - if(qs != null){ |
1088 | | - for(BooleanClause bc : qs.getClauses()) |
1089 | | - bq.add(bc); |
1090 | | - } |
1091 | 1089 | if(bq.getClauses() == null || bq.getClauses().length==0) |
1092 | 1090 | return null; |
1093 | 1091 | else |
— | — | @@ -1099,12 +1097,15 @@ |
1100 | 1098 | String contentField = defaultField; |
1101 | 1099 | float olfDefaultBoost = defaultBoost; |
1102 | 1100 | defaultField = fields.title(); // now parse the title part |
1103 | | - defaultBoost = TITLE_BOOST; |
| 1101 | + if(ADD_STEM_TITLE && builder.getFilters().hasStemmer()) |
| 1102 | + defaultBoost = TITLE_BOOST; // we have stem titles |
| 1103 | + else |
| 1104 | + defaultBoost = TITLE_BOOST+STEM_TITLE_BOOST; // no stem titles, add-up boosts |
1104 | 1105 | defaultAliasBoost = TITLE_ALIAS_BOOST; |
1105 | 1106 | Query qt = parseRaw(queryText); |
1106 | 1107 | Query qs = null; |
1107 | 1108 | // stemmed title |
1108 | | - if(ADD_STEM_TITLE){ |
| 1109 | + if(ADD_STEM_TITLE && builder.getFilters().hasStemmer()){ |
1109 | 1110 | defaultField = fields.stemtitle(); |
1110 | 1111 | defaultBoost = STEM_TITLE_BOOST; |
1111 | 1112 | defaultAliasBoost = STEM_TITLE_ALIAS_BOOST; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -55,7 +55,7 @@ |
56 | 56 | PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); |
57 | 57 | WikiTokenizer tokenizer = null; |
58 | 58 | for(FieldBuilder.BuilderSet bs : builder.getBuilders()){ |
59 | | - tokenizer = addFieldsForIndexing(perFieldAnalyzer,text,bs.getFilters(),bs.getFields(),redirects,bs.isExactCase()); |
| 59 | + tokenizer = addFieldsForIndexing(perFieldAnalyzer,text,bs.getFilters(),bs.getFields(),redirects,bs.isExactCase(),bs.isAddKeywords()); |
60 | 60 | } |
61 | 61 | return new Object[] {perFieldAnalyzer,tokenizer}; |
62 | 62 | } |
— | — | @@ -64,26 +64,30 @@ |
65 | 65 | * Add some fields to indexer's analyzer. |
66 | 66 | * |
67 | 67 | */ |
68 | | - public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text, FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, boolean exactCase) { |
| 68 | + public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text, FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, boolean exactCase, boolean addKeywords) { |
69 | 69 | // parse wiki-text to get categories |
70 | 70 | WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase); |
71 | 71 | tokenizer.tokenize(); |
72 | 72 | ArrayList<String> categories = tokenizer.getCategories(); |
73 | 73 | |
| 74 | + ArrayList<String> allKeywords = new ArrayList<String>(); |
| 75 | + if(addKeywords && tokenizer.getKeywords()!=null) |
| 76 | + allKeywords.addAll(tokenizer.getKeywords()); |
| 77 | + if(redirects!=null) |
| 78 | + allKeywords.addAll(redirects); |
| 79 | + |
74 | 80 | perFieldAnalyzer.addAnalyzer(fields.contents(), |
75 | 81 | new LanguageAnalyzer(filters,tokenizer)); |
76 | 82 | perFieldAnalyzer.addAnalyzer("category", |
77 | | - new CategoryAnalyzer(categories)); |
| 83 | + new CategoryAnalyzer(categories,exactCase)); |
78 | 84 | perFieldAnalyzer.addAnalyzer(fields.title(), |
79 | 85 | getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
80 | 86 | perFieldAnalyzer.addAnalyzer(fields.stemtitle(), |
81 | 87 | getTitleAnalyzer(filters,exactCase)); |
82 | 88 | setAltTitleAnalyzer(perFieldAnalyzer,fields.alttitle(), |
83 | 89 | getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
84 | | - setKeywordAnalyzer(perFieldAnalyzer,fields.redirect(), |
85 | | - new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),fields.redirect(),exactCase)); |
86 | 90 | setKeywordAnalyzer(perFieldAnalyzer,fields.keyword(), |
87 | | - new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),fields.keyword(),exactCase)); |
| 91 | + new KeywordsAnalyzer(allKeywords,filters.getNoStemmerFilterFactory(),fields.keyword(),exactCase)); |
88 | 92 | return tokenizer; |
89 | 93 | } |
90 | 94 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java |
— | — | @@ -46,13 +46,6 @@ |
47 | 47 | return "alttitle"; |
48 | 48 | } |
49 | 49 | |
50 | | - public String redirect(){ |
51 | | - if(exactCase) |
52 | | - return "redirect_exact"; |
53 | | - else |
54 | | - return "redirect"; |
55 | | - } |
56 | | - |
57 | 50 | public String keyword(){ |
58 | 51 | if(exactCase) |
59 | 52 | return "keyword_exact"; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -61,7 +61,7 @@ |
62 | 62 | keywordsBySize.add(new ArrayList<String>()); |
63 | 63 | // arange keywords into a list by token number |
64 | 64 | for(String k : keywords){ |
65 | | - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,exactCase).parse(); |
| 65 | + ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,filters.getLanguage(),exactCase).parse(); |
66 | 66 | if(parsed.size() == 0) |
67 | 67 | continue; |
68 | 68 | else if(parsed.size() < KEYWORD_LEVELS) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java |
— | — | @@ -25,7 +25,7 @@ |
26 | 26 | */ |
27 | 27 | @Override |
28 | 28 | public TokenStream tokenStream(String fieldName, String text) { |
29 | | - wikitokenizer = new WikiTokenizer(text,exactCase); |
| 29 | + wikitokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase); |
30 | 30 | return super.tokenStream(fieldName,(Reader)null); |
31 | 31 | } |
32 | 32 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | protected Iterator<String> tokensIt; |
21 | 21 | protected int start; |
22 | 22 | |
| 23 | + |
23 | 24 | ArrayTokenStream(ArrayList<String> tokens){ |
24 | 25 | this.tokens = tokens; |
25 | 26 | tokensIt = tokens.iterator(); |
— | — | @@ -28,7 +29,12 @@ |
29 | 30 | @Override |
30 | 31 | public Token next() throws IOException { |
31 | 32 | if(tokensIt.hasNext()){ |
32 | | - String text = tokensIt.next(); |
| 33 | + String text; |
| 34 | + if(!exactCase) |
| 35 | + text = tokensIt.next().toLowerCase(); |
| 36 | + else |
| 37 | + text = tokensIt.next(); |
| 38 | + |
33 | 39 | Token token = new Token(text,start,start+text.length()); |
34 | 40 | start += text.length()+1; |
35 | 41 | return token; |
— | — | @@ -39,9 +45,11 @@ |
40 | 46 | } |
41 | 47 | |
42 | 48 | ArrayList<String> categories; |
| 49 | + protected boolean exactCase; |
43 | 50 | |
44 | | - public CategoryAnalyzer(ArrayList<String> categories) { |
| 51 | + public CategoryAnalyzer(ArrayList<String> categories, boolean exactCase) { |
45 | 52 | this.categories = categories; |
| 53 | + this.exactCase = exactCase; |
46 | 54 | } |
47 | 55 | |
48 | 56 | @Override |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java |
— | — | @@ -60,14 +60,15 @@ |
61 | 61 | |
62 | 62 | /** |
63 | 63 | * Syntax: |
64 | | - * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-e dbname] [-n] dbname1 dbname2 ... |
| 64 | + * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-e dbname] [-n] [--no-ranks] dbname1 dbname2 ... |
65 | 65 | * Options: |
66 | 66 | * -d - daemonize, otherwise runs only one round of updates to dbs |
67 | | - * -s - sleep time after one cycle (default: 30000ms) |
| 67 | + * -s - sleep time after one cycle (default: 30s) |
68 | 68 | * -t - default timestamp if status file is missing (default: 2001-01-01) |
69 | 69 | * -f - file to read databases from |
70 | 70 | * -n - wait for notification of flush after done updating one db (default: true) |
71 | 71 | * -e - exclude dbname from incremental updates (overrides -f) |
| 72 | + * --no-ranks - don't fetch ranks |
72 | 73 | * |
73 | 74 | * @param args |
74 | 75 | */ |
— | — | @@ -81,12 +82,13 @@ |
82 | 83 | boolean notification = true; |
83 | 84 | HashSet<String> excludeList = new HashSet<String>(); |
84 | 85 | HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass |
| 86 | + boolean fetchReferences = true; |
85 | 87 | // args |
86 | 88 | for(int i=0; i<args.length; i++){ |
87 | 89 | if(args[i].equals("-d")) |
88 | 90 | daemon = true; |
89 | 91 | else if(args[i].equals("-s")) |
90 | | - sleepTime = Long.parseLong(args[++i]); |
| 92 | + sleepTime = Long.parseLong(args[++i])*1000; |
91 | 93 | else if(args[i].equals("-t")) |
92 | 94 | timestamp = args[++i]; |
93 | 95 | else if(args[i].equals("-f")) |
— | — | @@ -95,6 +97,8 @@ |
96 | 98 | excludeList.add(args[++i]); |
97 | 99 | else if(args[i].equals("-n")) |
98 | 100 | notification = true; |
| 101 | + else if(args[i].equals("--no-ranks")) |
| 102 | + fetchReferences = false; |
99 | 103 | else if(args[i].equals("--help")) |
100 | 104 | break; |
101 | 105 | else if(args[i].startsWith("-")){ |
— | — | @@ -119,14 +123,15 @@ |
120 | 124 | } |
121 | 125 | } |
122 | 126 | if(dbnames.size() == 0){ |
123 | | - System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] dbname1 dbname2 ..."); |
| 127 | + System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] [-n] [--no-ranks] dbname1 dbname2 ..."); |
124 | 128 | System.out.println("Options:"); |
125 | 129 | System.out.println(" -d - daemonize, otherwise runs only one round of updates to dbs"); |
126 | | - System.out.println(" -s - sleep time after one cycle (default: "+sleepTime+"ms)"); |
| 130 | + System.out.println(" -s - sleep time in seconds after one cycle (default: "+sleepTime+"ms)"); |
127 | 131 | System.out.println(" -t - timestamp to start from (if status is missing default: "+timestamp+")"); |
128 | 132 | System.out.println(" -f - dblist file, one dbname per line"); |
129 | 133 | System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")"); |
130 | 134 | System.out.println(" -e - exclude dbname from incremental updates (overrides -f)"); |
| 135 | + System.out.println(" --no-ranks - don't try to fetch any article rank data"); |
131 | 136 | return; |
132 | 137 | } |
133 | 138 | // config |
— | — | @@ -173,8 +178,10 @@ |
174 | 179 | continue; |
175 | 180 | boolean hasMore = false; |
176 | 181 | do{ |
177 | | - // fetch references for records |
178 | | - fetchReferences(records,dbname); |
| 182 | + if(fetchReferences){ |
| 183 | + // fetch references for records |
| 184 | + fetchReferences(records,dbname); |
| 185 | + } |
179 | 186 | for(IndexUpdateRecord rec : records){ |
180 | 187 | Article ar = rec.getArticle(); |
181 | 188 | log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -123,10 +123,12 @@ |
124 | 124 | log.warn("Property Localization.url not set in config file. Localization disabled."); |
125 | 125 | return false; |
126 | 126 | } |
| 127 | + if(!loc.endsWith("/")) |
| 128 | + loc += "/"; |
127 | 129 | log.info("Reading localization for "+langCode); |
128 | 130 | URL url; |
129 | 131 | try { |
130 | | - url = new URL(MessageFormat.format(loc,langCode)); |
| 132 | + url = new URL(MessageFormat.format(loc+"Messages{0}.php",langCode)); |
131 | 133 | |
132 | 134 | PHPParser parser = new PHPParser(); |
133 | 135 | String text = parser.readURL(url); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -86,7 +86,7 @@ |
87 | 87 | showTokens(text); |
88 | 88 | text = "This are [[bean]]s and more [[bla]]njah also Großmann"; |
89 | 89 | showTokens(text); |
90 | | - text = "[[Category:Blah Blah?!]], and [[:Category:Link to something]]"; |
| 90 | + text = "[[Category:Blah Blah?!]], and [[:Category:Link to something]] [[Category:Mathematics|Name]]"; |
91 | 91 | showTokens(text); |
92 | 92 | text = "[[sr:Glavna stranica]], and [[:Category:Link to category]]"; |
93 | 93 | showTokens(text); |
— | — | @@ -114,7 +114,7 @@ |
115 | 115 | for(int i=0;i<2000;i++){ |
116 | 116 | for(TestArticle article : articles){ |
117 | 117 | String text = article.content; |
118 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,false); |
| 118 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"en",false); |
119 | 119 | parser.parse(); |
120 | 120 | } |
121 | 121 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java |
— | — | @@ -58,6 +58,6 @@ |
59 | 59 | if(streams.get(fieldName) != null) |
60 | 60 | return streams.get(fieldName); |
61 | 61 | |
62 | | - return new AliasPorterStemFilter(new WikiTokenizer(text,false)); |
| 62 | + return new AliasPorterStemFilter(new WikiTokenizer(text,"en",false)); |
63 | 63 | } |
64 | 64 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.lucene.search.BooleanQuery; |
12 | 12 | import org.apache.lucene.search.Query; |
13 | 13 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 14 | +import org.wikimedia.lsearch.analyzers.FieldBuilder; |
14 | 15 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
15 | 16 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
16 | 17 | import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy; |
— | — | @@ -37,10 +38,10 @@ |
38 | 39 | WikiQueryParser.ALT_TITLE_BOOST = 6; |
39 | 40 | WikiQueryParser.KEYWORD_BOOST = 0.05f; |
40 | 41 | WikiIndexModifier.ALT_TITLES = 3; |
41 | | - WikiQueryParser.ADD_STEM_TITLE=false; |
| 42 | + FieldBuilder.BuilderSet bs = new FieldBuilder("").getBuilder(); |
42 | 43 | FieldNameFactory ff = new FieldNameFactory(); |
43 | 44 | try{ |
44 | | - WikiQueryParser parser = new WikiQueryParser(ff.contents(),new SimpleAnalyzer(),ff); |
| 45 | + WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),new SimpleAnalyzer(),bs); |
45 | 46 | Query q; |
46 | 47 | HashSet<String> fields; |
47 | 48 | |
— | — | @@ -115,11 +116,11 @@ |
116 | 117 | assertTrue(fields.contains("contents")); |
117 | 118 | |
118 | 119 | // namespace policies |
119 | | - parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), ff, WikiQueryParser.NamespacePolicy.IGNORE); |
| 120 | + parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), bs, WikiQueryParser.NamespacePolicy.IGNORE); |
120 | 121 | q = parser.parseRaw("help:making breakfast incategory:food"); |
121 | 122 | assertEquals("+contents:making +contents:breakfast +category:food",q.toString()); |
122 | 123 | |
123 | | - parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), ff, WikiQueryParser.NamespacePolicy.REWRITE); |
| 124 | + parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), bs, WikiQueryParser.NamespacePolicy.REWRITE); |
124 | 125 | q = parser.parseRaw("help:making breakfast incategory:food"); |
125 | 126 | assertEquals("+namespace:12 +(+contents:making +contents:breakfast +category:food)",q.toString()); |
126 | 127 | |
— | — | @@ -141,7 +142,7 @@ |
142 | 143 | |
143 | 144 | // ====== English Analyzer ======== |
144 | 145 | |
145 | | - parser = new WikiQueryParser(ff.contents(),"0",new EnglishAnalyzer(), ff, WikiQueryParser.NamespacePolicy.REWRITE); |
| 146 | + parser = new WikiQueryParser(ff.contents(),"0",new EnglishAnalyzer(), bs, WikiQueryParser.NamespacePolicy.REWRITE); |
146 | 147 | q = parser.parseRaw("main_talk:laziness"); |
147 | 148 | assertEquals("+namespace:1 +(contents:laziness contents:lazi^0.5)",q.toString()); |
148 | 149 | |
— | — | @@ -157,7 +158,7 @@ |
158 | 159 | q = parser.parse("(help:making something incategory:blah) OR (rest incategory:crest)"); |
159 | 160 | assertEquals("(+namespace:12 +(+(+(contents:making contents:make^0.5) title:making^2.0) +(+(contents:something contents:someth^0.5) title:something^2.0) +category:blah)) (+namespace:0 +(+(+contents:rest +category:crest) title:rest^2.0))",q.toString()); |
160 | 161 | |
161 | | - parser = new WikiQueryParser(ff.contents(),new EnglishAnalyzer(),ff); |
| 162 | + parser = new WikiQueryParser(ff.contents(),new EnglishAnalyzer(),bs); |
162 | 163 | |
163 | 164 | q = parser.parseRaw("laziness"); |
164 | 165 | assertEquals("contents:laziness contents:lazi^0.5",q.toString()); |
— | — | @@ -207,7 +208,10 @@ |
208 | 209 | // Tests with actual params :) |
209 | 210 | // ================================== |
210 | 211 | Analyzer analyzer = Analyzers.getSearcherAnalyzer("en"); |
211 | | - parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
| 212 | + bs = new FieldBuilder("en").getBuilder(); |
| 213 | + parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
| 214 | + WikiQueryParser.ADD_STEM_TITLE = false; |
| 215 | + WikiQueryParser.STEM_TITLE_BOOST = 0; |
212 | 216 | q = parser.parseTwoPass("beans everyone",null); |
213 | 217 | assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString()); |
214 | 218 | |
— | — | @@ -289,14 +293,14 @@ |
290 | 294 | |
291 | 295 | // Redirect third/forth pass tests |
292 | 296 | q = parser.parseFourPass("beans",NamespacePolicy.IGNORE,true); |
293 | | - assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 (alttitle1:beans^6.0 alttitle2:beans^6.0 alttitle3:beans^6.0 redirect1:beans^0.2 redirect2:beans^0.1 redirect3:beans^0.06666667 redirect4:beans^0.05 redirect5:beans^0.04) (keyword1:beans^0.05 keyword2:beans^0.025 keyword3:beans^0.016666668 keyword4:beans^0.0125 keyword5:beans^0.01)",q.toString()); |
| 297 | + assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 (alttitle1:beans^6.0 alttitle2:beans^6.0 alttitle3:beans^6.0) (keyword1:beans^0.05 keyword2:beans^0.025 keyword3:beans^0.016666668 keyword4:beans^0.0125 keyword5:beans^0.01)",q.toString()); |
294 | 298 | |
295 | 299 | q = parser.parseFourPass("beans everyone",NamespacePolicy.IGNORE,true); |
296 | | - assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0) spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04) (spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01)",q.toString()); |
| 300 | + assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01)",q.toString()); |
297 | 301 | |
298 | 302 | // TODO: check if this query will be optimized by lucene (categories) |
299 | 303 | q = parser.parseFourPass("beans everyone incategory:mouse",NamespacePolicy.IGNORE,true); |
300 | | - assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0 +category:mouse) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0 +category:mouse) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0 +category:mouse) (+spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2 +category:mouse) (+spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1 +category:mouse) (+spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667 +category:mouse) (+spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05 +category:mouse) (+spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04 +category:mouse)) ((+spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 +category:mouse) (+spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 +category:mouse) (+spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 +category:mouse) (+spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 +category:mouse) (+spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01 +category:mouse))",q.toString()); |
| 304 | + assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) ((+alttitle1:beans^6.0 +alttitle1:everyone^6.0 +category:mouse) (+alttitle2:beans^6.0 +alttitle2:everyone^6.0 +category:mouse) (+alttitle3:beans^6.0 +alttitle3:everyone^6.0 +category:mouse)) ((+spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05 +category:mouse) (+spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025 +category:mouse) (+spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668 +category:mouse) (+spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125 +category:mouse) (+spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01 +category:mouse))",q.toString()); |
301 | 305 | |
302 | 306 | q = parser.parseFourPass("beans OR everyone",NamespacePolicy.IGNORE,true); |
303 | 307 | assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0) ((alttitle1:beans^6.0 alttitle1:everyone^6.0) (alttitle2:beans^6.0 alttitle2:everyone^6.0) (alttitle3:beans^6.0 alttitle3:everyone^6.0))",q.toString()); |
— | — | @@ -305,7 +309,7 @@ |
306 | 310 | assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0) ((+alttitle1:beans^6.0 -alttitle1:everyone^6.0) (+alttitle2:beans^6.0 -alttitle2:everyone^6.0) (+alttitle3:beans^6.0 -alttitle3:everyone^6.0))",q.toString()); |
307 | 311 | |
308 | 312 | q = parser.parseFourPass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE,true); |
309 | | - assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect1:beans, redirect1:everyone], 100, false)^0.2) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect2:beans, redirect2:everyone], 100, false)^0.1) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect3:beans, redirect3:everyone], 100, false)^0.06666667) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect4:beans, redirect4:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect5:beans, redirect5:everyone], 100, false)^0.04)) ((+(namespace:0 namespace:1 namespace:2) +spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01))",q.toString()); |
| 313 | + assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0))) ((+(namespace:0 namespace:1 namespace:2) +spanNear([keyword1:beans, keyword1:everyone], 100, false)^0.05) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword2:beans, keyword2:everyone], 100, false)^0.025) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword3:beans, keyword3:everyone], 100, false)^0.016666668) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword4:beans, keyword4:everyone], 100, false)^0.0125) (+(namespace:0 namespace:1 namespace:2) +spanNear([keyword5:beans, keyword5:everyone], 100, false)^0.01))",q.toString()); |
310 | 314 | |
311 | 315 | q = parser.parseFourPass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE,true); |
312 | 316 | assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0)) (((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+namespace:0 +alttitle1:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+namespace:0 +alttitle2:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+namespace:0 +alttitle3:mainly^6.0)))",q.toString()); |
— | — | @@ -315,53 +319,61 @@ |
316 | 320 | |
317 | 321 | // alternative transliterations |
318 | 322 | q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true); |
319 | | - assertEquals("(+(contents:something contents:someth^0.5) +contents:for +((contents:gödels contents:gödel^0.5) (contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +((title:gödels^2.0 title:godels^2.0 title:goedels^2.0))) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +((alttitle1:gödels^6.0 alttitle1:godels^6.0 alttitle1:goedels^6.0))) (+alttitle2:something^6.0 +alttitle2:for^6.0 +((alttitle2:gödels^6.0 alttitle2:godels^6.0 alttitle2:goedels^6.0))) (+alttitle3:something^6.0 +alttitle3:for^6.0 +((alttitle3:gödels^6.0 alttitle3:godels^6.0 alttitle3:goedels^6.0))))",q.toString()); |
| 323 | + assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) (contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString()); |
320 | 324 | |
321 | 325 | q = parser.parseFourPass("Something for Gödel",NamespacePolicy.IGNORE,true); |
322 | | - assertEquals("(+(contents:something contents:someth^0.5) +contents:for +((contents:gödel contents:godel contents:goedel))) (+title:something^2.0 +title:for^2.0 +((title:gödel^2.0 title:godel^2.0 title:goedel^2.0))) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +((alttitle1:gödel^6.0 alttitle1:godel^6.0 alttitle1:goedel^6.0))) (+alttitle2:something^6.0 +alttitle2:for^6.0 +((alttitle2:gödel^6.0 alttitle2:godel^6.0 alttitle2:goedel^6.0))) (+alttitle3:something^6.0 +alttitle3:for^6.0 +((alttitle3:gödel^6.0 alttitle3:godel^6.0 alttitle3:goedel^6.0))))",q.toString()); |
| 326 | + assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString()); |
323 | 327 | |
| 328 | + // Backward compatiblity for complex filters |
| 329 | + analyzer = Analyzers.getSearcherAnalyzer("en"); |
| 330 | + bs = new FieldBuilder("en").getBuilder(); |
| 331 | + parser = new WikiQueryParser(bs.getFields().contents(),"0,1,4,12",analyzer,bs,NamespacePolicy.IGNORE); |
| 332 | + |
| 333 | + q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE); |
| 334 | + assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0))",q.toString()); |
| 335 | + |
| 336 | + q = parser.parseTwoPass("beans main:everyone",NamespacePolicy.REWRITE); |
| 337 | + assertEquals("((+(namespace:0 namespace:1 namespace:4 namespace:12) +(contents:beans contents:bean^0.5)) (+namespace:0 +(contents:everyone contents:everyon^0.5))) ((+(namespace:0 namespace:1 namespace:4 namespace:12) +title:beans^2.0) (+namespace:0 +title:everyone^2.0))",q.toString()); |
| 338 | + |
| 339 | + q = parser.parseTwoPass("beans everyone incategory:cheeses",NamespacePolicy.REWRITE); |
| 340 | + assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:cheeses)) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0 +category:cheeses))",q.toString()); |
| 341 | + |
| 342 | + q = parser.parseTwoPass("all_talk: beans everyone",NamespacePolicy.REWRITE); |
| 343 | + assertEquals("(+(namespace:1 namespace:3 namespace:5 namespace:7 namespace:9 namespace:11 namespace:13 namespace:15) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:1 namespace:3 namespace:5 namespace:7 namespace:9 namespace:11 namespace:13 namespace:15) +(+title:beans^2.0 +title:everyone^2.0))",q.toString()); |
| 344 | + |
| 345 | + |
324 | 346 | // Test field extraction |
325 | 347 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
326 | 348 | assertEquals(3,fs.size()); |
327 | 349 | assertTrue(fs.contains(new NamespaceFilter("0"))); |
328 | 350 | assertTrue(fs.contains(new NamespaceFilter("1"))); |
329 | 351 | assertTrue(fs.contains(new NamespaceFilter())); |
| 352 | + |
| 353 | + WikiQueryParser.ADD_STEM_TITLE = true; |
| 354 | + WikiQueryParser.STEM_TITLE_BOOST = 1; |
330 | 355 | |
331 | 356 | // Localization tests |
332 | 357 | analyzer = Analyzers.getSearcherAnalyzer("sr"); |
333 | | - parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
| 358 | + bs = new FieldBuilder("sr").getBuilder(); |
| 359 | + parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
334 | 360 | |
335 | 361 | q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE); |
336 | | - assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli^0.4) +(title:на^2.0 title:na^0.4) +(title:википедију^2.0 title:vikipediju^0.4))",q.toString()); |
| 362 | + assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^3.0 title:dobrodosli^0.6) +(title:на^3.0 title:na^0.6) +(title:википедију^3.0 title:vikipediju^0.6))",q.toString()); |
337 | 363 | |
338 | 364 | q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE); |
339 | | - assertEquals("(+(contents:dobrodošli contents:dobrodosli) +contents:na +(+contents:šdjčćž +contents:sdjccz)) (+(title:dobrodošli^2.0 title:dobrodosli^2.0) +title:na^2.0 +(+title:šdjčćž^2.0 +title:sdjccz^2.0))",q.toString()); |
| 365 | + assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^3.0 +title:na^3.0 +title:sdjccz^3.0)",q.toString()); |
340 | 366 | |
341 | 367 | analyzer = Analyzers.getSearcherAnalyzer("th"); |
342 | | - parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
| 368 | + bs = new FieldBuilder("th").getBuilder(); |
| 369 | + parser = new WikiQueryParser(bs.getFields().contents(),"0",analyzer,bs,NamespacePolicy.LEAVE); |
343 | 370 | |
344 | 371 | q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE); |
345 | | - assertEquals("(+contents:ภาษา +contents:ไทย) (+title:ภาษา^2.0 +title:ไทย^2.0)",q.toString()); |
| 372 | + assertEquals("(+contents:ภาษา +contents:ไทย) (+title:ภาษา^3.0 +title:ไทย^3.0)",q.toString()); |
346 | 373 | |
347 | 374 | q = parser.parseTwoPass("help:ภาษาไทย",NamespacePolicy.REWRITE); |
348 | | - assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^2.0 +title:ไทย^2.0))",q.toString()); |
| 375 | + assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^3.0 +title:ไทย^3.0))",q.toString()); |
349 | 376 | |
350 | | - // Backward compatiblity for complex filters |
351 | | - analyzer = Analyzers.getSearcherAnalyzer("en"); |
352 | | - parser = new WikiQueryParser(ff.contents(),"0,1,4,12",analyzer,ff,NamespacePolicy.IGNORE); |
353 | 377 | |
354 | | - q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE); |
355 | | - assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0))",q.toString()); |
356 | | - |
357 | | - q = parser.parseTwoPass("beans main:everyone",NamespacePolicy.REWRITE); |
358 | | - assertEquals("((+(namespace:0 namespace:1 namespace:4 namespace:12) +(contents:beans contents:bean^0.5)) (+namespace:0 +(contents:everyone contents:everyon^0.5))) ((+(namespace:0 namespace:1 namespace:4 namespace:12) +title:beans^2.0) (+namespace:0 +title:everyone^2.0))",q.toString()); |
359 | | - |
360 | | - q = parser.parseTwoPass("beans everyone incategory:cheeses",NamespacePolicy.REWRITE); |
361 | | - assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:cheeses)) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0 +category:cheeses))",q.toString()); |
362 | | - |
363 | | - q = parser.parseTwoPass("all_talk: beans everyone",NamespacePolicy.REWRITE); |
364 | | - assertEquals("(+(namespace:1 namespace:3 namespace:5 namespace:7 namespace:9 namespace:11 namespace:13 namespace:15) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:1 namespace:3 namespace:5 namespace:7 namespace:9 namespace:11 namespace:13 namespace:15) +(+title:beans^2.0 +title:everyone^2.0))",q.toString()); |
365 | | - |
366 | 378 | } catch(Exception e){ |
367 | 379 | e.printStackTrace(); |
368 | 380 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -119,20 +119,20 @@ |
120 | 120 | |
121 | 121 | String[] ssr = (String[]) sr.toArray(new String [] {} ); |
122 | 122 | |
123 | | - assertEquals("entest",ssr[0]); |
124 | | - assertEquals("entest.mainpart",ssr[1]); |
125 | | - assertEquals("entest.restpart",ssr[2]); |
126 | | - assertEquals("rutest",ssr[3]); |
127 | | - assertEquals(4,ssr.length); |
| 123 | + assertEquals("entest.mainpart",ssr[0]); |
| 124 | + assertEquals("entest.restpart",ssr[1]); |
| 125 | + assertEquals("rutest",ssr[2]); |
| 126 | + assertEquals(3,ssr.length); |
128 | 127 | |
129 | 128 | // search groups |
130 | 129 | Hashtable<Integer,Hashtable<String,ArrayList<String>>> sg = testgc.getSearchGroups(); |
131 | 130 | |
132 | 131 | Hashtable<String,ArrayList<String>> g0 = sg.get(new Integer(0)); |
133 | | - assertEquals("{192.168.0.5=[entest.mainpart, entest.restpart], 192.168.0.2=[entest, entest.mainpart]}",g0.toString()); |
| 132 | + assertEquals("{192.168.0.5=[entest.mainpart, entest.restpart], 192.168.0.2=[entest.mainpart]}",g0.toString()); |
134 | 133 | Hashtable<String,ArrayList<String>> g1 = sg.get(new Integer(1)); |
135 | | - assertEquals("{192.168.0.6=[frtest.part3, detest], 192.168.0.4=[frtest.part1, frtest.part2]}",g1.toString()); |
| 134 | + assertEquals("{192.168.0.6=[frtest.part3, detest], 192.168.0.4=[frtest.part1, frtest.part2]}",g1.toString()); |
136 | 135 | |
| 136 | + |
137 | 137 | // index |
138 | 138 | Hashtable index = testgc.getIndex(); |
139 | 139 | ArrayList ir = (ArrayList) index.get("192.168.0.5"); |
— | — | @@ -251,6 +251,7 @@ |
252 | 252 | assertEquals("njawiki.nspart3",njawiki.getPartByNamespace("4").toString()); |
253 | 253 | assertEquals("njawiki.nspart1",njawiki.getPartByNamespace("0").toString()); |
254 | 254 | assertEquals("njawiki.nspart2",njawiki.getPartByNamespace("12").toString()); |
| 255 | + assertEquals("[192.168.0.1]",njawiki.getSearchHosts().toString()); |
255 | 256 | |
256 | 257 | IndexId njawiki2 = IndexId.get("njawiki.nspart2"); |
257 | 258 | assertFalse(njawiki2.isLogical()); |
— | — | @@ -258,6 +259,7 @@ |
259 | 260 | assertTrue(njawiki2.isNssplit()); |
260 | 261 | assertEquals(3,njawiki2.getSplitFactor()); |
261 | 262 | assertEquals(2,njawiki2.getPartNum()); |
| 263 | + assertEquals("[192.168.0.1]",njawiki2.getSearchHosts().toString()); |
262 | 264 | |
263 | 265 | } |
264 | 266 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java |
— | — | @@ -163,11 +163,11 @@ |
164 | 164 | } |
165 | 165 | } |
166 | 166 | |
167 | | - public SearchResults searchPart(IndexId iid, Query query, NamespaceFilterWrapper filter, int offset, int limit, boolean explain, String host){ |
| 167 | + public SearchResults searchPart(IndexId iid, String searchterm, Query query, NamespaceFilterWrapper filter, int offset, int limit, boolean explain, String host){ |
168 | 168 | try { |
169 | 169 | RMIMessenger r = messengerFromCache(host); |
170 | 170 | log.debug("Calling searchPart("+iid+",("+query+"),"+offset+","+limit+") on "+host); |
171 | | - SearchResults res = r.searchPart(iid.toString(),query,filter,offset,limit,explain); |
| 171 | + SearchResults res = r.searchPart(iid.toString(),searchterm,query,filter,offset,limit,explain); |
172 | 172 | log.debug(" \\-> got: "+res); |
173 | 173 | return res; |
174 | 174 | } catch (Exception e) { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java |
— | — | @@ -63,7 +63,7 @@ |
64 | 64 | * @param limit |
65 | 65 | * @throws RemoteException |
66 | 66 | */ |
67 | | - public SearchResults searchPart(String dbrole, Query query, NamespaceFilterWrapper filter, int offset, int limit, boolean explain) throws RemoteException; |
| 67 | + public SearchResults searchPart(String dbrole, String searchterm, Query query, NamespaceFilterWrapper filter, int offset, int limit, boolean explain) throws RemoteException; |
68 | 68 | |
69 | 69 | /** |
70 | 70 | * Returns index queue size. Needed for incremental updater, so it doesn't overload the indexer. |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java |
— | — | @@ -81,9 +81,9 @@ |
82 | 82 | } |
83 | 83 | |
84 | 84 | // inherit javadoc |
85 | | - public SearchResults searchPart(String dbrole, Query query, NamespaceFilterWrapper filter, int offset, int limit, boolean explain) throws RemoteException { |
| 85 | + public SearchResults searchPart(String dbrole, String searchterm, Query query, NamespaceFilterWrapper filter, int offset, int limit, boolean explain) throws RemoteException { |
86 | 86 | log.debug("Received request searchMainPart("+dbrole+","+query+","+offset+","+limit+")"); |
87 | | - return new SearchEngine().searchPart(IndexId.get(dbrole),query,filter,offset,limit,explain); |
| 87 | + return new SearchEngine().searchPart(IndexId.get(dbrole),searchterm,query,filter,offset,limit,explain); |
88 | 88 | } |
89 | 89 | |
90 | 90 | // inherit javadoc |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/StreamTerms.java |
— | — | @@ -0,0 +1,52 @@ |
| 2 | +package org.wikimedia.lsearch.benchmark; |
| 3 | + |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.FileInputStream; |
| 6 | +import java.io.IOException; |
| 7 | +import java.io.InputStreamReader; |
| 8 | +import java.util.zip.GZIPInputStream; |
| 9 | + |
| 10 | +/** Reads terms from an endless stream of terms */ |
| 11 | +public class StreamTerms implements Terms { |
| 12 | + BufferedReader in = null; |
| 13 | + String path; |
| 14 | + |
| 15 | + public StreamTerms(String path){ |
| 16 | + this.path = path; |
| 17 | + open(); |
| 18 | + } |
| 19 | + |
| 20 | + protected void open(){ |
| 21 | + try{ |
| 22 | + if(in != null) |
| 23 | + in.close(); |
| 24 | + if(path.endsWith(".gz")) |
| 25 | + in = new BufferedReader( |
| 26 | + new InputStreamReader( |
| 27 | + new GZIPInputStream( |
| 28 | + new FileInputStream(path)))); |
| 29 | + else |
| 30 | + in = new BufferedReader( |
| 31 | + new InputStreamReader( |
| 32 | + new FileInputStream(path))); |
| 33 | + } catch(IOException e){ |
| 34 | + e.printStackTrace(); |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + public String next() { |
| 39 | + try { |
| 40 | + return in.readLine(); |
| 41 | + } catch (IOException e) { |
| 42 | + // try reopening the stream |
| 43 | + open(); |
| 44 | + try { |
| 45 | + return in.readLine(); |
| 46 | + } catch (IOException e1) { |
| 47 | + e1.printStackTrace(); |
| 48 | + return null; |
| 49 | + } |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/WordTerms.java |
— | — | @@ -7,16 +7,19 @@ |
8 | 8 | import java.util.ArrayList; |
9 | 9 | import java.util.zip.GZIPInputStream; |
10 | 10 | |
| 11 | +import org.apache.log4j.Logger; |
| 12 | + |
11 | 13 | /** Benchmark terms from a dictionary of words (word : frequency) */ |
12 | 14 | public class WordTerms implements Terms { |
| 15 | + Logger log = Logger.getLogger(WordTerms.class); |
13 | 16 | /** load words from file, e.g. ./test-data/words-wikilucene.ngram.gz */ |
14 | 17 | public static ArrayList<String> loadWordFreq(String path) throws IOException { |
15 | 18 | BufferedReader in; |
16 | 19 | if(path.endsWith(".gz")) |
17 | 20 | in = new BufferedReader( |
18 | | - new InputStreamReader( |
19 | | - new GZIPInputStream( |
20 | | - new FileInputStream(path)))); |
| 21 | + new InputStreamReader( |
| 22 | + new GZIPInputStream( |
| 23 | + new FileInputStream(path)))); |
21 | 24 | else |
22 | 25 | in = new BufferedReader( |
23 | 26 | new InputStreamReader( |
— | — | @@ -27,13 +30,17 @@ |
28 | 31 | int freqSum = 0; |
29 | 32 | int freq,count=0; |
30 | 33 | while((line = in.readLine())!=null){ |
31 | | - String[] parts = line.split(" : "); |
32 | | - if(parts.length > 1){ |
33 | | - freq = Integer.parseInt(parts[1]); |
34 | | - freqSum += freq; |
| 34 | + try{ |
| 35 | + String[] parts = line.split(" : "); |
| 36 | + if(parts.length > 1){ |
| 37 | + freq = Integer.parseInt(parts[1]); |
| 38 | + freqSum += freq; |
| 39 | + } |
| 40 | + words.add(parts[0].trim()); |
| 41 | + } catch(NumberFormatException e){ |
| 42 | + words.add(line.trim()); |
35 | 43 | } |
36 | 44 | count++; |
37 | | - words.add(parts[0].trim()); |
38 | 45 | } |
39 | 46 | //System.out.println("Loaded "+count+" words with frequency sum of "+freqSum); |
40 | 47 | return words; |
— | — | @@ -45,6 +52,7 @@ |
46 | 53 | try { |
47 | 54 | words = loadWordFreq(path); |
48 | 55 | } catch (IOException e) { |
| 56 | + log.error("Cannot open dictionary of search terms in "+path); |
49 | 57 | e.printStackTrace(); |
50 | 58 | } |
51 | 59 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java |
— | — | @@ -171,15 +171,16 @@ |
172 | 172 | public static void main(String[] args) { |
173 | 173 | String host = "127.0.0.1"; |
174 | 174 | int port = 8123; |
175 | | - String database = "wikilucene"; |
| 175 | + String database = "enwiki"; |
176 | 176 | String verb = "search"; |
177 | | - String namespace = "main"; |
| 177 | + String namespace = ""; |
178 | 178 | String namespaceFilter= "0"; |
179 | 179 | String lang = "en-b"; |
180 | 180 | int runs = 5000; |
181 | 181 | int threads = 10; |
182 | | - int words = 2; |
| 182 | + int words = 1; |
183 | 183 | sample = true; |
| 184 | + String wordfile = null; |
184 | 185 | Terms terms; |
185 | 186 | |
186 | 187 | for(int i = 0; i < args.length; i++) { |
— | — | @@ -195,6 +196,8 @@ |
196 | 197 | runs = Integer.parseInt(args[++i]); |
197 | 198 | } else if (args[i].equals("-v")) { |
198 | 199 | database = args[++i]; |
| 200 | + } else if (args[i].equals("-wf")) { |
| 201 | + wordfile = args[++i]; |
199 | 202 | } else if (args[i].equals("-n") || args[i].equals("-ns")) { |
200 | 203 | namespace = args[++i]; |
201 | 204 | } else if (args[i].equals("-f") ) { |
— | — | @@ -218,19 +221,17 @@ |
219 | 222 | " -n namespace (default: "+namespace+")\n"+ |
220 | 223 | " -f namespace filter (default: "+namespaceFilter+")\n"+ |
221 | 224 | " -l language (default: "+lang+")\n"+ |
222 | | - " -s show sample url (default: "+sample+")\n"); |
| 225 | + " -s show sample url (default: "+sample+")\n"+ |
| 226 | + " -wf <file> use file with search terms (default: none)\n"); |
223 | 227 | return; |
224 | 228 | } else{ |
225 | 229 | System.out.println("Unrecognized switch: "+args[i]); |
226 | 230 | return; |
227 | 231 | } |
228 | 232 | } |
229 | | - if(lang.equals("en")) |
230 | | - terms = new WordTerms("./lib/dict/english.txt.gz"); |
231 | | - else if(lang.equals("de")) |
232 | | - terms = new WordTerms("./lib/dict/german.txt.gz"); |
233 | | - else if(lang.equals("fr")) |
234 | | - terms = new WordTerms("./lib/dict/french.txt.gz"); |
| 233 | + if("en".equals(lang) || "de".equals(lang) || "es".equals(lang) || "fr".equals(lang) || "it".equals(lang) || "pt".equals(lang)) |
| 234 | + terms = new WordTerms("./lib/dict/terms-"+lang+".txt.gz"); |
| 235 | + |
235 | 236 | else if(lang.equals("sample")) |
236 | 237 | terms = new SampleTerms(); |
237 | 238 | else |
Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -16,13 +16,14 @@ |
17 | 17 | #wikilucene : (single) (language,en) (warmup,0) |
18 | 18 | wikidev : (single) (language,sr) |
19 | 19 | wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[]) |
| 20 | +wikilucene : (language,en) (warmup,10) |
20 | 21 | |
21 | 22 | # Search groups |
22 | 23 | # Index parts of a split index are always taken from the node's group |
23 | 24 | # host : db1.part db2.part |
24 | 25 | # Mulitple hosts can search multiple dbs (N-N mapping) |
25 | 26 | [Search-Group] |
26 | | -oblak : wikilucene wikidev wikilucene.nspart1 wikilucene.nspart2 wikilucene.nspart3 |
| 27 | +oblak : wikilucene wikidev |
27 | 28 | |
28 | 29 | # Index nodes |
29 | 30 | # host: db1.part db2.part |
Index: trunk/lucene-search-2.0/lsearch.conf |
— | — | @@ -82,12 +82,9 @@ |
83 | 83 | # Log, ganglia, localization |
84 | 84 | ################################################ |
85 | 85 | |
86 | | -# URL to message files, {0} is replaced with language code, i.e. En |
87 | | -Localization.url=file:///var/www/html/wiki-lucene/phase3/languages/messages/Messages{0}.php |
| 86 | +# URL to MediaWiki message files |
| 87 | +Localization.url=file:///var/www/html/wiki-lucene/phase3/languages/messages |
88 | 88 | |
89 | | -# Pattern for OAI repo. {0} is replaced with dbname, {1} with language |
90 | | -OAI.repo=http://localhost/wiki-lucene/phase3/index.php/Special:OAIRepository |
91 | | - |
92 | 89 | # Username/password for password authenticated OAI repo |
93 | 90 | OAI.username=user |
94 | 91 | OAI.password=pass |
Index: trunk/lucene-search-2.0/lib/dict/french.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/english.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/german.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/terms-en.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/dict/terms-en.txt.gz |
___________________________________________________________________ |
Added: svn:mime-type |
95 | 92 | + application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/terms-pt.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/dict/terms-pt.txt.gz |
___________________________________________________________________ |
Added: svn:mime-type |
96 | 93 | + application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/terms-es.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/dict/terms-es.txt.gz |
___________________________________________________________________ |
Added: svn:mime-type |
97 | 94 | + application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/terms-fr.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/dict/terms-fr.txt.gz |
___________________________________________________________________ |
Added: svn:mime-type |
98 | 95 | + application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/terms-de.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/dict/terms-de.txt.gz |
___________________________________________________________________ |
Added: svn:mime-type |
99 | 96 | + application/octet-stream |
Index: trunk/lucene-search-2.0/lib/dict/terms-it.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/dict/terms-it.txt.gz |
___________________________________________________________________ |
Added: svn:mime-type |
100 | 97 | + application/octet-stream |
Index: trunk/lucene-search-2.0/README.txt |
— | — | @@ -21,8 +21,9 @@ |
22 | 22 | * edit mwsearch.conf: |
23 | 23 | + MWConfig.global to point to URL of mwsearch-global.conf |
24 | 24 | + MWConfig.lib to point to local library path (ie with unicode-data etc) |
25 | | - + Localization.url to point to URL pattern of latest |
26 | | - message files from MediaWiki |
| 25 | + + Localization.url to point to URL of latest message files from MediaWiki |
| 26 | + + Indexes.path - base path where you want the deamon to store the indexes, |
| 27 | + + Logging.logconfig - local path to log4j configuration file, e.g. /etc/lsearch.log4j (the lsearch package has a sample log4j file you can use) |
27 | 28 | * setup rsync daemon (see rsyncd.conf-example) |
28 | 29 | * setup log4j logging subsystem (see mwsearch.log4j-example) |
29 | 30 | |