Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java |
— | — | @@ -37,6 +37,8 @@ |
38 | 38 | protected float completeBoost = 1; |
39 | 39 | /** use complete number of tokens (with completeBoost) only for scoring */ |
40 | 40 | protected boolean useCompleteOnly = false; |
| 41 | + /** act exactly as a phrase query without any positional or such optimizations */ |
| 42 | + protected boolean phraseQueryFallback = false; |
41 | 43 | |
42 | 44 | |
43 | 45 | /** Options specific for phrases in contents */ |
— | — | @@ -148,6 +150,12 @@ |
149 | 151 | //wholeBoost = 8; |
150 | 152 | } |
151 | 153 | } |
| 154 | + /** Fallback to phasequery-type behaviour, no positional info */ |
| 155 | + public static class PhraseQueryFallback extends PositionalOptions { |
| 156 | + public PhraseQueryFallback(){ |
| 157 | + phraseQueryFallback = true; |
| 158 | + } |
| 159 | + } |
152 | 160 | |
153 | 161 | public abstract static class NamespaceBoost implements Serializable { |
154 | 162 | public abstract float getBoost(int namespace); |
— | — | @@ -162,6 +170,7 @@ |
163 | 171 | } |
164 | 172 | } |
165 | 173 | } |
| 174 | + |
166 | 175 | |
167 | 176 | |
168 | 177 | @Override |
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java |
— | — | @@ -195,6 +195,8 @@ |
196 | 196 | * @throws IOException |
197 | 197 | */ |
198 | 198 | public float freqScore(int start, int distance) throws IOException{ |
| 199 | + if(options.phraseQueryFallback) |
| 200 | + return getSimilarity().sloppyFreq(distance); |
199 | 201 | //System.out.println("freqScore at start="+start+", dist="+distance); |
200 | 202 | int offset = start + distance; |
201 | 203 | float begin = 1; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -156,7 +156,7 @@ |
157 | 157 | * |
158 | 158 | * @return Returns unique id. |
159 | 159 | */ |
160 | | - public String getKey() { |
| 160 | + public String getIndexKey() { |
161 | 161 | return Long.toString(pageId); |
162 | 162 | } |
163 | 163 | |
— | — | @@ -270,9 +270,15 @@ |
271 | 271 | |
272 | 272 | public void setDate(Date date) { |
273 | 273 | this.date = date; |
| 274 | + } |
| 275 | + |
| 276 | + public void setRedirectTo(String redirectTo) { |
| 277 | + this.redirectTo = redirectTo; |
274 | 278 | } |
275 | 279 | |
276 | 280 | |
277 | 281 | |
278 | 282 | |
| 283 | + |
| 284 | + |
279 | 285 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/SearchResults.java |
— | — | @@ -32,6 +32,8 @@ |
33 | 33 | protected boolean foundAllInTitle = false; |
34 | 34 | /** threshold for filtering suggestions */ |
35 | 35 | protected int firstHitRank = 0; |
| 36 | + /** Words found in titles */ |
| 37 | + protected HashSet<String> foundInTitles = new HashSet<String>(); |
36 | 38 | |
37 | 39 | public SearchResults(){ |
38 | 40 | success = false; |
— | — | @@ -130,6 +132,12 @@ |
131 | 133 | public void addToFirstHitRank(int rank){ |
132 | 134 | firstHitRank += rank; |
133 | 135 | } |
| 136 | + public HashSet<String> getFoundInTitles() { |
| 137 | + return foundInTitles; |
| 138 | + } |
| 139 | + public void setFoundInTitles(HashSet<String> foundInTitles) { |
| 140 | + this.foundInTitles = foundInTitles; |
| 141 | + } |
134 | 142 | |
135 | 143 | @Override |
136 | 144 | public String toString() { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java |
— | — | @@ -52,11 +52,12 @@ |
53 | 53 | return new PrefixIndexBuilder(iid,Links.openStandalone(iid),null); |
54 | 54 | } |
55 | 55 | /** Builder for incremental updates to precursor index */ |
56 | | - static public PrefixIndexBuilder forPrecursorModification(IndexId iid, Links links) throws IOException{ |
| 56 | + static public PrefixIndexBuilder forPrecursorModification(IndexId iid) throws IOException{ |
| 57 | + iid = iid.getPrefix(); |
57 | 58 | IndexWriter writer = WikiIndexModifier.openForWrite(iid.getPrecursor().getIndexPath(),false,new PrefixAnalyzer()); |
58 | 59 | writer.setMergeFactor(20); |
59 | 60 | writer.setMaxBufferedDocs(500); |
60 | | - return new PrefixIndexBuilder(iid,links,writer); |
| 61 | + return new PrefixIndexBuilder(iid,null,writer); |
61 | 62 | } |
62 | 63 | |
63 | 64 | private PrefixIndexBuilder(IndexId iid, Links links, IndexWriter writer) throws IOException { |
— | — | @@ -177,15 +178,17 @@ |
178 | 179 | else return -1; |
179 | 180 | } |
180 | 181 | }); |
181 | | - HashSet<String> selectedRedirects = new HashSet<String>(); |
| 182 | + // hash set of selected articles and places they redirect to |
| 183 | + HashSet<String> selectedWithRedirects = new HashSet<String>(); |
182 | 184 | ArrayList<String> selected = new ArrayList<String>(); |
183 | 185 | for(int i=0;i<perPrefix && i<sorted.size();i++){ |
184 | 186 | String key = sorted.get(i).getKey(); |
185 | 187 | String redirect = redirects.get(key); |
186 | | - if(redirect == null || !selectedRedirects.contains(redirect)){ |
| 188 | + if((redirect == null || !selectedWithRedirects.contains(redirect)) |
| 189 | + && !selectedWithRedirects.contains(key)){ |
187 | 190 | selected.add(key); |
188 | | - selectedRedirects.add(redirect); |
189 | | - selectedRedirects.add(key); |
| 191 | + selectedWithRedirects.add(key); |
| 192 | + selectedWithRedirects.add(redirect); |
190 | 193 | } |
191 | 194 | } |
192 | 195 | Document d = new Document(); |
— | — | @@ -213,7 +216,7 @@ |
214 | 217 | writer.optimize(); |
215 | 218 | writer.close(); |
216 | 219 | |
217 | | - IndexThread.makeIndexSnapshot(prefixIid,path); |
| 220 | + IndexThread.makeIndexSnapshot(prefixIid,prefixIid.getImportPath()); |
218 | 221 | } |
219 | 222 | |
220 | 223 | public static String strip(String s){ |
— | — | @@ -230,15 +233,23 @@ |
231 | 234 | private static double lengthCoeff(String key, String prefix) { |
232 | 235 | return 1; |
233 | 236 | } |
234 | | - /** Modify a precursor index entry */ |
235 | | - protected void modifyPrecursor(String key) throws IOException{ |
236 | | - writer.deleteDocuments(new Term("key",key)); |
237 | | - addToPrecursor(key); |
238 | | - } |
| 237 | + |
| 238 | + |
239 | 239 | /** Add a new precursor index entry */ |
240 | 240 | protected void addToPrecursor(String key) throws IOException{ |
241 | 241 | int ref = links.getNumInLinks(key); |
242 | 242 | String redirect = links.getRedirectTarget(key); |
| 243 | + String pageid = links.getPageId(key); |
| 244 | + addToPrecursor(key,ref,redirect,pageid); |
| 245 | + } |
| 246 | + |
| 247 | + /** Modify a precursor index entry */ |
| 248 | + public void deleteFromPrecursor(String pageId) throws IOException{ |
| 249 | + writer.deleteDocuments(new Term("pageid",pageId)); |
| 250 | + } |
| 251 | + |
| 252 | + /** Add a new precursor index entry */ |
| 253 | + public void addToPrecursor(String key, int ref, String redirect, String pageId) throws IOException{ |
243 | 254 | String strippedKey = strip(key); |
244 | 255 | String strippedTarget = redirect==null? null : strip(redirect); |
245 | 256 | if(redirect == null); |
— | — | @@ -248,6 +259,7 @@ |
249 | 260 | return; // ignore redirects like byzantine -> byzantine empire |
250 | 261 | // add to index |
251 | 262 | Document d = new Document(); |
| 263 | + d.add(new Field("pageid",pageId,Field.Store.NO,Field.Index.UN_TOKENIZED)); |
252 | 264 | d.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
253 | 265 | ArrayList<Token> canonized = canonize(key,iid,filters); |
254 | 266 | for(Token t : canonized){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -62,7 +62,7 @@ |
63 | 63 | if(original != null) |
64 | 64 | this.suffix = original.getTitlesSuffix(); |
65 | 65 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
66 | | - langCode = global.getLanguage(iid.getDBname()); |
| 66 | + langCode = iid.getLangCode(); |
67 | 67 | FieldBuilder.Case dCase = (global.exactCaseIndex(iid.getDBname()))? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE; |
68 | 68 | builder = new FieldBuilder(iid,dCase); |
69 | 69 | indexes = new HashMap<String,IndexWriter>(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -143,7 +143,7 @@ |
144 | 144 | if(makeIndex){ |
145 | 145 | if(!useOldRelated){ |
146 | 146 | try { |
147 | | - RelatedBuilder.rebuildFromLinksNew(iid); |
| 147 | + RelatedBuilder.rebuildFromLinks(iid); |
148 | 148 | } catch (IOException e) { |
149 | 149 | log.fatal("Cannot make related mapping: "+e.getMessage()); |
150 | 150 | return; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -731,6 +731,10 @@ |
732 | 732 | return GlobalConfiguration.getIndexId(dbname+".spell") != null; |
733 | 733 | } |
734 | 734 | |
| 735 | + public boolean hasPrefix(){ |
| 736 | + return GlobalConfiguration.getIndexId(dbname+".prefix") != null; |
| 737 | + } |
| 738 | + |
735 | 739 | /** Get the coresponding spell words iid */ |
736 | 740 | public IndexId getSpell() { |
737 | 741 | return get(dbname+".spell"); |
— | — | @@ -787,7 +791,7 @@ |
788 | 792 | } |
789 | 793 | |
790 | 794 | /** Get if this is index that doesn't capitalize first letters of articles */ |
791 | | - public boolean getExactCase(){ |
| 795 | + public boolean isExactCase(){ |
792 | 796 | if(exactCase == null) |
793 | 797 | exactCase = GlobalConfiguration.getInstance().exactCaseIndex(dbname); |
794 | 798 | return exactCase; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -1158,10 +1158,6 @@ |
1159 | 1159 | public boolean isMyHost(String host) { |
1160 | 1160 | return host.equalsIgnoreCase(hostAddr) || host.equalsIgnoreCase(hostName); |
1161 | 1161 | } |
1162 | | - |
1163 | | - public String getLanguage(IndexId iid){ |
1164 | | - return getLanguage(iid.getDBname()); |
1165 | | - } |
1166 | 1162 | |
1167 | 1163 | /** Get language for a dbname */ |
1168 | 1164 | public String getLanguage(String dbname) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/StartupManager.java |
— | — | @@ -41,9 +41,9 @@ |
42 | 42 | // preload localizations |
43 | 43 | HashSet<String> langCodes = new HashSet<String>(); |
44 | 44 | for(IndexId iid : global.getMyIndex()) |
45 | | - langCodes.add(global.getLanguage(iid.getDBname())); |
| 45 | + langCodes.add(iid.getLangCode()); |
46 | 46 | for(IndexId iid : global.getMySearch()) |
47 | | - langCodes.add(global.getLanguage(iid.getDBname())); |
| 47 | + langCodes.add(iid.getLangCode()); |
48 | 48 | Localization.readLocalizations(langCodes); |
49 | 49 | Localization.loadInterwiki(); |
50 | 50 | // preload the unicode decomposer |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java |
— | — | @@ -250,7 +250,12 @@ |
251 | 251 | protected void updateCache(SearcherCache.SearcherPool pool, LocalIndex li){ |
252 | 252 | // do some typical queries to preload some lucene caches, pages into memory, etc.. |
253 | 253 | for(IndexSearcherMul is : pool.searchers){ |
254 | | - Warmup.warmupIndexSearcher(is,li.iid,true); |
| 254 | + try{ |
| 255 | + Warmup.warmupIndexSearcher(is,li.iid,true); |
| 256 | + } catch(IOException e){ |
| 257 | + e.printStackTrace(); |
| 258 | + log.warn("Error warmup up "+li+" : "+e.getMessage()); |
| 259 | + } |
255 | 260 | } |
256 | 261 | // add to cache |
257 | 262 | cache.invalidateLocalSearcher(li.iid,pool); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -3,9 +3,12 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.Collection; |
| 7 | +import java.util.HashSet; |
7 | 8 | import java.util.Hashtable; |
8 | 9 | |
9 | 10 | import org.apache.log4j.Logger; |
| 11 | +import org.apache.lucene.analysis.SimpleAnalyzer; |
| 12 | +import org.apache.lucene.index.IndexReader; |
10 | 13 | import org.apache.lucene.index.Term; |
11 | 14 | import org.apache.lucene.search.Hits; |
12 | 15 | import org.apache.lucene.search.Query; |
— | — | @@ -13,6 +16,7 @@ |
14 | 17 | import org.wikimedia.lsearch.analyzers.Analyzers; |
15 | 18 | import org.wikimedia.lsearch.analyzers.FieldBuilder; |
16 | 19 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
| 20 | +import org.wikimedia.lsearch.analyzers.StopWords; |
17 | 21 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
18 | 22 | import org.wikimedia.lsearch.benchmark.SampleTerms; |
19 | 23 | import org.wikimedia.lsearch.benchmark.Terms; |
— | — | @@ -20,6 +24,7 @@ |
21 | 25 | import org.wikimedia.lsearch.config.Configuration; |
22 | 26 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
23 | 27 | import org.wikimedia.lsearch.config.IndexId; |
| 28 | +import org.wikimedia.lsearch.spell.Suggest; |
24 | 29 | |
25 | 30 | /** |
26 | 31 | * Methods to warm up index and preload caches. |
— | — | @@ -33,41 +38,57 @@ |
34 | 39 | protected static Hashtable<String,Terms> langTerms = new Hashtable<String,Terms>(); |
35 | 40 | |
36 | 41 | /** Runs some typical queries on a local index searcher to preload caches, pages into memory, etc .. */ |
37 | | - public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay){ |
| 42 | + public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay) throws IOException { |
38 | 43 | if(iid.isLinks() || iid.isPrecursor()) |
39 | 44 | return; // no warmaup for these |
40 | 45 | log.info("Warming up index "+iid+" ..."); |
41 | 46 | long start = System.currentTimeMillis(); |
| 47 | + IndexReader reader = is.getIndexReader(); |
42 | 48 | |
43 | 49 | if(global == null) |
44 | 50 | global = GlobalConfiguration.getInstance(); |
45 | 51 | |
46 | 52 | Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup"); |
47 | | - if(iid.isSpell() || iid.isPrefix()); // no warmup for spell-chekers and prefixes (for now) |
48 | | - else if(warmup == null){ |
49 | | - makeNamespaceFilters(is,iid); |
50 | | - simpleWarmup(is,iid); |
51 | | - log.info("Warmed up "+iid); |
52 | | - } |
53 | | - else{ |
54 | | - int count; |
55 | | - try{ |
56 | | - count = Integer.parseInt(warmup.get("count")); |
57 | | - } catch(Exception e){ |
58 | | - log.warn("Wrong parameters for warmup of database "+iid+" in global settings"); |
59 | | - simpleWarmup(is,iid); |
60 | | - return; |
| 53 | + int count = warmup!=null? Integer.parseInt(warmup.get("count")) : 0; |
| 54 | + if(iid.isSpell() && count > 0){ |
| 55 | + Terms terms = getTermsForLang(iid.getLangCode()); |
| 56 | + Suggest sug = new Suggest(iid,is,false); |
| 57 | + WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid)); |
| 58 | + for(int i=0;i<count;i++){ |
| 59 | + String searchterm = terms.next(); |
| 60 | + sug.suggest(searchterm,parser.tokenizeBareText(searchterm),new Suggest.ExtraInfo(),new NamespaceFilter()); |
61 | 61 | } |
62 | | - makeNamespaceFilters(is,iid); |
63 | | - warmupSearchTerms(is,iid,count,useDelay); |
64 | | - long delta = System.currentTimeMillis() - start; |
65 | | - log.info("Warmed up "+iid+" in "+delta+" ms"); |
66 | | - } |
| 62 | + } else if((iid.isPrefix() || iid.isHighlight() || iid.isRelated()) && count > 0 && !iid.isTitlesBySuffix()){ |
| 63 | + // NOTE: this might not warmup all caches, but should read stuff into memory buffers |
| 64 | + for(int i=0;i<count;i++){ |
| 65 | + int docid = (int)(Math.random()*is.maxDoc()); |
| 66 | + reader.document(docid).get("key"); |
| 67 | + } |
| 68 | + } else{ |
| 69 | + // normal indexes |
| 70 | + if(count == 0){ |
| 71 | + makeNamespaceFilters(is,iid); |
| 72 | + simpleWarmup(is,iid); |
| 73 | + } else{ |
| 74 | + makeNamespaceFilters(is,iid); |
| 75 | + warmupWithSearchTerms(is,iid,count,useDelay); |
| 76 | + } |
| 77 | + // wait for aggregate fields to be cached |
| 78 | + while(AggregateMetaField.isBeingCached(reader)){ |
| 79 | + try { |
| 80 | + Thread.sleep(100); |
| 81 | + } catch (InterruptedException e) { |
| 82 | + e.printStackTrace(); |
| 83 | + } |
| 84 | + } |
| 85 | + } |
| 86 | + long delta = System.currentTimeMillis() - start; |
| 87 | + log.info("Warmed up "+iid+" in "+delta+" ms"); |
67 | 88 | } |
68 | 89 | |
69 | 90 | /** Warmup index using some number of simple searches */ |
70 | | - protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) { |
71 | | - String lang = global.getLanguage(iid.getDBname()); |
| 91 | + protected static void warmupWithSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) { |
| 92 | + String lang = iid.getLangCode(); |
72 | 93 | FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder(); |
73 | 94 | WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null); |
74 | 95 | Terms terms = getTermsForLang(lang); |
— | — | @@ -79,7 +100,7 @@ |
80 | 101 | for(int j =0; j<20 && j<hits.length(); j++) |
81 | 102 | hits.doc(j); // retrieve some documents |
82 | 103 | if(useDelay){ |
83 | | - if(i<1000) |
| 104 | + if(i<1000) |
84 | 105 | Thread.sleep(100); |
85 | 106 | else |
86 | 107 | Thread.sleep(50); |
— | — | @@ -126,7 +147,6 @@ |
127 | 148 | /** Just run one complex query and rebuild the main namespace filter */ |
128 | 149 | public static void simpleWarmup(IndexSearcherMul is, IndexId iid){ |
129 | 150 | try{ |
130 | | - String lang = global.getLanguage(iid.getDBname()); |
131 | 151 | FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder(); |
132 | 152 | WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null); |
133 | 153 | Query q = parser.parse("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons"); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java |
— | — | @@ -73,6 +73,21 @@ |
74 | 74 | return makeQueryFromTerms(terms,field); |
75 | 75 | } |
76 | 76 | |
| 77 | + /** Make terms array for phrases */ |
| 78 | + public Term[] makeTerms(String wildcard, String field){ |
| 79 | + HashSet<String> terms = getCached(wildcard); |
| 80 | + if(terms.size() == 0) |
| 81 | + return null; // no match or error |
| 82 | + |
| 83 | + trimTerms(terms); |
| 84 | + Term[] ret = new Term[terms.size()]; |
| 85 | + int i = 0; |
| 86 | + for(String w : terms) |
| 87 | + ret[i++] = new Term(field,w); |
| 88 | + return ret; |
| 89 | + |
| 90 | + } |
| 91 | + |
77 | 92 | protected HashSet<String> getCached(String wildcard){ |
78 | 93 | if(client == null) |
79 | 94 | client = new RMIMessengerClient(); |
— | — | @@ -99,6 +114,16 @@ |
100 | 115 | |
101 | 116 | /** Construct DijunctionMaxQuery from terms */ |
102 | 117 | protected Query makeQueryFromTerms(HashSet<String> terms, String field){ |
| 118 | + trimTerms(terms); |
| 119 | + |
| 120 | + DisjunctionMaxQuery q = new DisjunctionMaxQuery(0); |
| 121 | + for(String t : terms){ |
| 122 | + q.add(new TermQuery(new Term(field,t))); |
| 123 | + } |
| 124 | + return q; |
| 125 | + } |
| 126 | + |
| 127 | + private void trimTerms(HashSet<String> terms) { |
103 | 128 | if(terms.size() > MAX_TERMS){ |
104 | 129 | HashSet<String> temp = new HashSet<String>(); |
105 | 130 | int count = 0; |
— | — | @@ -110,13 +135,8 @@ |
111 | 136 | } |
112 | 137 | terms = temp; |
113 | 138 | } |
114 | | - DisjunctionMaxQuery q = new DisjunctionMaxQuery(0); |
115 | | - for(String t : terms){ |
116 | | - q.add(new TermQuery(new Term(field,t))); |
117 | | - } |
118 | | - return q; |
119 | 139 | } |
120 | | - |
| 140 | + |
121 | 141 | public boolean hasWildcards(){ |
122 | 142 | return wildcardCache.size() > 0; |
123 | 143 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Fuzzy.java |
— | — | @@ -54,6 +54,14 @@ |
55 | 55 | |
56 | 56 | } |
57 | 57 | |
| 58 | + public ArrayList<Float> getBoosts(String word, NamespaceFilter nsf, Term[] tt){ |
| 59 | + ArrayList<Float> boost = new ArrayList<Float>(); |
| 60 | + HashMap<String,Float> terms = getCached(word,nsf); |
| 61 | + for(Term t : tt) |
| 62 | + boost.add(terms.get(t.text())); |
| 63 | + return boost; |
| 64 | + } |
| 65 | + |
58 | 66 | public ArrayList<Float> getBoosts(String word, NamespaceFilter nsf, ArrayList<String> words){ |
59 | 67 | ArrayList<Float> boost = new ArrayList<Float>(); |
60 | 68 | HashMap<String,Float> terms = getCached(word,nsf); |
— | — | @@ -76,6 +84,20 @@ |
77 | 85 | // actually make query |
78 | 86 | return makeQueryFromTerms(terms, field); |
79 | 87 | } |
| 88 | + /** Make a term array without boost */ |
| 89 | + public Term[] makeTerms(String word, String field, NamespaceFilter nsf){ |
| 90 | + if(client == null) |
| 91 | + client = new RMIMessengerClient(); |
| 92 | + HashMap<String,Float> terms = getCached(word,nsf); |
| 93 | + if(terms.size() == 0) |
| 94 | + return null; |
| 95 | + |
| 96 | + Term[] ret = new Term[terms.size()]; |
| 97 | + int i=0; |
| 98 | + for(String w : terms.keySet()) |
| 99 | + ret[i++] = new Term(field,w); |
| 100 | + return ret; |
| 101 | + } |
80 | 102 | |
81 | 103 | protected HashMap<String,Float> getCached(String word, NamespaceFilter nsf){ |
82 | 104 | String key = cacheKey(word,nsf); |
— | — | @@ -99,7 +121,10 @@ |
100 | 122 | /** Calculate boost factor for suggest result - larger edit distance = smaller boost */ |
101 | 123 | protected float getBoost(SuggestResult r){ |
102 | 124 | int dist = r.getDist()+r.getDistMetaphone(); |
103 | | - return (float)(1.0/Math.pow(2,dist)); |
| 125 | + double d = r.getDist(); |
| 126 | + double l = r.getWord().length(); |
| 127 | + // 2^(-dist) * len_prop * 2^E(dist) |
| 128 | + return (float)((1.0/Math.pow(2,dist))*((l-d)/l)*4); |
104 | 129 | } |
105 | 130 | |
106 | 131 | private Query makeQueryFromTerms(HashMap<String,Float> terms, String field) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -41,10 +41,12 @@ |
42 | 42 | import org.wikimedia.lsearch.frontend.SearchServer; |
43 | 43 | import org.wikimedia.lsearch.highlight.Highlight; |
44 | 44 | import org.wikimedia.lsearch.highlight.HighlightResult; |
| 45 | +import org.wikimedia.lsearch.index.MessengerThread; |
45 | 46 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
46 | 47 | import org.wikimedia.lsearch.ranks.StringList; |
47 | 48 | import org.wikimedia.lsearch.related.Related; |
48 | 49 | import org.wikimedia.lsearch.related.RelatedTitle; |
| 50 | +import org.wikimedia.lsearch.spell.Suggest; |
49 | 51 | import org.wikimedia.lsearch.spell.SuggestQuery; |
50 | 52 | import org.wikimedia.lsearch.util.Localization; |
51 | 53 | |
— | — | @@ -79,24 +81,28 @@ |
80 | 82 | } |
81 | 83 | |
82 | 84 | /** Main search method, call this from the search frontend */ |
83 | | - public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) { |
| 85 | + public SearchResults search(IndexId iid, String what, String searchterm, HashMap query, double version) { |
84 | 86 | |
85 | 87 | if (what.equals("search") || what.equals("explain")) { |
86 | 88 | int offset = 0, limit = 100; boolean exactCase = false; |
87 | | - int iwlimit = 10; |
| 89 | + int iwlimit = 10; int iwoffset = 0; |
88 | 90 | boolean searchOnly = false; |
89 | 91 | if (query.containsKey("offset")) |
90 | 92 | offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
91 | 93 | if (query.containsKey("limit")) |
92 | 94 | limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES); |
| 95 | + if (query.containsKey("iwoffset")) |
| 96 | + iwoffset = Math.max(Integer.parseInt((String)query.get("iwoffset")), 0); |
93 | 97 | if (query.containsKey("iwlimit")) |
94 | 98 | iwlimit = Math.min(Integer.parseInt((String)query.get("iwlimit")), MAXLINES); |
95 | 99 | if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact")) |
96 | 100 | exactCase = true; |
97 | 101 | if(query.containsKey("searchonly")) |
98 | 102 | searchOnly = Boolean.parseBoolean((String)query.get("searchonly")); |
| 103 | + if(version <= 2) |
| 104 | + searchOnly = true; |
99 | 105 | NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
100 | | - SearchResults res = search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly); |
| 106 | + SearchResults res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly); |
101 | 107 | if(res!=null && res.isRetry()){ |
102 | 108 | int retries = 0; |
103 | 109 | if(iid.isSplit() || iid.isNssplit()){ |
— | — | @@ -105,7 +111,7 @@ |
106 | 112 | retries = 1; |
107 | 113 | |
108 | 114 | while(retries > 0 && res.isRetry()){ |
109 | | - res = search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly); |
| 115 | + res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly); |
110 | 116 | retries--; |
111 | 117 | } |
112 | 118 | if(res.isRetry()) |
— | — | @@ -114,24 +120,26 @@ |
115 | 121 | return res; |
116 | 122 | } else if (what.equals("raw") || what.equals("rawexplain")) { |
117 | 123 | int offset = 0, limit = 100; boolean exactCase = false; |
118 | | - int iwlimit = 10; |
| 124 | + int iwlimit = 10; int iwoffset = 0; |
119 | 125 | if (query.containsKey("offset")) |
120 | 126 | offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
121 | 127 | if (query.containsKey("limit")) |
122 | 128 | limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES); |
| 129 | + if (query.containsKey("iwoffset")) |
| 130 | + iwoffset = Math.max(Integer.parseInt((String)query.get("iwoffset")), 0); |
123 | 131 | if (query.containsKey("iwlimit")) |
124 | 132 | iwlimit = Math.min(Integer.parseInt((String)query.get("iwlimit")), MAXLINES); |
125 | 133 | if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact")) |
126 | 134 | exactCase = true; |
127 | 135 | NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
128 | | - return search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("rawexplain"), exactCase, true, true); |
| 136 | + return search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("rawexplain"), exactCase, true, true); |
129 | 137 | } else if (what.equals("titlematch")) { |
130 | 138 | // TODO: return searchTitles(searchterm); |
131 | 139 | } else if (what.equals("prefix")){ |
132 | 140 | int limit = MAXPREFIX; |
133 | 141 | if (query.containsKey("limit")) |
134 | 142 | limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXPREFIX); |
135 | | - SearchResults res = prefixSearch(iid, searchterm, limit); |
| 143 | + SearchResults res = searchPrefix(iid, searchterm, limit); |
136 | 144 | if(query.containsKey("format")){ |
137 | 145 | String format = (String)query.get("format"); |
138 | 146 | if(format.equalsIgnoreCase("json")) |
— | — | @@ -146,7 +154,7 @@ |
147 | 155 | offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
148 | 156 | if (query.containsKey("limit")) |
149 | 157 | limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES); |
150 | | - return relatedSearch(iid, searchterm, offset, limit); |
| 158 | + return searchRelated(iid, searchterm, offset, limit); |
151 | 159 | } else { |
152 | 160 | SearchResults res = new SearchResults(); |
153 | 161 | res.setErrorMsg("Unrecognized search type. Try one of: " + |
— | — | @@ -184,43 +192,47 @@ |
185 | 193 | return ""; |
186 | 194 | } |
187 | 195 | |
188 | | - protected SearchResults relatedSearch(IndexId iid, String searchterm, int offset, int limit) { |
| 196 | + protected SearchResults searchRelated(IndexId iid, String searchterm, int offset, int limit) { |
| 197 | + RMIMessengerClient messenger = new RMIMessengerClient(); |
| 198 | + String host = cache.getRandomHost(iid.getRelated()); |
| 199 | + return messenger.searchRelated(host,iid.toString(),searchterm,offset,limit); |
| 200 | + |
| 201 | + } |
| 202 | + |
| 203 | + /** Search on a local related index (called via RMI) */ |
| 204 | + public SearchResults searchRelatedLocal(IndexId iid, String searchterm, int offset, int limit) throws IOException { |
189 | 205 | readLocalization(iid); |
190 | 206 | IndexId rel = iid.getRelated(); |
191 | 207 | SearcherCache cache = SearcherCache.getInstance(); |
192 | 208 | SearchResults res = new SearchResults(); |
193 | | - try { |
194 | | - IndexSearcherMul searcher = cache.getLocalSearcher(rel); |
195 | | - IndexReader reader = searcher.getIndexReader(); |
196 | | - String key = getKey(searchterm,iid); |
197 | | - TermDocs td = reader.termDocs(new Term("key",key)); |
198 | | - if(td.next()){ |
199 | | - ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection()); |
200 | | - res.setNumHits(col.size()); |
201 | | - res.setSuccess(true); |
202 | | - for(int i=offset;i<offset+limit && i<col.size();i++){ |
203 | | - RelatedTitle rt = col.get(i); |
204 | | - Title t = rt.getRelated(); |
205 | | - ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle()); |
206 | | - res.addResult(rs); |
207 | | - } |
208 | | - // highlight stuff |
209 | | - Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid); |
210 | | - NamespaceFilter nsDefault = new NamespaceFilter(key.substring(0,key.indexOf(':'))); |
211 | | - FieldBuilder.BuilderSet bs = new FieldBuilder(iid).getBuilder(); |
212 | | - HashSet<String> stopWords = StopWords.getPredefinedSet(iid); |
213 | | - WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,NamespacePolicy.IGNORE,stopWords); |
214 | | - Query q = parser.parse(key.substring(key.indexOf(':')+1),new WikiQueryParser.ParsingOptions(true)); |
215 | | - highlight(iid,q,parser.getWordsClean(),searcher,res,parser.hasPhrases()); |
216 | | - } else{ |
217 | | - res.setSuccess(true); |
218 | | - res.setNumHits(0); |
| 209 | + |
| 210 | + IndexSearcherMul searcher = cache.getLocalSearcher(rel); |
| 211 | + IndexReader reader = searcher.getIndexReader(); |
| 212 | + String key = getKey(searchterm,iid); |
| 213 | + TermDocs td = reader.termDocs(new Term("key",key)); |
| 214 | + if(td.next()){ |
| 215 | + ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection()); |
| 216 | + res.setNumHits(col.size()); |
| 217 | + res.setSuccess(true); |
| 218 | + for(int i=offset;i<offset+limit && i<col.size();i++){ |
| 219 | + RelatedTitle rt = col.get(i); |
| 220 | + Title t = rt.getRelated(); |
| 221 | + ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle()); |
| 222 | + res.addResult(rs); |
219 | 223 | } |
220 | | - } catch (IOException e) { |
221 | | - e.printStackTrace(); |
222 | | - log.error("I/O error in relatedSearch on "+rel+" : "+e.getMessage()); |
223 | | - res.setErrorMsg("I/O Error processing index for "+rel); |
| 224 | + // highlight stuff |
| 225 | + Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid); |
| 226 | + NamespaceFilter nsDefault = new NamespaceFilter(key.substring(0,key.indexOf(':'))); |
| 227 | + FieldBuilder.BuilderSet bs = new FieldBuilder(iid).getBuilder(); |
| 228 | + HashSet<String> stopWords = StopWords.getPredefinedSet(iid); |
| 229 | + WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,NamespacePolicy.IGNORE,stopWords); |
| 230 | + Query q = parser.parse(key.substring(key.indexOf(':')+1),new WikiQueryParser.ParsingOptions(true)); |
| 231 | + highlight(iid,q,parser.getWordsClean(),searcher,res,true,true); |
| 232 | + } else{ |
| 233 | + res.setSuccess(true); |
| 234 | + res.setNumHits(0); |
224 | 235 | } |
| 236 | + |
225 | 237 | return res; |
226 | 238 | } |
227 | 239 | |
— | — | @@ -236,7 +248,7 @@ |
237 | 249 | } |
238 | 250 | } |
239 | 251 | |
240 | | - protected SearchResults prefixSearch(IndexId iid, String searchterm, int limit) { |
| 252 | + protected SearchResults searchPrefix(IndexId iid, String searchterm, int limit) { |
241 | 253 | readLocalization(iid); |
242 | 254 | IndexId pre = iid.getPrefix(); |
243 | 255 | SearcherCache cache = SearcherCache.getInstance(); |
— | — | @@ -313,7 +325,7 @@ |
314 | 326 | // search |
315 | 327 | SearchResults res = makeTitlesSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
316 | 328 | // highlight |
317 | | - highlightTitles(iid,q,words,searcher,res,sortByPhrases); |
| 329 | + highlightTitles(iid,q,words,searcher,res,sortByPhrases,false); |
318 | 330 | return res; |
319 | 331 | } catch (IOException e) { |
320 | 332 | e.printStackTrace(); |
— | — | @@ -362,7 +374,8 @@ |
363 | 375 | * Search on iid, with query searchterm. View results from offset to offset+limit, using |
364 | 376 | * the default namespaces filter |
365 | 377 | */ |
366 | | - public SearchResults search(IndexId iid, String searchterm, int offset, int limit, int iwlimit, NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw, boolean searchOnly){ |
| 378 | + public SearchResults search(IndexId iid, String searchterm, int offset, int limit, int iwoffset, int iwlimit, |
| 379 | + NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw, boolean searchOnly){ |
367 | 380 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
368 | 381 | if(nsDefault == null || nsDefault.cardinality() == 0) |
369 | 382 | nsDefault = new NamespaceFilter("0"); // default to main namespace |
— | — | @@ -429,8 +442,8 @@ |
430 | 443 | HighlightPack pack = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host); |
431 | 444 | res = pack.res; |
432 | 445 | if(!searchOnly){ |
433 | | - highlight(iid,q,parser.getWordsClean(),pack.terms,pack.dfs,pack.maxDoc,res,exactCase,null,parser.hasPhrases()); |
434 | | - fetchTitles(res,searchterm,nsfw,iid,parser,offset,0,iwlimit,explain); |
| 446 | + highlight(iid,q,parser.getWordsClean(),pack.terms,pack.dfs,pack.maxDoc,res,exactCase,null,parser.hasPhrases(),false); |
| 447 | + fetchTitles(res,searchterm,nsfw,iid,parser,offset,iwoffset,iwlimit,explain); |
435 | 448 | suggest(iid,searchterm,parser,res,offset,nsfw); |
436 | 449 | } |
437 | 450 | return res; |
— | — | @@ -459,8 +472,8 @@ |
460 | 473 | hits = searcher.search(q,nsfw,offset+limit); |
461 | 474 | res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
462 | 475 | if(!searchOnly){ |
463 | | - highlight(iid,q,parser.getWordsClean(),searcher,parser.getHighlightTerms(),res,exactCase,parser.hasPhrases()); |
464 | | - fetchTitles(res,searchterm,nsfw,iid,parser,offset,0,iwlimit,explain); |
| 476 | + highlight(iid,q,parser.getWordsClean(),searcher,parser.getHighlightTerms(),res,exactCase,parser.hasPhrases(),false); |
| 477 | + fetchTitles(res,searchterm,nsfw,iid,parser,offset,iwoffset,iwlimit,explain); |
465 | 478 | suggest(iid,searchterm,parser,res,offset,nsfw); |
466 | 479 | } |
467 | 480 | return res; |
— | — | @@ -506,7 +519,8 @@ |
507 | 520 | RMIMessengerClient messenger = new RMIMessengerClient(); |
508 | 521 | // find host |
509 | 522 | String host = cache.getRandomHost(iid.getSpell()); |
510 | | - SuggestQuery sq = messenger.suggest(host,iid.toString(),searchterm,tokens,res.getPhrases(),res.getFoundInContext(),res.getFirstHitRank(),nsfw.getFilter()); |
| 523 | + Suggest.ExtraInfo info = new Suggest.ExtraInfo(res.getPhrases(),res.getFoundInContext(),res.getFoundInTitles(),res.getFirstHitRank()); |
| 524 | + SuggestQuery sq = messenger.suggest(host,iid.toString(),searchterm,tokens,info,nsfw.getFilter()); |
511 | 525 | res.setSuggest(sq); |
512 | 526 | } |
513 | 527 | } |
— | — | @@ -595,7 +609,7 @@ |
596 | 610 | |
597 | 611 | TopDocs hits = searcher.search(q,wrap,iwoffset+iwlimit); |
598 | 612 | SearchResults r = makeTitlesSearchResults(searcher,hits,iwoffset,iwlimit,main,searchterm,q,searchStart,explain); |
599 | | - highlightTitles(main,q,words,searcher,r,parser.hasWildcards()); |
| 613 | + highlightTitles(main,q,words,searcher,r,parser.hasWildcards(),false); |
600 | 614 | |
601 | 615 | if(r.isSuccess()){ |
602 | 616 | res.setTitles(r.getResults()); |
— | — | @@ -697,38 +711,38 @@ |
698 | 712 | } |
699 | 713 | |
700 | 714 | /** Highlight search results, and set the property in ResultSet */ |
701 | | - protected void highlight(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, Term[] terms, SearchResults res, boolean exactCase, boolean sortByPhrases) throws IOException{ |
| 715 | + protected void highlight(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, Term[] terms, SearchResults res, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{ |
702 | 716 | int[] df = searcher.docFreqs(terms); |
703 | 717 | int maxDoc = searcher.maxDoc(); |
704 | | - highlight(iid,q,words,terms,df,maxDoc,res,exactCase,null,sortByPhrases); |
| 718 | + highlight(iid,q,words,terms,df,maxDoc,res,exactCase,null,sortByPhrases,alwaysIncludeFirst); |
705 | 719 | } |
706 | 720 | |
707 | 721 | /** Highlight search results, and set the property in ResultSet */ |
708 | | - protected void highlight(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases) throws IOException{ |
| 722 | + protected void highlight(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{ |
709 | 723 | Term[] terms = getTerms(q,"contents"); |
710 | 724 | int[] df = searcher.docFreqs(terms); |
711 | 725 | int maxDoc = searcher.maxDoc(); |
712 | | - highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases); |
| 726 | + highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases,alwaysIncludeFirst); |
713 | 727 | } |
714 | 728 | |
715 | 729 | /** Highlight search results from titles index */ |
716 | | - protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases) throws IOException{ |
| 730 | + protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{ |
717 | 731 | Term[] terms = getTerms(q,"alttitle"); |
718 | 732 | int[] df = searcher.docFreqs(terms); |
719 | 733 | int maxDoc = searcher.maxDoc(); |
720 | | - highlight(iid,q,words,terms,df,maxDoc,res,false,searcher.getIndexReader(),sortByPhrases); |
| 734 | + highlight(iid,q,words,terms,df,maxDoc,res,false,searcher.getIndexReader(),sortByPhrases,alwaysIncludeFirst); |
721 | 735 | } |
722 | 736 | |
723 | 737 | /** Highlight search results from titles index using a wikisearcher */ |
724 | | - protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, SearchResults res, boolean sortByPhrases) throws IOException{ |
| 738 | + protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{ |
725 | 739 | Term[] terms = getTerms(q,"alttitle"); |
726 | 740 | int[] df = searcher.docFreqs(terms); |
727 | 741 | int maxDoc = searcher.maxDoc(); |
728 | | - highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases); |
| 742 | + highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases,alwaysIncludeFirst); |
729 | 743 | } |
730 | 744 | |
731 | 745 | /** Highlight article (don't call directly, use one of the interfaces above instead) */ |
732 | | - protected void highlight(IndexId iid, Query q, ArrayList<String> words, Term[] terms, int[] df, int maxDoc, SearchResults res, boolean exactCase, IndexReader reader, boolean sortByPhrases) throws IOException{ |
| 746 | + protected void highlight(IndexId iid, Query q, ArrayList<String> words, Term[] terms, int[] df, int maxDoc, SearchResults res, boolean exactCase, IndexReader reader, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{ |
733 | 747 | // iid -> array of keys |
734 | 748 | HashMap<IndexId,ArrayList<String>> map = new HashMap<IndexId,ArrayList<String>>(); |
735 | 749 | iid = iid.getHighlight(); |
— | — | @@ -755,17 +769,18 @@ |
756 | 770 | Highlight.ResultSet rs = null; |
757 | 771 | if(reader != null){ |
758 | 772 | // we got a local reader, use it |
759 | | - rs = Highlight.highlight(e.getValue(),hiid,terms,df,maxDoc,words,stopWords,exactCase,reader,sortByPhrases); |
| 773 | + rs = Highlight.highlight(e.getValue(),hiid,terms,df,maxDoc,words,stopWords,exactCase,reader,sortByPhrases,alwaysIncludeFirst); |
760 | 774 | } else{ |
761 | 775 | // remote call |
762 | 776 | String host = cache.getRandomHost(hiid); |
763 | | - rs = messenger.highlight(host,e.getValue(),hiid.toString(),terms,df,maxDoc,words,exactCase,sortByPhrases); |
| 777 | + rs = messenger.highlight(host,e.getValue(),hiid.toString(),terms,df,maxDoc,words,exactCase,sortByPhrases,alwaysIncludeFirst); |
764 | 778 | } |
765 | 779 | results.putAll(rs.highlighted); |
766 | 780 | res.getPhrases().addAll(rs.phrases); |
767 | 781 | res.getFoundInContext().addAll(rs.foundInContext); |
768 | 782 | if(rs.foundAllInTitle && words.size()>1) |
769 | | - res.setFoundAllInTitle(true); |
| 783 | + res.setFoundAllInTitle(true); |
| 784 | + res.getFoundInTitles().addAll(rs.foundInTitles); |
770 | 785 | } |
771 | 786 | } |
772 | 787 | res.addToFirstHitRank(res.getNumHits()); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java |
— | — | @@ -3,6 +3,9 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.Collection; |
6 | 6 | import java.util.HashMap; |
| 7 | +import java.util.HashSet; |
| 8 | +import java.util.Hashtable; |
| 9 | +import java.util.Set; |
7 | 10 | import java.util.StringTokenizer; |
8 | 11 | import java.util.WeakHashMap; |
9 | 12 | |
— | — | @@ -21,10 +24,16 @@ |
22 | 25 | * |
23 | 26 | */ |
24 | 27 | public class AggregateMetaField { |
25 | | - static Logger log = Logger.getLogger(RankField.class); |
| 28 | + static Logger log = Logger.getLogger(AggregateMetaField.class); |
26 | 29 | protected static WeakHashMap<IndexReader,HashMap<String,AggregateMetaFieldSource>> cache = new WeakHashMap<IndexReader,HashMap<String,AggregateMetaFieldSource>>(); |
27 | 30 | protected static Object lock = new Object(); |
| 31 | + protected static Hashtable<IndexReader,AggregateMetaFieldSource> cachingInProgress = new Hashtable<IndexReader,AggregateMetaFieldSource>(); |
28 | 32 | |
| 33 | + /** Check if there is a current background caching on a reader */ |
| 34 | + public static boolean isBeingCached(IndexReader reader){ |
| 35 | + return cachingInProgress.containsKey(reader); |
| 36 | + } |
| 37 | + |
29 | 38 | /** Get a cached field source |
30 | 39 | * @throws IOException */ |
31 | 40 | public static AggregateMetaFieldSource getCachedSource(IndexReader reader, String field) throws IOException{ |
— | — | @@ -64,56 +73,63 @@ |
65 | 74 | |
66 | 75 | protected class CachingThread extends Thread { |
67 | 76 | public void run(){ |
68 | | - log.info("Caching aggregate field "+field+" for "+reader.directory()); |
69 | | - int maxdoc = reader.maxDoc(); |
70 | | - index = new int[maxdoc]; |
71 | | - int count = 0; |
72 | | - length = new byte[maxdoc]; // estimate maxdoc values |
73 | | - lengthNoStopWords = new byte[maxdoc]; |
74 | | - lengthComplete = new byte[maxdoc]; |
75 | | - boost = new float[maxdoc]; |
76 | | - namespaces = new byte[maxdoc]; |
77 | | - for(int i=0;i<maxdoc;i++){ |
78 | | - byte[] stored = null; |
79 | | - try{ |
80 | | - Document doc = reader.document(i); |
81 | | - stored = doc.getBinaryValue(field); |
82 | | - namespaces[i] = (byte)Integer.parseInt(doc.get("namespace")); |
83 | | - index[i] = count; |
84 | | - if(stored == null) |
85 | | - continue; |
86 | | - for(int j=0;j<stored.length/7;j++){ |
87 | | - if(count >= length.length){ |
88 | | - length = extendBytes(length); |
89 | | - lengthNoStopWords = extendBytes(lengthNoStopWords); |
90 | | - lengthComplete = extendBytes(lengthComplete); |
91 | | - boost = extendFloats(boost); |
92 | | - } |
93 | | - length[count] = stored[j*7]; |
94 | | - if(length[count] == 0){ |
95 | | - log.debug("Broken length=0 for docid="+i+", at position "+j); |
96 | | - } |
97 | | - lengthNoStopWords[count] = stored[j*7+1]; |
98 | | - int boostInt = (((stored[j*7+2]&0xff) << 24) + ((stored[j*7+3]&0xff) << 16) + ((stored[j*7+4]&0xff) << 8) + ((stored[j*7+5]&0xff) << 0)); |
99 | | - boost[count] = Float.intBitsToFloat(boostInt); |
100 | | - lengthComplete[count] = stored[j*7+6]; |
101 | | - |
102 | | - count++; |
103 | | - } |
104 | | - } catch(Exception e){ |
105 | | - log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage()); |
106 | | - e.printStackTrace(); |
| 77 | + cachingInProgress.put(reader,AggregateMetaFieldSource.this); |
| 78 | + try{ |
| 79 | + log.info("Caching aggregate field "+field+" for "+reader.directory()); |
| 80 | + int maxdoc = reader.maxDoc(); |
| 81 | + index = new int[maxdoc]; |
| 82 | + int count = 0; |
| 83 | + length = new byte[maxdoc]; // estimate maxdoc values |
| 84 | + lengthNoStopWords = new byte[maxdoc]; |
| 85 | + lengthComplete = new byte[maxdoc]; |
| 86 | + boost = new float[maxdoc]; |
| 87 | + namespaces = new byte[maxdoc]; |
| 88 | + for(int i=0;i<maxdoc;i++){ |
| 89 | + byte[] stored = null; |
| 90 | + try{ |
| 91 | + Document doc = reader.document(i); |
| 92 | + stored = doc.getBinaryValue(field); |
| 93 | + namespaces[i] = (byte)Integer.parseInt(doc.get("namespace")); |
| 94 | + index[i] = count; |
| 95 | + if(stored == null) |
| 96 | + continue; |
| 97 | + for(int j=0;j<stored.length/7;j++){ |
| 98 | + if(count >= length.length){ |
| 99 | + length = extendBytes(length); |
| 100 | + lengthNoStopWords = extendBytes(lengthNoStopWords); |
| 101 | + lengthComplete = extendBytes(lengthComplete); |
| 102 | + boost = extendFloats(boost); |
| 103 | + } |
| 104 | + length[count] = stored[j*7]; |
| 105 | + if(length[count] == 0){ |
| 106 | + log.debug("Broken length=0 for docid="+i+", at position "+j); |
| 107 | + } |
| 108 | + lengthNoStopWords[count] = stored[j*7+1]; |
| 109 | + int boostInt = (((stored[j*7+2]&0xff) << 24) + ((stored[j*7+3]&0xff) << 16) + ((stored[j*7+4]&0xff) << 8) + ((stored[j*7+5]&0xff) << 0)); |
| 110 | + boost[count] = Float.intBitsToFloat(boostInt); |
| 111 | + lengthComplete[count] = stored[j*7+6]; |
| 112 | + |
| 113 | + count++; |
| 114 | + } |
| 115 | + } catch(Exception e){ |
| 116 | + log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage()); |
| 117 | + e.printStackTrace(); |
| 118 | + } |
107 | 119 | } |
| 120 | + // compact arrays |
| 121 | + if(count < length.length - 1){ |
| 122 | + length = resizeBytes(length,count); |
| 123 | + lengthNoStopWords = resizeBytes(lengthNoStopWords,count); |
| 124 | + boost = resizeFloats(boost,count); |
| 125 | + lengthComplete = resizeBytes(lengthComplete,count); |
| 126 | + } |
| 127 | + log.info("Finished caching aggregate "+field+" for "+reader.directory()); |
| 128 | + cachingFinished = true; |
| 129 | + } catch(Exception e){ |
| 130 | + e.printStackTrace(); |
| 131 | + log.error("Whole caching failed on field="+field+", reader="+reader); |
108 | 132 | } |
109 | | - // compact arrays |
110 | | - if(count < length.length - 1){ |
111 | | - length = resizeBytes(length,count); |
112 | | - lengthNoStopWords = resizeBytes(lengthNoStopWords,count); |
113 | | - boost = resizeFloats(boost,count); |
114 | | - lengthComplete = resizeBytes(lengthComplete,count); |
115 | | - } |
116 | | - log.info("Finished caching aggregate "+field+" for "+reader.directory()); |
117 | | - cachingFinished = true; |
| 133 | + cachingInProgress.remove(reader); |
118 | 134 | } |
119 | 135 | protected byte[] extendBytes(byte[] array){ |
120 | 136 | return resizeBytes(array,array.length*2); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java |
— | — | @@ -39,22 +39,24 @@ |
40 | 40 | included = new BitSet(64); |
41 | 41 | } |
42 | 42 | |
| 43 | + /** "all" filter */ |
43 | 44 | public NamespaceFilter() { |
44 | 45 | init(); |
45 | 46 | } |
46 | 47 | |
| 48 | + /** filter namespaces */ |
47 | 49 | public NamespaceFilter(Collection<Integer> namespaces){ |
48 | 50 | init(); |
49 | 51 | for(Integer namespace : namespaces){ |
50 | 52 | included.set(namespace.intValue()); |
51 | 53 | } |
52 | 54 | } |
53 | | - |
| 55 | + /** filter on one namespace */ |
54 | 56 | public NamespaceFilter(int namespace){ |
55 | 57 | init(); |
56 | 58 | included.set(namespace); |
57 | 59 | } |
58 | | - |
| 60 | + /** filter number of namespaces separated by comma, e.g. 0,2,10 */ |
59 | 61 | public NamespaceFilter(String namespaces) { |
60 | 62 | init(); |
61 | 63 | if (namespaces != null && !namespaces.equals("")) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java |
— | — | @@ -70,12 +70,15 @@ |
71 | 71 | public boolean foundAllInTitle; |
72 | 72 | /** Rank of the first hit, used as title-suggestion threshold */ |
73 | 73 | public int firstHitRank = 0; |
74 | | - public ResultSet(HashMap<String, HighlightResult> highlighted, HashSet<String> phrases, HashSet<String> foundInContext, boolean foundAllInTitle, int firstHitRank) { |
| 74 | + /** Words found in titles */ |
| 75 | + public HashSet<String> foundInTitles; |
| 76 | + public ResultSet(HashMap<String, HighlightResult> highlighted, HashSet<String> phrases, HashSet<String> foundInContext, boolean foundAllInTitle, int firstHitRank, HashSet<String> foundInTitles) { |
75 | 77 | this.highlighted = highlighted; |
76 | 78 | this.phrases = phrases; |
77 | 79 | this.foundInContext = foundInContext; |
78 | 80 | this.foundAllInTitle = foundAllInTitle; |
79 | 81 | this.firstHitRank = firstHitRank; |
| 82 | + this.foundInTitles = foundInTitles; |
80 | 83 | } |
81 | 84 | } |
82 | 85 | /** |
— | — | @@ -87,10 +90,12 @@ |
88 | 91 | * @param words - in order words (from main phrase) |
89 | 92 | * @param exactCase - if these are results from exactCase search |
90 | 93 | * @throws IOException |
91 | | - * @returns map: key -> what to highlight |
| 94 | + * @returns resultset |
92 | 95 | */ |
93 | 96 | @SuppressWarnings("unchecked") |
94 | | - public static ResultSet highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, ArrayList<String> words, HashSet<String> stopWords, boolean exactCase, IndexReader reader, boolean sortByPhrases) throws IOException{ |
| 97 | + public static ResultSet highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, |
| 98 | + ArrayList<String> words, HashSet<String> stopWords, boolean exactCase, IndexReader reader, |
| 99 | + boolean sortByPhrases, boolean alwaysIncludeFirstLine) throws IOException{ |
95 | 100 | if(cache == null) |
96 | 101 | cache = SearcherCache.getInstance(); |
97 | 102 | |
— | — | @@ -101,6 +106,7 @@ |
102 | 107 | HashSet<String> inContext = new HashSet<String>(); |
103 | 108 | boolean foundAllInTitle = false; |
104 | 109 | int firstHitRank = 0; |
| 110 | + HashSet<String> inTitle = new HashSet<String>(); |
105 | 111 | |
106 | 112 | // terms weighted with idf |
107 | 113 | HashMap<String,Double> weightTerm = new HashMap<String,Double>(); |
— | — | @@ -140,8 +146,8 @@ |
141 | 147 | firstHitRank = alttitles.getTitle().getRank(); |
142 | 148 | |
143 | 149 | HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles,wordIndex); |
144 | | - ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases ); |
145 | | - ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,wordIndex,1,true,stopWords,false,phrases,inContext,false); |
| 150 | + ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine ); |
| 151 | + ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,wordIndex,1,true,stopWords,false,phrases,inContext,false,false); |
146 | 152 | int redirectAdditional = 0; |
147 | 153 | if(titleSnippets.size()>0 && |
148 | 154 | ((titleSnippets.get(0).found.containsAll(words) && textTokenLength(titleSnippets.get(0).tokens) == words.size()) |
— | — | @@ -219,12 +225,14 @@ |
220 | 226 | hr.setTitle(titleSnippets.get(0).makeSnippet(256,true)); |
221 | 227 | if(titleSnippets.get(0).found.containsAll(words)) |
222 | 228 | foundAllInTitle = true; |
| 229 | + inTitle.addAll(titleSnippets.get(0).found); |
223 | 230 | } |
224 | 231 | |
225 | 232 | if(redirectSnippets != null){ |
226 | 233 | hr.setRedirect(redirectSnippets.makeSnippet(MAX_CONTEXT,true)); |
227 | 234 | if(!foundAllInTitle && redirectSnippets.found.containsAll(words)) |
228 | 235 | foundAllInTitle = true; |
| 236 | + inTitle.addAll(redirectSnippets.found); |
229 | 237 | } |
230 | 238 | |
231 | 239 | if(sectionSnippets != null){ |
— | — | @@ -240,7 +248,7 @@ |
241 | 249 | res.put(key,hr); |
242 | 250 | |
243 | 251 | } |
244 | | - return new ResultSet(res,phrases,inContext,foundAllInTitle,firstHitRank); |
| 252 | + return new ResultSet(res,phrases,inContext,foundAllInTitle,firstHitRank,inTitle); |
245 | 253 | } |
246 | 254 | |
247 | 255 | /** Number of tokens excluding aliases and glue stuff */ |
— | — | @@ -423,7 +431,7 @@ |
424 | 432 | } |
425 | 433 | } |
426 | 434 | if((completeMatch && additional >= minAdditional) || additional > minAdditional || (additional != 0 && additional == notInTitle.size())){ |
427 | | - ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 1, false, stopWords, false, phrases, inContext, false); |
| 435 | + ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false); |
428 | 436 | if(snippets.size() > 0){ |
429 | 437 | RawSnippet snippet = snippets.get(0); |
430 | 438 | snippet.setAlttitle(ainf); |
— | — | @@ -498,7 +506,7 @@ |
499 | 507 | /** Highlight text */ |
500 | 508 | protected static ArrayList<RawSnippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms, |
501 | 509 | HashMap<String,Integer> wordIndex, int maxSnippets, boolean ignoreBreaks, HashSet<String> stopWords, boolean showFirstIfNone, |
502 | | - HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases) { |
| 510 | + HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases, final boolean alwaysIncludeFirstLine) { |
503 | 511 | |
504 | 512 | // pieces of text to ge highlighted |
505 | 513 | ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>(); |
— | — | @@ -680,6 +688,12 @@ |
681 | 689 | // find fragments with best score |
682 | 690 | Collections.sort(fragments, new Comparator<FragmentScore>() { |
683 | 691 | public int compare(FragmentScore o1, FragmentScore o2) { |
| 692 | + if(alwaysIncludeFirstLine){ |
| 693 | + if(o1.isFirstSentence) |
| 694 | + return -1; |
| 695 | + if(o2.isFirstSentence) |
| 696 | + return 1; |
| 697 | + } |
684 | 698 | // sort via longest phrase found |
685 | 699 | int c = o2.bestCount - o1.bestCount; |
686 | 700 | if(sortByPhrases && c != 0) |
— | — | @@ -698,7 +712,7 @@ |
699 | 713 | HashSet<String> termsFound = new HashSet<String>(); |
700 | 714 | ArrayList<FragmentScore> resNoNew = new ArrayList<FragmentScore>(); |
701 | 715 | for(FragmentScore f : fragments){ |
702 | | - if(f.score == 0) |
| 716 | + if(f.score == 0 && !(alwaysIncludeFirstLine && f.isFirstSentence)) |
703 | 717 | break; |
704 | 718 | // check if the fragment has new terms |
705 | 719 | boolean hasNew = false; |
— | — | @@ -711,7 +725,7 @@ |
712 | 726 | } |
713 | 727 | } |
714 | 728 | } |
715 | | - if(hasNew){ |
| 729 | + if(hasNew || (alwaysIncludeFirstLine && f.isFirstSentence)){ |
716 | 730 | if(f.found != null) |
717 | 731 | termsFound.addAll(f.found); |
718 | 732 | adjustBest(f,tokens,weightTerms,wordIndex,newTerms); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java |
— | — | @@ -437,11 +437,11 @@ |
438 | 438 | dbUpdates = new Hashtable<String,IndexUpdateRecord>(); |
439 | 439 | queuedUpdates.put(iid.toString(), dbUpdates); |
440 | 440 | } |
441 | | - IndexUpdateRecord oldr = dbUpdates.get(record.getKey()); |
| 441 | + IndexUpdateRecord oldr = dbUpdates.get(record.getIndexKey()); |
442 | 442 | // combine a previous delete with current add to form update |
443 | 443 | if(oldr != null && oldr.doDelete() && record.doAdd()) |
444 | 444 | record.setAction(IndexUpdateRecord.Action.UPDATE); |
445 | | - dbUpdates.put(record.getKey(),record); |
| 445 | + dbUpdates.put(record.getIndexKey(),record); |
446 | 446 | } |
447 | 447 | |
448 | 448 | log.debug("Locally queued item: "+record); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java |
— | — | @@ -149,16 +149,16 @@ |
150 | 150 | } |
151 | 151 | |
152 | 152 | /** |
153 | | - * @return Returns the page key -- page_id (via article) |
| 153 | + * @return page index key -- page_id (via article) |
154 | 154 | */ |
155 | | - public String getKey(){ |
156 | | - return article.getKey(); |
| 155 | + public String getIndexKey(){ |
| 156 | + return article.getIndexKey(); |
157 | 157 | } |
158 | 158 | |
159 | 159 | /** |
160 | | - * @return Highlight key -- ns:title |
| 160 | + * @return ns:title key, used in links, highlight, prefix, etc.. indexes |
161 | 161 | */ |
162 | | - public String getHighlightKey(){ |
| 162 | + public String getNsTitleKey(){ |
163 | 163 | return article.getTitleObject().getKey(); |
164 | 164 | } |
165 | 165 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -59,11 +59,16 @@ |
60 | 60 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
61 | 61 | import org.wikimedia.lsearch.config.IndexId; |
62 | 62 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 63 | +import org.wikimedia.lsearch.prefix.PrefixIndexBuilder; |
63 | 64 | import org.wikimedia.lsearch.ranks.Links; |
64 | 65 | import org.wikimedia.lsearch.ranks.StringList; |
| 66 | +import org.wikimedia.lsearch.related.Related; |
65 | 67 | import org.wikimedia.lsearch.related.RelatedTitle; |
66 | 68 | import org.wikimedia.lsearch.search.NamespaceFilter; |
| 69 | +import org.wikimedia.lsearch.spell.CleanIndexImporter; |
| 70 | +import org.wikimedia.lsearch.spell.CleanIndexWriter; |
67 | 71 | import org.wikimedia.lsearch.spell.api.SpellCheckIndexer; |
| 72 | +import org.wikimedia.lsearch.storage.RelatedStorage; |
68 | 73 | import org.wikimedia.lsearch.util.Buffer; |
69 | 74 | import org.wikimedia.lsearch.util.Localization; |
70 | 75 | import org.wikimedia.lsearch.util.MathFunc; |
— | — | @@ -155,9 +160,9 @@ |
156 | 161 | if(rec.doDelete()){ |
157 | 162 | int count = 0; |
158 | 163 | if(iid.isHighlight()) |
159 | | - count = reader.deleteDocuments(new Term("key", rec.getHighlightKey())); |
| 164 | + count = reader.deleteDocuments(new Term("key", rec.getNsTitleKey())); |
160 | 165 | else // normal or titles index |
161 | | - count = reader.deleteDocuments(new Term("key", rec.getKey())); |
| 166 | + count = reader.deleteDocuments(new Term("key", rec.getIndexKey())); |
162 | 167 | if(count == 0) |
163 | 168 | nonDeleteDocuments.add(rec); |
164 | 169 | IndexReportCard card = getReportCard(rec); |
— | — | @@ -167,7 +172,7 @@ |
168 | 173 | else |
169 | 174 | card.setSuccessfulDelete(); |
170 | 175 | } |
171 | | - log.debug(iid+": Deleting document "+rec.getKey()+" "+rec.getArticle()); |
| 176 | + log.debug(iid+": Deleting document "+rec.getIndexKey()+" "+rec.getArticle()); |
172 | 177 | } |
173 | 178 | } |
174 | 179 | reader.close(); |
— | — | @@ -231,7 +236,7 @@ |
232 | 237 | writer.addDocument(doc,indexAnalyzer); |
233 | 238 | } |
234 | 239 | |
235 | | - log.debug(iid+": Adding document "+rec.getKey()+" "+rec.getArticle()); |
| 240 | + log.debug(iid+": Adding document "+rec.getIndexKey()+" "+rec.getArticle()); |
236 | 241 | if(card != null) |
237 | 242 | card.setSuccessfulAdd(); |
238 | 243 | } catch (IOException e) { |
— | — | @@ -241,7 +246,7 @@ |
242 | 247 | succ = false; // report unsucc, but still continue, to process all cards |
243 | 248 | } catch(Exception e){ |
244 | 249 | e.printStackTrace(); |
245 | | - log.error("Error adding document "+rec.getKey()+" with message: "+e.getMessage()); |
| 250 | + log.error("Error adding document "+rec.getIndexKey()+" with message: "+e.getMessage()); |
246 | 251 | if(card != null) |
247 | 252 | card.setFailedAdd(); |
248 | 253 | succ = false; // report unsucc, but still continue, to process all cards |
— | — | @@ -410,16 +415,124 @@ |
411 | 416 | * |
412 | 417 | * @param iid |
413 | 418 | * @param updateRecords |
| 419 | + * @return success |
414 | 420 | */ |
415 | 421 | public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
416 | | - boolean index = updateDocumentsOn(iid,updateRecords,iid); |
417 | | - boolean highlight = updateDocumentsOn(iid.getHighlight(),updateRecords,iid); |
418 | | - boolean titles = true; |
| 422 | + return updateLinks(iid,updateRecords) |
| 423 | + && fetchLinksInfo(iid,updateRecords) |
| 424 | + && updatePrefix(iid,updateRecords) |
| 425 | + && updateSpell(iid,updateRecords) |
| 426 | + && updateDocumentsOn(iid,updateRecords,iid) |
| 427 | + && updateDocumentsOn(iid.getHighlight(),updateRecords,iid) |
| 428 | + && updateTitles(iid,updateRecords); |
| 429 | + } |
| 430 | + |
| 431 | + public boolean updateTitles(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
419 | 432 | if(iid.hasTitlesIndex()) |
420 | | - titles = updateDocumentsOn(iid.getTitlesIndex(),updateRecords,iid); |
421 | | - return index && highlight && titles; |
| 433 | + return updateDocumentsOn(iid.getTitlesIndex(),updateRecords,iid); |
| 434 | + return true; |
422 | 435 | } |
423 | 436 | |
| 437 | + /** Update articles with latest linking & related information */ |
| 438 | + public boolean fetchLinksInfo(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
| 439 | + try{ |
| 440 | + Links links = Links.openForRead(iid,iid.getIndexPath()); |
| 441 | + RelatedStorage related = new RelatedStorage(iid); |
| 442 | + for(IndexUpdateRecord rec : updateRecords){ |
| 443 | + if(rec.doAdd()){ |
| 444 | + String key = rec.getNsTitleKey(); |
| 445 | + Article article = rec.getArticle(); |
| 446 | + // references, redirect status |
| 447 | + article.setReferences(links.getNumInLinks(key)); |
| 448 | + article.setRedirectTo(links.getRedirectTarget(key)); |
| 449 | + if(article.isRedirect()) |
| 450 | + article.setRedirectTargetNamespace(links.getRedirectTargetNamespace(key)); |
| 451 | + else |
| 452 | + article.setRedirectTargetNamespace(-1); |
| 453 | + |
| 454 | + // redirects |
| 455 | + ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
| 456 | + for(String rk : links.getRedirectsTo(key)){ |
| 457 | + String[] parts = rk.toString().split(":",2); |
| 458 | + int redirectRef = links.getNumInLinks(rk); |
| 459 | + redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef)); |
| 460 | + } |
| 461 | + article.setRedirects(redirects); |
| 462 | + // related |
| 463 | + if(related != null) |
| 464 | + article.setRelated(related.getRelated(key)); |
| 465 | + } |
| 466 | + } |
| 467 | + return true; |
| 468 | + } catch(IOException e){ |
| 469 | + e.printStackTrace(); |
| 470 | + log.error("Cannot fetch links info: "+e.getMessage()); |
| 471 | + return false; |
| 472 | + } |
| 473 | + } |
| 474 | + |
| 475 | + public boolean updateLinks(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
| 476 | + try{ |
| 477 | + Links links = Links.openForModification(iid); |
| 478 | + for(IndexUpdateRecord rec : updateRecords){ |
| 479 | + // TODO: this might do some unnecessary additions/deletions on split index architecture |
| 480 | + if(rec.doDelete()){ |
| 481 | + links.deleteArticleInfoByIndexKey(rec.getIndexKey()); |
| 482 | + } else if(rec.doAdd()){ |
| 483 | + Article a = rec.getArticle(); |
| 484 | + links.addArticleInfo(a.getContents(),a.getTitleObject(),iid.isExactCase(),a.getIndexKey()); |
| 485 | + } |
| 486 | + } |
| 487 | + links.close(); |
| 488 | + return true; |
| 489 | + } catch(IOException e){ |
| 490 | + e.printStackTrace(); |
| 491 | + log.error("Cannot update links index: "+e.getMessage()); |
| 492 | + return false; |
| 493 | + } |
| 494 | + } |
| 495 | + |
| 496 | + public boolean updatePrefix(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
| 497 | + if(!iid.hasPrefix()) |
| 498 | + return true; |
| 499 | + try{ |
| 500 | + PrefixIndexBuilder prefix = PrefixIndexBuilder.forPrecursorModification(iid); |
| 501 | + for(IndexUpdateRecord rec : updateRecords){ |
| 502 | + if(rec.doDelete()){ |
| 503 | + prefix.deleteFromPrecursor(rec.getIndexKey()); |
| 504 | + } else if(rec.doAdd()){ |
| 505 | + Article a = rec.getArticle(); |
| 506 | + prefix.addToPrecursor(rec.getNsTitleKey(),a.getReferences(),a.getRedirectTarget(),rec.getIndexKey()); |
| 507 | + } |
| 508 | + } |
| 509 | + return true; |
| 510 | + } catch(IOException e){ |
| 511 | + e.printStackTrace(); |
| 512 | + log.error("Cannot update prefix index: "+e.getMessage()); |
| 513 | + return false; |
| 514 | + } |
| 515 | + } |
| 516 | + |
| 517 | + public boolean updateSpell(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
| 518 | + if(!iid.hasSpell()) |
| 519 | + return true; |
| 520 | + try{ |
| 521 | + CleanIndexWriter writer = CleanIndexWriter.newForModification(iid); |
| 522 | + for(IndexUpdateRecord rec : updateRecords){ |
| 523 | + if(rec.doDelete()){ |
| 524 | + writer.deleteArticleInfo(rec.getIndexKey()); |
| 525 | + } else if(rec.doAdd()){ |
| 526 | + writer.addArticleInfo(rec.getArticle()); |
| 527 | + } |
| 528 | + } |
| 529 | + return true; |
| 530 | + } catch(IOException e){ |
| 531 | + e.printStackTrace(); |
| 532 | + log.error("Cannot update spellcheck index: "+e.getMessage()); |
| 533 | + return false; |
| 534 | + } |
| 535 | + } |
| 536 | + |
424 | 537 | /** |
425 | 538 | * Update all documents in the collection. If needed the request |
426 | 539 | * is forwarded to a remote object (i.e. if the part of the split |
— | — | @@ -518,7 +631,7 @@ |
519 | 632 | transformArticleForIndexing(article); |
520 | 633 | |
521 | 634 | // page_id from database, used to look up and replace entries on index updates |
522 | | - doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 635 | + doc.add(new Field("key", article.getIndexKey(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
523 | 636 | |
524 | 637 | // namespace, returned with results |
525 | 638 | doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WordNet.java |
— | — | @@ -6,6 +6,7 @@ |
7 | 7 | import java.util.ArrayList; |
8 | 8 | import java.util.Arrays; |
9 | 9 | import java.util.HashMap; |
| 10 | +import java.util.HashSet; |
10 | 11 | import java.util.List; |
11 | 12 | import java.util.zip.GZIPInputStream; |
12 | 13 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -564,7 +564,8 @@ |
565 | 565 | c = text[cur]; |
566 | 566 | else break; |
567 | 567 | } |
568 | | - cur--; // we moved to next legal char |
| 568 | + if(!noTrailing) |
| 569 | + cur--; // we moved to next legal char |
569 | 570 | } |
570 | 571 | |
571 | 572 | addToken(noTrailing); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/StopWords.java |
— | — | @@ -88,11 +88,11 @@ |
89 | 89 | return ret; |
90 | 90 | } |
91 | 91 | |
92 | | - /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */ |
93 | | - public static HashSet<String> getPredefinedSet(IndexId iid){ |
| 92 | + /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */ |
| 93 | + public static HashSet<String> getPredefinedSet(String langCode){ |
94 | 94 | loadPredefined(); |
95 | 95 | HashSet<String> ret = new HashSet<String>(); |
96 | | - HashSet<String> cached = cachePredefined.get(iid.getLangCode()); |
| 96 | + HashSet<String> cached = cachePredefined.get(langCode); |
97 | 97 | if(cached != null){ |
98 | 98 | synchronized(cached){ |
99 | 99 | ret.addAll(cached); |
— | — | @@ -100,6 +100,9 @@ |
101 | 101 | } |
102 | 102 | return ret; |
103 | 103 | } |
| 104 | + public static HashSet<String> getPredefinedSet(IndexId iid){ |
| 105 | + return getPredefinedSet(iid.getLangCode()); |
| 106 | + } |
104 | 107 | |
105 | 108 | protected static void loadPredefined(){ |
106 | 109 | if(loadedPredefined) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | import org.apache.lucene.search.BooleanClause; |
22 | 22 | import org.apache.lucene.search.BooleanQuery; |
23 | 23 | import org.apache.lucene.search.LogTransformScore; |
| 24 | +import org.apache.lucene.search.MultiPhraseQuery; |
24 | 25 | import org.apache.lucene.search.PhraseQuery; |
25 | 26 | import org.apache.lucene.search.PositionalMultiQuery; |
26 | 27 | import org.apache.lucene.search.PositionalOptions; |
— | — | @@ -340,24 +341,10 @@ |
341 | 342 | c = text[cur]; |
342 | 343 | if(c == '"'){ |
343 | 344 | inPhrase = !inPhrase; |
344 | | - if(inPhrase) |
345 | | - length = 0; |
346 | | - else{ // end of phrase |
347 | | - int start = cur - length; |
348 | | - analyzeBuffer(); |
349 | | - for(Token t : tokens){ |
350 | | - if(t.getPositionIncrement() > 0) |
351 | | - ret.add(new Token(t.termText(),start+t.startOffset(),start+t.endOffset(),"phrase")); |
352 | | - } |
353 | | - } |
354 | 345 | } |
355 | 346 | |
356 | | - if(inPhrase){ |
357 | | - buffer[length++] = c; |
358 | | - continue; |
359 | | - } |
360 | | - |
361 | | - if(c == ')'){ |
| 347 | + if(inPhrase); |
| 348 | + else if(c == ')'){ |
362 | 349 | level--; |
363 | 350 | if(level < fieldLevel) |
364 | 351 | fieldLevel = -1; |
— | — | @@ -368,9 +355,9 @@ |
369 | 356 | } else if(fieldLevel != -1 && level>fieldLevel) |
370 | 357 | continue; |
371 | 358 | |
372 | | - if(Character.isLetterOrDigit(c) || c=='?' || c=='*' || c=='~'){ |
| 359 | + if(isTermChar(c)){ |
373 | 360 | int start = cur; |
374 | | - tokenType = fetchToken(); |
| 361 | + tokenType = fetchToken(inPhrase); |
375 | 362 | if(tokenType == TokenType.WORD && (start==0 || text[start-1]!='-')){ |
376 | 363 | String type = "word"; |
377 | 364 | if(bufferIsWildCard()) |
— | — | @@ -384,7 +371,7 @@ |
385 | 372 | } |
386 | 373 | } |
387 | 374 | } |
388 | | - } else if(c == '['){ |
| 375 | + } else if(c == '[' && !inPhrase){ |
389 | 376 | fetchGenericPrefix(); |
390 | 377 | } |
391 | 378 | } |
— | — | @@ -420,12 +407,19 @@ |
421 | 408 | return defaultNamespaceFilter; |
422 | 409 | } |
423 | 410 | |
| 411 | + private final boolean isTermChar(char ch){ |
| 412 | + return !Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"'; |
| 413 | + } |
| 414 | + |
424 | 415 | /** |
425 | 416 | * Fetch token into <code>buffer</code> starting from current position (<code>cur</code>) |
426 | 417 | * |
427 | 418 | * @return type of the token in buffer |
428 | 419 | */ |
429 | 420 | private TokenType fetchToken(){ |
| 421 | + return fetchToken(false); |
| 422 | + } |
| 423 | + private TokenType fetchToken(boolean termOnly){ |
430 | 424 | char ch; |
431 | 425 | prev_cur = cur; |
432 | 426 | for(length = 0; cur < queryLength; cur++){ |
— | — | @@ -434,7 +428,7 @@ |
435 | 429 | continue; // ignore whitespaces |
436 | 430 | |
437 | 431 | // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries |
438 | | - if(!Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"'){ |
| 432 | + if(isTermChar(ch)){ |
439 | 433 | if(length<buffer.length) |
440 | 434 | buffer[length++] = ch; |
441 | 435 | } else{ |
— | — | @@ -445,6 +439,9 @@ |
446 | 440 | if(length == 0) |
447 | 441 | return TokenType.EOF; |
448 | 442 | |
| 443 | + if(termOnly) |
| 444 | + return TokenType.WORD; |
| 445 | + |
449 | 446 | // check for keywords |
450 | 447 | if(length == 3 && buffer[0]=='A' && buffer[1]=='N' && buffer[2]=='D') |
451 | 448 | return TokenType.AND; |
— | — | @@ -553,34 +550,80 @@ |
554 | 551 | * |
555 | 552 | * @return a query, or null if the query is empty |
556 | 553 | */ |
557 | | - private PhraseQuery parsePhrase(){ |
558 | | - PhraseQuery query = null; |
559 | | - |
560 | | - length = 0; |
| 554 | + private Query parsePhrase(){ |
| 555 | + // special case for incategory |
| 556 | + if(currentField!=null && currentField.equals("incategory")){ |
| 557 | + for(; cur < queryLength ; cur++ ){ |
| 558 | + if(text[cur] == '"') |
| 559 | + break; |
| 560 | + else if(length < buffer.length) |
| 561 | + buffer[length++] = text[cur]; |
| 562 | + } |
| 563 | + if(length > 0){ |
| 564 | + // no tokenization, we want whole category name |
| 565 | + return new TermQuery(makeTerm()); |
| 566 | + } |
| 567 | + return null; |
| 568 | + } |
| 569 | + //PositionalMultiQuery query = new PositionalMultiQuery(new PositionalOptions.PhraseQueryFallback()); |
| 570 | + MultiPhraseQuery query = new MultiPhraseQuery(); |
561 | 571 | for(; cur < queryLength ; cur++ ){ |
| 572 | + length = 0; |
| 573 | + // fetch next word |
| 574 | + while(cur<queryLength && isTermChar(text[cur]) && length<buffer.length){ |
| 575 | + buffer[length++] = text[cur++]; |
| 576 | + } |
| 577 | + |
| 578 | + // add to phrase |
| 579 | + if(length > 0){ |
| 580 | + boolean added = false; |
| 581 | + if(bufferIsWildCard()){ |
| 582 | + Term term = makeTerm(); |
| 583 | + Term[] terms = wildcards.makeTerms(term.text(),term.field()); |
| 584 | + if(terms != null){ |
| 585 | + query.add(terms); |
| 586 | + ArrayList<String> words = wildcards.getWords(term.text()); |
| 587 | + expandedWordsFromParser.add(words); |
| 588 | + expandedTypesFromParser.add(ExpandedType.WILDCARD); |
| 589 | + ArrayList<Float> boosts = new ArrayList<Float>(); |
| 590 | + for(int i=0;i<words.size();i++) boosts.add(1f); |
| 591 | + expandedBoostFromParser.add(boosts); |
| 592 | + added = true; |
| 593 | + } |
| 594 | + } |
| 595 | + if(bufferIsFuzzy()){ |
| 596 | + Term term = makeTerm(); |
| 597 | + NamespaceFilter nsf = getNamespaceFilter(currentField); |
| 598 | + Term[] terms = fuzzy.makeTerms(term.text(),term.field(),nsf); |
| 599 | + if(terms != null){ |
| 600 | + //query.add(terms,fuzzy.getBoosts(term.text(),nsf,terms)); |
| 601 | + query.add(terms); |
| 602 | + ArrayList<String> words = fuzzy.getWords(term.text(),nsf); |
| 603 | + expandedWordsFromParser.add(words); |
| 604 | + expandedTypesFromParser.add(ExpandedType.FUZZY); |
| 605 | + expandedBoostFromParser.add(fuzzy.getBoosts(term.text(),nsf,words)); |
| 606 | + added = true; |
| 607 | + } |
| 608 | + } |
| 609 | + if(!added){ |
| 610 | + // fallback to ordinary words |
| 611 | + analyzeBuffer(); |
| 612 | + for(Token token : tokens){ |
| 613 | + if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words |
| 614 | + Term t = makeTerm(token); |
| 615 | + addToWords(t.text(),1,ExpandedType.PHRASE); |
| 616 | + query.add(t); |
| 617 | + } |
| 618 | + } |
| 619 | + } |
| 620 | + } |
562 | 621 | // end of phrase query |
563 | 622 | if(text[cur] == '"') |
564 | 623 | break; |
565 | | - else if(length < buffer.length) |
566 | | - buffer[length++] = text[cur]; |
567 | 624 | } |
568 | | - if(length != 0){ |
569 | | - query = new PhraseQuery(); |
570 | | - // if it's a category don't tokenize it, we want whole category name |
571 | | - if(currentField!=null && currentField.equals("incategory")) |
572 | | - query.add(makeTerm()); |
573 | | - else{ |
574 | | - analyzeBuffer(); |
575 | | - for(Token token : tokens){ |
576 | | - if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words |
577 | | - Term t = makeTerm(token); |
578 | | - addToWords(t.text(),1,ExpandedType.PHRASE); |
579 | | - query.add(t); |
580 | | - } |
581 | | - } |
582 | | - query.setBoost(defaultBoost); |
583 | | - } |
584 | | - return query; |
| 625 | + if(query.getPositions().length > 0){ |
| 626 | + query.setBoost(defaultBoost); |
| 627 | + return query; |
585 | 628 | } else |
586 | 629 | return null; |
587 | 630 | } |
— | — | @@ -1115,20 +1158,22 @@ |
1116 | 1159 | full.add(additional,Occur.SHOULD); |
1117 | 1160 | |
1118 | 1161 | // redirect match (when redirect is not contained in contents or title) |
1119 | | - Query redirects = makeAlttitleForRedirects(words,20,1); |
1120 | | - if(redirects != null) |
1121 | | - full.add(redirects,Occur.SHOULD); |
1122 | | - if(singularWords != null){ |
1123 | | - Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f); |
1124 | | - if(redirectsSing != null) |
1125 | | - full.add(redirectsSing,Occur.SHOULD); |
1126 | | - } |
1127 | 1162 | if(hasWildcards() || hasFuzzy()){ |
1128 | 1163 | Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f); |
1129 | 1164 | if(redirectsMulti != null) |
1130 | 1165 | full.add(redirectsMulti,Occur.SHOULD); |
| 1166 | + } else{ |
| 1167 | + Query redirects = makeAlttitleForRedirects(words,20,1); |
| 1168 | + if(redirects != null) |
| 1169 | + full.add(redirects,Occur.SHOULD); |
| 1170 | + if(singularWords != null){ |
| 1171 | + Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f); |
| 1172 | + if(redirectsSing != null) |
| 1173 | + full.add(redirectsSing,Occur.SHOULD); |
| 1174 | + } |
1131 | 1175 | } |
1132 | 1176 | |
| 1177 | + |
1133 | 1178 | BooleanQuery wrap = new BooleanQuery(true); |
1134 | 1179 | wrap.add(full,Occur.SHOULD); |
1135 | 1180 | wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD); |
— | — | @@ -1324,7 +1369,7 @@ |
1325 | 1370 | Query main = null; |
1326 | 1371 | |
1327 | 1372 | // all words as entered into the query |
1328 | | - PositionalQuery exact = makePositional(words,fields.contents(),new PositionalOptions.Exact(),0,1); |
| 1373 | + Query exact = makePositionalMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,fields.contents(),new PositionalOptions.Exact(),0,1); |
1329 | 1374 | // words + stemmed + singulars + transliterations + wildcards + fuzzy - with slop factor |
1330 | 1375 | Query sloppy = makePositionalMulti(expandedWordsContents,expandedBoostContents,expandedTypes,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1,false); |
1331 | 1376 | |
— | — | @@ -1335,7 +1380,7 @@ |
1336 | 1381 | ArrayList<ArrayList<String>> wordnet = WordNet.replaceOne(words,iid.getLangCode()); |
1337 | 1382 | |
1338 | 1383 | BooleanQuery combined = new BooleanQuery(true); |
1339 | | - if(exact!=null && exact.getTerms().length > 0) |
| 1384 | + if(exact!=null) |
1340 | 1385 | combined.add(exact,Occur.SHOULD); |
1341 | 1386 | // combined various queries into mainphrase |
1342 | 1387 | if(sloppy != null){ |
— | — | @@ -1343,7 +1388,8 @@ |
1344 | 1389 | // wordnet |
1345 | 1390 | if(wordnet != null){ |
1346 | 1391 | for(ArrayList<String> wnwords : wordnet){ |
1347 | | - combined.add(makePositional(wnwords,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD); |
| 1392 | + if(!allStopWords(wnwords)) |
| 1393 | + combined.add(makePositional(wnwords,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD); |
1348 | 1394 | } |
1349 | 1395 | } |
1350 | 1396 | } |
— | — | @@ -1367,7 +1413,8 @@ |
1368 | 1414 | ArrayList<Query> altAdd = new ArrayList<Query>(); |
1369 | 1415 | if(wordnet!=null) |
1370 | 1416 | for(ArrayList<String> wnwords : wordnet) |
1371 | | - altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST)); |
| 1417 | + if(!allStopWords(wnwords)) |
| 1418 | + altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST)); |
1372 | 1419 | alttitle = simplify(combine(alttitle,altAdd)); |
1373 | 1420 | |
1374 | 1421 | // relevance: related |
— | — | @@ -1375,7 +1422,8 @@ |
1376 | 1423 | ArrayList<Query> relAdd = new ArrayList<Query>(); |
1377 | 1424 | if(wordnet!=null) |
1378 | 1425 | for(ArrayList<String> wnwords : wordnet) |
1379 | | - relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST)); |
| 1426 | + if(!allStopWords(wnwords)) |
| 1427 | + relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST)); |
1380 | 1428 | related = simplify(combine(related,relAdd)); |
1381 | 1429 | |
1382 | 1430 | BooleanQuery relevances = new BooleanQuery(true); |
— | — | @@ -1546,8 +1594,11 @@ |
1547 | 1595 | // add the whole-only query |
1548 | 1596 | if(whole != null) |
1549 | 1597 | bq.add(makePositional(words,field,whole,slop,1),Occur.SHOULD); |
1550 | | - if(wholeSloppy != null) |
1551 | | - bq.add(makePositional(words,field,wholeSloppy,slop,1,false),Occur.SHOULD); |
| 1598 | + if(wholeSloppy != null){ |
| 1599 | + Query ws = makePositional(words,field,wholeSloppy,slop,1,false); |
| 1600 | + if(ws != null) |
| 1601 | + bq.add(ws,Occur.SHOULD); |
| 1602 | + } |
1552 | 1603 | bq.setBoost(boost); |
1553 | 1604 | |
1554 | 1605 | return bq; |
— | — | @@ -1600,8 +1651,11 @@ |
1601 | 1652 | // add the whole-only query |
1602 | 1653 | if(whole != null) |
1603 | 1654 | bq.add(makePositionalMulti(words,boosts,types,field,whole,slop,1),Occur.SHOULD); |
1604 | | - if(wholeSloppy != null) |
1605 | | - bq.add(makePositionalMulti(words,boosts,types,field,wholeSloppy,slop,0.5f,false),Occur.SHOULD); |
| 1655 | + if(wholeSloppy != null){ |
| 1656 | + Query ws = makePositionalMulti(words,boosts,types,field,wholeSloppy,slop,0.5f,false); |
| 1657 | + if(ws != null) |
| 1658 | + bq.add(ws,Occur.SHOULD); |
| 1659 | + } |
1606 | 1660 | bq.setBoost(boost); |
1607 | 1661 | |
1608 | 1662 | return bq; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java |
— | — | @@ -56,12 +56,11 @@ |
57 | 57 | } |
58 | 58 | } |
59 | 59 | |
60 | | - public static class HighlightOriginal extends TokenizerOptions { |
| 60 | + /** Used for titles, doesn't simply glue and has no case detection */ |
| 61 | + public static class HighlightOriginal extends Highlight { |
61 | 62 | public HighlightOriginal(){ |
62 | | - super(false); |
63 | | - this.highlightParsing = true; |
64 | | - this.relocationParsing = false; |
65 | 63 | this.simplifyGlue = false; |
| 64 | + this.noCaseDetection = true; |
66 | 65 | } |
67 | 66 | } |
68 | 67 | /** Used to filter prefixes (up to FastWikiTokenizer.MAX_WORD_LEN chars) */ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java |
— | — | @@ -37,7 +37,7 @@ |
38 | 38 | /** Invoke ListRecords from a certain timestamp */ |
39 | 39 | public ArrayList<IndexUpdateRecord> getRecords(String from){ |
40 | 40 | try{ |
41 | | - read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&from="+from)); |
| 41 | + read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=mediawiki&from="+from)); |
42 | 42 | return collector.getRecords(); |
43 | 43 | } catch(IOException e){ |
44 | 44 | log.warn("I/O exception listing records: "+e.getMessage()); |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | /** Invoke ListRecords using the last resumption token */ |
60 | 60 | public ArrayList<IndexUpdateRecord> getMoreRecords(){ |
61 | 61 | try{ |
62 | | - read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken)); |
| 62 | + read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=mediawiki&resumptionToken="+resumptionToken)); |
63 | 63 | return collector.getRecords(); |
64 | 64 | } catch(IOException e){ |
65 | 65 | log.warn("I/O exception listing records: "+e.getMessage()); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -35,7 +35,7 @@ |
36 | 36 | |
37 | 37 | public IndexUpdatesCollector(IndexId iid){ |
38 | 38 | this.iid = iid; |
39 | | - this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
| 39 | + this.langCode = iid.getLangCode(); |
40 | 40 | } |
41 | 41 | |
42 | 42 | public void addRedirect(String redirectTitle, int references) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java |
— | — | @@ -93,7 +93,6 @@ |
94 | 94 | HashSet<String> excludeList = new HashSet<String>(); |
95 | 95 | HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass |
96 | 96 | String defaultTimestamp = "2001-01-01"; |
97 | | - boolean fetchReferences = true; |
98 | 97 | // args |
99 | 98 | for(int i=0; i<args.length; i++){ |
100 | 99 | if(args[i].equals("-d")) |
— | — | @@ -110,8 +109,6 @@ |
111 | 110 | excludeList.add(args[++i]); |
112 | 111 | else if(args[i].equals("-n")) |
113 | 112 | notification = true; |
114 | | - else if(args[i].equals("--no-ranks")) |
115 | | - fetchReferences = false; |
116 | 113 | else if(args[i].equals("--help")) |
117 | 114 | break; |
118 | 115 | else if(args[i].startsWith("-")){ |
— | — | @@ -145,7 +142,6 @@ |
146 | 143 | System.out.println(" -f - dblist file, one dbname per line"); |
147 | 144 | System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")"); |
148 | 145 | System.out.println(" -e - exclude dbname from incremental updates (overrides -f)"); |
149 | | - System.out.println(" --no-ranks - don't try to fetch any article rank data"); |
150 | 146 | return; |
151 | 147 | } |
152 | 148 | // config |
— | — | @@ -190,22 +186,8 @@ |
191 | 187 | ArrayList<IndexUpdateRecord> records = harvester.getRecords(from); |
192 | 188 | if(records.size() == 0) |
193 | 189 | continue; |
194 | | - LinkAnalysisStorage las = new LinkAnalysisStorage(iid); |
195 | | - RelatedStorage related = new RelatedStorage(iid); |
196 | 190 | boolean hasMore = false; |
197 | 191 | do{ |
198 | | - if(fetchReferences){ |
199 | | - try{ |
200 | | - // fetch references for records |
201 | | - fetchReferencesAndRelated(records,las,related); |
202 | | - } catch(IOException e){ |
203 | | - // FIXME: quick hack, if the table cannot be found (e.g. for new wikis) don't abort |
204 | | - if(e.getMessage().contains("Base table or view not found")){ |
205 | | - log.warn("Continuing, but could not fetch references for "+iid+": "+e.getMessage()); |
206 | | - } else |
207 | | - throw e; |
208 | | - } |
209 | | - } |
210 | 192 | for(IndexUpdateRecord rec : records){ |
211 | 193 | Article ar = rec.getArticle(); |
212 | 194 | log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects()); |
— | — | @@ -287,62 +269,5 @@ |
288 | 270 | } |
289 | 271 | } while(daemon); |
290 | 272 | } |
291 | | - |
292 | | - protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, LinkAnalysisStorage las, RelatedStorage related) throws IOException { |
293 | | - ArrayList<Title> titles = new ArrayList<Title>(); |
294 | | - for(IndexUpdateRecord rec : records){ |
295 | | - if(rec.isDelete()) |
296 | | - continue; |
297 | | - Article ar = rec.getArticle(); |
298 | | - titles.add(ar.makeTitle()); |
299 | | - if(ar.getRedirects() != null){ |
300 | | - for(Redirect r : ar.getRedirects()){ |
301 | | - titles.add(r.makeTitle()); |
302 | | - } |
303 | | - } |
304 | | - } |
305 | | - // fetch |
306 | | - //OldLinks links = new OldLinks(store.getPageReferences(titles,dbname)); |
307 | | - //HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname); |
308 | | - // update |
309 | | - // FIXME: wow, this is BCE ... |
310 | | - for(IndexUpdateRecord rec : records){ |
311 | | - if(rec.isDelete()) |
312 | | - continue; |
313 | | - Article ar = rec.getArticle(); |
314 | | - Title t = ar.makeTitle(); |
315 | | - ArticleAnalytics aa = las.getAnaliticsForArticle(t.getKey()); |
316 | | - ArrayList<String> anchors = new ArrayList<String>(); |
317 | | - anchors.addAll(aa.getAnchorText()); |
318 | | - // set references |
319 | | - ar.setReferences(aa.getReferences()); |
320 | | - //ar.setRedirect(aa.isRedirect()); |
321 | | - if(aa.isRedirect()) |
322 | | - ar.setRedirectTargetNamespace(aa.getRedirectTargetNamespace()); |
323 | | - if(ar.getRedirects() != null){ |
324 | | - for(Redirect r : ar.getRedirects()){ |
325 | | - ArticleAnalytics raa = las.getAnaliticsForReferences(r.makeTitle().getKey()); |
326 | | - r.setReferences(raa.getReferences()); |
327 | | - anchors.addAll(raa.getAnchorText()); |
328 | | - } |
329 | | - } |
330 | | - // set anchors |
331 | | - ar.setAnchorText(anchors); |
332 | | - // set related |
333 | | - if(related.canRead()) |
334 | | - ar.setRelated(related.getRelated(t.getKey())); |
335 | | - /*ArrayList<RelatedTitle> rt = rel.get(t.getKey()); |
336 | | - if(rt != null){ |
337 | | - Collections.sort(rt,new Comparator<RelatedTitle>() { |
338 | | - public int compare(RelatedTitle o1, RelatedTitle o2){ |
339 | | - double d = o2.getScore()-o1.getScore(); |
340 | | - if(d == 0) return 0; |
341 | | - else if(d > 0) return 1; |
342 | | - else return -1; |
343 | | - } |
344 | | - }); |
345 | | - ar.setRelated(rt); |
346 | | - }*/ |
347 | | - } |
348 | | - } |
349 | | -} |
| 273 | + |
| 274 | +} |
\ No newline at end of file |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java |
— | — | @@ -46,7 +46,7 @@ |
47 | 47 | langCode = "en"; |
48 | 48 | this.langCode = langCode; |
49 | 49 | this.iid = iid; |
50 | | - this.exactCase = iid.getExactCase(); |
| 50 | + this.exactCase = iid.isExactCase(); |
51 | 51 | interwiki = Localization.getInterwiki(); |
52 | 52 | } |
53 | 53 | public void writeRevision(Revision revision) throws IOException { |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | public void writeEndPage() throws IOException { |
60 | 60 | Title t = new Title(page.Title.Namespace,page.Title.Text); |
61 | 61 | try{ |
62 | | - links.addArticleInfo(revision.Text,t,exactCase); |
| 62 | + links.addArticleInfo(revision.Text,t,exactCase,Integer.toString(page.Id)); |
63 | 63 | } catch(Exception e){ |
64 | 64 | log.error("Error adding article "+t+" : "+e.getMessage()); |
65 | 65 | e.printStackTrace(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -70,16 +70,16 @@ |
71 | 71 | protected Directory directory = null; |
72 | 72 | protected NamespaceFilter nsf; // default search |
73 | 73 | protected ObjectCache cache; |
74 | | - //protected ObjectCache refCache; |
75 | 74 | protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly; |
76 | 75 | protected boolean optimized = false; |
| 76 | + protected boolean autoOptimize = false; |
77 | 77 | |
78 | | - private Links(IndexId iid, String path, IndexWriter writer) throws CorruptIndexException, IOException{ |
| 78 | + private Links(IndexId iid, String path, IndexWriter writer, boolean autoOptimize) throws CorruptIndexException, IOException{ |
79 | 79 | this.writer = writer; |
80 | 80 | this.path = path; |
81 | 81 | this.iid = iid; |
82 | | - GlobalConfiguration global = GlobalConfiguration.getInstance(); |
83 | | - this.langCode = global.getLanguage(iid); |
| 82 | + this.autoOptimize = autoOptimize; |
| 83 | + this.langCode = iid.getLangCode(); |
84 | 84 | String dbname = iid.getDBname(); |
85 | 85 | nsmap = Localization.getLocalizedNamespaces(langCode,dbname); |
86 | 86 | interwiki = Localization.getInterwiki(); |
— | — | @@ -87,13 +87,7 @@ |
88 | 88 | imageLocalized = Localization.getLocalizedImage(langCode,dbname); |
89 | 89 | state = State.FLUSHED; |
90 | 90 | initWriter(writer); |
91 | | - //reader = IndexReader.open(path); |
92 | | - nsf = global.getDefaultNamespace(iid); |
93 | | - cache = new ObjectCache(10000); |
94 | | - // init cache manager |
95 | | - /*CacheManager manager = CacheManager.create(); |
96 | | - cache = new Cache("links", 5000, false, false, 5, 2); |
97 | | - manager.addCache(cache); */ |
| 91 | + nsf = iid.getDefaultNamespace(); |
98 | 92 | keyOnly = makeSelector("article_key"); |
99 | 93 | redirectOnly = makeSelector("redirect"); |
100 | 94 | contextOnly = makeSelector("context"); |
— | — | @@ -122,7 +116,7 @@ |
123 | 117 | String path = iid.getIndexPath(); |
124 | 118 | log.info("Using index at "+path); |
125 | 119 | IndexWriter writer = WikiIndexModifier.openForWrite(path,false); |
126 | | - return new Links(iid,path,writer); |
| 120 | + return new Links(iid,path,writer,false); |
127 | 121 | } |
128 | 122 | |
129 | 123 | public static Links openStandalone(IndexId iid) throws IOException { |
— | — | @@ -138,7 +132,7 @@ |
139 | 133 | public static Links openForRead(IndexId iid, String path) throws IOException { |
140 | 134 | iid = iid.getLinks(); |
141 | 135 | log.info("Opening for read "+path); |
142 | | - return new Links(iid,path,null); |
| 136 | + return new Links(iid,path,null,true); |
143 | 137 | } |
144 | 138 | |
145 | 139 | /** Create new in the import path */ |
— | — | @@ -147,7 +141,7 @@ |
148 | 142 | String path = iid.getImportPath(); |
149 | 143 | log.info("Making index at "+path); |
150 | 144 | IndexWriter writer = WikiIndexModifier.openForWrite(path,true); |
151 | | - Links links = new Links(iid,path,writer); |
| 145 | + Links links = new Links(iid,path,writer,true); |
152 | 146 | return links; |
153 | 147 | } |
154 | 148 | |
— | — | @@ -156,7 +150,7 @@ |
157 | 151 | iid = iid.getLinks(); |
158 | 152 | log.info("Making index in memory"); |
159 | 153 | IndexWriter writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true); |
160 | | - Links links = new Links(iid,null,writer); |
| 154 | + Links links = new Links(iid,null,writer,true); |
161 | 155 | return links; |
162 | 156 | } |
163 | 157 | |
— | — | @@ -172,7 +166,7 @@ |
173 | 167 | nsmap.put(namespace.toLowerCase(),index); |
174 | 168 | } |
175 | 169 | |
176 | | - /** Write all changes, optimize/close everything |
| 170 | + /** Write all changes, optimize if in autoOptimize mode |
177 | 171 | * @throws IOException */ |
178 | 172 | public void flush() throws IOException{ |
179 | 173 | // close & optimize |
— | — | @@ -181,7 +175,8 @@ |
182 | 176 | if(reader != null) |
183 | 177 | reader.close(); |
184 | 178 | if(writer != null){ |
185 | | - writer.optimize(); |
| 179 | + if(autoOptimize) |
| 180 | + writer.optimize(); |
186 | 181 | writer.close(); |
187 | 182 | } |
188 | 183 | state = State.FLUSHED; |
— | — | @@ -193,15 +188,7 @@ |
194 | 189 | * @throws IOException |
195 | 190 | */ |
196 | 191 | protected void flushForRead() throws IOException{ |
197 | | - // close & optimize |
198 | | - if(searcher != null) |
199 | | - searcher.close(); |
200 | | - if(reader != null) |
201 | | - reader.close(); |
202 | | - if(writer != null){ |
203 | | - writer.optimize(); |
204 | | - writer.close(); |
205 | | - } |
| 192 | + flush(); |
206 | 193 | log.debug("Opening index reader"); |
207 | 194 | // reopen |
208 | 195 | reader = IndexReader.open(path); |
— | — | @@ -238,27 +225,28 @@ |
239 | 226 | openForWrite(); |
240 | 227 | } |
241 | 228 | |
242 | | - /** Modify existing article links info */ |
243 | | - public void modifyArticleInfo(String text, Title t, boolean exactCase) throws IOException{ |
| 229 | + /** Delete article info connected to title t */ |
| 230 | + public void deleteArticleInfo(Title t) throws IOException { |
244 | 231 | ensureWrite(); |
245 | 232 | writer.deleteDocuments(new Term("article_key",t.getKey())); |
246 | | - addArticleInfo(text,t,exactCase); |
247 | 233 | } |
| 234 | + /** Delete by page_id, not ns:title key */ |
| 235 | + public void deleteArticleInfoByIndexKey(String key) throws IOException { |
| 236 | + ensureWrite(); |
| 237 | + writer.deleteDocuments(new Term("article_pageid",key)); |
| 238 | + } |
248 | 239 | |
249 | 240 | /** Add links and other info from article |
250 | 241 | * @throws IOException */ |
251 | | - public void addArticleInfo(String text, Title t, boolean exactCase) throws IOException{ |
| 242 | + public void addArticleInfo(String text, Title t, boolean exactCase, String pageId) throws IOException{ |
252 | 243 | ensureWrite(); |
253 | 244 | Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
254 | 245 | int namespace = t.getNamespace(); |
255 | 246 | Matcher matcher = linkPat.matcher(text); |
256 | 247 | int ns; String title; |
257 | 248 | boolean escaped; |
258 | | - //PrefixAnalyzer prefixAnalyzer = new PrefixAnalyzer(); |
259 | 249 | |
260 | 250 | ArrayList<String> pagelinks = new ArrayList<String>(); |
261 | | - // article link -> contexts |
262 | | - //HashMap<String,ArrayList<String>> contextMap = new HashMap<String,ArrayList<String>>(); |
263 | 251 | |
264 | 252 | // use context only for namespace in default search |
265 | 253 | boolean useContext = nsf.contains(t.getNamespace()); |
— | — | @@ -339,6 +327,7 @@ |
340 | 328 | StringList lk = new StringList(pagelinks); |
341 | 329 | Analyzer an = new SplitAnalyzer(1,true); |
342 | 330 | Document doc = new Document(); |
| 331 | + doc.add(new Field("article_pageid",pageId,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
343 | 332 | // ns:title |
344 | 333 | doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
345 | 334 | if(redirectsTo != null) |
— | — | @@ -348,8 +337,6 @@ |
349 | 338 | // a list of all links |
350 | 339 | doc.add(new Field("links",lk.toString(),Field.Store.NO,Field.Index.TOKENIZED)); |
351 | 340 | } |
352 | | - // key split up into prefixes (for prefix index) |
353 | | - // doc.add(new Field("prefix",prefixAnalyzer.tokenStream("prefix",t.getKey()))); |
354 | 341 | |
355 | 342 | writer.addDocument(doc,an); |
356 | 343 | state = State.MODIFIED; |
— | — | @@ -430,18 +417,17 @@ |
431 | 418 | } |
432 | 419 | return false; |
433 | 420 | } |
434 | | - |
435 | | - @Deprecated |
436 | | - /** If article is redirect, get target, else null */ |
437 | | - public String getRedirectTargetOld(String key) throws IOException{ |
| 421 | + |
| 422 | + /** Get page_id for ns:title */ |
| 423 | + public String getPageId(String key) throws IOException { |
438 | 424 | ensureRead(); |
439 | 425 | TermDocs td = reader.termDocs(new Term("article_key",key)); |
440 | 426 | if(td.next()){ |
441 | | - return reader.document(td.doc(),redirectOnly).get("redirect"); |
| 427 | + return reader.document(td.doc()).get("article_pageid"); |
442 | 428 | } |
443 | 429 | return null; |
444 | 430 | } |
445 | | - |
| 431 | + |
446 | 432 | /** If article is redirect, get target key, else null */ |
447 | 433 | public String getRedirectTarget(String key) throws IOException{ |
448 | 434 | ensureRead(); |
— | — | @@ -637,19 +623,16 @@ |
638 | 624 | writer.close(); |
639 | 625 | if(reader != null) |
640 | 626 | reader.close(); |
641 | | - if(directory != null) |
642 | | - directory.close(); |
| 627 | + //if(directory != null) |
| 628 | + // directory.close(); |
643 | 629 | } |
644 | 630 | |
645 | 631 | public ObjectCache getCache() { |
646 | 632 | return cache; |
647 | 633 | } |
648 | 634 | |
649 | | - /*public ObjectCache getRefCache() { |
650 | | - return refCache; |
651 | | - } */ |
652 | | - |
653 | | - |
654 | | - |
655 | | - |
| 635 | + public boolean isAutoOptimize() { |
| 636 | + return autoOptimize; |
| 637 | + } |
| 638 | + |
656 | 639 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -72,8 +72,10 @@ |
73 | 73 | Configuration.open(); |
74 | 74 | String text = "bre! (ant) and some. it's stupid it's something and 5\"6' or more, links abacus"; |
75 | 75 | showTokens(text); |
76 | | - text = "bre! u.s. {{template|text}} {{template|text2|text3}} [http://ls2.wiki link]"; |
| 76 | + text = "This, is a '''list of [[African]] countries and dependencies by [[population]]'''.\n\n{| border=\"1\" cellpadding=\"2\" cellspacing=\"0\" style=\"border-collapse:collapse; text-align:right;\"\n|- style=\"text-align:center; background:#efefef\"\n!Pos !! Country !! Population\n|-\n| align=\"left\" |-\n| align=\"left\" |'''Africa''' || 934,283,426\n|-\n"; |
77 | 77 | showTokens(text); |
| 78 | + text = "u.s. {{template|text}} {{template|text2|text3}} [http://ls2.wiki link]"; |
| 79 | + showTokens(text); |
78 | 80 | text = "Good-Thomas C# C++ and so on.. "; |
79 | 81 | showTokens(text); |
80 | 82 | text = "[[Image:Argishti monument.JPG|thumb|King Argishti of Urartu riding a chariot with two horses in Yerevan, Armenia in front of the Erebuni Museum.]]'''Urartu''' (Assyrian ''Urarṭu'', [[Urartian language|Urartian]] ''Biainili'') was an ancient [[kingdom (politics)|kingdom]] of [[Armenia]]<ref>"Urartu." Columbia Electronic Encyclopedia. Columbia University Press.</ref> located in the mountainous plateau between [[Asia Minor]], [[Mesopotamia]], and [[Caucasus mountains]], later known as the [[Armenian Highland]], and it centered around [[Lake Van]] (present-day eastern [[Turkey]]). The kingdom existed from ca. [[860s BC|860 BC]], emerging from Late Bronze Age [[Nairi]] polities, until [[585 BC]]. The name corresponds to the [[Bible|Biblical]] '''[[Mount Ararat|Ararat]]'''."; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java |
— | — | @@ -113,6 +113,7 @@ |
114 | 114 | {"ommmmmmmmiteed", "omitted"}, |
115 | 115 | {"ommmmmmmmitted", "omitted"}, |
116 | 116 | {"a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",""}, |
| 117 | + {"Douglas Adams's Guide to The Hitch-Hiker's Guide to the Galaxy",""}, |
117 | 118 | |
118 | 119 | }; |
119 | 120 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java |
— | — | @@ -48,7 +48,7 @@ |
49 | 49 | && res.get(1).getWord().equals(m[1])) |
50 | 50 | good++; |
51 | 51 | else if(r.getDist() > 1){ |
52 | | - SuggestResult split = sc.suggestSplit(m[0],0); |
| 52 | + SuggestResult split = sc.suggestSplit(m[0],null); |
53 | 53 | if(split!=null && m[1].equals(split.getWord())) |
54 | 54 | good++; |
55 | 55 | else{ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java |
— | — | @@ -104,10 +104,10 @@ |
105 | 105 | Analyzer analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("enwiki")); |
106 | 106 | //Analyzer analyzer = Analyzers.getHighlightAnalyzer(IndexId.get("enwiki")); |
107 | 107 | Analyzer old = new EnglishAnalyzer(); |
108 | | - String text = "a-b compatibly compatible Gödel; The who is a band. The who is Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method."; |
| 108 | + String text = "Pokémons a-b compatibly compatible Gödel; The who is a band. The who is Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method."; |
109 | 109 | displayTokens(analyzer,text); |
110 | 110 | displayTokens(old,text); |
111 | | - text = "links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)"; |
| 111 | + text = "Pokémons links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)"; |
112 | 112 | displayTokens(analyzer,text); |
113 | 113 | text = "Thomas c# c++ good-thomas Good-Thomas rats RATS Frame semantics (linguistics) 16th century sixteenth .fr web.fr other"; |
114 | 114 | displayTokens(analyzer,text); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -318,7 +318,7 @@ |
319 | 319 | assertTrue(ents1.isTitlesBySuffix()); |
320 | 320 | assertEquals("w",ents1.getInterwikiBySuffix("wiki")); |
321 | 321 | assertEquals(ents1,IndexId.get("enwiki").getTitlesIndex()); |
322 | | - assertEquals("en",testgc.getLanguage(ents1)); |
| 322 | + assertEquals("en",ents1.getLangCode()); |
323 | 323 | assertEquals("{wiki=enwiki}",ents1.getSuffixToDbname().toString()); |
324 | 324 | IndexId ents2 = IndexId.get("en-titles.tspart2"); |
325 | 325 | assertEquals("{wikisource=enwikisource, wiktionary=enwiktionary, test=entest}",ents2.getSuffixToDbname().toString()); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/HighlightTest.java |
— | — | @@ -49,7 +49,7 @@ |
50 | 50 | Term[] terms = termSet.toArray(new Term[] {}); |
51 | 51 | IndexSearcher searcher = SearcherCache.getInstance().getLocalSearcher(iid); |
52 | 52 | int[] df = searcher.docFreqs(terms); |
53 | | - Highlight.highlight(hits,iid,terms,df,searcher.maxDoc(),parser.getWordsClean(),StopWords.getPredefinedSet(iid),false,null,false); |
| 53 | + Highlight.highlight(hits,iid,terms,df,searcher.maxDoc(),parser.getWordsClean(),StopWords.getPredefinedSet(iid),false,null,false,false); |
54 | 54 | } |
55 | 55 | |
56 | 56 | public static void timeTest(String dbname, String dbnameSrc) throws Exception { |
— | — | @@ -86,7 +86,7 @@ |
87 | 87 | Document doc = reader.document(docid); |
88 | 88 | hits.add(doc.get("namespace")+":"+doc.get("title")); |
89 | 89 | } |
90 | | - Highlight.ResultSet rs = Highlight.highlight(hits,iid,terms,df,maxDoc,words,stopWords,false,null,false); |
| 90 | + Highlight.ResultSet rs = Highlight.highlight(hits,iid,terms,df,maxDoc,words,stopWords,false,null,false,false); |
91 | 91 | HashMap<String,HighlightResult> res = rs.highlighted; |
92 | 92 | count += res.size(); |
93 | 93 | if(i!=0 && i % 200 == 0){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java |
— | — | @@ -221,7 +221,7 @@ |
222 | 222 | "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+ |
223 | 223 | "<head>\n<title>Error: " + code + " " + message + "</title>\n"+ |
224 | 224 | "</head>\n<body>\n<h1>" + code + " " + message + "</h1>\n"+ |
225 | | - "<p>" + detail + "</p>\n<hr />\n<p><i>MWSearch on localhost" + |
| 225 | + "<div>" + detail + "</div>\n<hr />\n<p><i>LSearch daemon on localhost" + |
226 | 226 | "</i></p>\n</body>\n</html>"); |
227 | 227 | } |
228 | 228 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java |
— | — | @@ -36,6 +36,8 @@ |
37 | 37 | String what; |
38 | 38 | /** Client-supplied database we should operate on */ |
39 | 39 | String dbname; |
| 40 | + |
| 41 | + public static final double CURRENT_VERSION = 2.1; |
40 | 42 | |
41 | 43 | public SearchDaemon(Socket sock) { |
42 | 44 | super(sock); |
— | — | @@ -68,7 +70,8 @@ |
69 | 71 | try{ |
70 | 72 | SearchEngine engine = new SearchEngine(); |
71 | 73 | HashMap query = new QueryStringMap(uri); |
72 | | - SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query); |
| 74 | + double version = getVersion(query); |
| 75 | + SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query,version); |
73 | 76 | contentType = "text/plain"; |
74 | 77 | // format: |
75 | 78 | // <num of hits> |
— | — | @@ -85,45 +88,49 @@ |
86 | 89 | } |
87 | 90 | } else{ |
88 | 91 | sendOutputLine(Integer.toString(res.getNumHits())); |
89 | | - SuggestQuery sq = res.getSuggest(); |
90 | | - if(sq != null && sq.hasSuggestion()){ |
91 | | - sendOutputLine("#suggest ["+sq.getRangesSerialized()+"] "+encode(sq.getSearchterm())); |
92 | | - } else |
93 | | - sendOutputLine("#no suggestion"); |
94 | | - if(res.getTitles() != null){ |
95 | | - sendOutputLine("#interwiki "+res.getTitles().size()); |
96 | | - for(ResultSet rs : res.getTitles()){ |
97 | | - sendOutputLine(rs.getScore()+" "+encode(rs.getInterwiki())+" "+rs.getNamespace()+" "+encodeTitle(rs.getTitle())); |
| 92 | + if(version>=2.1){ |
| 93 | + SuggestQuery sq = res.getSuggest(); |
| 94 | + if(sq != null && sq.hasSuggestion()){ |
| 95 | + sendOutputLine("#suggest ["+sq.getRangesSerialized()+"] "+encode(sq.getSearchterm())); |
| 96 | + } else |
| 97 | + sendOutputLine("#no suggestion"); |
| 98 | + if(res.getTitles() != null){ |
| 99 | + sendOutputLine("#interwiki "+res.getTitles().size()); |
| 100 | + for(ResultSet rs : res.getTitles()){ |
| 101 | + sendOutputLine(rs.getScore()+" "+encode(rs.getInterwiki())+" "+rs.getNamespace()+" "+encodeTitle(rs.getTitle())); |
| 102 | + if(rs.getExplanation() != null) |
| 103 | + sendOutputLine(rs.getExplanation().toString()); |
| 104 | + if(rs.getHighlight() != null){ |
| 105 | + HighlightResult hr = rs.getHighlight(); |
| 106 | + sendHighlight("title",hr.getTitle()); |
| 107 | + sendHighlightWithTitle("redirect",hr.getRedirect()); |
| 108 | + } |
| 109 | + } |
| 110 | + } else |
| 111 | + sendOutputLine("#interwiki 0"); |
| 112 | + sendOutputLine("#results"); |
| 113 | + } |
| 114 | + for(ResultSet rs : res.getResults()){ |
| 115 | + sendResultLine(rs.score, rs.namespace, rs.title); |
| 116 | + if(version>=2.1){ |
| 117 | + if(rs.getContext() != null){ |
| 118 | + for(String c : rs.getContext()) |
| 119 | + sendOutputLine("#context "+c); |
| 120 | + } |
98 | 121 | if(rs.getExplanation() != null) |
99 | 122 | sendOutputLine(rs.getExplanation().toString()); |
100 | 123 | if(rs.getHighlight() != null){ |
101 | 124 | HighlightResult hr = rs.getHighlight(); |
102 | | - sendHighlight("title",hr.getTitle()); |
| 125 | + sendHighlight("title",hr.getTitle()); |
| 126 | + for(Snippet sn : hr.getText()) |
| 127 | + sendHighlight("text",sn); |
103 | 128 | sendHighlightWithTitle("redirect",hr.getRedirect()); |
| 129 | + sendHighlightWithFragment("section",hr.getSection()); |
| 130 | + if(hr.getDate() != null) |
| 131 | + sendHighlight("date",hr.getDate()); |
| 132 | + sendHighlight("wordcount",Integer.toString(hr.getWordCount())); |
104 | 133 | } |
105 | 134 | } |
106 | | - } else |
107 | | - sendOutputLine("#interwiki 0"); |
108 | | - sendOutputLine("#results"); |
109 | | - for(ResultSet rs : res.getResults()){ |
110 | | - sendResultLine(rs.score, rs.namespace, rs.title); |
111 | | - if(rs.getContext() != null){ |
112 | | - for(String c : rs.getContext()) |
113 | | - sendOutputLine("#context "+c); |
114 | | - } |
115 | | - if(rs.getExplanation() != null) |
116 | | - sendOutputLine(rs.getExplanation().toString()); |
117 | | - if(rs.getHighlight() != null){ |
118 | | - HighlightResult hr = rs.getHighlight(); |
119 | | - sendHighlight("title",hr.getTitle()); |
120 | | - for(Snippet sn : hr.getText()) |
121 | | - sendHighlight("text",sn); |
122 | | - sendHighlightWithTitle("redirect",hr.getRedirect()); |
123 | | - sendHighlightWithFragment("section",hr.getSection()); |
124 | | - if(hr.getDate() != null) |
125 | | - sendHighlight("date",hr.getDate()); |
126 | | - sendHighlight("wordcount",Integer.toString(hr.getWordCount())); |
127 | | - } |
128 | 135 | } |
129 | 136 | } |
130 | 137 | } else if(res.getFormat() == Format.JSON){ |
— | — | @@ -170,7 +177,17 @@ |
171 | 178 | } |
172 | 179 | } |
173 | 180 | |
174 | | - |
| 181 | + |
| 182 | + private double getVersion(HashMap query) { |
| 183 | + String v = (String)query.get("version"); |
| 184 | + if(v == null) |
| 185 | + v = (String)query.get("ver"); |
| 186 | + if(v != null) |
| 187 | + return Double.parseDouble(v); |
| 188 | + return CURRENT_VERSION; |
| 189 | + } |
| 190 | + |
| 191 | + |
175 | 192 | private String makeHighlight(String type, Snippet snippet){ |
176 | 193 | if(snippet == null) |
177 | 194 | return null; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | package org.wikimedia.lsearch.spell; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.io.Serializable; |
5 | 6 | import java.util.ArrayList; |
6 | 7 | import java.util.Collection; |
7 | 8 | import java.util.Collections; |
— | — | @@ -56,6 +57,7 @@ |
57 | 58 | protected NamespaceFilter defaultNs; |
58 | 59 | protected HashMap<String,Boolean> wordExistCache = new HashMap<String,Boolean>(); |
59 | 60 | protected enum Filtering { STRONG, WEAK }; |
| 61 | + protected boolean useLogging = true; |
60 | 62 | |
61 | 63 | /** Distance an metaphone metrics */ |
62 | 64 | static public class Metric { |
— | — | @@ -123,6 +125,8 @@ |
124 | 126 | |
125 | 127 | protected static class Namespaces { |
126 | 128 | HashSet<Integer> namespaces = new HashSet<Integer>(); |
| 129 | + /** If true, these namespaces are additional to the default namespaces, |
| 130 | + * if false, there is no intersection between these namespaces and default namespaces */ |
127 | 131 | boolean additional = false; |
128 | 132 | String prefix = "ns_"; |
129 | 133 | public Namespaces(HashSet<Integer> namespaces, boolean additional) { |
— | — | @@ -155,19 +159,26 @@ |
156 | 160 | /** Number of results to fetch for titles */ |
157 | 161 | public static final int POOL_TITLE = 100; |
158 | 162 | /** Number of results to fetch for fuzzy word matches */ |
159 | | - public static final int POOL_FUZZY = 1000; |
| 163 | + public static final int POOL_FUZZY = 500; |
160 | 164 | /** Number of words to return for fuzzy queries */ |
161 | 165 | public static final int MAX_FUZZY = 50; |
162 | 166 | |
163 | 167 | /** Lower limit to hit rate for joining */ |
164 | 168 | public static final int JOIN_FREQ = 1; |
165 | 169 | |
166 | | - public Suggest(IndexId iid) throws IOException{ |
| 170 | + public Suggest(IndexId iid) throws IOException { |
| 171 | + this(iid,null,true); |
| 172 | + } |
| 173 | + |
| 174 | + public Suggest(IndexId iid, IndexSearcher searcher, boolean useLogging) throws IOException{ |
167 | 175 | SearcherCache cache = SearcherCache.getInstance(); |
168 | 176 | this.iid = iid; |
169 | | - this.searcher = cache.getLocalSearcher(iid.getSpell()); |
| 177 | + if(searcher == null) |
| 178 | + searcher = cache.getLocalSearcher(iid.getSpell()); |
| 179 | + this.searcher = searcher; |
170 | 180 | this.reader = searcher.getIndexReader(); |
171 | 181 | this.defaultNs = iid.getDefaultNamespace(); |
| 182 | + this.useLogging = useLogging; |
172 | 183 | |
173 | 184 | synchronized(stopWordsIndexes){ |
174 | 185 | if(!stopWordsIndexes.containsKey(searcher)){ |
— | — | @@ -209,18 +220,43 @@ |
210 | 221 | } |
211 | 222 | |
212 | 223 | /** |
| 224 | + * Extra information that will help disambiguate some suggest cases, |
| 225 | + * e.g. words from titles found in search, phrases found in text, ... |
| 226 | + * @author rainman |
| 227 | + * |
| 228 | + */ |
| 229 | + public static class ExtraInfo implements Serializable { |
| 230 | + protected HashSet<String> phrases; |
| 231 | + protected HashSet<String> foundInContext; |
| 232 | + protected HashSet<String> foundInTitles; |
| 233 | + protected int firstRank; |
| 234 | + |
| 235 | + public ExtraInfo(HashSet<String> phrases, HashSet<String> foundInContext, HashSet<String> foundInTitles, int firstRank) { |
| 236 | + this.phrases = phrases; |
| 237 | + this.foundInContext = foundInContext; |
| 238 | + this.foundInTitles = foundInTitles; |
| 239 | + this.firstRank = firstRank; |
| 240 | + } |
| 241 | + |
| 242 | + public ExtraInfo(){ |
| 243 | + this(new HashSet<String>(),new HashSet<String>(),new HashSet<String>(),0); |
| 244 | + } |
| 245 | + |
| 246 | + |
| 247 | + } |
| 248 | + |
| 249 | + /** |
213 | 250 | * Make a suggestion for a query |
214 | 251 | * |
215 | 252 | * @throws IOException |
216 | 253 | */ |
217 | 254 | @SuppressWarnings("unchecked") |
218 | | - public SuggestQuery suggest(String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, |
219 | | - int firstRank, NamespaceFilter nsf) throws IOException{ |
| 255 | + public SuggestQuery suggest(String searchterm, ArrayList<Token> tokens, ExtraInfo info, NamespaceFilter nsf) throws IOException{ |
220 | 256 | FilterFactory filters = new FilterFactory(iid); |
221 | 257 | wordExistCache.clear(); |
222 | 258 | long start = System.currentTimeMillis(); |
223 | 259 | |
224 | | - System.out.println("tokens: "+tokens+" inContext:"+foundInContext+" phrases:"+phrases); |
| 260 | + // System.out.println("tokens: "+tokens+" inContext:"+info.foundInContext+" phrases:"+info.phrases+", inTitles="+info.foundInTitles); |
225 | 261 | |
226 | 262 | if(tokens.size() > 30){ |
227 | 263 | logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start); |
— | — | @@ -262,7 +298,6 @@ |
263 | 299 | } |
264 | 300 | |
265 | 301 | // init suggestions |
266 | | - int minFreq = 0; |
267 | 302 | ArrayList<Change> suggestions = new ArrayList<Change>(); |
268 | 303 | ArrayList<Change> suggestionsTitle = new ArrayList<Change>(); |
269 | 304 | HashMap<String,HashSet<String>> contextCache = new HashMap<String,HashSet<String>>(); |
— | — | @@ -272,7 +307,7 @@ |
273 | 308 | String redirectTarget = followRedirect(joinTokens,ns); |
274 | 309 | if(redirectTarget != null){ |
275 | 310 | EditDistance ed = new EditDistance(joinTokens); |
276 | | - if(ed.getDistance(redirectTarget) <= 2 && betterRank(titleRank(redirectTarget,ns),firstRank)){ |
| 311 | + if(ed.getDistance(redirectTarget) <= 2 && betterRank(titleRank(redirectTarget,ns),info.firstRank)){ |
277 | 312 | HashMap<Integer,String> changes = extractTitleChanges(joinTokens,redirectTarget,tokens); |
278 | 313 | if(changes != null){ |
279 | 314 | SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns); |
— | — | @@ -291,7 +326,7 @@ |
292 | 327 | logRequest(searchterm,"CORRECT (exact title match)",start); |
293 | 328 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
294 | 329 | } |
295 | | - if(betterRank(r.frequency,firstRank)){ |
| 330 | + if(betterRank(r.frequency,info.firstRank)){ |
296 | 331 | HashMap<Integer,String> changes = extractTitleChanges(joinTokens,r.word,tokens); |
297 | 332 | if(changes != null){ |
298 | 333 | SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns); |
— | — | @@ -312,7 +347,7 @@ |
313 | 348 | if(r.isExactMatch()){ |
314 | 349 | logRequest(searchterm,"CORRECT (by single word index)",start); |
315 | 350 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
316 | | - } else if(r.dist == 1 && betterRank(r.frequency,firstRank)){ |
| 351 | + } else if(r.dist == 1 && betterRank(r.frequency,info.firstRank)){ |
317 | 352 | HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>(); |
318 | 353 | proposedChanges.put(0,r.word); |
319 | 354 | SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,new HashSet<Integer>(),ns); |
— | — | @@ -323,13 +358,13 @@ |
324 | 359 | } |
325 | 360 | |
326 | 361 | // check if all words are found within phrases during highlighting |
327 | | - if(tokens.size() > 1 && tokens.size() == phrases.size() + 1){ |
| 362 | + if(tokens.size() > 1 && tokens.size() == info.phrases.size() + 1){ |
328 | 363 | logRequest(searchterm,"CORRECT (by highlight phrases)",start); |
329 | 364 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
330 | 365 | } |
331 | 366 | |
332 | 367 | // indexes of words in found during highlighting in phrases |
333 | | - HashSet<Integer> inPhrases = new HashSet<Integer>(); |
| 368 | + //HashSet<Integer> inPhrases = new HashSet<Integer>(); |
334 | 369 | // words that might spellcheck to stop words |
335 | 370 | ArrayList<SuggestResult> possibleStopWords = new ArrayList<SuggestResult>(); |
336 | 371 | // word suggestions |
— | — | @@ -377,7 +412,7 @@ |
378 | 413 | possibleStopWords.add(null); |
379 | 414 | } |
380 | 415 | // suggest split |
381 | | - SuggestResult split = suggestSplit(w,minFreq); |
| 416 | + SuggestResult split = suggestSplit(w,ns); |
382 | 417 | if(split != null){ |
383 | 418 | Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT); |
384 | 419 | sc.substitutes.put(i,split.word.replace("_"," ")); |
— | — | @@ -388,7 +423,7 @@ |
389 | 424 | if(i-1 >= 0 |
390 | 425 | && (wordSug.get(i-1)==null || !wordSug.get(i-1).get(0).isExactMatch()) |
391 | 426 | && (wordSug.get(i)==null || !wordSug.get(i).get(0).isExactMatch())){ |
392 | | - SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,minFreq); |
| 427 | + SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,ns); |
393 | 428 | if(join != null){ |
394 | 429 | Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN); |
395 | 430 | sc.substitutes.put(i-1,""); |
— | — | @@ -459,27 +494,29 @@ |
460 | 495 | int freq = (Integer)ret[0]; |
461 | 496 | boolean inTitle = (Boolean)ret[1]; |
462 | 497 | |
463 | | - // log.info("Checking "+phrase); |
| 498 | + //log.debug("Checking "+phrase); |
464 | 499 | boolean inContext = inContext(s1.word,s2.word,contextCache,allWords,ns) || inContext(s2.word,s1.word,contextCache,allWords,ns); |
465 | 500 | if(freq > 0 || inContext){ |
466 | 501 | // number of characters added/substracted |
467 | 502 | int diff1 = Math.abs(s1.word.length()-w1.length()); |
468 | 503 | int diff2 = Math.abs(s2.word.length()-w2.length()); |
469 | | - log.info("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle); |
| 504 | + log.debug("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle); |
470 | 505 | int dist = s1.dist + s2.dist + distOffset; |
471 | 506 | boolean accept = true; |
472 | 507 | Change c = new Change(dist,freq,Change.Type.PHRASE); |
473 | 508 | // register changes |
474 | 509 | if(s1.word.equals(w1)) |
475 | 510 | c.preserves.put(i,w1); |
476 | | - else if(!good1 || ((inTitle||inContext) && diff1 <=2 && !foundInContext.contains(w1)) ) |
| 511 | + else if((!good1 && !info.foundInTitles.contains(w1)) |
| 512 | + || ((inTitle||inContext) && diff1 <=2 && !info.foundInContext.contains(w1)) ) |
477 | 513 | c.substitutes.put(i,s1.word); |
478 | 514 | else |
479 | 515 | accept = false; |
480 | 516 | |
481 | 517 | if(s2.word.equals(w2)) |
482 | 518 | c.preserves.put(i2,w2); |
483 | | - else if(!good2 || ((inTitle||inContext) && diff2 <= 2 && !foundInContext.contains(w2))) |
| 519 | + else if((!good2 && !info.foundInTitles.contains(w2)) |
| 520 | + || ((inTitle||inContext) && diff2 <= 2 && !info.foundInContext.contains(w2))) |
484 | 521 | c.substitutes.put(i2,s2.word); |
485 | 522 | else |
486 | 523 | accept = false; |
— | — | @@ -522,7 +559,7 @@ |
523 | 560 | return sq; |
524 | 561 | } |
525 | 562 | } |
526 | | - log.info("Spell-checking based on phrases..."); |
| 563 | + log.debug("Spell-checking based on phrases..."); |
527 | 564 | // find best suggestion based on phrases |
528 | 565 | HashMap<Integer,String> preserveTokens = new HashMap<Integer,String>(); |
529 | 566 | HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>(); |
— | — | @@ -544,11 +581,12 @@ |
545 | 582 | for(int i=0;i<tokens.size();i++){ |
546 | 583 | if(preserveTokens.containsKey(i) || proposedChanges.containsKey(i)) |
547 | 584 | continue; |
| 585 | + String w = tokens.get(i).termText(); |
548 | 586 | ArrayList<SuggestResult> sug = wordSug.get(i); |
549 | 587 | if(sug == null) |
550 | 588 | continue; |
551 | 589 | SuggestResult s = sug.get(0); |
552 | | - if(!s.isExactMatch() && acceptWordChange(tokens.get(i).termText(),s)){ |
| 590 | + if(!s.isExactMatch() && !info.foundInTitles.contains(w) && acceptWordChange(w,s)){ |
553 | 591 | distance += s.dist; |
554 | 592 | proposedChanges.put(i,s.word); |
555 | 593 | if(using.equals("phrases")) |
— | — | @@ -564,7 +602,7 @@ |
565 | 603 | SuggestResult tr = titleRes.get(0); |
566 | 604 | HashMap<Integer,String> changes = extractTitleChanges(joinTokens,tr.word,tokens); |
567 | 605 | if(changes != null){ |
568 | | - if(tr.dist <= distance && (betterRank(tr.frequency,firstRank) || proposedChanges.equals(changes))){ |
| 606 | + if(tr.dist <= distance && (betterRank(tr.frequency,info.firstRank) || proposedChanges.equals(changes))){ |
569 | 607 | // we found a much better suggestion ! |
570 | 608 | proposedChanges = changes; |
571 | 609 | alwaysReplace.addAll(proposedChanges.keySet()); |
— | — | @@ -690,6 +728,26 @@ |
691 | 729 | } |
692 | 730 | return b; |
693 | 731 | } |
| 732 | + /** Get frequency of a word if exists (0 if not) */ |
| 733 | + private int wordFrequency(String w, Namespaces ns) throws IOException { |
| 734 | + if(ns == null){ // default |
| 735 | + TermDocs td = reader.termDocs(new Term("word",w)); |
| 736 | + if(td.next()) |
| 737 | + return getFrequency(reader.document(td.doc()),null); |
| 738 | + return 0; |
| 739 | + } else{ // other |
| 740 | + int freq = 0; |
| 741 | + TermDocs td = reader.termDocs(new Term(ns.prefix+"word",w)); |
| 742 | + if(td.next()) |
| 743 | + freq = getFrequency(reader.document(td.doc()),ns); |
| 744 | + if(ns.additional){ // also look in main |
| 745 | + TermDocs td2 = reader.termDocs(new Term("word",w)); |
| 746 | + if(td2.next()) |
| 747 | + freq += getFrequency(reader.document(td2.doc()),null); |
| 748 | + } |
| 749 | + return freq; |
| 750 | + } |
| 751 | + } |
694 | 752 | |
695 | 753 | /** Return true if (striped) title exists in the index */ |
696 | 754 | private boolean titleExists(String w, Namespaces ns) throws IOException{ |
— | — | @@ -762,8 +820,9 @@ |
763 | 821 | if(w.equals(nt)) |
764 | 822 | continue; // trying to subtitute same |
765 | 823 | // incorrect words, or doesn't stem to same |
766 | | - boolean sameStem = (alwaysReplace.contains(e.getKey()))? false : filters.stemsToSame(FastWikiTokenizerEngine.decompose(w),FastWikiTokenizerEngine.decompose(nt)); |
767 | | - if(!sameStem || (sameStem && !wordExists(w,ns))){ |
| 824 | + boolean sameStem = (alwaysReplace.contains(e.getKey()))? false : filters.stemsToSame(FastWikiTokenizerEngine.decompose(w),FastWikiTokenizerEngine.decompose(nt)) || filters.stemsToSame(w,nt); |
| 825 | + //if(!sameStem || (sameStem && !wordExists(w,ns))){ |
| 826 | + if(!sameStem){ |
768 | 827 | int so = t.startOffset(); |
769 | 828 | int eo = t.endOffset(); |
770 | 829 | if(so != start) |
— | — | @@ -940,7 +999,7 @@ |
941 | 1000 | } |
942 | 1001 | }); |
943 | 1002 | |
944 | | - log.info("Sorted changes: "+changes); |
| 1003 | + log.debug("Sorted changes: "+changes); |
945 | 1004 | |
946 | 1005 | HashMap<Integer,String> accept = new HashMap<Integer,String>(); |
947 | 1006 | HashMap<Integer,String> preserve = new HashMap<Integer,String>(); |
— | — | @@ -971,7 +1030,7 @@ |
972 | 1031 | break; |
973 | 1032 | } |
974 | 1033 | if(changesBadWord){ |
975 | | - log.info("Considering "+c); |
| 1034 | + log.debug("Considering "+c); |
976 | 1035 | boolean acceptChange = true; |
977 | 1036 | for(Entry<Integer,String> e : c.substitutes.entrySet()){ |
978 | 1037 | String acceptedTerm = accept.get(e.getKey()); |
— | — | @@ -983,7 +1042,7 @@ |
984 | 1043 | } |
985 | 1044 | } |
986 | 1045 | if(acceptChange && (dist + c.dist < maxDist)){ |
987 | | - log.info("Applying "+c); |
| 1046 | + log.debug("Applying "+c); |
988 | 1047 | processedChange.add(i); |
989 | 1048 | for(Entry<Integer,String> e : c.substitutes.entrySet()){ |
990 | 1049 | accept.put(e.getKey(),e.getValue()); |
— | — | @@ -1014,7 +1073,7 @@ |
1015 | 1074 | } |
1016 | 1075 | } |
1017 | 1076 | if(acceptChange && (dist + c.dist < maxDist)){ |
1018 | | - log.info("Applying "+c); |
| 1077 | + log.debug("Applying "+c); |
1019 | 1078 | processedChange.add(i); |
1020 | 1079 | for(Entry<Integer,String> e : c.substitutes.entrySet()){ |
1021 | 1080 | accept.put(e.getKey(),e.getValue()); |
— | — | @@ -1058,7 +1117,7 @@ |
1059 | 1118 | } |
1060 | 1119 | |
1061 | 1120 | /** Merge two result sets */ |
1062 | | - public ArrayList<SuggestResult> mergeResults(ArrayList<SuggestResult> main, ArrayList<SuggestResult> add, int num){ |
| 1121 | + public ArrayList<SuggestResult> mergeResults(ArrayList<SuggestResult> main, ArrayList<SuggestResult> add, int num, Filtering filter){ |
1063 | 1122 | // merge |
1064 | 1123 | HashMap<String,SuggestResult> map = new HashMap<String,SuggestResult>(); |
1065 | 1124 | ArrayList<SuggestResult> toAdd = new ArrayList<SuggestResult>(); |
— | — | @@ -1074,7 +1133,10 @@ |
1075 | 1134 | } |
1076 | 1135 | main.addAll(toAdd); |
1077 | 1136 | // re-sort |
1078 | | - Collections.sort(main,new SuggestResult.Comparator()); |
| 1137 | + if(filter == Filtering.WEAK) |
| 1138 | + Collections.sort(main,new SuggestResult.ComparatorNoCommonMisspell()); |
| 1139 | + else |
| 1140 | + Collections.sort(main,new SuggestResult.Comparator()); |
1079 | 1141 | // trim |
1080 | 1142 | ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>(); |
1081 | 1143 | for(int i=0;i<num && i<main.size();i++) |
— | — | @@ -1097,7 +1159,7 @@ |
1098 | 1160 | ArrayList<SuggestResult> res = suggestWordsOnNamespaces(word,word,num,num,namespaces,filter); |
1099 | 1161 | if(namespaces.additional){ |
1100 | 1162 | ArrayList<SuggestResult> def = suggestWordsOnNamespaces(word,word,num,num,null,filter); // add from default |
1101 | | - return mergeResults(def,res,num); |
| 1163 | + return mergeResults(def,res,num,filter); |
1102 | 1164 | } |
1103 | 1165 | return res; |
1104 | 1166 | } |
— | — | @@ -1135,7 +1197,10 @@ |
1136 | 1198 | res.add(r); |
1137 | 1199 | } |
1138 | 1200 | // sort |
1139 | | - Collections.sort(res,new SuggestResult.Comparator()); |
| 1201 | + if(filter == Filtering.WEAK) |
| 1202 | + Collections.sort(res,new SuggestResult.ComparatorNoCommonMisspell()); |
| 1203 | + else |
| 1204 | + Collections.sort(res,new SuggestResult.Comparator()); |
1140 | 1205 | ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>(); |
1141 | 1206 | for(int i=0;i<num && i<res.size();i++) |
1142 | 1207 | ret.add(res.get(i)); |
— | — | @@ -1148,9 +1213,7 @@ |
1149 | 1214 | } |
1150 | 1215 | |
1151 | 1216 | private int getFrequency(Document d, Namespaces namespaces) { |
1152 | | - String prefix = ""; |
1153 | | - if(namespaces != null) // namespaces=null -> default namespace, empty -> all |
1154 | | - prefix = namespaces.prefix; |
| 1217 | + String prefix = getPrefix(namespaces); |
1155 | 1218 | int freq = 0; |
1156 | 1219 | if(namespaces == null) |
1157 | 1220 | freq = Integer.parseInt(d.get(prefix+"freq")); |
— | — | @@ -1168,39 +1231,46 @@ |
1169 | 1232 | return freq; |
1170 | 1233 | } |
1171 | 1234 | |
| 1235 | + /** @return {frequency (int), inTitle (boolean)} */ |
1172 | 1236 | private Object[] getPhrase(String phrase, Namespaces namespaces) throws IOException { |
1173 | | - String prefix = ""; |
1174 | | - if(namespaces != null) // namespaces=null -> default namespace, empty -> all |
1175 | | - prefix = namespaces.prefix; |
1176 | | - |
| 1237 | + String prefix = getPrefix(namespaces); |
1177 | 1238 | int freq = 0; |
1178 | | - boolean inTitle = false; |
1179 | | - TermDocs td = reader.termDocs(new Term(prefix+"phrase",phrase)); |
1180 | | - if(td.next()){ |
1181 | | - Document d = reader.document(td.doc()); |
1182 | | - if(namespaces == null){ |
1183 | | - freq = Integer.parseInt(d.get(prefix+"freq")); |
| 1239 | + boolean inTitle = false; |
| 1240 | + // default namespaces |
| 1241 | + if(namespaces == null || namespaces.additional){ |
| 1242 | + TermDocs td = reader.termDocs(new Term("phrase",phrase)); |
| 1243 | + if(td.next()){ |
| 1244 | + Document d = reader.document(td.doc()); |
| 1245 | + String f = d.get("freq"); |
| 1246 | + freq = Integer.parseInt(f); |
1184 | 1247 | String it = d.get("intitle"); |
1185 | 1248 | if(it!=null && it.equals("1")) |
1186 | 1249 | inTitle = true; |
1187 | | - } else{ // all namespaces |
1188 | | - if(namespaces.namespaces.isEmpty()){ |
1189 | | - freq = Integer.parseInt(d.get(prefix+"freq")); |
1190 | | - String it = d.get("intitle"); |
1191 | | - if(it!=null && it.equals("1")) |
1192 | | - inTitle = true; |
1193 | | - |
1194 | | - } else{ |
| 1250 | + } |
| 1251 | + } |
| 1252 | + // other |
| 1253 | + if(namespaces!=null){ |
| 1254 | + TermDocs td = reader.termDocs(new Term(prefix+"phrase",phrase)); |
| 1255 | + if(td.next()){ |
| 1256 | + Document d = reader.document(td.doc()); |
| 1257 | + String it = d.get(prefix+"intitle"); |
| 1258 | + if(it!=null && it.equals("1")) |
| 1259 | + inTitle = true; |
| 1260 | + |
| 1261 | + if(namespaces.namespaces.isEmpty()){ // all |
| 1262 | + String f = d.get(prefix+"freq"); |
| 1263 | + if(f != null) |
| 1264 | + freq += Integer.parseInt(f); |
| 1265 | + } else{ // some subset |
1195 | 1266 | for(Integer i : namespaces.namespaces){ |
1196 | 1267 | String f = d.get(prefix+"freq_"+i); |
1197 | | - if(f != null){ |
| 1268 | + if(f != null) |
1198 | 1269 | freq += Integer.parseInt(f); |
1199 | | - inTitle = true; |
1200 | | - } |
1201 | | - } |
| 1270 | + } |
1202 | 1271 | } |
1203 | 1272 | } |
1204 | 1273 | } |
| 1274 | + |
1205 | 1275 | return new Object[] { freq, inTitle}; |
1206 | 1276 | } |
1207 | 1277 | |
— | — | @@ -1211,7 +1281,7 @@ |
1212 | 1282 | ArrayList<SuggestResult> res = suggestTitlesOnNamespaces(title,num,pool_size,distance,namespaces); |
1213 | 1283 | if(namespaces.additional){ |
1214 | 1284 | ArrayList<SuggestResult> main = suggestTitlesOnNamespaces(title,num,pool_size,distance,null); |
1215 | | - return mergeResults(main,res,num); |
| 1285 | + return mergeResults(main,res,num,Filtering.STRONG); |
1216 | 1286 | } |
1217 | 1287 | return res; |
1218 | 1288 | } |
— | — | @@ -1329,25 +1399,19 @@ |
1330 | 1400 | } |
1331 | 1401 | |
1332 | 1402 | /** Try to split word into 2 words which make up a phrase */ |
1333 | | - public SuggestResult suggestSplit(String word, int minFreq){ |
1334 | | - int freq = 0; |
1335 | | - Hits hits; |
| 1403 | + public SuggestResult suggestSplit(String word, Namespaces ns){ |
1336 | 1404 | ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
1337 | 1405 | try { |
1338 | 1406 | // find frequency |
1339 | | - hits = searcher.search(new TermQuery(new Term("word",word))); |
1340 | | - if(hits.length() == 1) |
1341 | | - freq = Integer.parseInt(hits.doc(0).get("freq")); |
| 1407 | + int wordFreq = wordFrequency(word,ns); |
1342 | 1408 | |
1343 | 1409 | // try different splits |
1344 | 1410 | for(int i=1;i<word.length()-1;i++){ |
1345 | 1411 | String phrase = word.substring(0,i) + "_" + word.substring(i); |
1346 | | - hits = searcher.search(new TermQuery(new Term("phrase",phrase))); |
1347 | | - if(hits.length() > 0){ |
1348 | | - int pfreq = Integer.parseInt(hits.doc(0).get("freq")); |
1349 | | - if(pfreq >= freq && pfreq > minFreq) |
1350 | | - res.add(new SuggestResult(phrase,pfreq,2)); |
1351 | | - } |
| 1412 | + Object[] ret = getPhrase(phrase,ns); |
| 1413 | + int freq = (Integer)ret[0]; |
| 1414 | + if(freq > wordFreq) |
| 1415 | + res.add(new SuggestResult(phrase,freq,2)); |
1352 | 1416 | } |
1353 | 1417 | if(res.size() > 0){ |
1354 | 1418 | Collections.sort(res,new SuggestResult.Comparator()); |
— | — | @@ -1361,14 +1425,13 @@ |
1362 | 1426 | } |
1363 | 1427 | |
1364 | 1428 | /** Returns suggestion if joining words makes sense */ |
1365 | | - public SuggestResult suggestJoin(String word1, String word2, int minFreq){ |
| 1429 | + public SuggestResult suggestJoin(String word1, String word2, Namespaces ns){ |
1366 | 1430 | try { |
1367 | | - Hits hits = searcher.search(new TermQuery(new Term("word",word1+word2))); |
1368 | | - if(hits.length() > 0){ |
1369 | | - int freq = Integer.parseInt(hits.doc(0).get("freq")); |
1370 | | - if(freq >= minFreq) |
1371 | | - return new SuggestResult(word1+word2,freq,1); |
1372 | | - } |
| 1431 | + Object[] ret = getPhrase(word1+"_"+word2,ns); |
| 1432 | + int freqPhrase = (Integer)ret[0]; |
| 1433 | + int freqJoin = wordFrequency(word1+word2,ns); |
| 1434 | + if(freqJoin > 0 && freqJoin > freqPhrase) |
| 1435 | + return new SuggestResult(word1+word2,freqJoin,1); |
1373 | 1436 | } catch (IOException e) { |
1374 | 1437 | log.warn("I/O error while suggesting join on "+iid+" : "+e.getMessage()); |
1375 | 1438 | e.printStackTrace(); |
— | — | @@ -1379,7 +1442,10 @@ |
1380 | 1443 | /** Fetch a set of string for fuzzy queries */ |
1381 | 1444 | public ArrayList<SuggestResult> getFuzzy(String word, NamespaceFilter nsf){ |
1382 | 1445 | Namespaces ns = makeNamespaces(nsf); |
1383 | | - ArrayList<SuggestResult> sug = suggestWords(word,POOL_FUZZY,ns,Filtering.WEAK); |
| 1446 | + int pool = POOL_FUZZY; |
| 1447 | + if(word.length() <= 4) |
| 1448 | + pool *= 2; |
| 1449 | + ArrayList<SuggestResult> sug = suggestWords(word,pool,ns,Filtering.WEAK); |
1384 | 1450 | ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>(); |
1385 | 1451 | for(int i=0;i<MAX_FUZZY && i<sug.size();i++){ |
1386 | 1452 | ret.add(sug.get(i)); |
— | — | @@ -1388,7 +1454,8 @@ |
1389 | 1455 | } |
1390 | 1456 | |
1391 | 1457 | protected void logRequest(String searchterm, String using, long start){ |
1392 | | - log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms"); |
| 1458 | + if(useLogging) |
| 1459 | + log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms"); |
1393 | 1460 | } |
1394 | 1461 | |
1395 | 1462 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java |
— | — | @@ -73,15 +73,15 @@ |
74 | 74 | System.out.println(r); |
75 | 75 | } |
76 | 76 | |
77 | | - System.out.println("SPLIT: "+sc.suggestSplit(text,0)); |
| 77 | + System.out.println("SPLIT: "+sc.suggestSplit(text,null)); |
78 | 78 | } |
79 | 79 | if(last != null){ |
80 | | - System.out.println("JOIN: "+sc.suggestJoin(last,text,0)); |
| 80 | + System.out.println("JOIN: "+sc.suggestJoin(last,text,null)); |
81 | 81 | } |
82 | 82 | last = text; |
83 | 83 | } |
84 | 84 | } |
85 | | - System.out.println("#suggest: "+sc.suggest(inputtext,parser.tokenizeBareText(inputtext),new HashSet<String>(),new HashSet<String>(),0,new NamespaceFilter("0"))); |
| 85 | + System.out.println("#suggest: "+sc.suggest(inputtext,parser.tokenizeBareText(inputtext),new Suggest.ExtraInfo(new HashSet<String>(),new HashSet<String>(),new HashSet<String>(),0),new NamespaceFilter("0"))); |
86 | 86 | System.out.println("(finished in "+(System.currentTimeMillis()-start)+" ms)"); |
87 | 87 | } |
88 | 88 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java |
— | — | @@ -14,6 +14,7 @@ |
15 | 15 | import org.apache.lucene.document.Field.Store; |
16 | 16 | import org.apache.lucene.index.CorruptIndexException; |
17 | 17 | import org.apache.lucene.index.IndexWriter; |
| 18 | +import org.apache.lucene.index.Term; |
18 | 19 | import org.wikimedia.lsearch.analyzers.Analyzers; |
19 | 20 | import org.wikimedia.lsearch.analyzers.FieldBuilder; |
20 | 21 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
— | — | @@ -44,6 +45,7 @@ |
45 | 46 | protected String langCode; |
46 | 47 | protected Analyzer analyzer; |
47 | 48 | protected HashSet<String> stopWords; |
| 49 | + protected NamespaceFilter nsf; |
48 | 50 | |
49 | 51 | /** Make a new index, and init writer on it (on importPath())*/ |
50 | 52 | public static CleanIndexWriter newForWrite(IndexId iid) throws IOException{ |
— | — | @@ -63,9 +65,10 @@ |
64 | 66 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
65 | 67 | this.iid = iid; |
66 | 68 | this.builder = new FieldBuilder(iid,FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK); |
67 | | - this.langCode = global.getLanguage(iid.getDBname()); |
| 69 | + this.langCode = iid.getLangCode(); |
68 | 70 | analyzer = Analyzers.getIndexerAnalyzer(builder); |
69 | 71 | this.stopWords = StopWords.getPredefinedSet(iid); |
| 72 | + nsf = global.getDefaultNamespace(iid); |
70 | 73 | |
71 | 74 | HashSet<String> stopWords = new HashSet<String>(); |
72 | 75 | for(String w : StopWords.getStopWords(iid)) |
— | — | @@ -83,6 +86,19 @@ |
84 | 87 | writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH); |
85 | 88 | } |
86 | 89 | |
| 90 | + public void deleteArticleInfo(String pageId) throws IOException { |
| 91 | + writer.deleteDocuments(new Term("key",pageId)); |
| 92 | + } |
| 93 | + |
| 94 | + /** Call this to add information about the article into index */ |
| 95 | + public void addArticleInfo(Article a){ |
| 96 | + // only for articles in default namespace(s) |
| 97 | + if(nsf.contains(Integer.parseInt(a.getNamespace()))) |
| 98 | + addArticle(a); |
| 99 | + else |
| 100 | + addTitleOnly(a); |
| 101 | + } |
| 102 | + |
87 | 103 | /** Add single article */ |
88 | 104 | protected void addArticle(Article a){ |
89 | 105 | //if(!WikiIndexModifier.checkAddPreconditions(a,langCode)) |
— | — | @@ -102,8 +118,9 @@ |
103 | 119 | } |
104 | 120 | |
105 | 121 | /** Add title/redirect with ranks information only */ |
106 | | - public void addTitleOnly(Article article) { |
| 122 | + protected void addTitleOnly(Article article) { |
107 | 123 | Document doc = new Document(); |
| 124 | + doc.add(new Field("key",article.getIndexKey(),Store.NO,Index.UN_TOKENIZED)); |
108 | 125 | doc.add(new Field("ns_title",article.getTitle(),Store.YES,Index.TOKENIZED)); |
109 | 126 | doc.add(new Field("ns_namespace",article.getNamespace(),Store.YES,Index.UN_TOKENIZED)); |
110 | 127 | doc.add(new Field("ns_rank",Integer.toString(article.getReferences()),Store.YES,Index.NO)); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java |
— | — | @@ -83,7 +83,7 @@ |
84 | 84 | if(minPhraseFreq < 1) |
85 | 85 | minPhraseFreq = 1; |
86 | 86 | this.createNew = createNew; |
87 | | - this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
| 87 | + this.langCode=iid.getLangCode(); |
88 | 88 | this.ngramWriter = new NgramIndexer(); |
89 | 89 | this.registry = IndexRegistry.getInstance(); |
90 | 90 | } |
— | — | @@ -222,7 +222,7 @@ |
223 | 223 | while((word = dict.next()) != null){ |
224 | 224 | String w = word.getWord(); |
225 | 225 | if(w.contains("_")){ // phrase |
226 | | - addNsPhrase(w,ir); |
| 226 | + addNsPhrase(w,ir,true); |
227 | 227 | } else{ // word |
228 | 228 | addNsWord(w,ir); |
229 | 229 | } |
— | — | @@ -329,7 +329,7 @@ |
330 | 330 | } |
331 | 331 | |
332 | 332 | /** Add phrase in namespace other than default */ |
333 | | - public void addNsPhrase(String phrase, IndexReader ir) throws IOException { |
| 333 | + public void addNsPhrase(String phrase, IndexReader ir, boolean inTitle) throws IOException { |
334 | 334 | if(phrase.length() <= 2){ |
335 | 335 | log.warn("Invalid phrase: "+phrase); |
336 | 336 | return; |
— | — | @@ -342,6 +342,9 @@ |
343 | 343 | for(Entry<String,SimpleInt> e : freq.entrySet()){ |
344 | 344 | doc.add(new Field("ns_freq_"+e.getKey(), Integer.toString(e.getValue().count), Field.Store.YES, Field.Index.NO)); |
345 | 345 | } |
| 346 | + if(inTitle){ |
| 347 | + doc.add(new Field("ns_intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 348 | + } |
346 | 349 | ngramWriter.addDocument(doc); |
347 | 350 | } |
348 | 351 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java |
— | — | @@ -47,15 +47,14 @@ |
48 | 48 | CleanIndexWriter writer; |
49 | 49 | String langCode; |
50 | 50 | Links links; |
51 | | - NamespaceFilter nsf; |
52 | 51 | |
53 | 52 | public CleanIndexImporter(IndexId iid, String langCode) throws IOException{ |
54 | 53 | Configuration.open(); // make sure configuration is loaded |
55 | 54 | this.writer = CleanIndexWriter.newForWrite(iid); |
56 | 55 | this.langCode = langCode; |
57 | | - this.links = Links.openForRead(iid,iid.getLinks().getImportPath()); |
58 | | - nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid); |
59 | | - log.info("Rebuilding for namespaces: "+nsf); |
| 56 | + this.links = Links.openStandalone(iid); |
| 57 | + |
| 58 | + //log.info("Rebuilding for namespaces: "+nsf); |
60 | 59 | } |
61 | 60 | public void writeRevision(Revision revision) throws IOException { |
62 | 61 | this.revision = revision; |
— | — | @@ -72,8 +71,8 @@ |
73 | 72 | ArrayList<String> redirectsHere = links.getRedirectsTo(key); |
74 | 73 | references -= redirectsHere.size(); // we want raw rank, without redirects |
75 | 74 | |
76 | | - if(redirectTargetNamespace<0 || !nsf.contains(redirectTargetNamespace)) |
77 | | - redirectTo = null; // redirect to other namespace |
| 75 | + if(redirectTargetNamespace<0 || redirectTargetNamespace != page.Title.Namespace) |
| 76 | + redirectTo = null; // redirect to different namespace |
78 | 77 | } |
79 | 78 | Date date = new Date(revision.Timestamp.getTimeInMillis()); |
80 | 79 | |
— | — | @@ -88,11 +87,7 @@ |
89 | 88 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,redirectTo, |
90 | 89 | references,redirectTargetNamespace,redirects,new ArrayList<RelatedTitle>(),anchors,date); |
91 | 90 | |
92 | | - // only for articles in default namespace(s) |
93 | | - if(nsf.contains(page.Title.Namespace)) |
94 | | - writer.addArticle(article); |
95 | | - else |
96 | | - writer.addTitleOnly(article); |
| 91 | + writer.addArticleInfo(article); |
97 | 92 | } |
98 | 93 | |
99 | 94 | public void close() throws IOException { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java |
— | — | @@ -60,7 +60,7 @@ |
61 | 61 | } |
62 | 62 | long start = System.currentTimeMillis(); |
63 | 63 | try { |
64 | | - rebuildFromLinksNew(iid); |
| 64 | + rebuildFromLinks(iid); |
65 | 65 | } catch (IOException e) { |
66 | 66 | log.fatal("Rebuild I/O error: "+e.getMessage()); |
67 | 67 | e.printStackTrace(); |
— | — | @@ -71,84 +71,9 @@ |
72 | 72 | |
73 | 73 | System.out.println("Finished generating related in "+formatTime(end-start)); |
74 | 74 | } |
75 | | - |
76 | | - @Deprecated |
77 | | - public static void rebuildFromDump(String inputfile, IndexId iid) throws IOException{ |
78 | | - GlobalConfiguration global = GlobalConfiguration.getInstance(); |
79 | | - String langCode = global.getLanguage(iid); |
80 | | - log.info("First pass, getting a list of valid articles..."); |
81 | | - // first pass - titles |
82 | | - InputStream input = null; |
83 | | - input = Tools.openInputFile(inputfile); |
84 | | - NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid); |
85 | | - TitleReader tr = new TitleReader(iid,langCode,nsf); |
86 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
87 | | - reader.readDump(); |
88 | | - input.close(); |
89 | | - CompactLinks links = tr.getTitles(); |
90 | | - tr = null; // GC |
91 | | - |
92 | | - log.info("Second pass, geting in/out links..."); |
93 | | - // second pass - in/out links |
94 | | - input = Tools.openInputFile(inputfile); |
95 | | - LinkReader rr = new LinkReader(links,iid); |
96 | | - reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000)); |
97 | | - reader.readDump(); |
98 | | - links.compactAll(); |
99 | | - store(links,iid); |
100 | | - } |
101 | 75 | |
102 | | - /** |
103 | | - * Rebuild related articles index for iid |
104 | | - * @throws IOException |
105 | | - */ |
106 | | - @Deprecated |
107 | | - public static void rebuildFromLinks(IndexId iid) throws IOException { |
108 | | - CompactLinks links = new CompactLinks(); |
109 | | - Links temp = Links.openForRead(iid,iid.getLinks().getImportPath()); |
110 | | - |
111 | | - NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid); |
112 | | - log.info("Reading titles in default search"); |
113 | | - Dictionary dict = temp.getKeys(); |
114 | | - Word w; |
115 | | - HashMap<Integer,CompactArticleLinks> keyCache = new HashMap<Integer,CompactArticleLinks>(); |
116 | | - while((w = dict.next()) != null){ |
117 | | - String key = w.getWord(); |
118 | | - int ns = Integer.parseInt(key.substring(0,key.indexOf(':'))); |
119 | | - if(nsf.contains(ns)){ |
120 | | - links.add(key,temp.getNumInLinks(key)); |
121 | | - keyCache.put(temp.getDocId(key),links.get(key)); |
122 | | - } |
123 | | - } |
124 | | - |
125 | | - log.info("Reading in/out links"); |
126 | | - dict = temp.getKeys(); |
127 | | - while((w = dict.next()) != null){ |
128 | | - String key = w.getWord(); |
129 | | - int ns = Integer.parseInt(key.substring(0,key.indexOf(':'))); |
130 | | - if(nsf.contains(ns)){ |
131 | | - CompactArticleLinks l = links.get(key); |
132 | | - // inlinks |
133 | | - l.setInLinks(temp.getInLinks(l,keyCache)); |
134 | | - // outlinks |
135 | | - ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>(); |
136 | | - for(String k : temp.getOutLinks(key).toCollection()){ |
137 | | - CompactArticleLinks cs = links.get(k); |
138 | | - if(cs != null) |
139 | | - out.add(cs); |
140 | | - } |
141 | | - l.setOutLinks(out); |
142 | | - } |
143 | | - } |
144 | | - temp.close(); |
145 | | - temp = null; // GC |
146 | | - keyCache = null; // GC |
147 | | - |
148 | | - store(links,iid); |
149 | | - } |
150 | | - |
151 | 76 | /** Calculate from links index */ |
152 | | - public static void rebuildFromLinksNew(IndexId iid) throws IOException { |
| 77 | + public static void rebuildFromLinks(IndexId iid) throws IOException { |
153 | 78 | Links links = Links.openForRead(iid,iid.getLinks().getImportPath()); |
154 | 79 | RelatedStorage store = new RelatedStorage(iid); |
155 | 80 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java |
— | — | @@ -132,10 +132,10 @@ |
133 | 133 | } |
134 | 134 | |
135 | 135 | // inherit javadoc |
136 | | - public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int[] df, int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases) throws RemoteException{ |
| 136 | + public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int[] df, int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws RemoteException{ |
137 | 137 | IndexId iid = IndexId.get(dbrole); |
138 | 138 | try{ |
139 | | - return Highlight.highlight(hits,iid,terms,df,maxDoc,words,StopWords.getPredefinedSet(iid),exactCase,null,sortByPhrases); |
| 139 | + return Highlight.highlight(hits,iid,terms,df,maxDoc,words,StopWords.getPredefinedSet(iid),exactCase,null,sortByPhrases,alwaysIncludeFirst); |
140 | 140 | } catch(IOException e){ |
141 | 141 | throw new RemoteException("IOException on "+dbrole,e); |
142 | 142 | } |
— | — | @@ -151,10 +151,10 @@ |
152 | 152 | } |
153 | 153 | } |
154 | 154 | |
155 | | - public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf) throws RemoteException { |
| 155 | + public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf) throws RemoteException { |
156 | 156 | IndexId iid = IndexId.get(dbrole); |
157 | 157 | try{ |
158 | | - return new Suggest(iid).suggest(searchterm,tokens,phrases,foundInContext,firstRank,nsf); |
| 158 | + return new Suggest(iid).suggest(searchterm,tokens,info,nsf); |
159 | 159 | } catch(Exception e){ |
160 | 160 | e.printStackTrace(); |
161 | 161 | throw new RemoteException("Exception on "+dbrole,e); |
— | — | @@ -171,6 +171,16 @@ |
172 | 172 | } |
173 | 173 | } |
174 | 174 | |
| 175 | + public SearchResults searchRelated(String dbrole, String searchterm, int offset, int limit) throws RemoteException { |
| 176 | + IndexId iid = IndexId.get(dbrole); |
| 177 | + try{ |
| 178 | + return new SearchEngine().searchRelatedLocal(iid,searchterm,offset,limit); |
| 179 | + } catch(IOException e){ |
| 180 | + e.printStackTrace(); |
| 181 | + throw new RemoteException("Exception on "+dbrole,e); |
| 182 | + } |
| 183 | + } |
| 184 | + |
175 | 185 | protected RMIMessengerImpl(){ |
176 | 186 | networkStatus = null; |
177 | 187 | indexRegistry = null; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java |
— | — | @@ -32,6 +32,7 @@ |
33 | 33 | import org.wikimedia.lsearch.search.SuffixFilterWrapper; |
34 | 34 | import org.wikimedia.lsearch.search.SuffixNamespaceWrapper; |
35 | 35 | import org.wikimedia.lsearch.search.Wildcards; |
| 36 | +import org.wikimedia.lsearch.spell.Suggest; |
36 | 37 | import org.wikimedia.lsearch.spell.SuggestQuery; |
37 | 38 | import org.wikimedia.lsearch.spell.SuggestResult; |
38 | 39 | |
— | — | @@ -247,13 +248,13 @@ |
248 | 249 | } |
249 | 250 | } |
250 | 251 | |
251 | | - public Highlight.ResultSet highlight(String host, ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases){ |
| 252 | + public Highlight.ResultSet highlight(String host, ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst){ |
252 | 253 | try{ |
253 | 254 | RMIMessenger r = messengerFromCache(host); |
254 | | - return r.highlight(hits,dbrole,terms,df,maxDoc,words,exactCase,sortByPhrases); |
| 255 | + return r.highlight(hits,dbrole,terms,df,maxDoc,words,exactCase,sortByPhrases,alwaysIncludeFirst); |
255 | 256 | } catch(Exception e){ |
256 | 257 | e.printStackTrace(); |
257 | | - return new Highlight.ResultSet(new HashMap<String,HighlightResult>(),new HashSet<String>(),new HashSet<String>(),false,0); |
| 258 | + return new Highlight.ResultSet(new HashMap<String,HighlightResult>(),new HashSet<String>(),new HashSet<String>(),false,0,new HashSet<String>()); |
258 | 259 | } |
259 | 260 | } |
260 | 261 | |
— | — | @@ -279,10 +280,10 @@ |
280 | 281 | } |
281 | 282 | } |
282 | 283 | |
283 | | - public SuggestQuery suggest(String host, String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf){ |
| 284 | + public SuggestQuery suggest(String host, String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf){ |
284 | 285 | try{ |
285 | 286 | RMIMessenger r = messengerFromCache(host); |
286 | | - return r.suggest(dbrole,searchterm,tokens,phrases,foundInContext,firstRank,nsf); |
| 287 | + return r.suggest(dbrole,searchterm,tokens,info,nsf); |
287 | 288 | } catch(Exception e){ |
288 | 289 | if(host == null){ |
289 | 290 | log.warn("Cannot find spell-check host for "+dbrole); |
— | — | @@ -304,9 +305,29 @@ |
305 | 306 | return r.getFuzzy(dbrole,word,nsf); |
306 | 307 | } catch(Exception e){ |
307 | 308 | e.printStackTrace(); |
308 | | - log.warn("Error invoking getFuzzyt() on "+host+" : "+e.getMessage()); |
| 309 | + log.warn("Error invoking getFuzzy() on "+host+" : "+e.getMessage()); |
309 | 310 | return new ArrayList<SuggestResult>(); |
310 | 311 | } |
311 | 312 | } |
| 313 | + |
| 314 | + /** dbrole pointing to original dbrole, not .related, e.g. wikilucene, not wikilucene.related */ |
| 315 | + public SearchResults searchRelated(String host, String dbrole, String searchterm, int offset, int limit){ |
| 316 | + try{ |
| 317 | + RMIMessenger r = messengerFromCache(host); |
| 318 | + return r.searchRelated(dbrole,searchterm,offset,limit); |
| 319 | + } catch(Exception e){ |
| 320 | + e.printStackTrace(); |
| 321 | + log.warn("Error invoking searchRelated() on "+host+" : "+e.getMessage()); |
| 322 | + if(host!=null && !isLocal(host)){ |
| 323 | + if(cache == null) |
| 324 | + cache = SearcherCache.getInstance(); |
| 325 | + cache.invalidateSearchable(IndexId.get(dbrole),host); |
| 326 | + } |
| 327 | + SearchResults res = new SearchResults(); |
| 328 | + res.setErrorMsg("Error searching related index: "+e.getMessage()); |
| 329 | + return res; |
| 330 | + } |
| 331 | + |
| 332 | + } |
312 | 333 | |
313 | 334 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java |
— | — | @@ -22,6 +22,7 @@ |
23 | 23 | import org.wikimedia.lsearch.search.NamespaceFilterWrapper; |
24 | 24 | import org.wikimedia.lsearch.search.SuffixFilterWrapper; |
25 | 25 | import org.wikimedia.lsearch.search.SuffixNamespaceWrapper; |
| 26 | +import org.wikimedia.lsearch.spell.Suggest; |
26 | 27 | import org.wikimedia.lsearch.spell.SuggestQuery; |
27 | 28 | import org.wikimedia.lsearch.spell.SuggestResult; |
28 | 29 | |
— | — | @@ -133,9 +134,9 @@ |
134 | 135 | * @param maxDoc - max number of documents in the index (needed for idf calculation) |
135 | 136 | * @param words - main phrase words, gives extra score |
136 | 137 | * @param exactCase - if this is an exact case query |
137 | | - * @return map: key -> highlighting result |
| 138 | + * @return resultset |
138 | 139 | */ |
139 | | - public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases) throws RemoteException; |
| 140 | + public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws RemoteException; |
140 | 141 | |
141 | 142 | /** |
142 | 143 | * Search grouped titles, similar logic to that of searchPart() |
— | — | @@ -161,7 +162,7 @@ |
162 | 163 | * @return |
163 | 164 | * @throws RemoteException |
164 | 165 | */ |
165 | | - public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf) throws RemoteException; |
| 166 | + public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf) throws RemoteException; |
166 | 167 | |
167 | 168 | /** |
168 | 169 | * Fetch words for fuzzy queries (e.g. query~) |
— | — | @@ -172,5 +173,17 @@ |
173 | 174 | * @return |
174 | 175 | * @throws RemoteException |
175 | 176 | */ |
176 | | - public ArrayList<SuggestResult> getFuzzy(String dbrole, String word, NamespaceFilter nsf) throws RemoteException; |
| 177 | + public ArrayList<SuggestResult> getFuzzy(String dbrole, String word, NamespaceFilter nsf) throws RemoteException; |
| 178 | + |
| 179 | + /** |
| 180 | + * Search a remote related index |
| 181 | + * |
| 182 | + * @param dbrole |
| 183 | + * @param searchterm |
| 184 | + * @param limit |
| 185 | + * @param offset |
| 186 | + * @return |
| 187 | + * @throws RemoteException |
| 188 | + */ |
| 189 | + public SearchResults searchRelated(String dbrole, String searchterm, int offset, int limit) throws RemoteException; |
177 | 190 | } |
Index: branches/lucene-search-2.1/webinterface/lsweb.py |
— | — | @@ -401,10 +401,10 @@ |
402 | 402 | self.wfile.write('</body></html>') |
403 | 403 | except HTTPError: |
404 | 404 | self.send_error(400,'Bad request') |
405 | | - self.wfile.write("Error in query") |
| 405 | + self.wfile.write("<div>Error in query</div>") |
406 | 406 | except URLError: |
407 | 407 | self.send_error(500,'Internal Server Error') |
408 | | - self.wfile.write("Cannot connect to lucene search 2 daemon") |
| 408 | + self.wfile.write("<div>Cannot connect to lucene search 2 daemon</div>") |
409 | 409 | delta_time = time.time() - start_time |
410 | 410 | print '[%s] Processed query %s in %d ms' %(time.strftime("%Y-%m-%d %H:%M:%S"),self.path,int(delta_time*1000)) |
411 | 411 | elif s[2] == '/': |