Index: branches/lucene-search-2.1/lib/lucene-core-2.2.0.jar |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/StringUtils.java |
— | — | @@ -0,0 +1,12 @@ |
| 2 | +package org.wikimedia.lsearch.util; |
| 3 | + |
| 4 | +public class StringUtils { |
| 5 | + /** reverse a string */ |
| 6 | + public static String reverseString(String str){ |
| 7 | + int len = str.length(); |
| 8 | + char[] buf = new char[len]; |
| 9 | + for(int i=0;i<len;i++) |
| 10 | + buf[i] = str.charAt(len-i-1); |
| 11 | + return new String(buf,0,len); |
| 12 | + } |
| 13 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java |
— | — | @@ -29,6 +29,7 @@ |
30 | 30 | import org.apache.log4j.Logger; |
31 | 31 | import org.apache.lucene.analysis.Analyzer; |
32 | 32 | import org.apache.lucene.analysis.TokenStream; |
| 33 | +import org.wikimedia.lsearch.analyzers.TokenizerOptions; |
33 | 34 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
34 | 35 | import org.wikimedia.lsearch.config.IndexId; |
35 | 36 | |
— | — | @@ -59,6 +60,6 @@ |
60 | 61 | if(streams.get(fieldName) != null) |
61 | 62 | return streams.get(fieldName); |
62 | 63 | |
63 | | - return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),false)); |
| 64 | + return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),new TokenizerOptions(false))); |
64 | 65 | } |
65 | 66 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -11,13 +11,14 @@ |
12 | 12 | import org.apache.lucene.analysis.Token; |
13 | 13 | import org.apache.lucene.analysis.TokenStream; |
14 | 14 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 15 | +import org.wikimedia.lsearch.analyzers.TokenizerOptions; |
15 | 16 | import org.wikimedia.lsearch.config.Configuration; |
16 | 17 | import org.wikimedia.lsearch.config.IndexId; |
17 | 18 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
18 | 19 | |
19 | 20 | public class FastWikiTokenizerTest { |
20 | 21 | public static void displayTokensForParser(String text) { |
21 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false); |
| 22 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions(false)); |
22 | 23 | Token[] tokens = parser.parse().toArray(new Token[] {}); |
23 | 24 | for (int i = 0; i < tokens.length; i++) { |
24 | 25 | Token token = tokens[i]; |
— | — | @@ -125,7 +126,7 @@ |
126 | 127 | for(int i=0;i<2000;i++){ |
127 | 128 | for(TestArticle article : articles){ |
128 | 129 | String text = article.content; |
129 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false); |
| 130 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions(false)); |
130 | 131 | parser.parse(); |
131 | 132 | } |
132 | 133 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java |
— | — | @@ -5,6 +5,7 @@ |
6 | 6 | import java.rmi.RemoteException; |
7 | 7 | import java.rmi.registry.LocateRegistry; |
8 | 8 | import java.rmi.registry.Registry; |
| 9 | +import java.util.ArrayList; |
9 | 10 | import java.util.Arrays; |
10 | 11 | import java.util.Collection; |
11 | 12 | import java.util.Hashtable; |
— | — | @@ -20,6 +21,7 @@ |
21 | 22 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
22 | 23 | import org.wikimedia.lsearch.search.NamespaceFilterWrapper; |
23 | 24 | import org.wikimedia.lsearch.search.SearcherCache; |
| 25 | +import org.wikimedia.lsearch.search.Wildcards; |
24 | 26 | |
25 | 27 | /** |
26 | 28 | * Invokes procedures on a remote RMIMessenger. |
— | — | @@ -217,4 +219,14 @@ |
218 | 220 | return -1; |
219 | 221 | } |
220 | 222 | } |
| 223 | + |
| 224 | + public ArrayList<String> getTerms(String host, String dbrole, String wildcard, boolean exactCase) throws RemoteException { |
| 225 | + try{ |
| 226 | + RMIMessenger r = messengerFromCache(host); |
| 227 | + return r.getTerms(dbrole,wildcard,exactCase); |
| 228 | + } catch(Exception e){ |
| 229 | + e.printStackTrace(); |
| 230 | + return new ArrayList<String>(); |
| 231 | + } |
| 232 | + } |
221 | 233 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java |
— | — | @@ -2,6 +2,7 @@ |
3 | 3 | |
4 | 4 | import java.rmi.Remote; |
5 | 5 | import java.rmi.RemoteException; |
| 6 | +import java.util.ArrayList; |
6 | 7 | |
7 | 8 | import org.apache.lucene.search.Query; |
8 | 9 | import org.wikimedia.lsearch.beans.IndexReportCard; |
— | — | @@ -95,4 +96,16 @@ |
96 | 97 | * @throws RemoteException |
97 | 98 | */ |
98 | 99 | public Boolean isSuccessfulFlush(String dbname) throws RemoteException; |
| 100 | + |
| 101 | + /** |
| 102 | + * Wildcard matcher, |
| 103 | + * Request all terms from title and reverse_title that match wildcard pattern |
| 104 | + * |
| 105 | + * @param dbrole - part of index, e.g. enwiki.nspart1 |
| 106 | + * @param wildcard - wildcard pattern with * and ? |
| 107 | + * @param exactCase - if pattern is exact capitalization |
| 108 | + * @return |
| 109 | + * @throws RemoteException |
| 110 | + */ |
| 111 | + public ArrayList<String> getTerms(String dbrole, String wildcard, boolean exactCase) throws RemoteException; |
99 | 112 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java |
— | — | @@ -1,6 +1,8 @@ |
2 | 2 | package org.wikimedia.lsearch.interoperability; |
3 | 3 | |
| 4 | +import java.io.IOException; |
4 | 5 | import java.rmi.RemoteException; |
| 6 | +import java.util.ArrayList; |
5 | 7 | import java.util.Arrays; |
6 | 8 | |
7 | 9 | import org.apache.log4j.Logger; |
— | — | @@ -17,6 +19,8 @@ |
18 | 20 | import org.wikimedia.lsearch.search.NamespaceFilterWrapper; |
19 | 21 | import org.wikimedia.lsearch.search.NetworkStatusThread; |
20 | 22 | import org.wikimedia.lsearch.search.SearchEngine; |
| 23 | +import org.wikimedia.lsearch.search.SearcherCache; |
| 24 | +import org.wikimedia.lsearch.search.Wildcards; |
21 | 25 | |
22 | 26 | /** Local implementation for {@link RMIMessenger} */ |
23 | 27 | public class RMIMessengerImpl implements RMIMessenger { |
— | — | @@ -86,6 +90,14 @@ |
87 | 91 | return new SearchEngine().searchPart(IndexId.get(dbrole),searchterm,query,filter,offset,limit,explain); |
88 | 92 | } |
89 | 93 | |
| 94 | + public ArrayList<String> getTerms(String dbrole, String wildcard, boolean exactCase) throws RemoteException { |
| 95 | + try{ |
| 96 | + return Wildcards.getLocalTerms(IndexId.get(dbrole),wildcard,exactCase); |
| 97 | + } catch(IOException e){ |
| 98 | + throw new RemoteException("IOException on "+dbrole,e); |
| 99 | + } |
| 100 | + } |
| 101 | + |
90 | 102 | // inherit javadoc |
91 | 103 | public int getIndexerQueueSize() throws RemoteException { |
92 | 104 | return IndexThread.getQueueSize(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -156,13 +156,10 @@ |
157 | 157 | ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection()); |
158 | 158 | res.setNumHits(col.size()); |
159 | 159 | res.setSuccess(true); |
160 | | - // TODO: this is extremely slow |
161 | | - //Links links = Links.openForRead(lin,lin.getSearchPath()); |
162 | 160 | for(int i=offset;i<offset+limit && i<col.size();i++){ |
163 | 161 | RelatedTitle rt = col.get(i); |
164 | 162 | Title t = rt.getRelated(); |
165 | 163 | ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle()); |
166 | | - //rs.addContext(links.getContext(t.getKey(),key)); |
167 | 164 | res.addResult(rs); |
168 | 165 | } |
169 | 166 | } else{ |
— | — | @@ -305,7 +302,7 @@ |
306 | 303 | |
307 | 304 | WikiSearcher searcher = null; |
308 | 305 | try { |
309 | | - q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll); |
| 306 | + //q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll); |
310 | 307 | |
311 | 308 | TopDocs hits=null; |
312 | 309 | // see if we can search only part of the index |
— | — | @@ -331,7 +328,11 @@ |
332 | 329 | log.error("Error contacting searcher for "+piid); |
333 | 330 | return res; |
334 | 331 | } |
335 | | - RMIMessengerClient messenger = new RMIMessengerClient(); |
| 332 | + // query |
| 333 | + Wildcards wildcards = new Wildcards(piid,host,exactCase); |
| 334 | + q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll,wildcards); |
| 335 | + |
| 336 | + RMIMessengerClient messenger = new RMIMessengerClient(); |
336 | 337 | res = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host); |
337 | 338 | if(sug != null){ |
338 | 339 | SuggestQuery sq = sug.suggest(searchterm,parser,res); |
— | — | @@ -360,6 +361,10 @@ |
361 | 362 | searcher = new WikiSearcher(iid); |
362 | 363 | // normal search |
363 | 364 | try{ |
| 365 | + // query |
| 366 | + Wildcards wildcards = new Wildcards(searcher.getAllHosts(),exactCase); |
| 367 | + q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll,wildcards); |
| 368 | + |
364 | 369 | hits = searcher.search(q,nsfw,offset+limit); |
365 | 370 | res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
366 | 371 | if(sug != null){ |
— | — | @@ -406,7 +411,7 @@ |
407 | 412 | } |
408 | 413 | } |
409 | 414 | |
410 | | - protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll) throws ParseException { |
| 415 | + protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll, Wildcards wildcards) throws ParseException { |
411 | 416 | Query q = null; |
412 | 417 | if(raw){ |
413 | 418 | // do minimal parsing, make a raw query |
— | — | @@ -414,11 +419,11 @@ |
415 | 420 | q = parser.parseRaw(searchterm); |
416 | 421 | } else if(nsfw == null){ |
417 | 422 | if(searchAll) |
418 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
| 423 | + q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,wildcards); |
419 | 424 | else |
420 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
| 425 | + q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,wildcards); |
421 | 426 | } else{ |
422 | | - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
| 427 | + q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,wildcards); |
423 | 428 | log.info("Using NamespaceFilterWrapper "+nsfw); |
424 | 429 | } |
425 | 430 | return q; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java |
— | — | @@ -42,10 +42,11 @@ |
43 | 43 | protected byte[] lengthNoStopWords = null; |
44 | 44 | protected float[] boost = null; |
45 | 45 | protected IndexReader reader = null; |
| 46 | + protected String field; |
46 | 47 | |
47 | 48 | protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{ |
48 | 49 | this.reader = reader; |
49 | | - String field = fieldBase+"_meta"; |
| 50 | + field = fieldBase+"_meta"; |
50 | 51 | Collection fields = reader.getFieldNames(FieldOption.ALL); |
51 | 52 | if(!fields.contains(field)) |
52 | 53 | return; // index doesn't have ranking info |
— | — | @@ -118,10 +119,10 @@ |
119 | 120 | int end = (docid == index.length-1)? length.length : index[docid+1]; |
120 | 121 | if(position >= end-start) |
121 | 122 | try { |
122 | | - throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" for "+docid+" ["+reader.document(docid).get("title")+"], but last valid index is "+(end-start-1)); |
| 123 | + throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" on field "+field+" for "+docid+" ["+reader.document(docid).get("title")+"], but last valid index is "+(end-start-1)); |
123 | 124 | } catch (IOException e) { |
124 | 125 | e.printStackTrace(); |
125 | | - throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" unavailable"); |
| 126 | + throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" on field "+field+" unavailable"); |
126 | 127 | } |
127 | 128 | return start+position; |
128 | 129 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java |
— | — | @@ -0,0 +1,161 @@ |
| 2 | +package org.wikimedia.lsearch.search; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.rmi.RemoteException; |
| 6 | +import java.util.ArrayList; |
| 7 | +import java.util.Collection; |
| 8 | +import java.util.HashMap; |
| 9 | +import java.util.HashSet; |
| 10 | +import java.util.Map.Entry; |
| 11 | + |
| 12 | +import org.apache.log4j.Logger; |
| 13 | +import org.apache.lucene.index.IndexReader; |
| 14 | +import org.apache.lucene.index.Term; |
| 15 | +import org.apache.lucene.search.DisjunctionMaxQuery; |
| 16 | +import org.apache.lucene.search.Query; |
| 17 | +import org.apache.lucene.search.TermQuery; |
| 18 | +import org.apache.lucene.search.WildcardTermEnum; |
| 19 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
| 20 | +import org.wikimedia.lsearch.config.IndexId; |
| 21 | +import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 22 | +import org.wikimedia.lsearch.util.StringUtils; |
| 23 | + |
| 24 | +/** |
| 25 | + * Wildcard-search related functions |
| 26 | + * @author rainman |
| 27 | + * |
| 28 | + */ |
| 29 | +public class Wildcards { |
| 30 | + protected static Logger log = Logger.getLogger(Wildcards.class); |
| 31 | + public static final int MAX_TERMS = 1024; |
| 32 | + protected static SearcherCache searcherCache = null; |
| 33 | + protected enum WildcardType { PREFIX, SUFFIX, INVALID }; |
| 34 | + |
| 35 | + /** wildcard pattern -> terms */ |
| 36 | + protected HashMap<String,HashSet<String>> wildcardCache = new HashMap<String,HashSet<String>>(); |
| 37 | + /** iid -> host */ |
| 38 | + protected HashMap<String,String> hosts = new HashMap<String,String>(); |
| 39 | + |
| 40 | + protected RMIMessengerClient client = null; |
| 41 | + protected boolean exactCase; |
| 42 | + |
| 43 | + public Wildcards(IndexId iid, String host, boolean exactCase){ |
| 44 | + hosts.put(iid.toString(),host); |
| 45 | + this.exactCase = exactCase; |
| 46 | + } |
| 47 | + |
| 48 | + public Wildcards(HashMap<String,String> iidHostMapping, boolean exactCase){ |
| 49 | + hosts.putAll(iidHostMapping); |
| 50 | + this.exactCase = exactCase; |
| 51 | + } |
| 52 | + |
| 53 | + /** |
| 54 | + * Make a DisjunctionMaxQuery of expanded wildcard |
| 55 | + * |
| 56 | + * @param wildcard |
| 57 | + * @param field |
| 58 | + * @return null if there is no match, or on error |
| 59 | + */ |
| 60 | + public Query makeQuery(String wildcard, String field){ |
| 61 | + if(client == null) |
| 62 | + client = new RMIMessengerClient(); |
| 63 | + |
| 64 | + HashSet<String> terms = wildcardCache.get(wildcard); |
| 65 | + if(terms == null){ |
| 66 | + terms = new HashSet<String>(); |
| 67 | + for(Entry<String,String> e : hosts.entrySet()){ |
| 68 | + try { |
| 69 | + terms.addAll(client.getTerms(e.getValue(),e.getKey(),wildcard,exactCase)); |
| 70 | + } catch (RemoteException e1) { |
| 71 | + e1.printStackTrace(); |
| 72 | + log.warn("Cannot get terms for "+wildcard+" on host "+e.getValue()+" for "+e.getKey()); |
| 73 | + } |
| 74 | + } |
| 75 | + wildcardCache.put(wildcard,terms); |
| 76 | + log.info("Using "+terms.size()+" terms for pattern="+wildcard); |
| 77 | + } |
| 78 | + |
| 79 | + if(terms.size() == 0) |
| 80 | + return null; // no match or error |
| 81 | + |
| 82 | + return makeQuery(terms,field); |
| 83 | + } |
| 84 | + |
| 85 | + /** Construct DijunctionMaxQuery from terms */ |
| 86 | + protected Query makeQuery(HashSet<String> terms, String field){ |
| 87 | + if(terms.size() > MAX_TERMS){ |
| 88 | + HashSet<String> temp = new HashSet<String>(); |
| 89 | + int count = 0; |
| 90 | + for(String t : terms){ |
| 91 | + if(count >= MAX_TERMS) |
| 92 | + break; |
| 93 | + temp.add(t); |
| 94 | + count++; |
| 95 | + } |
| 96 | + terms = temp; |
| 97 | + } |
| 98 | + DisjunctionMaxQuery q = new DisjunctionMaxQuery(0); |
| 99 | + for(String t : terms){ |
| 100 | + q.add(new TermQuery(new Term(field,t))); |
| 101 | + } |
| 102 | + return q; |
| 103 | + } |
| 104 | + |
| 105 | + protected static WildcardType getType(String wildcard){ |
| 106 | + if(wildcard == null || wildcard.equals("")) |
| 107 | + return WildcardType.INVALID; |
| 108 | + boolean pre = wildcard.startsWith("*") || wildcard.startsWith("?"); |
| 109 | + boolean suff = wildcard.endsWith("*") || wildcard.endsWith("?"); |
| 110 | + if(pre && !suff) |
| 111 | + return WildcardType.PREFIX; |
| 112 | + else if(suff && !pre) |
| 113 | + return WildcardType.SUFFIX; |
| 114 | + else |
| 115 | + return WildcardType.INVALID; |
| 116 | + } |
| 117 | + |
| 118 | + public static ArrayList<String> getLocalTerms(IndexId iid, String wildcard, boolean exactCase) throws IOException { |
| 119 | + if(searcherCache == null) |
| 120 | + searcherCache = SearcherCache.getInstance(); |
| 121 | + ArrayList<String> ret = new ArrayList<String>(); |
| 122 | + // check type of wildcard |
| 123 | + WildcardType type = getType(wildcard); |
| 124 | + if(type == WildcardType.INVALID) |
| 125 | + return ret; |
| 126 | + // check searcher |
| 127 | + IndexSearcherMul searcher = searcherCache.getLocalSearcher(iid); |
| 128 | + if(searcher == null) |
| 129 | + throw new IOException(iid+" not a local index, or index not available"); |
| 130 | + |
| 131 | + // get field |
| 132 | + IndexReader reader = searcher.getIndexReader(); |
| 133 | + String field = null; |
| 134 | + Term wildcardTerm = null; |
| 135 | + FieldNameFactory fields = new FieldNameFactory(exactCase); |
| 136 | + if(type == WildcardType.PREFIX){ |
| 137 | + field = fields.title(); |
| 138 | + wildcardTerm = new Term(field,wildcard); |
| 139 | + } else{ |
| 140 | + field = fields.reverse_title(); |
| 141 | + wildcardTerm = new Term(field,StringUtils.reverseString(wildcard)); |
| 142 | + } |
| 143 | + |
| 144 | + // get terms |
| 145 | + Term t; |
| 146 | + WildcardTermEnum te = new WildcardTermEnum(reader,wildcardTerm); |
| 147 | + while((t = te.term()) != null){ |
| 148 | + if(type == WildcardType.SUFFIX) |
| 149 | + ret.add(StringUtils.reverseString(t.text())); |
| 150 | + else |
| 151 | + ret.add(t.text()); |
| 152 | + |
| 153 | + if(!te.next()) |
| 154 | + break; |
| 155 | + if(ret.size() >= MAX_TERMS) |
| 156 | + break; |
| 157 | + } |
| 158 | + |
| 159 | + return ret; |
| 160 | + } |
| 161 | + |
| 162 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/WikiSearcher.java |
— | — | @@ -4,7 +4,9 @@ |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.Arrays; |
7 | 7 | import java.util.Collection; |
| 8 | +import java.util.HashMap; |
8 | 9 | import java.util.Hashtable; |
| 10 | +import java.util.Map.Entry; |
9 | 11 | |
10 | 12 | import org.apache.log4j.Logger; |
11 | 13 | import org.apache.lucene.document.Document; |
— | — | @@ -108,6 +110,15 @@ |
109 | 111 | else |
110 | 112 | return cache.getSearchableHost(s); |
111 | 113 | } |
| 114 | + |
| 115 | + /** Get map iid->host for of all parts in this searcher */ |
| 116 | + public HashMap<String,String> getAllHosts(){ |
| 117 | + HashMap<String,String> ret = new HashMap<String,String>(); |
| 118 | + for(Entry<String,Searchable> e : searcherParts.entrySet()){ |
| 119 | + ret.put(e.getKey(),cache.getSearchableHost(e.getValue())); |
| 120 | + } |
| 121 | + return ret; |
| 122 | + } |
112 | 123 | |
113 | 124 | @Override |
114 | 125 | public void close() throws IOException { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -40,6 +40,7 @@ |
41 | 41 | import org.wikimedia.lsearch.analyzers.LanguageAnalyzer; |
42 | 42 | import org.wikimedia.lsearch.analyzers.RelatedAnalyzer; |
43 | 43 | import org.wikimedia.lsearch.analyzers.StopWords; |
| 44 | +import org.wikimedia.lsearch.analyzers.TokenizerOptions; |
44 | 45 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
45 | 46 | import org.wikimedia.lsearch.beans.Article; |
46 | 47 | import org.wikimedia.lsearch.beans.IndexReportCard; |
— | — | @@ -53,6 +54,7 @@ |
54 | 55 | import org.wikimedia.lsearch.spell.api.SpellCheckIndexer; |
55 | 56 | import org.wikimedia.lsearch.util.Localization; |
56 | 57 | import org.wikimedia.lsearch.util.MathFunc; |
| 58 | +import org.wikimedia.lsearch.util.StringUtils; |
57 | 59 | |
58 | 60 | /** |
59 | 61 | * IndexModifier for batch update of local lucene index. |
— | — | @@ -481,7 +483,7 @@ |
482 | 484 | FilterFactory filters = bs.getFilters(); |
483 | 485 | |
484 | 486 | // tokenize the article to fill in pre-analyzed fields |
485 | | - WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,bs.isExactCase()); |
| 487 | + WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,new TokenizerOptions(bs.isExactCase())); |
486 | 488 | tokenizer.tokenize(); |
487 | 489 | |
488 | 490 | // title |
— | — | @@ -510,7 +512,7 @@ |
511 | 513 | } |
512 | 514 | |
513 | 515 | // reverse title for wildcard searches |
514 | | - Field rtitle = new Field(fields.reverse_title(), reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED); |
| 516 | + Field rtitle = new Field(fields.reverse_title(), StringUtils.reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED); |
515 | 517 | rtitle.setBoost(rankBoost); |
516 | 518 | doc.add(rtitle); |
517 | 519 | |
— | — | @@ -518,20 +520,11 @@ |
519 | 521 | return doc; |
520 | 522 | } |
521 | 523 | |
522 | | - /** reverse a string */ |
523 | | - public static String reverseString(String str){ |
524 | | - int len = str.length(); |
525 | | - char[] buf = new char[len]; |
526 | | - for(int i=0;i<len;i++) |
527 | | - buf[i] = str.charAt(len-i-1); |
528 | | - return new String(buf,0,len); |
529 | | - } |
530 | | - |
531 | 524 | /** add related aggregate field */ |
532 | 525 | protected static void makeRelated(Document doc, String prefix, Article article, IndexId iid, HashSet<String> stopWords){ |
533 | 526 | ArrayList<Aggregate> items = new ArrayList<Aggregate>(); |
534 | 527 | for(RelatedTitle rt : article.getRelated()){ |
535 | | - items.add(new Aggregate(rt.getRelated().getTitle(),transformRelated(rt.getScore()),iid,false,stopWords)); |
| 528 | + addToItems(items,new Aggregate(rt.getRelated().getTitle(),transformRelated(rt.getScore()),iid,false,stopWords)); |
536 | 529 | } |
537 | 530 | makeAggregate(doc,prefix,items); |
538 | 531 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/Transaction.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.wikimedia.lsearch.config.Configuration; |
12 | 12 | import org.wikimedia.lsearch.config.IndexId; |
13 | 13 | import org.wikimedia.lsearch.util.Command; |
| 14 | +import org.wikimedia.lsearch.util.FSUtils; |
14 | 15 | |
15 | 16 | /** |
16 | 17 | * Simple transaction support for indexing. Wrap index operations by |
— | — | @@ -58,20 +59,19 @@ |
59 | 60 | // start new transaction |
60 | 61 | backup.getParentFile().mkdirs(); |
61 | 62 | try{ |
62 | | - if( exec("/bin/cp -lr "+iid.getIndexPath()+" "+backup.getAbsolutePath()) == 0){ |
63 | | - Properties prop = new Properties(); |
64 | | - // write out the status file |
65 | | - prop.setProperty("status","started at "+System.currentTimeMillis()); |
66 | | - FileOutputStream fileos = new FileOutputStream(info,false); |
67 | | - prop.store(fileos,""); |
68 | | - fileos.close(); |
69 | | - // all is good, set transaction flag |
70 | | - inTransaction = true; |
71 | | - log.info("Transaction on index "+iid+" started"); |
72 | | - } else |
73 | | - log.warn("Making a transaction copy for "+iid+" failed."); |
| 63 | + // make a copy |
| 64 | + FSUtils.createHardLinkRecursive(iid.getIndexPath(),backup.getAbsolutePath()); |
| 65 | + Properties prop = new Properties(); |
| 66 | + // write out the status file |
| 67 | + prop.setProperty("status","started at "+System.currentTimeMillis()); |
| 68 | + FileOutputStream fileos = new FileOutputStream(info,false); |
| 69 | + prop.store(fileos,""); |
| 70 | + fileos.close(); |
| 71 | + // all is good, set transaction flag |
| 72 | + inTransaction = true; |
| 73 | + log.info("Transaction on index "+iid+" started"); |
74 | 74 | } catch(Exception e){ |
75 | | - log.warn("Error while intializing transaction: "+e.getMessage()); |
| 75 | + log.error("Error while intializing transaction: "+e.getMessage()); |
76 | 76 | } |
77 | 77 | } |
78 | 78 | |
— | — | @@ -82,11 +82,11 @@ |
83 | 83 | // cleanup before starting new transaction |
84 | 84 | try{ |
85 | 85 | if(trans.exists()) |
86 | | - exec("/bin/rm -rf "+trans.getAbsolutePath()); |
| 86 | + FSUtils.deleteRecursive(trans.getAbsoluteFile()); |
87 | 87 | if(info.exists()) |
88 | | - exec("/bin/rm -rf "+info.getAbsolutePath()); |
| 88 | + FSUtils.deleteRecursive(info.getAbsoluteFile()); |
89 | 89 | } catch(Exception e){ |
90 | | - log.warn("Error removing old transaction data from "+iid.getTransactionPath()+" : "+e.getMessage()); |
| 90 | + log.error("Error removing old transaction data from "+iid.getTransactionPath()+" : "+e.getMessage()); |
91 | 91 | } |
92 | 92 | |
93 | 93 | } |
— | — | @@ -122,15 +122,14 @@ |
123 | 123 | try{ |
124 | 124 | if(index.exists()) // clear locks before recovering |
125 | 125 | WikiIndexModifier.unlockIndex(iid.getIndexPath()); |
126 | | - if( exec("/bin/rm -rf "+iid.getIndexPath()) == 0 ){ |
127 | | - if( exec("/bin/mv "+backup.getAbsolutePath()+" "+iid.getIndexPath()) == 0 ){ |
128 | | - log.info("Successfully recovered index for "+iid); |
129 | | - } else |
130 | | - log.warn("Recovery of "+iid+" failed: cannot move "+backup.getAbsolutePath()); |
131 | | - } else |
132 | | - log.warn("Recovery of "+iid+" failed: cannot delete "+iid.getIndexPath()); |
| 126 | + |
| 127 | + // delete old indexpath |
| 128 | + FSUtils.deleteRecursive(new File(iid.getIndexPath())); |
| 129 | + |
| 130 | + FSUtils.createHardLinkRecursive(backup.getAbsolutePath(),iid.getIndexPath()); |
| 131 | + FSUtils.deleteRecursive(backup.getAbsoluteFile()); // cleanup |
133 | 132 | } catch(Exception e){ |
134 | | - log.warn("Recovery of index "+iid+" failed with error "+e.getMessage()); |
| 133 | + log.error("Recovery of index "+iid+" failed with error "+e.getMessage()); |
135 | 134 | } |
136 | 135 | } |
137 | 136 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ReusableLanguageAnalyzer.java |
— | — | @@ -6,18 +6,19 @@ |
7 | 7 | import org.apache.lucene.analysis.TokenStream; |
8 | 8 | |
9 | 9 | /** |
10 | | - * Reusable language analyzer. Can be used to tokenize arbitrary text. |
| 10 | + * Reusable language analyzer. Should be used to tokenize queries and |
| 11 | + * other non-wikitext stuff. Has template relocation, etc... turned off. |
11 | 12 | * |
12 | 13 | * @author rainman |
13 | 14 | * |
14 | 15 | */ |
15 | 16 | public class ReusableLanguageAnalyzer extends LanguageAnalyzer { |
16 | 17 | static org.apache.log4j.Logger log = Logger.getLogger(ReusableLanguageAnalyzer.class); |
17 | | - protected boolean exactCase; |
| 18 | + protected TokenizerOptions options; |
18 | 19 | |
19 | 20 | public ReusableLanguageAnalyzer(FilterFactory filters, boolean exactCase){ |
20 | 21 | super(filters,null); |
21 | | - this.exactCase = exactCase; |
| 22 | + this.options = new TokenizerOptions.NoRelocation(exactCase); |
22 | 23 | } |
23 | 24 | |
24 | 25 | /** |
— | — | @@ -25,7 +26,7 @@ |
26 | 27 | */ |
27 | 28 | @Override |
28 | 29 | public TokenStream tokenStream(String fieldName, String text) { |
29 | | - wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase); |
| 30 | + wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),options); |
30 | 31 | return super.tokenStream(fieldName,(Reader)null); |
31 | 32 | } |
32 | 33 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -64,9 +64,10 @@ |
65 | 65 | ArrayList<ArrayList<String>> keywordsBySize = new ArrayList<ArrayList<String>>(); |
66 | 66 | for(int i=0;i<KEYWORD_LEVELS;i++) |
67 | 67 | keywordsBySize.add(new ArrayList<String>()); |
| 68 | + TokenizerOptions options = new TokenizerOptions(exactCase); |
68 | 69 | // arange keywords into a list by token number |
69 | 70 | for(String k : keywords){ |
70 | | - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,exactCase).parse(); |
| 71 | + ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,options).parse(); |
71 | 72 | if(parsed.size() == 0) |
72 | 73 | continue; |
73 | 74 | else if(parsed.size() < KEYWORD_LEVELS) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +/** |
| 5 | + * FastWikiTokenizerEngine options |
| 6 | + * |
| 7 | + * @author rainman |
| 8 | + * |
| 9 | + */ |
| 10 | +public class TokenizerOptions { |
| 11 | + /** if capitalization should be preserved */ |
| 12 | + boolean exactCase = false; |
| 13 | + /** if templates should be relocated, etc.. makes sense only if whole article |
| 14 | + * is parsed (and not query,or part of an article) */ |
| 15 | + boolean relocationParsing = true; |
| 16 | + |
| 17 | + public TokenizerOptions(boolean exactCase){ |
| 18 | + this.exactCase = exactCase; |
| 19 | + } |
| 20 | + |
| 21 | + public static class NoRelocation extends TokenizerOptions { |
| 22 | + public NoRelocation(boolean exactCase){ |
| 23 | + super(exactCase); |
| 24 | + this.relocationParsing = false; |
| 25 | + } |
| 26 | + } |
| 27 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java |
— | — | @@ -20,7 +20,8 @@ |
21 | 21 | |
22 | 22 | /** Construct from arbitrary text that will be tokenized */ |
23 | 23 | public Aggregate(String text, float boost, IndexId iid, boolean exactCase, HashSet<String> stopWords){ |
24 | | - tokens = new FastWikiTokenizerEngine(text,iid,exactCase).parse(); |
| 24 | + TokenizerOptions options = new TokenizerOptions.NoRelocation(exactCase); |
| 25 | + tokens = new FastWikiTokenizerEngine(text,iid,options).parse(); |
25 | 26 | this.boost = boost; |
26 | 27 | noStopWordsLength = 0; |
27 | 28 | for(Token t : tokens){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/AggregateAnalyzer.java |
— | — | @@ -24,7 +24,7 @@ |
25 | 25 | if(item >= items.size()) |
26 | 26 | return null; |
27 | 27 | Aggregate ag = items.get(item); |
28 | | - if(token >= ag.length()){ |
| 28 | + if(token >= ag.length() || token >= TOKEN_GAP-1){ |
29 | 29 | gap = true; |
30 | 30 | do{ |
31 | 31 | // find next nonempty item |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java |
— | — | @@ -38,8 +38,8 @@ |
39 | 39 | * @param str |
40 | 40 | */ |
41 | 41 | |
42 | | - public WikiTokenizer(String str, IndexId iid, boolean exactCase){ |
43 | | - parser = new FastWikiTokenizerEngine(str,iid,exactCase); |
| 42 | + public WikiTokenizer(String str, IndexId iid, TokenizerOptions options){ |
| 43 | + parser = new FastWikiTokenizerEngine(str,iid,options); |
44 | 44 | this.input = null; |
45 | 45 | } |
46 | 46 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -56,6 +56,8 @@ |
57 | 57 | private int templateLevel = 0; // level of nestedness of templates |
58 | 58 | private int gap = 1; |
59 | 59 | private ArrayList<Token> nonContentTokens; // tokens from the beginning of the article that are not content, but templates, images, etc.. |
| 60 | + private ArrayList<Token> references; // stuff between <ref></ref> tags should go to the end |
| 61 | + private boolean inRef = false, firstRef = false; // if we are within a ref tag |
60 | 62 | |
61 | 63 | private int prefixLen = 0; |
62 | 64 | private final char[] prefixBuf = new char[MAX_WORD_LEN]; |
— | — | @@ -77,6 +79,8 @@ |
78 | 80 | public static int BULLETIN_GAP = 10; |
79 | 81 | /** Gap between sentences */ |
80 | 82 | public static int SENTENCE_GAP = 2; |
| 83 | + /** Gap between references */ |
| 84 | + public static int REFERENCE_GAP = 20; |
81 | 85 | |
82 | 86 | /** language code */ |
83 | 87 | private String language; |
— | — | @@ -87,9 +91,8 @@ |
88 | 92 | private static Hashtable<String,HashSet<String>> categoryLocalized = new Hashtable<String,HashSet<String>>(); |
89 | 93 | private static HashSet<String> interwiki; |
90 | 94 | |
91 | | - /** if true, words won't be lowercased */ |
92 | | - private boolean exactCase = false; |
93 | | - private UnicodeDecomposer decomposer; |
| 95 | + private UnicodeDecomposer decomposer; |
| 96 | + private TokenizerOptions options; |
94 | 97 | |
95 | 98 | enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD, |
96 | 99 | LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS, |
— | — | @@ -108,6 +111,9 @@ |
109 | 112 | numberToken = false; |
110 | 113 | headingText = new ArrayList<String>(); |
111 | 114 | nonContentTokens = new ArrayList<Token>(); |
| 115 | + inRef = false; |
| 116 | + firstRef = false; |
| 117 | + references = new ArrayList<Token>(); |
112 | 118 | } |
113 | 119 | |
114 | 120 | /** Note: this will read only 1024 bytes of reader, it's |
— | — | @@ -127,12 +133,12 @@ |
128 | 134 | } |
129 | 135 | } |
130 | 136 | |
131 | | - public FastWikiTokenizerEngine(String text, IndexId iid, boolean exactCase){ |
| 137 | + public FastWikiTokenizerEngine(String text, IndexId iid, TokenizerOptions options){ |
132 | 138 | this.text = text.toCharArray(); |
133 | 139 | this.textString = text; |
134 | 140 | this.language = iid.getLangCode(); |
135 | 141 | this.iid = iid; |
136 | | - this.exactCase = exactCase; |
| 142 | + this.options = options; |
137 | 143 | textLength = text.length(); |
138 | 144 | init(); |
139 | 145 | } |
— | — | @@ -175,7 +181,7 @@ |
176 | 182 | boolean addDecomposed = false; |
177 | 183 | for(int i=0;i<length;i++){ |
178 | 184 | addToAlias = true; |
179 | | - if( ! exactCase ) |
| 185 | + if( ! options.exactCase ) |
180 | 186 | cl = Character.toLowerCase(buffer[i]); |
181 | 187 | else{ |
182 | 188 | cl = buffer[i]; |
— | — | @@ -303,7 +309,19 @@ |
304 | 310 | * @param t |
305 | 311 | */ |
306 | 312 | private final void addToTokens(Token t){ |
307 | | - if(templateLevel > 0 && keywordTokens < FIRST_SECTION_GAP){ |
| 313 | + if(!options.relocationParsing){ |
| 314 | + tokens.add(t); |
| 315 | + return; |
| 316 | + } |
| 317 | + // and now, relocation parsing: |
| 318 | + if(inRef){ |
| 319 | + if(firstRef){ // delimiter whole references from each other |
| 320 | + firstRef = false; |
| 321 | + t.setPositionIncrement(REFERENCE_GAP); |
| 322 | + } |
| 323 | + references.add(t); |
| 324 | + return; |
| 325 | + } else if(templateLevel > 0 && keywordTokens < FIRST_SECTION_GAP){ |
308 | 326 | nonContentTokens.add(t); |
309 | 327 | return; |
310 | 328 | } else if(t.getPositionIncrement() == FIRST_SECTION_GAP){ |
— | — | @@ -478,11 +496,34 @@ |
479 | 497 | // check |
480 | 498 | if(start == end && start != 0 && start+end<endOfLine-cur && start>=2 && start<=4){ |
481 | 499 | headings++; |
482 | | - headingText.add(new String(text,cur+start,endOfLine-(cur+start+end))); |
| 500 | + headingText.add(deleteRefs(new String(text,cur+start,endOfLine-(cur+start+end)))); |
483 | 501 | } |
484 | 502 | } |
485 | 503 | } |
486 | 504 | |
| 505 | + /** Delete <ref></ref> text from a string */ |
| 506 | + protected String deleteRefs(String str){ |
| 507 | + int start; |
| 508 | + while((start = str.indexOf("<ref>")) != -1){ |
| 509 | + int end = str.indexOf("</ref>",start+1); |
| 510 | + if(end == -1) |
| 511 | + break; |
| 512 | + str = str.substring(0,start)+((end+6<str.length())? str.substring(end+6) : ""); |
| 513 | + } |
| 514 | + return str; |
| 515 | + } |
| 516 | + |
| 517 | + /** Check if starting from current position a string is matched */ |
| 518 | + protected boolean matchesString(String target){ |
| 519 | + if(cur + target.length() >= textLength) |
| 520 | + return false; |
| 521 | + for(lookup=cur,lc=0;lc<target.length();lookup++,lc++){ |
| 522 | + if(target.charAt(lc) != Character.toLowerCase(text[lookup])) |
| 523 | + return false; |
| 524 | + } |
| 525 | + return true; |
| 526 | + } |
| 527 | + |
487 | 528 | /** |
488 | 529 | * Parse Wiki text, and produce an arraylist of tokens. |
489 | 530 | * Also fills the lists categories and interwikis. |
— | — | @@ -517,14 +558,16 @@ |
518 | 559 | case '=': |
519 | 560 | addToken(); |
520 | 561 | checkHeadings(); |
521 | | - if(headings == 1) |
522 | | - gap = FIRST_SECTION_GAP; |
523 | | - else if(headings > 1) |
524 | | - gap = SECTION_GAP; |
| 562 | + if(options.relocationParsing){ |
| 563 | + if(headings == 1) |
| 564 | + gap = FIRST_SECTION_GAP; |
| 565 | + else if(headings > 1) |
| 566 | + gap = SECTION_GAP; |
| 567 | + } |
525 | 568 | continue; |
526 | 569 | case '\n': |
527 | 570 | addToken(); |
528 | | - if(cur + 1 < textLength){ |
| 571 | + if(options.relocationParsing && cur + 1 < textLength){ |
529 | 572 | switch(text[cur+1]){ |
530 | 573 | case '\n': gap = PARAGRAPH_GAP; break; |
531 | 574 | case '*': case ':': case '#': gap = BULLETIN_GAP; break; |
— | — | @@ -539,11 +582,19 @@ |
540 | 583 | case ':': |
541 | 584 | case ';': |
542 | 585 | addToken(); |
543 | | - if(gap == 1) |
| 586 | + if(options.relocationParsing && gap == 1) |
544 | 587 | gap = SENTENCE_GAP; |
545 | 588 | continue; |
546 | 589 | case '<': |
547 | 590 | addToken(); |
| 591 | + if(matchesString("<ref>")){ |
| 592 | + inRef = true; |
| 593 | + firstRef = true; |
| 594 | + } |
| 595 | + if(matchesString("</ref>")){ |
| 596 | + inRef = false; |
| 597 | + gap = 1; |
| 598 | + } |
548 | 599 | state = ParserState.IGNORE; |
549 | 600 | ignoreEnd = '>'; |
550 | 601 | continue; |
— | — | @@ -817,6 +868,12 @@ |
818 | 869 | } |
819 | 870 | nonContentTokens.clear(); |
820 | 871 | } |
| 872 | + // add references to end |
| 873 | + if(references.size() != 0){ |
| 874 | + for(Token tt : references){ |
| 875 | + tokens.add(tt); |
| 876 | + } |
| 877 | + } |
821 | 878 | return tokens; |
822 | 879 | } |
823 | 880 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -42,6 +42,7 @@ |
43 | 43 | import org.wikimedia.lsearch.search.NamespaceFilter; |
44 | 44 | import org.wikimedia.lsearch.search.RankField; |
45 | 45 | import org.wikimedia.lsearch.search.RankValue; |
| 46 | +import org.wikimedia.lsearch.search.Wildcards; |
46 | 47 | import org.wikimedia.lsearch.search.RankField.RankFieldSource; |
47 | 48 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
48 | 49 | |
— | — | @@ -79,7 +80,7 @@ |
80 | 81 | private float defaultBoost = 1; |
81 | 82 | private float defaultAliasBoost = ALIAS_BOOST; |
82 | 83 | protected enum TokenType {WORD, FIELD, AND, OR, EOF }; |
83 | | - |
| 84 | + |
84 | 85 | private TokenStream tokenStream; |
85 | 86 | private ArrayList<Token> tokens; // tokens from analysis |
86 | 87 | |
— | — | @@ -140,6 +141,7 @@ |
141 | 142 | protected FieldBuilder.BuilderSet builder; |
142 | 143 | protected FieldNameFactory fields; |
143 | 144 | protected HashSet<String> stopWords; |
| 145 | + protected Wildcards wildcards = null; |
144 | 146 | |
145 | 147 | /** default value for boolean queries */ |
146 | 148 | public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST; |
— | — | @@ -620,7 +622,7 @@ |
621 | 623 | continue; |
622 | 624 | |
623 | 625 | // terms, fields |
624 | | - if(Character.isLetterOrDigit(c) || c == '['){ |
| 626 | + if(Character.isLetterOrDigit(c) || c == '[' || c=='*' || c=='?'){ |
625 | 627 | // check for generic namespace prefixes, e.g. [0,1]: |
626 | 628 | if(c == '['){ |
627 | 629 | if(fetchGenericPrefix()) |
— | — | @@ -780,6 +782,33 @@ |
781 | 783 | return query; |
782 | 784 | } |
783 | 785 | |
| 786 | + /** return true if buffer is wildcard */ |
| 787 | + private boolean bufferIsWildCard(){ |
| 788 | + if(length < 1) |
| 789 | + return false; |
| 790 | + boolean wild = false; |
| 791 | + int index = -1; |
| 792 | + for(int i=0;i<length;i++){ |
| 793 | + if(buffer[i] == '*' || buffer[i] == '?'){ |
| 794 | + wild = true; |
| 795 | + index = i; |
| 796 | + break; |
| 797 | + } |
| 798 | + } |
| 799 | + // check if it's a valid wildcard |
| 800 | + if(wild){ |
| 801 | + if((buffer[0] == '*' || buffer[0] == '?') && (buffer[length-1]=='*' || buffer[length-1]=='?')) |
| 802 | + return false; // don't support patterns like *a* |
| 803 | + if(index == length-1 && buffer[index]=='?') |
| 804 | + return false; // probably just an ordinary question mark |
| 805 | + for(int i=0;i<length;i++){ |
| 806 | + if(Character.isLetterOrDigit(buffer[i])) |
| 807 | + return true; // +card :P |
| 808 | + } |
| 809 | + } |
| 810 | + return false; |
| 811 | + } |
| 812 | + |
784 | 813 | /** |
785 | 814 | * Constructs either a termquery or a boolean query depending on |
786 | 815 | * analysis of the fetched token. A single "word" might be analyzed |
— | — | @@ -798,11 +827,16 @@ |
799 | 828 | |
800 | 829 | // check for wildcard seaches, they are also not analyzed/stemmed, only for titles |
801 | 830 | // wildcard signs are allowed only at the end of the word, minimum one letter word |
802 | | - if(length>1 && Character.isLetter(buffer[0]) && buffer[length-1]=='*' && |
803 | | - defaultField.equals(fields.title())){ |
804 | | - Query ret = new WildcardQuery(makeTerm()); |
805 | | - ret.setBoost(defaultBoost); |
806 | | - return ret; |
| 831 | + if(length>1 && wildcards != null && bufferIsWildCard()){ |
| 832 | + Term term = makeTerm(); |
| 833 | + Query ret = wildcards.makeQuery(term.text(),term.field()); |
| 834 | + if(ret != null){ |
| 835 | + ret.setBoost(defaultBoost); |
| 836 | + return ret; |
| 837 | + } else{ |
| 838 | + // something is wrong, try making normal query |
| 839 | + return new TermQuery(term); |
| 840 | + } |
807 | 841 | } |
808 | 842 | |
809 | 843 | if(toplevelOccur == BooleanClause.Occur.MUST_NOT) |
— | — | @@ -1432,15 +1466,31 @@ |
1433 | 1467 | /** Make the main phrase query, finds exact phrases, and sloppy phrases without stop words */ |
1434 | 1468 | public Query makeMainPhrase(ArrayList<String> words, String field, int slop, float boost, Query stemtitle, Query related, HashSet<String> preStopWords){ |
1435 | 1469 | RankValue val = new RankValue(); |
1436 | | - CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.ContentsSloppyOptions(val,stemtitle,related), |
1437 | | - new QueryOptions.ContentsExactOptions(val,stemtitle,related),preStopWords); |
| 1470 | + boolean allStopWords = true; |
1438 | 1471 | for(String w : words){ |
1439 | | - pq.add(new Term(field,w)); |
| 1472 | + if(!preStopWords.contains(w)){ |
| 1473 | + allStopWords = false; |
| 1474 | + break; |
| 1475 | + } |
1440 | 1476 | } |
1441 | | - pq.setSlop(slop); |
1442 | | - pq.setBoost(boost); |
1443 | | - return pq; |
1444 | | - |
| 1477 | + if(allStopWords){ |
| 1478 | + CustomPhraseQuery pq = new CustomPhraseQuery(new QueryOptions.ContentsExactOptions(val,stemtitle,related)); |
| 1479 | + for(String w : words){ |
| 1480 | + pq.add(new Term(field,w)); |
| 1481 | + } |
| 1482 | + pq.setSlop(slop); |
| 1483 | + pq.setBoost(boost); |
| 1484 | + return pq; |
| 1485 | + } else{ |
| 1486 | + CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.ContentsSloppyOptions(val,stemtitle,related), |
| 1487 | + new QueryOptions.ContentsExactOptions(val,stemtitle,related),preStopWords); |
| 1488 | + for(String w : words){ |
| 1489 | + pq.add(new Term(field,w)); |
| 1490 | + } |
| 1491 | + pq.setSlop(slop); |
| 1492 | + pq.setBoost(boost); |
| 1493 | + return pq; |
| 1494 | + } |
1445 | 1495 | } |
1446 | 1496 | |
1447 | 1497 | /** make single phrase for related field */ |
— | — | @@ -1616,14 +1666,31 @@ |
1617 | 1667 | /** Make the phrase that will match redirects, etc.. */ |
1618 | 1668 | public Query makeAlttitlePhrase(ArrayList<String> words, String field, int slop, float boost, HashSet<String> preStopWords){ |
1619 | 1669 | AggregatePhraseInfo ap = new AggregatePhraseInfo(); |
1620 | | - CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.AlttitleSloppyOptions(ap), |
1621 | | - new QueryOptions.AlttitleExactOptions(ap),preStopWords); |
| 1670 | + boolean allStopWords = true; |
1622 | 1671 | for(String w : words){ |
1623 | | - pq.add(new Term(field,w)); |
| 1672 | + if(!preStopWords.contains(w)){ |
| 1673 | + allStopWords = false; |
| 1674 | + break; |
| 1675 | + } |
1624 | 1676 | } |
1625 | | - pq.setSlop(slop); |
1626 | | - pq.setBoost(boost); |
1627 | | - return pq; |
| 1677 | + if(allStopWords){ |
| 1678 | + CustomPhraseQuery pq = new CustomPhraseQuery(new QueryOptions.AlttitleExactOptions(ap)); |
| 1679 | + for(String w : words){ |
| 1680 | + pq.add(new Term(field,w)); |
| 1681 | + } |
| 1682 | + pq.setSlop(slop); |
| 1683 | + pq.setBoost(boost); |
| 1684 | + return pq; |
| 1685 | + } else{ |
| 1686 | + CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.AlttitleSloppyOptions(ap), |
| 1687 | + new QueryOptions.AlttitleExactOptions(ap),preStopWords); |
| 1688 | + for(String w : words){ |
| 1689 | + pq.add(new Term(field,w)); |
| 1690 | + } |
| 1691 | + pq.setSlop(slop); |
| 1692 | + pq.setBoost(boost); |
| 1693 | + return pq; |
| 1694 | + } |
1628 | 1695 | |
1629 | 1696 | } |
1630 | 1697 | |
— | — | @@ -1650,7 +1717,8 @@ |
1651 | 1718 | * @return |
1652 | 1719 | */ |
1653 | 1720 | @SuppressWarnings("unchecked") |
1654 | | - protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){ |
| 1721 | + protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords, Wildcards wildcards){ |
| 1722 | + this.wildcards = wildcards; |
1655 | 1723 | queryText = quoteCJK(queryText); |
1656 | 1724 | if(policy != null) |
1657 | 1725 | this.namespacePolicy = policy; |
— | — | @@ -1669,6 +1737,9 @@ |
1670 | 1738 | BooleanQuery bq = new BooleanQuery(true); |
1671 | 1739 | bq.add(qc,BooleanClause.Occur.SHOULD); |
1672 | 1740 | bq.add(qt,BooleanClause.Occur.SHOULD); |
| 1741 | + |
| 1742 | + if(words.size() == 0) |
| 1743 | + return bq; |
1673 | 1744 | |
1674 | 1745 | HashSet<String> preStopWords = StopWords.getPredefinedSet(builder.getFilters().getIndexId()); |
1675 | 1746 | Query alttitleQuery = makeAlttitlePhrase(words,fields.alttitle(),10,1,preStopWords); |
— | — | @@ -1701,7 +1772,11 @@ |
1702 | 1773 | return coreQuery; |
1703 | 1774 | |
1704 | 1775 | } |
1705 | | - |
| 1776 | + |
| 1777 | + public Query parseWithWildcards(String queryText, NamespacePolicy policy, Wildcards wildcards){ |
| 1778 | + return parseMultiPass(queryText,policy,false,false,wildcards); |
| 1779 | + } |
| 1780 | + |
1706 | 1781 | /** |
1707 | 1782 | * Three parse pases: contents, title, redirect |
1708 | 1783 | * |
— | — | @@ -1711,7 +1786,7 @@ |
1712 | 1787 | * @throws ParseException |
1713 | 1788 | */ |
1714 | 1789 | public Query parseThreePass(String queryText, NamespacePolicy policy) throws ParseException{ |
1715 | | - return parseMultiPass(queryText,policy,true,false); |
| 1790 | + return parseMultiPass(queryText,policy,true,false,null); |
1716 | 1791 | } |
1717 | 1792 | |
1718 | 1793 | /** |
— | — | @@ -1723,11 +1798,11 @@ |
1724 | 1799 | */ |
1725 | 1800 | public Query parseFourPass(String queryText, NamespacePolicy policy, String dbname) throws ParseException{ |
1726 | 1801 | boolean makeKeywords = global.useKeywordScoring(dbname); |
1727 | | - return parseMultiPass(queryText,policy,true,makeKeywords); |
| 1802 | + return parseMultiPass(queryText,policy,true,makeKeywords,null); |
1728 | 1803 | } |
1729 | 1804 | |
1730 | 1805 | public Query parseFourPass(String queryText, NamespacePolicy policy, boolean makeKeywords) throws ParseException{ |
1731 | | - return parseMultiPass(queryText,policy,true,makeKeywords); |
| 1806 | + return parseMultiPass(queryText,policy,true,makeKeywords,null); |
1732 | 1807 | } |
1733 | 1808 | |
1734 | 1809 | /** |
— | — | @@ -1740,7 +1815,7 @@ |
1741 | 1816 | * @throws ParseException |
1742 | 1817 | */ |
1743 | 1818 | public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{ |
1744 | | - return parseMultiPass(queryText,policy,false,false); |
| 1819 | + return parseMultiPass(queryText,policy,false,false,null); |
1745 | 1820 | } |
1746 | 1821 | |
1747 | 1822 | public NamespacePolicy getNamespacePolicy() { |