Index: trunk/lucene-search-3/src/main/java/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -21,6 +21,7 @@ |
22 | 22 | import org.apache.lucene.search.ArticleQueryWrap; |
23 | 23 | import org.apache.lucene.search.ArticleScaling; |
24 | 24 | import org.apache.lucene.search.BooleanClause; |
| 25 | +import org.apache.lucene.search.BooleanClause.Occur; |
25 | 26 | import org.apache.lucene.search.BooleanQuery; |
26 | 27 | import org.apache.lucene.search.MultiPhraseQuery; |
27 | 28 | import org.apache.lucene.search.PositionalMultiQuery; |
— | — | @@ -29,7 +30,6 @@ |
30 | 31 | import org.apache.lucene.search.Query; |
31 | 32 | import org.apache.lucene.search.RelevanceQuery; |
32 | 33 | import org.apache.lucene.search.TermQuery; |
33 | | -import org.apache.lucene.search.BooleanClause.Occur; |
34 | 34 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
35 | 35 | import org.wikimedia.lsearch.config.IndexId; |
36 | 36 | import org.wikimedia.lsearch.config.IndexId.AgeScaling; |
— | — | @@ -44,13 +44,12 @@ |
45 | 45 | * Parser for wiki query syntax |
46 | 46 | * |
47 | 47 | * @author rainman |
48 | | - * |
49 | 48 | */ |
50 | 49 | public class WikiQueryParser { |
51 | 50 | private static final int MAX_TERM_LEN = 255; |
52 | | - private final char[] buffer = new char[MAX_TERM_LEN+1]; |
| 51 | + private final char[] buffer = new char[MAX_TERM_LEN + 1]; |
53 | 52 | private int length; // length of the token in the buffer |
54 | | - private Analyzer analyzer; |
| 53 | + private Analyzer analyzer; |
55 | 54 | private char[] text; // text that is being parsed |
56 | 55 | private int cur; // current position in text |
57 | 56 | private int prev_cur; // cur before parsing this token (for backToken()) |
— | — | @@ -61,76 +60,90 @@ |
62 | 61 | private String defaultField; // the default field value |
63 | 62 | private float defaultBoost = 1; |
64 | 63 | private float defaultAliasBoost = ALIAS_BOOST; |
65 | | - protected enum TokenType {WORD, FIELD, AND, OR, EOF }; |
66 | | - |
67 | | - private TokenStream tokenStream; |
| 64 | + |
| 65 | + protected enum TokenType { |
| 66 | + WORD, FIELD, AND, OR, EOF |
| 67 | + }; |
| 68 | + |
| 69 | + private TokenStream tokenStream; |
68 | 70 | private ArrayList<Token> tokens; // tokens from analysis |
69 | 71 | protected ParsedWords parsedWords; |
70 | 72 | protected String[] prefixFilters; |
71 | | - protected enum ExpandedType { WORD, WILDCARD, FUZZY, PHRASE }; |
| 73 | + |
| 74 | + protected enum ExpandedType { |
| 75 | + WORD, WILDCARD, FUZZY, PHRASE |
| 76 | + }; |
| 77 | + |
72 | 78 | protected Term[] highlightTerms = null; |
73 | | - |
| 79 | + |
74 | 80 | protected ArrayList<ArrayList<Term>> urls; |
75 | | - |
76 | | - /** sometimes the fieldsubquery takes the bool modifier, to retrieve it, use this variable, |
77 | | - * this will always point to the last unused bool modifier */ |
78 | | - BooleanClause.Occur explicitOccur = null; |
79 | | - |
| 81 | + |
| 82 | + /** |
| 83 | + * sometimes the fieldsubquery takes the bool modifier, to retrieve it, use |
| 84 | + * this variable, |
| 85 | + * this will always point to the last unused bool modifier |
| 86 | + */ |
| 87 | + BooleanClause.Occur explicitOccur = null; |
| 88 | + |
80 | 89 | /** Wheather to include aliases during title rewrite */ |
81 | 90 | protected boolean disableTitleAliases; |
82 | | - |
| 91 | + |
83 | 92 | /** boost for alias words from analyzer */ |
84 | | - public static float ALIAS_BOOST = 0.5f; |
| 93 | + public static float ALIAS_BOOST = 0.5f; |
85 | 94 | /** boost for title field */ |
86 | | - public static float TITLE_BOOST = 6; |
| 95 | + public static float TITLE_BOOST = 6; |
87 | 96 | public static float TITLE_ALIAS_BOOST = 0.2f; |
88 | 97 | public static float TITLE_PHRASE_BOOST = 2; |
89 | | - public static float STEM_TITLE_BOOST = 0.8f; |
| 98 | + public static float STEM_TITLE_BOOST = 0.8f; |
90 | 99 | public static float STEM_TITLE_ALIAS_BOOST = 0.4f; |
91 | 100 | public static float ALT_TITLE_BOOST = 4; |
92 | 101 | public static float ALT_TITLE_ALIAS_BOOST = 0.4f; |
93 | 102 | public static float CONTENTS_BOOST = 0.2f; |
94 | | - |
| 103 | + |
95 | 104 | public static float STEM_WORD_BOOST = 0.01f; |
96 | 105 | public static float SINGULAR_WORD_BOOST = 0.5f; |
97 | | - |
| 106 | + |
98 | 107 | // main phrase stuff: |
99 | 108 | public static int MAINPHRASE_SLOP = 100; |
100 | 109 | public static float MAINPHRASE_BOOST = 2f; |
101 | | - public static float RELEVANCE_RELATED_BOOST = 12f; |
| 110 | + public static float RELEVANCE_RELATED_BOOST = 12f; |
102 | 111 | public static float RELEVANCE_ALTTITLE_BOOST = 2.5f; |
103 | 112 | public static float SECTIONS_BOOST = 0.25f; |
104 | 113 | public static float ALTTITLE_BOOST = 0.5f; |
105 | 114 | public static float RELATED_BOOST = 1f; |
106 | 115 | // additional to main phrase: |
107 | 116 | public static float ADD_RELATED_BOOST = 4f; |
108 | | - |
| 117 | + |
109 | 118 | public static float WILDCARD_BOOST = 2f; |
110 | 119 | public static float FUZZY_BOOST = 4f; |
111 | | - |
| 120 | + |
112 | 121 | public static boolean ADD_STEM_TITLE = true; |
113 | 122 | public static boolean ADD_TITLE_PHRASES = true; |
114 | | - |
115 | | - /** Policies in treating field names: |
116 | | - * |
| 123 | + |
| 124 | + /** |
| 125 | + * Policies in treating field names: |
117 | 126 | * LEAVE - don't mess with field rewriting |
118 | 127 | * IGNORE - convert all field names to contents (except category) |
119 | | - * REWRITE - rewrite (help:searchterm) => (+namespace:12 contents:searchterm) |
| 128 | + * REWRITE - rewrite (help:searchterm) => (+namespace:12 |
| 129 | + * contents:searchterm) |
120 | 130 | */ |
121 | | - public enum NamespacePolicy { LEAVE, IGNORE, REWRITE }; |
| 131 | + public enum NamespacePolicy { |
| 132 | + LEAVE, IGNORE, REWRITE |
| 133 | + }; |
| 134 | + |
122 | 135 | /** Rewritten namespace queries. prefix => query */ |
123 | | - static protected Hashtable<String,Query> namespaceQueries = null; |
| 136 | + static protected Hashtable<String, Query> namespaceQueries = null; |
124 | 137 | /** The 'all' keyword */ |
125 | 138 | static protected String namespaceAllKeyword = null; |
126 | 139 | /** Prefixes and associated filters. prefix -> filter */ |
127 | | - static protected Hashtable<String,NamespaceFilter> namespaceFilters = null; |
| 140 | + static protected Hashtable<String, NamespaceFilter> namespaceFilters = null; |
128 | 141 | /** nsfilter -> prefix (reverse table to namespaceFilters */ |
129 | | - static protected Hashtable<NamespaceFilter,String> namespacePrefixes = null; |
| 142 | + static protected Hashtable<NamespaceFilter, String> namespacePrefixes = null; |
130 | 143 | private String defaultNamespaceName; |
131 | 144 | private Query namespaceRewriteQuery; |
132 | 145 | private NamespacePolicy namespacePolicy; |
133 | 146 | protected NamespaceFilter defaultNamespaceFilter; |
134 | | - protected static GlobalConfiguration global=null; |
| 147 | + protected static GlobalConfiguration global = null; |
135 | 148 | protected FieldBuilder.BuilderSet builder; |
136 | 149 | protected FieldNameFactory fields; |
137 | 150 | protected FilterFactory filters; |
— | — | @@ -140,219 +153,246 @@ |
141 | 154 | protected IndexId iid; |
142 | 155 | protected boolean isInTitle = false; |
143 | 156 | protected int isInTitleLevel = 0; |
144 | | - |
| 157 | + |
145 | 158 | /** Raw fields to append to queries like ondiscussionpage */ |
146 | | - protected HashMap<String,String> rawFields = new HashMap<String,String>(); |
147 | | - |
148 | | - Hashtable<String,String> keywordFieldMapping = new Hashtable<String,String>(); |
149 | | - |
150 | | - protected Pattern urlPattern = Pattern.compile("(\\w+:{0,1}\\w*@)?(\\S+)(:[0-9]+)?(\\/|\\/([\\w#!:.?+=&%@!\\-\\/]))?"); |
151 | | - |
| 159 | + protected HashMap<String, String> rawFields = new HashMap<String, String>(); |
| 160 | + |
| 161 | + Hashtable<String, String> keywordFieldMapping = new Hashtable<String, String>(); |
| 162 | + |
| 163 | + protected Pattern urlPattern = Pattern |
| 164 | + .compile("(\\w+:{0,1}\\w*@)?(\\S+)(:[0-9]+)?(\\/|\\/([\\w#!:.?+=&%@!\\-\\/]))?"); |
| 165 | + |
152 | 166 | /** default operator (must = AND, should = OR) for boolean queries */ |
153 | 167 | public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST; |
154 | | - |
| 168 | + |
155 | 169 | /** Word + boost for expanded term */ |
156 | 170 | static class WordBoost { |
157 | 171 | String word; |
158 | 172 | float boost; |
| 173 | + |
159 | 174 | public WordBoost(String word, float boost) { |
160 | 175 | this.word = word; |
161 | 176 | this.boost = boost; |
162 | 177 | } |
163 | 178 | } |
164 | | - |
| 179 | + |
165 | 180 | /** Descriptor for words within queries */ |
166 | 181 | static class WordsDesc { |
167 | 182 | /** original term text */ |
168 | | - String original = null; |
| 183 | + String original = null; |
169 | 184 | /** words in which the term is expaned to */ |
170 | | - ArrayList<WordBoost> expanded = new ArrayList<WordBoost>(); |
| 185 | + ArrayList<WordBoost> expanded = new ArrayList<WordBoost>(); |
171 | 186 | ExpandedType type = ExpandedType.WORD; |
172 | 187 | int position; |
173 | | - |
| 188 | + |
174 | 189 | public WordsDesc(String original, ExpandedType type, int position) { |
175 | 190 | this.original = original; |
176 | 191 | this.type = type; |
177 | 192 | this.position = position; |
178 | 193 | } |
179 | 194 | |
180 | | - void add(WordBoost wb){ |
| 195 | + void add(WordBoost wb) { |
181 | 196 | expanded.add(wb); |
182 | 197 | } |
183 | | - |
184 | | - String first(){ |
| 198 | + |
| 199 | + String first() { |
185 | 200 | return expanded.get(0).word; |
186 | 201 | } |
187 | | - |
188 | | - WordBoost firstWordBoost(){ |
| 202 | + |
| 203 | + WordBoost firstWordBoost() { |
189 | 204 | return expanded.get(0); |
190 | 205 | } |
| 206 | + |
191 | 207 | /** new word desc with first word extracted only */ |
192 | | - WordsDesc firstWordsDesc(){ |
193 | | - WordsDesc d = new WordsDesc(original,type,position); |
| 208 | + WordsDesc firstWordsDesc() { |
| 209 | + WordsDesc d = new WordsDesc(original, type, position); |
194 | 210 | d.add(firstWordBoost()); |
195 | 211 | return d; |
196 | 212 | } |
197 | | - |
| 213 | + |
198 | 214 | /** create search terms */ |
199 | | - Term[] getTerms(String field){ |
| 215 | + Term[] getTerms(String field) { |
200 | 216 | Term[] terms = new Term[expanded.size()]; |
201 | | - for(int i=0;i<expanded.size();i++) |
202 | | - terms[i] = new Term(field,expanded.get(i).word); |
| 217 | + for (int i = 0; i < expanded.size(); i++) |
| 218 | + terms[i] = new Term(field, expanded.get(i).word); |
203 | 219 | return terms; |
204 | 220 | } |
205 | | - |
206 | | - ArrayList<Float> getBoosts(){ |
| 221 | + |
| 222 | + ArrayList<Float> getBoosts() { |
207 | 223 | ArrayList<Float> boosts = new ArrayList<Float>(); |
208 | | - for(WordBoost w : expanded) |
| 224 | + for (WordBoost w : expanded) |
209 | 225 | boosts.add(w.boost); |
210 | 226 | return boosts; |
211 | 227 | } |
212 | | - |
213 | | - int getPosition(){ |
| 228 | + |
| 229 | + int getPosition() { |
214 | 230 | return position; |
215 | 231 | } |
216 | | - |
217 | | - boolean isWildcardOrFuzzy(){ |
218 | | - return type == ExpandedType.WILDCARD || type == ExpandedType.FUZZY; |
| 232 | + |
| 233 | + boolean isWildcardOrFuzzy() { |
| 234 | + return type == ExpandedType.WILDCARD || type == ExpandedType.FUZZY; |
219 | 235 | } |
220 | | - |
| 236 | + |
221 | 237 | } |
222 | | - |
| 238 | + |
223 | 239 | /** Words from parser */ |
224 | 240 | static class ParsedWords { |
225 | 241 | ArrayList<WordsDesc> words = new ArrayList<WordsDesc>(); |
226 | | - |
227 | | - void add(String original, ArrayList<String> words, ArrayList<Float> boosts, ExpandedType type){ |
| 242 | + |
| 243 | + void add(String original, ArrayList<String> words, |
| 244 | + ArrayList<Float> boosts, ExpandedType type) { |
228 | 245 | int pos = this.words.size(); |
229 | | - WordsDesc wd = new WordsDesc(original,type,pos); |
230 | | - for(int i=0;i<words.size();i++){ |
231 | | - wd.add(new WordBoost(words.get(i),boosts.get(i))); |
| 246 | + WordsDesc wd = new WordsDesc(original, type, pos); |
| 247 | + for (int i = 0; i < words.size(); i++) { |
| 248 | + wd.add(new WordBoost(words.get(i), boosts.get(i))); |
232 | 249 | } |
233 | 250 | this.words.add(wd); |
234 | 251 | } |
235 | | - |
236 | | - void add(String original, ArrayList<String> words, float boost, ExpandedType type){ |
| 252 | + |
| 253 | + void add(String original, ArrayList<String> words, float boost, |
| 254 | + ExpandedType type) { |
237 | 255 | int pos = this.words.size(); |
238 | | - WordsDesc wd = new WordsDesc(original,type,pos); |
239 | | - for(int i=0;i<words.size();i++){ |
240 | | - wd.add(new WordBoost(words.get(i),boost)); |
| 256 | + WordsDesc wd = new WordsDesc(original, type, pos); |
| 257 | + for (int i = 0; i < words.size(); i++) { |
| 258 | + wd.add(new WordBoost(words.get(i), boost)); |
241 | 259 | } |
242 | 260 | this.words.add(wd); |
243 | 261 | } |
244 | | - |
245 | | - void add(String original, String word, float boost, ExpandedType type){ |
| 262 | + |
| 263 | + void add(String original, String word, float boost, ExpandedType type) { |
246 | 264 | int pos = this.words.size(); |
247 | | - WordsDesc wd = new WordsDesc(original,type,pos); |
248 | | - wd.add(new WordBoost(word,boost)); |
| 265 | + WordsDesc wd = new WordsDesc(original, type, pos); |
| 266 | + wd.add(new WordBoost(word, boost)); |
249 | 267 | this.words.add(wd); |
250 | 268 | } |
251 | | - |
252 | | - WordsDesc last(){ |
253 | | - return words.get(words.size()-1); |
| 269 | + |
| 270 | + WordsDesc last() { |
| 271 | + return words.get(words.size() - 1); |
254 | 272 | } |
255 | | - |
| 273 | + |
256 | 274 | /** Extract the main stream of words, excludes wildcards and such */ |
257 | | - ArrayList<String> extractFirst(){ |
| 275 | + ArrayList<String> extractFirst() { |
258 | 276 | ArrayList<String> ret = new ArrayList<String>(); |
259 | | - for(WordsDesc d : words){ |
260 | | - if(d.type==ExpandedType.WORD || d.type==ExpandedType.PHRASE) |
| 277 | + for (WordsDesc d : words) { |
| 278 | + if (d.type == ExpandedType.WORD |
| 279 | + || d.type == ExpandedType.PHRASE) |
261 | 280 | ret.add(d.first()); |
262 | 281 | } |
263 | 282 | return ret; |
264 | 283 | } |
265 | | - |
| 284 | + |
266 | 285 | /** First string at index of expanded */ |
267 | | - String firstAt(int index){ |
| 286 | + String firstAt(int index) { |
268 | 287 | return words.get(index).first(); |
269 | 288 | } |
270 | | - |
271 | | - int size(){ |
| 289 | + |
| 290 | + int size() { |
272 | 291 | return words.size(); |
273 | 292 | } |
274 | | - |
| 293 | + |
275 | 294 | /** get ParsedWords with only a single word on given position */ |
276 | | - ParsedWords cloneSingleWord(int index){ |
277 | | - return cloneRange(index,index); |
| 295 | + ParsedWords cloneSingleWord(int index) { |
| 296 | + return cloneRange(index, index); |
278 | 297 | } |
| 298 | + |
279 | 299 | /** get ParsedWords with a range of words (both i1, i2 inclusive) */ |
280 | | - ParsedWords cloneRange(int i1, int i2){ |
| 300 | + ParsedWords cloneRange(int i1, int i2) { |
281 | 301 | ParsedWords ret = new ParsedWords(); |
282 | | - for(int i=i1;i<=i2;i++) |
| 302 | + for (int i = i1; i <= i2; i++) |
283 | 303 | ret.words.add(words.get(i)); |
284 | 304 | return ret; |
285 | 305 | } |
| 306 | + |
286 | 307 | /** Get ParsedWords of first words */ |
287 | | - ParsedWords cloneFirst(){ |
| 308 | + ParsedWords cloneFirst() { |
288 | 309 | ParsedWords ret = new ParsedWords(); |
289 | | - for(WordsDesc d : words){ |
290 | | - if(d.type==ExpandedType.WORD || d.type==ExpandedType.PHRASE) |
| 310 | + for (WordsDesc d : words) { |
| 311 | + if (d.type == ExpandedType.WORD |
| 312 | + || d.type == ExpandedType.PHRASE) |
291 | 313 | ret.add(d.firstWordsDesc()); |
292 | 314 | } |
293 | 315 | return ret; |
294 | 316 | } |
295 | | - |
296 | | - /** Get ParsedWords of first words, or whole ParsedWords if wildcard/fuzzy */ |
297 | | - ParsedWords cloneFirstWithWildcards(){ |
| 317 | + |
| 318 | + /** |
| 319 | + * Get ParsedWords of first words, or whole ParsedWords if |
| 320 | + * wildcard/fuzzy |
| 321 | + */ |
| 322 | + ParsedWords cloneFirstWithWildcards() { |
298 | 323 | ParsedWords ret = new ParsedWords(); |
299 | | - for(WordsDesc d : words){ |
300 | | - if(d.type==ExpandedType.WORD || d.type==ExpandedType.PHRASE) |
| 324 | + for (WordsDesc d : words) { |
| 325 | + if (d.type == ExpandedType.WORD |
| 326 | + || d.type == ExpandedType.PHRASE) |
301 | 327 | ret.add(d.firstWordsDesc()); |
302 | | - else if(d.isWildcardOrFuzzy()) |
| 328 | + else if (d.isWildcardOrFuzzy()) |
303 | 329 | ret.add(d); |
304 | 330 | } |
305 | 331 | return ret; |
306 | 332 | } |
307 | | - |
308 | | - void add(WordsDesc desc){ |
| 333 | + |
| 334 | + void add(WordsDesc desc) { |
309 | 335 | words.add(desc); |
310 | 336 | } |
311 | | - |
| 337 | + |
312 | 338 | } |
313 | | - |
| 339 | + |
314 | 340 | /** Init namespace queries */ |
315 | | - protected void initNamespaces(){ |
316 | | - if(namespaceQueries != null) |
| 341 | + protected void initNamespaces() { |
| 342 | + if (namespaceQueries != null) |
317 | 343 | return; |
318 | | - if(global == null) |
319 | | - global = GlobalConfiguration.getInstance(); |
| 344 | + if (global == null) |
| 345 | + global = GlobalConfiguration.getInstance(); |
320 | 346 | namespaceAllKeyword = global.getNamespacePrefixAll(); |
321 | | - namespaceQueries = new Hashtable<String,Query>(); |
322 | | - namespacePrefixes = new Hashtable<NamespaceFilter,String>(); |
| 347 | + namespaceQueries = new Hashtable<String, Query>(); |
| 348 | + namespacePrefixes = new Hashtable<NamespaceFilter, String>(); |
323 | 349 | namespaceFilters = global.getNamespacePrefixes(); |
324 | | - for(Entry<String,NamespaceFilter> prefix : namespaceFilters.entrySet()){ |
325 | | - namespaceQueries.put(prefix.getKey(),generateRewrite(prefix.getValue())); |
326 | | - namespacePrefixes.put(prefix.getValue(),prefix.getKey()); |
| 350 | + for (Entry<String, NamespaceFilter> prefix : namespaceFilters |
| 351 | + .entrySet()) { |
| 352 | + namespaceQueries.put(prefix.getKey(), |
| 353 | + generateRewrite(prefix.getValue())); |
| 354 | + namespacePrefixes.put(prefix.getValue(), prefix.getKey()); |
327 | 355 | } |
328 | 356 | } |
329 | | - |
| 357 | + |
330 | 358 | /** |
331 | 359 | * Construct using default policy (LEAVE), without any namespace rewriting |
332 | | - * @param field default field name |
| 360 | + * |
| 361 | + * @param field |
| 362 | + * default field name |
333 | 363 | * @param analyzer |
334 | 364 | */ |
335 | | - public WikiQueryParser(String field, Analyzer analyzer, FieldBuilder.BuilderSet builder, Collection<String> stopWords){ |
336 | | - this(field,(NamespaceFilter)null,analyzer,builder,NamespacePolicy.LEAVE,stopWords); |
| 365 | + public WikiQueryParser(String field, Analyzer analyzer, |
| 366 | + FieldBuilder.BuilderSet builder, Collection<String> stopWords) { |
| 367 | + this(field, (NamespaceFilter) null, analyzer, builder, |
| 368 | + NamespacePolicy.LEAVE, stopWords); |
337 | 369 | } |
338 | | - |
| 370 | + |
339 | 371 | /** |
340 | 372 | * Construct with default field (e.g. contents), with default namespace |
341 | 373 | * (e.g. main), and with analyzer and namespace policy |
| 374 | + * |
342 | 375 | * @param field |
343 | 376 | * @param namespace |
344 | 377 | * @param analyzer |
345 | 378 | * @param nsPolicy |
346 | 379 | */ |
347 | | - public WikiQueryParser(String field, String namespace, Analyzer analyzer, FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy, Collection<String> stopWords){ |
348 | | - this(field,new NamespaceFilter(namespace),analyzer,builder,nsPolicy,stopWords); |
| 380 | + public WikiQueryParser(String field, String namespace, Analyzer analyzer, |
| 381 | + FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy, |
| 382 | + Collection<String> stopWords) { |
| 383 | + this(field, new NamespaceFilter(namespace), analyzer, builder, |
| 384 | + nsPolicy, stopWords); |
349 | 385 | } |
350 | | - |
351 | | - public WikiQueryParser(String field, String namespace, Analyzer analyzer, FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy){ |
352 | | - this(field,new NamespaceFilter(namespace),analyzer,builder,nsPolicy,null); |
| 386 | + |
| 387 | + public WikiQueryParser(String field, String namespace, Analyzer analyzer, |
| 388 | + FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy) { |
| 389 | + this(field, new NamespaceFilter(namespace), analyzer, builder, |
| 390 | + nsPolicy, null); |
353 | 391 | } |
354 | | - |
355 | | - public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, FieldBuilder.BuilderSet builder, NamespacePolicy nsPolicy, Collection<String> stopWords){ |
356 | | - defaultField = field; |
| 392 | + |
| 393 | + public WikiQueryParser(String field, NamespaceFilter nsfilter, |
| 394 | + Analyzer analyzer, FieldBuilder.BuilderSet builder, |
| 395 | + NamespacePolicy nsPolicy, Collection<String> stopWords) { |
| 396 | + defaultField = field; |
357 | 397 | this.analyzer = analyzer; |
358 | 398 | this.builder = builder; |
359 | 399 | this.fields = builder.getFields(); |
— | — | @@ -361,299 +401,313 @@ |
362 | 402 | tokens = new ArrayList<Token>(); |
363 | 403 | this.namespacePolicy = nsPolicy; |
364 | 404 | disableTitleAliases = true; |
365 | | - keywordFieldMapping = new Hashtable<String,String>(); |
366 | | - keywordFieldMapping.put("inthread", "ThreadAncestor"); |
| 405 | + keywordFieldMapping = new Hashtable<String, String>(); |
| 406 | + keywordFieldMapping.put("inthread", "ThreadAncestor"); |
367 | 407 | keywordFieldMapping.put("ondiscussionpage", "ThreadPage"); |
368 | 408 | initNamespaces(); |
369 | 409 | this.stopWords = new HashSet<String>(); |
370 | | - if(stopWords != null) |
| 410 | + if (stopWords != null) |
371 | 411 | this.stopWords.addAll(stopWords); |
372 | | - this.defaultNamespaceFilter=nsfilter; |
373 | | - if(nsfilter != null){ |
374 | | - namespaceRewriteQuery = generateRewrite(nsfilter); |
375 | | - if(namespaceRewriteQuery != null && namespacePrefixes.containsKey(nsfilter)) |
| 412 | + this.defaultNamespaceFilter = nsfilter; |
| 413 | + if (nsfilter != null) { |
| 414 | + namespaceRewriteQuery = generateRewrite(nsfilter); |
| 415 | + if (namespaceRewriteQuery != null |
| 416 | + && namespacePrefixes.containsKey(nsfilter)) |
376 | 417 | defaultNamespaceName = namespacePrefixes.get(nsfilter); |
377 | 418 | else |
378 | 419 | defaultNamespaceName = null; |
379 | | - } |
380 | | - else{ |
| 420 | + } else { |
381 | 421 | namespaceRewriteQuery = null; |
382 | 422 | defaultNamespaceName = null; |
383 | 423 | } |
384 | 424 | } |
385 | | - |
| 425 | + |
386 | 426 | /** Generate a rewrite query for a collection of namespaces */ |
387 | | - public static Query generateRewrite(NamespaceFilter nsfilter){ |
388 | | - if(nsfilter.cardinality() == 0) |
| 427 | + public static Query generateRewrite(NamespaceFilter nsfilter) { |
| 428 | + if (nsfilter.cardinality() == 0) |
389 | 429 | return null; |
390 | | - else if(nsfilter.cardinality() == 1) |
391 | | - return new TermQuery(new Term("namespace",Integer.toString(nsfilter.getNamespace()))); |
392 | | - |
| 430 | + else if (nsfilter.cardinality() == 1) |
| 431 | + return new TermQuery(new Term("namespace", |
| 432 | + Integer.toString(nsfilter.getNamespace()))); |
| 433 | + |
393 | 434 | BooleanQuery bq = new BooleanQuery(); |
394 | 435 | BitSet bs = nsfilter.getIncluded(); |
395 | 436 | // iterate over set bits |
396 | | - for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){ |
397 | | - bq.add(new TermQuery(new Term("namespace",Integer.toString(i))), |
| 437 | + for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) { |
| 438 | + bq.add(new TermQuery(new Term("namespace", Integer.toString(i))), |
398 | 439 | BooleanClause.Occur.SHOULD); |
399 | | - bq.add(new TermQuery(new Term("redirect_namespace",Integer.toString(i))), |
400 | | - BooleanClause.Occur.MUST_NOT); |
| 440 | + bq.add(new TermQuery(new Term("redirect_namespace", Integer |
| 441 | + .toString(i))), BooleanClause.Occur.MUST_NOT); |
401 | 442 | } |
402 | 443 | return bq; |
403 | 444 | } |
404 | | - |
| 445 | + |
405 | 446 | /** Generate a rewrite query for a collection of namespaces */ |
406 | | - public static Query generateRedirectRewrite(NamespaceFilter nsfilter){ |
407 | | - if(nsfilter.cardinality() == 0) |
| 447 | + public static Query generateRedirectRewrite(NamespaceFilter nsfilter) { |
| 448 | + if (nsfilter.cardinality() == 0) |
408 | 449 | return null; |
409 | | - else if(nsfilter.cardinality() == 1) |
410 | | - return new TermQuery(new Term("redirect_namespace",Integer.toString(nsfilter.getNamespace()))); |
411 | | - |
| 450 | + else if (nsfilter.cardinality() == 1) |
| 451 | + return new TermQuery(new Term("redirect_namespace", |
| 452 | + Integer.toString(nsfilter.getNamespace()))); |
| 453 | + |
412 | 454 | BooleanQuery bq = new BooleanQuery(); |
413 | 455 | BitSet bs = nsfilter.getIncluded(); |
414 | 456 | // iterate over set bits |
415 | | - for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){ |
416 | | - bq.add(new TermQuery(new Term("redirect_namespace",Integer.toString(i))), |
417 | | - BooleanClause.Occur.SHOULD); |
| 457 | + for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) { |
| 458 | + bq.add(new TermQuery(new Term("redirect_namespace", Integer |
| 459 | + .toString(i))), BooleanClause.Occur.SHOULD); |
418 | 460 | } |
419 | 461 | return bq; |
420 | 462 | } |
421 | | - |
422 | | - /** |
| 463 | + |
| 464 | + /** |
423 | 465 | * Get a hashset of namespace numbers for fields that are |
424 | | - * valid namespace keys. |
| 466 | + * valid namespace keys. |
| 467 | + * |
425 | 468 | * @param queryText |
426 | 469 | * @return |
427 | 470 | */ |
428 | | - public HashSet<NamespaceFilter> getFieldNamespaces(String queryText){ |
| 471 | + public HashSet<NamespaceFilter> getFieldNamespaces(String queryText) { |
429 | 472 | HashSet<String> fields = getFields(queryText); |
430 | 473 | HashSet<NamespaceFilter> ret = new HashSet<NamespaceFilter>(); |
431 | 474 | List ThreadingKeywords = new ArrayList(); |
432 | | - ThreadingKeywords.add("inthread"); |
433 | | - |
434 | | - for(String field : fields){ |
| 475 | + ThreadingKeywords.add("inthread"); |
| 476 | + |
| 477 | + for (String field : fields) { |
435 | 478 | field = field.toLowerCase(); |
436 | | - if(namespaceFilters.containsKey(field)) |
| 479 | + if (namespaceFilters.containsKey(field)) |
437 | 480 | ret.add(namespaceFilters.get(field)); |
438 | | - else if(field.equals(namespaceAllKeyword)) |
| 481 | + else if (field.equals(namespaceAllKeyword)) |
439 | 482 | ret.add(new NamespaceFilter()); |
440 | | - else if(field.equals(defaultField) && defaultNamespaceFilter != null) |
| 483 | + else if (field.equals(defaultField) |
| 484 | + && defaultNamespaceFilter != null) |
441 | 485 | ret.add(defaultNamespaceFilter); |
442 | | - else if(field.startsWith("[")){ |
443 | | - ret.add(new NamespaceFilter(field.substring(1,field.length()-1))); |
| 486 | + else if (field.startsWith("[")) { |
| 487 | + ret.add(new NamespaceFilter(field.substring(1, |
| 488 | + field.length() - 1))); |
444 | 489 | } else if (ThreadingKeywords.contains(field)) { |
445 | | - ret.add( new NamespaceFilter(90) ); |
| 490 | + ret.add(new NamespaceFilter(90)); |
446 | 491 | } |
447 | 492 | } |
448 | | - |
| 493 | + |
449 | 494 | return ret; |
450 | 495 | } |
451 | | - |
| 496 | + |
452 | 497 | /** get all fields that appear in a query */ |
453 | | - public HashSet<String> getFields(String queryText){ |
| 498 | + public HashSet<String> getFields(String queryText) { |
454 | 499 | int level = 0; // parenthesis count |
455 | 500 | HashSet<String> fields = new HashSet<String>(); |
456 | 501 | int fieldLevel = -1; |
457 | 502 | TokenType tokenType; |
458 | 503 | boolean inPhrase = false; |
459 | | - |
| 504 | + |
460 | 505 | reset(); |
461 | | - |
462 | | - queryLength = queryText.length(); |
| 506 | + |
| 507 | + queryLength = queryText.length(); |
463 | 508 | text = queryText.toCharArray(); |
464 | | - |
465 | | - for(cur = 0; cur < text.length; cur++ ){ |
| 509 | + |
| 510 | + for (cur = 0; cur < text.length; cur++) { |
466 | 511 | c = text[cur]; |
467 | | - if(c == '"'){ |
| 512 | + if (c == '"') { |
468 | 513 | inPhrase = !inPhrase; |
469 | | - if(inPhrase && fieldLevel == -1) |
| 514 | + if (inPhrase && fieldLevel == -1) |
470 | 515 | fields.add(defaultField); |
471 | 516 | } |
472 | | - |
473 | | - if(inPhrase) |
| 517 | + |
| 518 | + if (inPhrase) |
474 | 519 | continue; // ignore stuff between "" |
475 | | - |
476 | | - if(c == ')'){ |
| 520 | + |
| 521 | + if (c == ')') { |
477 | 522 | level--; |
478 | | - if(level < fieldLevel) |
| 523 | + if (level < fieldLevel) |
479 | 524 | fieldLevel = -1; |
480 | 525 | continue; |
481 | | - } else if(c == '('){ |
482 | | - level++; |
| 526 | + } else if (c == '(') { |
| 527 | + level++; |
483 | 528 | continue; |
484 | | - } else if(fieldLevel != -1 && level>fieldLevel) |
| 529 | + } else if (fieldLevel != -1 && level > fieldLevel) |
485 | 530 | continue; |
486 | | - |
487 | | - if(Character.isLetterOrDigit(c)){ |
| 531 | + |
| 532 | + if (Character.isLetterOrDigit(c)) { |
488 | 533 | tokenType = fetchToken(); |
489 | | - if(tokenType == TokenType.FIELD){ |
| 534 | + if (tokenType == TokenType.FIELD) { |
490 | 535 | fieldLevel = level; |
491 | | - fields.add(new String(buffer,0,length)); |
492 | | - } else if(tokenType == TokenType.WORD){ |
493 | | - if(fieldLevel == -1) |
| 536 | + fields.add(new String(buffer, 0, length)); |
| 537 | + } else if (tokenType == TokenType.WORD) { |
| 538 | + if (fieldLevel == -1) |
494 | 539 | fields.add(defaultField); |
495 | 540 | } |
496 | | - } else if(c == '['){ |
497 | | - if(fetchGenericPrefix()){ |
| 541 | + } else if (c == '[') { |
| 542 | + if (fetchGenericPrefix()) { |
498 | 543 | fieldLevel = level; |
499 | | - fields.add(new String(buffer,0,length)); |
| 544 | + fields.add(new String(buffer, 0, length)); |
500 | 545 | } |
501 | 546 | } |
502 | 547 | } |
503 | | - |
504 | | - |
| 548 | + |
505 | 549 | return fields; |
506 | 550 | } |
507 | | - |
| 551 | + |
508 | 552 | /** Find and delete all valid prefixes, return search terms in tokens */ |
509 | | - public ArrayList<Token> tokenizeForSpellCheck(String queryText){ |
| 553 | + public ArrayList<Token> tokenizeForSpellCheck(String queryText) { |
510 | 554 | int level = 0; // parenthesis count |
511 | 555 | int fieldLevel = -1; |
512 | 556 | TokenType tokenType; |
513 | 557 | boolean inPhrase = false; |
514 | | - |
| 558 | + |
515 | 559 | Analyzer oldAnalyzer = this.analyzer; |
516 | | - this.analyzer = Analyzers.getReusableAnalyzer(filters,new TokenizerOptions.SpellCheckSearch()); |
517 | | - |
| 560 | + this.analyzer = Analyzers.getReusableAnalyzer(filters, |
| 561 | + new TokenizerOptions.SpellCheckSearch()); |
| 562 | + |
518 | 563 | ArrayList<Token> ret = new ArrayList<Token>(); |
519 | | - |
| 564 | + |
520 | 565 | reset(); |
521 | | - |
522 | | - queryLength = queryText.length(); |
| 566 | + |
| 567 | + queryLength = queryText.length(); |
523 | 568 | text = queryText.toCharArray(); |
524 | 569 | String oldDefault = defaultField; |
525 | 570 | defaultField = "title"; // no stemming |
526 | | - |
527 | | - for(cur = 0; cur < text.length; cur++ ){ |
| 571 | + |
| 572 | + for (cur = 0; cur < text.length; cur++) { |
528 | 573 | c = text[cur]; |
529 | | - if(c == '"'){ |
| 574 | + if (c == '"') { |
530 | 575 | inPhrase = !inPhrase; |
531 | 576 | } |
532 | | - |
533 | | - if(inPhrase) // skip words in phrases |
534 | | - continue; |
535 | | - else if(c == ')'){ |
| 577 | + |
| 578 | + if (inPhrase) // skip words in phrases |
| 579 | + continue; |
| 580 | + else if (c == ')') { |
536 | 581 | level--; |
537 | | - if(level < fieldLevel) |
| 582 | + if (level < fieldLevel) |
538 | 583 | fieldLevel = -1; |
539 | 584 | continue; |
540 | | - } else if(c == '('){ |
541 | | - level++; |
| 585 | + } else if (c == '(') { |
| 586 | + level++; |
542 | 587 | continue; |
543 | | - } else if(fieldLevel != -1 && level>fieldLevel) |
| 588 | + } else if (fieldLevel != -1 && level > fieldLevel) |
544 | 589 | continue; |
545 | | - |
| 590 | + |
546 | 591 | // include exclusion/inclusion marks |
547 | | - if(isTermChar(c) && text[cur]!='-' && text[cur]!='+'){ |
| 592 | + if (isTermChar(c) && text[cur] != '-' && text[cur] != '+') { |
548 | 593 | int start = cur; |
549 | 594 | tokenType = fetchToken(inPhrase); |
550 | 595 | // ignore excluded words |
551 | | - if(tokenType == TokenType.WORD && (start==0 || text[start-1]!='-')){ |
| 596 | + if (tokenType == TokenType.WORD |
| 597 | + && (start == 0 || text[start - 1] != '-')) { |
552 | 598 | String type = "word"; |
553 | | - if(bufferIsWildCard()) |
| 599 | + if (bufferIsWildCard()) |
554 | 600 | type = "wildcard"; |
555 | | - else if(bufferIsFuzzy()) |
| 601 | + else if (bufferIsFuzzy()) |
556 | 602 | type = "fuzzy"; |
557 | 603 | analyzeBuffer(); |
558 | | - for(Token t : tokens){ |
559 | | - if(t.getPositionIncrement() > 0){ |
560 | | - ret.add(new Token(t.termText(),start+t.startOffset(),start+t.endOffset(),type)); |
| 604 | + for (Token t : tokens) { |
| 605 | + if (t.getPositionIncrement() > 0) { |
| 606 | + ret.add(new Token(t.termText(), start |
| 607 | + + t.startOffset(), start + t.endOffset(), |
| 608 | + type)); |
561 | 609 | } |
562 | | - } |
| 610 | + } |
563 | 611 | } |
564 | | - } else if(c == '[' && !inPhrase){ |
| 612 | + } else if (c == '[' && !inPhrase) { |
565 | 613 | fetchGenericPrefix(); |
566 | 614 | } |
567 | 615 | } |
568 | | - |
| 616 | + |
569 | 617 | this.analyzer = oldAnalyzer; |
570 | 618 | defaultField = oldDefault; |
571 | | - |
| 619 | + |
572 | 620 | return ret; |
573 | | - |
| 621 | + |
574 | 622 | } |
575 | | - |
| 623 | + |
576 | 624 | /** rewrite field name (e.g. help) into a term query like namespace:12 */ |
577 | | - private Query getNamespaceQuery(String fieldName){ |
578 | | - if(fieldName == null || namespacePolicy != NamespacePolicy.REWRITE) |
| 625 | + private Query getNamespaceQuery(String fieldName) { |
| 626 | + if (fieldName == null || namespacePolicy != NamespacePolicy.REWRITE) |
579 | 627 | return null; |
580 | | - |
| 628 | + |
581 | 629 | Query q; |
582 | | - if((q = namespaceQueries.get(fieldName))!=null){ |
| 630 | + if ((q = namespaceQueries.get(fieldName)) != null) { |
583 | 631 | return q; |
584 | | - } else if(fieldName.startsWith("[")){ |
585 | | - return generateRewrite(new NamespaceFilter(fieldName.substring(1,fieldName.length()-1))); |
| 632 | + } else if (fieldName.startsWith("[")) { |
| 633 | + return generateRewrite(new NamespaceFilter(fieldName.substring(1, |
| 634 | + fieldName.length() - 1))); |
586 | 635 | } else |
587 | 636 | return null; |
588 | 637 | } |
589 | | - |
590 | | - private NamespaceFilter getNamespaceFilter(String fieldName){ |
591 | | - if(fieldName == null) |
| 638 | + |
| 639 | + private NamespaceFilter getNamespaceFilter(String fieldName) { |
| 640 | + if (fieldName == null) |
592 | 641 | return defaultNamespaceFilter; |
593 | | - else if(namespaceFilters.contains(fieldName)) |
| 642 | + else if (namespaceFilters.contains(fieldName)) |
594 | 643 | return namespaceFilters.get(fieldName); |
595 | | - else if(fieldName.startsWith("[")) |
596 | | - return new NamespaceFilter(fieldName.substring(1,fieldName.length()-1)); |
| 644 | + else if (fieldName.startsWith("[")) |
| 645 | + return new NamespaceFilter(fieldName.substring(1, |
| 646 | + fieldName.length() - 1)); |
597 | 647 | else |
598 | 648 | return defaultNamespaceFilter; |
599 | 649 | } |
600 | | - |
601 | | - private final boolean isTermChar(char ch){ |
602 | | - return !Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"'; |
| 650 | + |
| 651 | + private final boolean isTermChar(char ch) { |
| 652 | + return !Character.isWhitespace(ch) && ch != ':' && ch != '(' |
| 653 | + && ch != ')' && ch != '[' && ch != ']' && ch != ',' |
| 654 | + && ch != ';' && ch != '"'; |
603 | 655 | } |
604 | | - |
| 656 | + |
605 | 657 | /** |
606 | | - * Fetch token into <code>buffer</code> starting from current position (<code>cur</code>) |
| 658 | + * Fetch token into <code>buffer</code> starting from current position ( |
| 659 | + * <code>cur</code>) |
607 | 660 | * |
608 | 661 | * @return type of the token in buffer |
609 | 662 | */ |
610 | | - private TokenType fetchToken(){ |
| 663 | + private TokenType fetchToken() { |
611 | 664 | return fetchToken(false); |
612 | 665 | } |
613 | | - private TokenType fetchToken(boolean termOnly){ |
| 666 | + |
| 667 | + private TokenType fetchToken(boolean termOnly) { |
614 | 668 | char ch; |
615 | 669 | prev_cur = cur; |
616 | | - for(length = 0; cur < queryLength; cur++){ |
| 670 | + for (length = 0; cur < queryLength; cur++) { |
617 | 671 | ch = text[cur]; |
618 | | - if(length == 0 && ch == ' ') |
| 672 | + if (length == 0 && ch == ' ') |
619 | 673 | continue; // ignore whitespaces |
620 | | - |
621 | | - // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries |
622 | | - if(isTermChar(ch)){ |
623 | | - if(length<buffer.length) |
| 674 | + |
| 675 | + // pluses and minuses, underscores can be within words (to prevent |
| 676 | + // to be missinterpeted), *,? are for wildcard queries |
| 677 | + if (isTermChar(ch)) { |
| 678 | + if (length < buffer.length) |
624 | 679 | buffer[length++] = ch; |
625 | | - } else{ |
| 680 | + } else { |
626 | 681 | cur--; // position before the nonletter character |
627 | 682 | break; |
628 | 683 | } |
629 | 684 | } |
630 | | - if(length == 0) |
| 685 | + if (length == 0) |
631 | 686 | return TokenType.EOF; |
632 | | - |
633 | | - if(termOnly) |
634 | | - return TokenType.WORD; |
635 | | - |
| 687 | + |
| 688 | + if (termOnly) |
| 689 | + return TokenType.WORD; |
| 690 | + |
636 | 691 | // check for keywords |
637 | | - if(length == 3 && buffer[0]=='A' && buffer[1]=='N' && buffer[2]=='D') |
| 692 | + if (length == 3 && buffer[0] == 'A' && buffer[1] == 'N' |
| 693 | + && buffer[2] == 'D') |
638 | 694 | return TokenType.AND; |
639 | | - else if(length == 2 && buffer[0]=='O' && buffer[1]=='R') |
| 695 | + else if (length == 2 && buffer[0] == 'O' && buffer[1] == 'R') |
640 | 696 | return TokenType.OR; |
641 | | - |
642 | | - |
| 697 | + |
643 | 698 | // lookahead to see if this is a field |
644 | | - for(lookup = cur+1; lookup < queryLength; lookup++ ){ |
| 699 | + for (lookup = cur + 1; lookup < queryLength; lookup++) { |
645 | 700 | ch = text[lookup]; |
646 | | - if(ch == ' ') |
| 701 | + if (ch == ' ') |
647 | 702 | continue; |
648 | | - else if(ch == ':'){ |
| 703 | + else if (ch == ':') { |
649 | 704 | // check if it's a valid field |
650 | | - String f = new String(buffer,0,length); |
651 | | - |
| 705 | + String f = new String(buffer, 0, length); |
| 706 | + |
652 | 707 | List<String> fieldOperators = getFieldOperators(); |
653 | | - |
654 | | - if( f.equals(namespaceAllKeyword) |
655 | | - || fieldOperators.contains(f) |
| 708 | + |
| 709 | + if (f.equals(namespaceAllKeyword) || fieldOperators.contains(f) |
656 | 710 | || namespaceFilters.containsKey(f) |
657 | | - || namespacePolicy == NamespacePolicy.LEAVE){ |
| 711 | + || namespacePolicy == NamespacePolicy.LEAVE) { |
658 | 712 | cur = lookup; |
659 | 713 | return TokenType.FIELD; |
660 | 714 | } else |
— | — | @@ -661,35 +715,35 @@ |
662 | 716 | } else |
663 | 717 | break; |
664 | 718 | } |
665 | | - |
666 | | - return TokenType.WORD; |
| 719 | + |
| 720 | + return TokenType.WORD; |
667 | 721 | } |
668 | | - |
| 722 | + |
669 | 723 | private List<String> getFieldOperators() { |
670 | 724 | List<String> fieldOperators = new ArrayList<String>(); |
671 | 725 | fieldOperators.add("intitle"); |
672 | 726 | fieldOperators.add("incategory"); |
673 | | - fieldOperators.add("inthread"); |
674 | | - |
| 727 | + fieldOperators.add("inthread"); |
| 728 | + |
675 | 729 | return fieldOperators; |
676 | 730 | } |
677 | | - |
| 731 | + |
678 | 732 | /** |
679 | | - * Fetches prefixes like [0,1,2] (in [0,1,2]:query) |
| 733 | + * Fetches prefixes like [0,1,2] (in [0,1,2]:query) |
680 | 734 | * |
681 | 735 | * @return true if search prefixes is successfully fetched |
682 | 736 | */ |
683 | | - private boolean fetchGenericPrefix(){ |
| 737 | + private boolean fetchGenericPrefix() { |
684 | 738 | char ch; |
685 | 739 | prev_cur = cur; |
686 | | - if(text[cur] != '[') |
| 740 | + if (text[cur] != '[') |
687 | 741 | return false; // sanity check |
688 | 742 | buffer[0] = '['; |
689 | | - for(length = 1, cur++; cur < queryLength; cur++){ |
| 743 | + for (length = 1, cur++; cur < queryLength; cur++) { |
690 | 744 | ch = text[cur]; |
691 | | - if(Character.isDigit(ch) || ch ==',') |
| 745 | + if (Character.isDigit(ch) || ch == ',') |
692 | 746 | buffer[length++] = ch; |
693 | | - else if(ch == ']' && cur+1 < queryLength && text[cur+1]==':'){ |
| 747 | + else if (ch == ']' && cur + 1 < queryLength && text[cur + 1] == ':') { |
694 | 748 | cur++; // position on : |
695 | 749 | buffer[length++] = ch; |
696 | 750 | return true; |
— | — | @@ -698,323 +752,359 @@ |
699 | 753 | } |
700 | 754 | cur = prev_cur; // traceback |
701 | 755 | return false; |
702 | | - |
| 756 | + |
703 | 757 | } |
704 | | - |
| 758 | + |
705 | 759 | /** Go back one token */ |
706 | | - private void backToken(){ |
| 760 | + private void backToken() { |
707 | 761 | cur = prev_cur; |
708 | 762 | } |
709 | 763 | |
710 | 764 | /** analyzer buffer into tokens using default analyzer */ |
711 | | - private void analyzeBuffer(){ |
| 765 | + private void analyzeBuffer() { |
712 | 766 | String analysisField = defaultField; |
713 | | - if(defaultField.equals("contents") && isInTitle) |
| 767 | + if (defaultField.equals("contents") && isInTitle) |
714 | 768 | analysisField = "title"; |
715 | | - tokenStream = analyzer.tokenStream(analysisField, |
716 | | - new String(buffer,0,length)); |
717 | | - |
| 769 | + tokenStream = analyzer.tokenStream(analysisField, new String(buffer, 0, |
| 770 | + length)); |
| 771 | + |
718 | 772 | Token token; |
719 | 773 | tokens.clear(); |
720 | | - try{ |
721 | | - while((token = tokenStream.next()) != null){ |
| 774 | + try { |
| 775 | + while ((token = tokenStream.next()) != null) { |
722 | 776 | tokens.add(token); |
723 | 777 | } |
724 | | - } catch (IOException e){ |
| 778 | + } catch (IOException e) { |
725 | 779 | e.printStackTrace(); |
726 | | - } |
| 780 | + } |
727 | 781 | } |
728 | | - |
729 | | - /** Analyze a string, and return tokens (doesn't use any of the object storage attributes) */ |
730 | | - private ArrayList<Token> analyzeString(String input){ |
| 782 | + |
| 783 | + /** |
| 784 | + * Analyze a string, and return tokens (doesn't use any of the object |
| 785 | + * storage attributes) |
| 786 | + */ |
| 787 | + private ArrayList<Token> analyzeString(String input) { |
731 | 788 | tokenStream = analyzer.tokenStream("contents", input); |
732 | | - |
| 789 | + |
733 | 790 | ArrayList<Token> ret = new ArrayList<Token>(); |
734 | 791 | Token token; |
735 | | - try{ |
736 | | - while((token = tokenStream.next()) != null){ |
| 792 | + try { |
| 793 | + while ((token = tokenStream.next()) != null) { |
737 | 794 | ret.add(token); |
738 | 795 | } |
739 | | - } catch (IOException e){ |
| 796 | + } catch (IOException e) { |
740 | 797 | e.printStackTrace(); |
741 | 798 | } |
742 | 799 | return ret; |
743 | 800 | } |
744 | | - |
745 | | - |
| 801 | + |
746 | 802 | /** Make term form lucene token */ |
747 | | - private Term makeTerm(Token token){ |
| 803 | + private Term makeTerm(Token token) { |
748 | 804 | return makeTerm(token.termText()); |
749 | 805 | } |
750 | | - |
| 806 | + |
751 | 807 | /** Make term from <code>buffer</code> */ |
752 | | - private Term makeTerm(){ |
753 | | - return makeTerm(new String(buffer,0,length)); |
| 808 | + private Term makeTerm() { |
| 809 | + return makeTerm(new String(buffer, 0, length)); |
754 | 810 | } |
755 | | - |
| 811 | + |
756 | 812 | /** Make a lucene term from string */ |
757 | | - private Term makeTerm(String t){ |
758 | | - |
759 | | - |
760 | | - if(currentField == null) |
761 | | - return new Term(defaultField,builder.isExactCase()? t : t.toLowerCase()); |
762 | | - else if(defaultField.equals("contents") && isInTitle) |
763 | | - return new Term("title",builder.isExactCase()? t : t.toLowerCase()); |
764 | | - else if(currentField.equals("incategory")){ |
765 | | - String norm = t.replace("_"," "); // bug 10822 |
766 | | - return new Term("category",builder.isExactCase()? norm : norm.toLowerCase()); |
767 | | - } else if( keywordFieldMapping.containsKey(currentField) ) { |
| 813 | + private Term makeTerm(String t) { |
| 814 | + |
| 815 | + if (currentField == null) |
| 816 | + return new Term(defaultField, builder.isExactCase() ? t |
| 817 | + : t.toLowerCase()); |
| 818 | + else if (defaultField.equals("contents") && isInTitle) |
| 819 | + return new Term("title", builder.isExactCase() ? t |
| 820 | + : t.toLowerCase()); |
| 821 | + else if (currentField.equals("incategory")) { |
| 822 | + String norm = t.replace("_", " "); // bug 10822 |
| 823 | + return new Term("category", builder.isExactCase() ? norm |
| 824 | + : norm.toLowerCase()); |
| 825 | + } else if (keywordFieldMapping.containsKey(currentField)) { |
768 | 826 | String field = keywordFieldMapping.get(currentField); |
769 | | - |
| 827 | + |
770 | 828 | return new Term(field, t); |
771 | | - } else if(!"incategory".equals(currentField) && |
772 | | - (namespacePolicy == NamespacePolicy.IGNORE || |
773 | | - namespacePolicy == NamespacePolicy.REWRITE)) |
774 | | - return new Term(defaultField,t); |
| 829 | + } else if (!"incategory".equals(currentField) |
| 830 | + && (namespacePolicy == NamespacePolicy.IGNORE || namespacePolicy == NamespacePolicy.REWRITE)) |
| 831 | + return new Term(defaultField, t); |
775 | 832 | else |
776 | | - return new Term(currentField,t); |
| 833 | + return new Term(currentField, t); |
777 | 834 | } |
778 | | - |
779 | | - /** |
| 835 | + |
| 836 | + /** |
780 | 837 | * Parses a phrase query (i.e. between ""), the cur |
781 | | - * should be set to the char just after the first |
782 | | - * quotation mark |
783 | | - * |
| 838 | + * should be set to the char just after the first |
| 839 | + * quotation mark |
| 840 | + * |
784 | 841 | * @return a query, or null if the query is empty |
785 | 842 | */ |
786 | | - private Query parsePhrase(){ |
787 | | - // special case for incategory |
788 | | - if(currentField!=null && currentField.equals("incategory")){ |
| 843 | + private Query parsePhrase() { |
| 844 | + // special case for incategory |
| 845 | + if (currentField != null && currentField.equals("incategory")) { |
789 | 846 | length = 0; |
790 | | - for(; cur < queryLength ; cur++ ){ |
791 | | - if(text[cur] == '"') |
| 847 | + for (; cur < queryLength; cur++) { |
| 848 | + if (text[cur] == '"') |
792 | 849 | break; |
793 | | - else if(length < buffer.length) |
| 850 | + else if (length < buffer.length) |
794 | 851 | buffer[length++] = text[cur]; |
795 | 852 | } |
796 | | - if(length > 0){ |
| 853 | + if (length > 0) { |
797 | 854 | // no tokenization, we want whole category name |
798 | 855 | return new TermQuery(makeTerm()); |
799 | 856 | } |
800 | 857 | return null; |
801 | | - } |
802 | | - //PositionalMultiQuery query = new PositionalMultiQuery(new PositionalOptions.PhraseQueryFallback()); |
| 858 | + } |
| 859 | + // PositionalMultiQuery query = new PositionalMultiQuery(new |
| 860 | + // PositionalOptions.PhraseQueryFallback()); |
803 | 861 | MultiPhraseQuery query = new MultiPhraseQuery(); |
804 | | - for(; cur < queryLength ; cur++ ){ |
| 862 | + for (; cur < queryLength; cur++) { |
805 | 863 | length = 0; |
806 | 864 | // fetch next word |
807 | | - while(cur<queryLength && isTermChar(text[cur]) && length<buffer.length){ |
| 865 | + while (cur < queryLength && isTermChar(text[cur]) |
| 866 | + && length < buffer.length) { |
808 | 867 | buffer[length++] = text[cur++]; |
809 | 868 | } |
810 | | - |
| 869 | + |
811 | 870 | // add to phrase |
812 | | - if(length > 0){ |
| 871 | + if (length > 0) { |
813 | 872 | boolean added = false; |
814 | | - if(bufferIsWildCard()){ |
| 873 | + if (bufferIsWildCard()) { |
815 | 874 | Term term = makeTerm(); |
816 | | - Term[] terms = wildcards.makeTerms(term.text(),term.field()); |
817 | | - if(terms != null){ |
| 875 | + Term[] terms = wildcards.makeTerms(term.text(), |
| 876 | + term.field()); |
| 877 | + if (terms != null) { |
818 | 878 | query.add(terms); |
819 | | - ArrayList<String> words = wildcards.getWords(term.text()); |
820 | | - parsedWords.add(term.text(),words,1f,ExpandedType.WILDCARD); |
| 879 | + ArrayList<String> words = wildcards.getWords(term |
| 880 | + .text()); |
| 881 | + parsedWords.add(term.text(), words, 1f, |
| 882 | + ExpandedType.WILDCARD); |
821 | 883 | added = true; |
822 | 884 | } |
823 | 885 | } |
824 | | - if(bufferIsFuzzy()){ |
| 886 | + if (bufferIsFuzzy()) { |
825 | 887 | Term term = makeTerm(); |
826 | 888 | NamespaceFilter nsf = getNamespaceFilter(currentField); |
827 | | - Term[] terms = fuzzy.makeTerms(term.text(),term.field(),nsf); |
828 | | - if(terms != null){ |
829 | | - //query.add(terms,fuzzy.getBoosts(term.text(),nsf,terms)); |
| 889 | + Term[] terms = fuzzy.makeTerms(term.text(), term.field(), |
| 890 | + nsf); |
| 891 | + if (terms != null) { |
| 892 | + // query.add(terms,fuzzy.getBoosts(term.text(),nsf,terms)); |
830 | 893 | query.add(terms); |
831 | | - ArrayList<String> words = fuzzy.getWords(term.text(),nsf); |
832 | | - parsedWords.add(term.text(),words,fuzzy.getBoosts(term.text(),nsf,words),ExpandedType.FUZZY); |
| 894 | + ArrayList<String> words = fuzzy.getWords(term.text(), |
| 895 | + nsf); |
| 896 | + parsedWords.add(term.text(), words, |
| 897 | + fuzzy.getBoosts(term.text(), nsf, words), |
| 898 | + ExpandedType.FUZZY); |
833 | 899 | added = true; |
834 | 900 | } |
835 | 901 | } |
836 | | - if(!added){ |
| 902 | + if (!added) { |
837 | 903 | // fallback to ordinary words |
838 | 904 | analyzeBuffer(); |
839 | | - for(Token token : tokens){ |
840 | | - if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words |
| 905 | + for (Token token : tokens) { |
| 906 | + if (token.getPositionIncrement() > 0) { // ignore |
| 907 | + // aliases and |
| 908 | + // stemmed words |
841 | 909 | Term t = makeTerm(token); |
842 | | - addToWords(t,1,ExpandedType.PHRASE); |
| 910 | + addToWords(t, 1, ExpandedType.PHRASE); |
843 | 911 | query.add(t); |
844 | 912 | } |
845 | | - } |
| 913 | + } |
846 | 914 | } |
847 | | - } |
| 915 | + } |
848 | 916 | // end of phrase query |
849 | | - if(cur < queryLength && text[cur] == '"') |
| 917 | + if (cur < queryLength && text[cur] == '"') |
850 | 918 | break; |
851 | 919 | } |
852 | | - if(query.getPositions().length > 0){ |
| 920 | + if (query.getPositions().length > 0) { |
853 | 921 | query.setBoost(defaultBoost); |
854 | 922 | return query; |
855 | 923 | } else |
856 | 924 | return null; |
857 | 925 | } |
858 | | - |
859 | | - final private Query parseClause(int level){ |
860 | | - return parseClause(level,false,null); |
| 926 | + |
| 927 | + final private Query parseClause(int level) { |
| 928 | + return parseClause(level, false, null); |
861 | 929 | } |
862 | | - |
863 | | - private final boolean needsRewrite(){ |
864 | | - return namespaceRewriteQuery != null && namespacePolicy == NamespacePolicy.REWRITE; |
| 930 | + |
| 931 | + private final boolean needsRewrite() { |
| 932 | + return namespaceRewriteQuery != null |
| 933 | + && namespacePolicy == NamespacePolicy.REWRITE; |
865 | 934 | } |
866 | | - |
867 | | - /** Parses a clause: (in regexp-like notation) |
| 935 | + |
| 936 | + /** |
| 937 | + * Parses a clause: (in regexp-like notation) |
| 938 | + * Clause := ([+-]? (<field>:)? <term> | [AND,OR] | \( Clause \) )+ |
868 | 939 | * |
869 | | - * Clause := ([+-]? (<field>:)? <term> | [AND,OR] | \( Clause \) )+ |
870 | | - * |
871 | | - * @param level - level of recurstion |
872 | | - * @param returnOnFieldDef - if this is a nested field rewrite call |
| 940 | + * @param level |
| 941 | + * - level of recurstion |
| 942 | + * @param returnOnFieldDef |
| 943 | + * - if this is a nested field rewrite call |
873 | 944 | * @return |
874 | 945 | */ |
875 | | - private Query parseClause(int level, boolean returnOnFieldDef, String topFieldName){ |
| 946 | + private Query parseClause(int level, boolean returnOnFieldDef, |
| 947 | + String topFieldName) { |
876 | 948 | // the whole query |
877 | | - Query query = null; |
| 949 | + Query query = null; |
878 | 950 | // reference to boolean query if one is constructed |
879 | 951 | BooleanQuery boolquery = null; |
880 | 952 | BooleanClause.Occur occur = boolDefault; |
881 | 953 | // the first query |
882 | | - BooleanClause.Occur firstOccur = boolDefault; |
| 954 | + BooleanClause.Occur firstOccur = boolDefault; |
883 | 955 | // state |
884 | 956 | TokenType tokenType; |
885 | | - Query subquery = null; |
| 957 | + Query subquery = null; |
886 | 958 | boolean definedField = false; |
887 | 959 | boolean definedExplicitField = false; |
888 | 960 | Query fieldQuery = null; // the namespace term, e.g. namespace:0 |
889 | | - Query fieldsubquery = null; // e.g. 'all:something else' will be parsed 'something else' |
890 | | - |
| 961 | + Query fieldsubquery = null; // e.g. 'all:something else' will be parsed |
| 962 | + // 'something else' |
| 963 | + |
891 | 964 | // assume default namespace value on rewrite |
892 | | - if(!returnOnFieldDef && currentField == null && needsRewrite()){ |
893 | | - fieldQuery = namespaceRewriteQuery; |
| 965 | + if (!returnOnFieldDef && currentField == null && needsRewrite()) { |
| 966 | + fieldQuery = namespaceRewriteQuery; |
894 | 967 | } |
895 | | - |
896 | | - mainloop: for( ; cur < queryLength; cur++ ){ |
| 968 | + |
| 969 | + mainloop: for (; cur < queryLength; cur++) { |
897 | 970 | c = text[cur]; |
898 | | - |
899 | | - if(c == ' ') |
| 971 | + |
| 972 | + if (c == ' ') |
900 | 973 | continue; |
901 | | - |
| 974 | + |
902 | 975 | // terms, fields |
903 | | - if(Character.isLetterOrDigit(c) || c=='.' || c == '[' || c=='*'){ |
| 976 | + if (Character.isLetterOrDigit(c) || c == '.' || c == '[' |
| 977 | + || c == '*') { |
904 | 978 | // check for generic namespace prefixes, e.g. [0,1]: |
905 | | - if(c == '['){ |
906 | | - if(fetchGenericPrefix()) |
| 979 | + if (c == '[') { |
| 980 | + if (fetchGenericPrefix()) |
907 | 981 | tokenType = TokenType.FIELD; |
908 | 982 | else |
909 | 983 | continue; |
910 | | - } else // fetch next token |
| 984 | + } else |
| 985 | + // fetch next token |
911 | 986 | tokenType = fetchToken(); |
912 | | - |
913 | | - switch(tokenType){ |
| 987 | + |
| 988 | + switch (tokenType) { |
914 | 989 | case FIELD: |
915 | 990 | // this is where the function returns if called from the |
916 | 991 | // next if (i.e. some 10 lines down) |
917 | | - if(returnOnFieldDef){ |
918 | | - String newfield = new String(buffer,0,length); |
919 | | - if(!newfield.equals("incategory") && !newfield.equals(topFieldName)){ |
920 | | - backToken(); cur--; |
| 992 | + if (returnOnFieldDef) { |
| 993 | + String newfield = new String(buffer, 0, length); |
| 994 | + if (!newfield.equals("incategory") |
| 995 | + && !newfield.equals(topFieldName)) { |
| 996 | + backToken(); |
| 997 | + cur--; |
921 | 998 | break mainloop; |
922 | 999 | } |
923 | 1000 | } |
924 | | - if(currentField == null || definedExplicitField){ |
| 1001 | + if (currentField == null || definedExplicitField) { |
925 | 1002 | // set field name |
926 | | - currentField = new String(buffer,0,length); |
927 | | - if("intitle".equals(currentField)){ |
| 1003 | + currentField = new String(buffer, 0, length); |
| 1004 | + if ("intitle".equals(currentField)) { |
928 | 1005 | isInTitle = true; |
929 | 1006 | isInTitleLevel = level; |
930 | 1007 | } |
931 | | - if((defaultNamespaceName!=null && currentField.equals(defaultNamespaceName)) || currentField.equals(defaultField)){ |
| 1008 | + if ((defaultNamespaceName != null && currentField |
| 1009 | + .equals(defaultNamespaceName)) |
| 1010 | + || currentField.equals(defaultField)) { |
932 | 1011 | currentField = null; |
933 | 1012 | break; // repeated definition of field, ignore |
934 | 1013 | } |
935 | 1014 | definedExplicitField = true; |
936 | | - |
937 | | - fieldQuery = getNamespaceQuery(currentField); // depending on policy rewrite this field |
938 | | - if(fieldQuery != null){ |
939 | | - // save field, we will need it to be set to null to fetch categories |
| 1015 | + |
| 1016 | + fieldQuery = getNamespaceQuery(currentField); // depending |
| 1017 | + // on |
| 1018 | + // policy |
| 1019 | + // rewrite |
| 1020 | + // this |
| 1021 | + // field |
| 1022 | + if (fieldQuery != null) { |
| 1023 | + // save field, we will need it to be set to null to |
| 1024 | + // fetch categories |
940 | 1025 | String myfield = currentField; |
941 | 1026 | currentField = null; |
942 | 1027 | // fetch the clause until the next field |
943 | | - fieldsubquery = parseClause(level+1,true,myfield); |
| 1028 | + fieldsubquery = parseClause(level + 1, true, |
| 1029 | + myfield); |
944 | 1030 | currentField = myfield; |
945 | 1031 | } |
946 | | - } else{ |
| 1032 | + } else { |
947 | 1033 | // nested field names, don't allow, just add to query |
948 | 1034 | analyzeBuffer(); |
949 | 1035 | subquery = makeQueryFromTokens(occur); |
950 | 1036 | } |
951 | 1037 | break; |
952 | 1038 | case WORD: |
953 | | - if(fieldQuery != null){ |
| 1039 | + if (fieldQuery != null) { |
954 | 1040 | backToken(); |
955 | | - String myfield = (topFieldName != null)? topFieldName : (currentField !=null)? currentField : (defaultNamespaceName!=null)? defaultNamespaceName : defaultField; |
956 | | - fieldsubquery = parseClause(level+1,true,myfield); |
957 | | - } else{ |
| 1041 | + String myfield = (topFieldName != null) ? topFieldName |
| 1042 | + : (currentField != null) ? currentField |
| 1043 | + : (defaultNamespaceName != null) ? defaultNamespaceName |
| 1044 | + : defaultField; |
| 1045 | + fieldsubquery = parseClause(level + 1, true, myfield); |
| 1046 | + } else { |
958 | 1047 | analyzeBuffer(); |
959 | | - subquery = makeQueryFromTokens(explicitOccur!=null? explicitOccur : occur); |
| 1048 | + subquery = makeQueryFromTokens(explicitOccur != null ? explicitOccur |
| 1049 | + : occur); |
960 | 1050 | } |
961 | 1051 | break; |
962 | 1052 | case AND: |
963 | 1053 | firstOccur = BooleanClause.Occur.MUST; |
964 | 1054 | occur = BooleanClause.Occur.MUST; |
965 | | - if(returnOnFieldDef) |
| 1055 | + if (returnOnFieldDef) |
966 | 1056 | explicitOccur = BooleanClause.Occur.MUST; |
967 | 1057 | continue; |
968 | 1058 | case OR: |
969 | 1059 | firstOccur = BooleanClause.Occur.SHOULD; |
970 | 1060 | occur = BooleanClause.Occur.SHOULD; |
971 | | - if(returnOnFieldDef) |
| 1061 | + if (returnOnFieldDef) |
972 | 1062 | explicitOccur = BooleanClause.Occur.SHOULD; |
973 | 1063 | continue; |
974 | 1064 | case EOF: |
975 | | - break mainloop; |
976 | | - } |
| 1065 | + break mainloop; |
| 1066 | + } |
977 | 1067 | } |
978 | | - |
| 1068 | + |
979 | 1069 | // field subquery, the fetched clause while doing rewriting |
980 | | - if(fieldsubquery != null){ |
| 1070 | + if (fieldsubquery != null) { |
981 | 1071 | // this not the first field definition at this level |
982 | | - if(definedField){ |
| 1072 | + if (definedField) { |
983 | 1073 | // embed the old query |
984 | 1074 | BooleanQuery bq = new BooleanQuery(); |
985 | | - bq.add(query,BooleanClause.Occur.SHOULD); |
| 1075 | + bq.add(query, BooleanClause.Occur.SHOULD); |
986 | 1076 | query = boolquery = bq; |
987 | 1077 | } |
988 | | - |
| 1078 | + |
989 | 1079 | BooleanQuery bq = new BooleanQuery(); |
990 | | - bq.add(fieldQuery,BooleanClause.Occur.MUST); |
991 | | - bq.add(fieldsubquery,BooleanClause.Occur.MUST); |
992 | | - |
| 1080 | + bq.add(fieldQuery, BooleanClause.Occur.MUST); |
| 1081 | + bq.add(fieldsubquery, BooleanClause.Occur.MUST); |
| 1082 | + |
993 | 1083 | // add to existing queries |
994 | | - if(boolquery != null) |
995 | | - boolquery.add(bq,BooleanClause.Occur.SHOULD); |
996 | | - else if(query != null){ |
| 1084 | + if (boolquery != null) |
| 1085 | + boolquery.add(bq, BooleanClause.Occur.SHOULD); |
| 1086 | + else if (query != null) { |
997 | 1087 | boolquery = new BooleanQuery(); |
998 | | - boolquery.add(query,firstOccur); |
999 | | - boolquery.add(bq,BooleanClause.Occur.SHOULD); |
| 1088 | + boolquery.add(query, firstOccur); |
| 1089 | + boolquery.add(bq, BooleanClause.Occur.SHOULD); |
1000 | 1090 | query = boolquery; |
1001 | 1091 | } else |
1002 | 1092 | query = bq; |
1003 | | - |
| 1093 | + |
1004 | 1094 | fieldQuery = null; |
1005 | 1095 | definedField = true; |
1006 | 1096 | fieldsubquery = null; |
1007 | 1097 | } |
1008 | | - |
| 1098 | + |
1009 | 1099 | // modifiers |
1010 | | - switch(c){ |
| 1100 | + switch (c) { |
1011 | 1101 | case '+': |
1012 | 1102 | occur = BooleanClause.Occur.MUST; |
1013 | | - if(returnOnFieldDef) |
1014 | | - explicitOccur = BooleanClause.Occur.MUST; |
| 1103 | + if (returnOnFieldDef) |
| 1104 | + explicitOccur = BooleanClause.Occur.MUST; |
1015 | 1105 | continue; |
1016 | 1106 | case '-': |
1017 | 1107 | occur = BooleanClause.Occur.MUST_NOT; |
1018 | | - if(returnOnFieldDef) |
| 1108 | + if (returnOnFieldDef) |
1019 | 1109 | explicitOccur = BooleanClause.Occur.MUST_NOT; |
1020 | 1110 | continue; |
1021 | 1111 | case '"': |
— | — | @@ -1023,12 +1113,12 @@ |
1024 | 1114 | break; |
1025 | 1115 | case '(': |
1026 | 1116 | cur++; |
1027 | | - subquery = parseClause(level+1); |
| 1117 | + subquery = parseClause(level + 1); |
1028 | 1118 | break; |
1029 | 1119 | case ')': |
1030 | | - if(level > 0){ |
| 1120 | + if (level > 0) { |
1031 | 1121 | // get out of titles on appropriate level of parenthesis |
1032 | | - if(isInTitle && level <= isInTitleLevel) |
| 1122 | + if (isInTitle && level <= isInTitleLevel) |
1033 | 1123 | isInTitle = false; |
1034 | 1124 | break mainloop; |
1035 | 1125 | } |
— | — | @@ -1036,23 +1126,22 @@ |
1037 | 1127 | } |
1038 | 1128 | |
1039 | 1129 | // if we fetched some tokens or a subquery add it to main query |
1040 | | - if(subquery != null){ |
1041 | | - if(query == null){ |
| 1130 | + if (subquery != null) { |
| 1131 | + if (query == null) { |
1042 | 1132 | query = subquery; |
1043 | 1133 | firstOccur = occur; // save the boolean modifier |
1044 | 1134 | occur = boolDefault; // return to default |
1045 | | - } |
1046 | | - else{ |
1047 | | - if(explicitOccur != null) |
| 1135 | + } else { |
| 1136 | + if (explicitOccur != null) |
1048 | 1137 | occur = explicitOccur; |
1049 | | - if(boolquery == null){ |
| 1138 | + if (boolquery == null) { |
1050 | 1139 | // we have found the second term, make boolean query |
1051 | 1140 | boolquery = new BooleanQuery(); |
1052 | | - boolquery.add(query,firstOccur); |
1053 | | - boolquery.add(subquery,occur); |
| 1141 | + boolquery.add(query, firstOccur); |
| 1142 | + boolquery.add(subquery, occur); |
1054 | 1143 | query = boolquery; |
1055 | | - } else{ |
1056 | | - boolquery.add(subquery,occur); |
| 1144 | + } else { |
| 1145 | + boolquery.add(subquery, occur); |
1057 | 1146 | } |
1058 | 1147 | occur = boolDefault; // return to default |
1059 | 1148 | explicitOccur = null; |
— | — | @@ -1060,193 +1149,199 @@ |
1061 | 1150 | subquery = null; |
1062 | 1151 | } |
1063 | 1152 | } |
1064 | | - |
1065 | | - if(definedExplicitField) |
| 1153 | + |
| 1154 | + if (definedExplicitField) |
1066 | 1155 | currentField = null; |
1067 | 1156 | return query; |
1068 | 1157 | } |
1069 | | - |
1070 | | - /** |
| 1158 | + |
| 1159 | + /** |
1071 | 1160 | * return true if buffer is wildcard |
1072 | | - * the only allowed patterns are *q and q* and not other combinations like *q* or q*r |
1073 | | - * |
| 1161 | + * the only allowed patterns are *q and q* and not other combinations like |
| 1162 | + * *q* or q*r |
1074 | 1163 | */ |
1075 | | - private boolean bufferIsWildCard(){ |
1076 | | - if(length < 2) |
| 1164 | + private boolean bufferIsWildCard() { |
| 1165 | + if (length < 2) |
1077 | 1166 | return false; |
1078 | 1167 | boolean wild = false; |
1079 | 1168 | int index = -1; |
1080 | 1169 | // only allow '*' at begin and end |
1081 | | - if(buffer[0] == '*'){ |
| 1170 | + if (buffer[0] == '*') { |
1082 | 1171 | index = 0; |
1083 | 1172 | wild = true; |
1084 | | - } else if( buffer[length-1] == '*' ){ |
1085 | | - index = length-1; |
| 1173 | + } else if (buffer[length - 1] == '*') { |
| 1174 | + index = length - 1; |
1086 | 1175 | wild = true; |
1087 | 1176 | } |
1088 | 1177 | |
1089 | 1178 | // check if it's a valid wildcard |
1090 | | - if(wild){ |
| 1179 | + if (wild) { |
1091 | 1180 | // check if this is the only asterix |
1092 | | - for(int i=0;i<length;i++){ |
1093 | | - if( i!= index && buffer[i] == '*'){ |
| 1181 | + for (int i = 0; i < length; i++) { |
| 1182 | + if (i != index && buffer[i] == '*') { |
1094 | 1183 | return false; // more than one '*' |
1095 | 1184 | } |
1096 | 1185 | } |
1097 | | - |
| 1186 | + |
1098 | 1187 | // require at least one letter besides the wildcard sign |
1099 | | - for(int i=0;i<length;i++){ |
1100 | | - if(Character.isLetterOrDigit(buffer[i])) |
| 1188 | + for (int i = 0; i < length; i++) { |
| 1189 | + if (Character.isLetterOrDigit(buffer[i])) |
1101 | 1190 | return true; // found it! |
1102 | 1191 | } |
1103 | 1192 | } |
1104 | 1193 | return false; |
1105 | 1194 | } |
1106 | | - |
1107 | | - private boolean bufferIsFuzzy(){ |
1108 | | - return length>1 && (buffer[0]=='~' || buffer[length-1]=='~'); |
| 1195 | + |
| 1196 | + private boolean bufferIsFuzzy() { |
| 1197 | + return length > 1 && (buffer[0] == '~' || buffer[length - 1] == '~'); |
1109 | 1198 | } |
1110 | | - |
1111 | | - private boolean bufferContains(char c){ |
1112 | | - for(int i=0;i<length;i++){ |
1113 | | - if(buffer[i] == c) |
| 1199 | + |
| 1200 | + private boolean bufferContains(char c) { |
| 1201 | + for (int i = 0; i < length; i++) { |
| 1202 | + if (buffer[i] == c) |
1114 | 1203 | return true; |
1115 | 1204 | } |
1116 | 1205 | return false; |
1117 | 1206 | } |
1118 | | - |
1119 | | - private void addToWords(Term t){ |
1120 | | - addToWords(t,1,ExpandedType.WORD); |
| 1207 | + |
| 1208 | + private void addToWords(Term t) { |
| 1209 | + addToWords(t, 1, ExpandedType.WORD); |
1121 | 1210 | } |
1122 | | - private void addToWords(Term t, float boost, ExpandedType type){ |
1123 | | - parsedWords.add(t.text(),t.text(),boost,type); |
| 1211 | + |
| 1212 | + private void addToWords(Term t, float boost, ExpandedType type) { |
| 1213 | + parsedWords.add(t.text(), t.text(), boost, type); |
1124 | 1214 | } |
1125 | | - |
1126 | | - private void addToWordsAsAlias(Token t){ |
| 1215 | + |
| 1216 | + private void addToWordsAsAlias(Token t) { |
1127 | 1217 | float boost = STEM_WORD_BOOST; |
1128 | | - if(t.type().equals("singular")) |
| 1218 | + if (t.type().equals("singular")) |
1129 | 1219 | boost = SINGULAR_WORD_BOOST; |
1130 | | - parsedWords.last().add(new WordBoost(t.termText(),boost)); |
| 1220 | + parsedWords.last().add(new WordBoost(t.termText(), boost)); |
1131 | 1221 | } |
1132 | | - |
1133 | | - /** |
| 1222 | + |
| 1223 | + /** |
1134 | 1224 | * Constructs either a termquery or a boolean query depending on |
1135 | 1225 | * analysis of the fetched token. A single "word" might be analyzed |
1136 | | - * into many tokens, and some of them might be aliases |
| 1226 | + * into many tokens, and some of them might be aliases |
| 1227 | + * |
1137 | 1228 | * @return |
1138 | 1229 | */ |
1139 | | - private Query makeQueryFromTokens(BooleanClause.Occur toplevelOccur){ |
| 1230 | + private Query makeQueryFromTokens(BooleanClause.Occur toplevelOccur) { |
1140 | 1231 | BooleanQuery bq = null; |
1141 | 1232 | TermQuery t; |
1142 | 1233 | boolean addAliases = true; |
1143 | | - |
| 1234 | + |
1144 | 1235 | // check for urls |
1145 | | - Matcher urlMatcher = urlPattern.matcher(new String(buffer,0,length)); |
1146 | | - while(bufferContains('.') && urlMatcher.find()){ |
| 1236 | + Matcher urlMatcher = urlPattern.matcher(new String(buffer, 0, length)); |
| 1237 | + while (bufferContains('.') && urlMatcher.find()) { |
1147 | 1238 | ArrayList<Token> urlTokens = analyzeString(urlMatcher.group()); |
1148 | 1239 | ArrayList<Term> urlTerms = new ArrayList<Term>(); |
1149 | | - for(Token tt : urlTokens) |
| 1240 | + for (Token tt : urlTokens) |
1150 | 1241 | urlTerms.add(makeTerm(tt.termText())); |
1151 | | - urls.add(urlTerms); |
| 1242 | + urls.add(urlTerms); |
1152 | 1243 | } |
1153 | | - |
| 1244 | + |
1154 | 1245 | // categories should not be analyzed |
1155 | | - if(currentField != null && currentField.equals("incategory")){ |
| 1246 | + if (currentField != null && currentField.equals("incategory")) { |
1156 | 1247 | return new TermQuery(makeTerm()); |
1157 | 1248 | } |
1158 | | - |
1159 | | - // check for wildcard seaches, they are also not analyzed/stemmed, only for titles |
1160 | | - // wildcard signs are allowed only at the end of the word, minimum one letter word |
1161 | | - if(length>1 && wildcards != null && bufferIsWildCard()){ |
1162 | | - Term term = makeTerm(); |
1163 | | - Query ret = wildcards.makeQuery(term.text(),term.field()); |
1164 | | - if(ret != null){ |
| 1249 | + |
| 1250 | + // check for wildcard seaches, they are also not analyzed/stemmed, only |
| 1251 | + // for titles |
| 1252 | + // wildcard signs are allowed only at the end of the word, minimum one |
| 1253 | + // letter word |
| 1254 | + if (length > 1 && wildcards != null && bufferIsWildCard()) { |
| 1255 | + Term term = makeTerm(); |
| 1256 | + Query ret = wildcards.makeQuery(term.text(), term.field()); |
| 1257 | + if (ret != null) { |
1165 | 1258 | ArrayList<String> words = wildcards.getWords(term.text()); |
1166 | | - parsedWords.add(term.text(),words,1,ExpandedType.WILDCARD); |
| 1259 | + parsedWords.add(term.text(), words, 1, ExpandedType.WILDCARD); |
1167 | 1260 | ret.setBoost(WILDCARD_BOOST); |
1168 | 1261 | return ret; |
1169 | | - } else{ |
| 1262 | + } else { |
1170 | 1263 | // something is wrong, try making normal query |
1171 | 1264 | addToWords(term); |
1172 | 1265 | return new TermQuery(term); |
1173 | 1266 | } |
1174 | 1267 | } |
1175 | 1268 | // parse fuzzy queries |
1176 | | - if(length>1 && fuzzy != null && bufferIsFuzzy()){ |
| 1269 | + if (length > 1 && fuzzy != null && bufferIsFuzzy()) { |
1177 | 1270 | Term term = makeTerm(); |
1178 | | - String termText = term.text().replaceAll("~",""); |
| 1271 | + String termText = term.text().replaceAll("~", ""); |
1179 | 1272 | NamespaceFilter nsf = getNamespaceFilter(currentField); |
1180 | | - Query ret = fuzzy.makeQuery(termText,term.field(),nsf); |
1181 | | - if(ret != null){ |
1182 | | - ArrayList<String> words = fuzzy.getWords(termText,nsf); |
1183 | | - parsedWords.add(term.text(),words,fuzzy.getBoosts(termText,nsf,words),ExpandedType.FUZZY); |
| 1273 | + Query ret = fuzzy.makeQuery(termText, term.field(), nsf); |
| 1274 | + if (ret != null) { |
| 1275 | + ArrayList<String> words = fuzzy.getWords(termText, nsf); |
| 1276 | + parsedWords.add(term.text(), words, |
| 1277 | + fuzzy.getBoosts(termText, nsf, words), |
| 1278 | + ExpandedType.FUZZY); |
1184 | 1279 | ret.setBoost(FUZZY_BOOST); |
1185 | 1280 | return ret; |
1186 | 1281 | } |
1187 | 1282 | } |
1188 | | - |
1189 | | - if(toplevelOccur == BooleanClause.Occur.MUST_NOT) |
| 1283 | + |
| 1284 | + if (toplevelOccur == BooleanClause.Occur.MUST_NOT) |
1190 | 1285 | addAliases = false; |
1191 | 1286 | |
1192 | | - if(tokens.size() == 1){ |
| 1287 | + if (tokens.size() == 1) { |
1193 | 1288 | t = new TermQuery(makeTerm(tokens.get(0))); |
1194 | 1289 | t.setBoost(defaultBoost); |
1195 | | - if(toplevelOccur != Occur.MUST_NOT) |
| 1290 | + if (toplevelOccur != Occur.MUST_NOT) |
1196 | 1291 | addToWords(t.getTerm()); |
1197 | 1292 | return t; |
1198 | | - } else{ |
| 1293 | + } else { |
1199 | 1294 | // make a nested boolean query |
1200 | 1295 | ArrayList<BooleanQuery> queries = new ArrayList<BooleanQuery>(); |
1201 | 1296 | ArrayList<Token> aliases = new ArrayList<Token>(); |
1202 | | - for(int i=0; i<tokens.size(); i++){ |
| 1297 | + for (int i = 0; i < tokens.size(); i++) { |
1203 | 1298 | BooleanQuery query = new BooleanQuery(); |
1204 | 1299 | // main token |
1205 | 1300 | Token token = tokens.get(i); |
1206 | 1301 | t = new TermQuery(makeTerm(token)); |
1207 | 1302 | t.setBoost(defaultBoost); |
1208 | | - if(toplevelOccur != Occur.MUST_NOT) |
| 1303 | + if (toplevelOccur != Occur.MUST_NOT) |
1209 | 1304 | addToWords(t.getTerm()); |
1210 | | - query.add(t,Occur.SHOULD); |
| 1305 | + query.add(t, Occur.SHOULD); |
1211 | 1306 | // group aliases together |
1212 | 1307 | aliases.clear(); |
1213 | | - for(int j=i+1;j<tokens.size();j++){ |
1214 | | - if(tokens.get(j).getPositionIncrement() == 0){ |
| 1308 | + for (int j = i + 1; j < tokens.size(); j++) { |
| 1309 | + if (tokens.get(j).getPositionIncrement() == 0) { |
1215 | 1310 | aliases.add(tokens.get(j)); |
1216 | 1311 | i = j; |
1217 | 1312 | } else |
1218 | 1313 | break; |
1219 | | - } |
1220 | | - if(addAliases){ |
1221 | | - for(Token alias : aliases){ |
| 1314 | + } |
| 1315 | + if (addAliases) { |
| 1316 | + for (Token alias : aliases) { |
1222 | 1317 | t = new TermQuery(makeTerm(alias)); |
1223 | | - t.setBoost(defaultAliasBoost*defaultBoost); |
1224 | | - query.add(t,Occur.SHOULD); |
| 1318 | + t.setBoost(defaultAliasBoost * defaultBoost); |
| 1319 | + query.add(t, Occur.SHOULD); |
1225 | 1320 | addToWordsAsAlias(alias); |
1226 | 1321 | } |
1227 | 1322 | } |
1228 | 1323 | queries.add(query); |
1229 | 1324 | } |
1230 | 1325 | // don't returned nested if one query only |
1231 | | - if(queries.size() == 1){ |
1232 | | - BooleanQuery q = (BooleanQuery)queries.get(0); |
| 1326 | + if (queries.size() == 1) { |
| 1327 | + BooleanQuery q = (BooleanQuery) queries.get(0); |
1233 | 1328 | // one nested clause |
1234 | | - if(q.getClauses().length == 1) |
| 1329 | + if (q.getClauses().length == 1) |
1235 | 1330 | return q.getClauses()[0].getQuery(); |
1236 | 1331 | return queries.get(0); |
1237 | 1332 | } |
1238 | 1333 | // multiple tokens, e.g. super-hero -> +super +hero |
1239 | 1334 | bq = new BooleanQuery(); |
1240 | | - for(BooleanQuery q : queries){ |
1241 | | - if(q.getClauses().length == 1) |
1242 | | - bq.add(q.getClauses()[0].getQuery(),boolDefault); |
| 1335 | + for (BooleanQuery q : queries) { |
| 1336 | + if (q.getClauses().length == 1) |
| 1337 | + bq.add(q.getClauses()[0].getQuery(), boolDefault); |
1243 | 1338 | else |
1244 | | - bq.add(q,boolDefault); |
| 1339 | + bq.add(q, boolDefault); |
1245 | 1340 | } |
1246 | 1341 | return bq; |
1247 | | - |
| 1342 | + |
1248 | 1343 | } |
1249 | 1344 | } |
1250 | | - |
| 1345 | + |
1251 | 1346 | /** |
1252 | 1347 | * Extract prefix: field from the query and put it into prefixFilter |
1253 | 1348 | * variable for later retrieval |
— | — | @@ -1254,94 +1349,101 @@ |
1255 | 1350 | * @param queryText |
1256 | 1351 | * @return queryText with prefix part deleted |
1257 | 1352 | */ |
1258 | | - public String extractPrefixFilter(String queryText){ |
| 1353 | + public String extractPrefixFilter(String queryText) { |
1259 | 1354 | this.prefixFilters = null; |
1260 | | - ArrayList<String> filters = new ArrayList<String>(); |
| 1355 | + ArrayList<String> filters = new ArrayList<String>(); |
1261 | 1356 | int start = 0; |
1262 | | - while(start < queryText.length()){ |
1263 | | - int end = indexOf(queryText,'"',start); // begin of phrase |
1264 | | - int inx = queryText.indexOf("prefix:"); |
1265 | | - if(inx >=0 && inx < end){ |
1266 | | - String[] prefixes = queryText.substring(inx+"prefix:".length()).split("\\|"); |
| 1357 | + while (start < queryText.length()) { |
| 1358 | + int end = indexOf(queryText, '"', start); // begin of phrase |
| 1359 | + int inx = queryText.indexOf("prefix:"); |
| 1360 | + if (inx >= 0 && inx < end) { |
| 1361 | + String[] prefixes = queryText.substring( |
| 1362 | + inx + "prefix:".length()).split("\\|"); |
1267 | 1363 | |
1268 | | - for(String prefix : prefixes){ |
| 1364 | + for (String prefix : prefixes) { |
1269 | 1365 | String full = null; |
1270 | | - if(prefix.startsWith("[") && prefix.contains("]:")){ |
| 1366 | + if (prefix.startsWith("[") && prefix.contains("]:")) { |
1271 | 1367 | // convert from [2]:query to 2:query form |
1272 | | - full = prefix.replace("[","").replace("]:",":"); |
1273 | | - } else // default to main namespace |
1274 | | - full = "0:"+prefix ; |
1275 | | - |
| 1368 | + full = prefix.replace("[", "").replace("]:", ":"); |
| 1369 | + } else |
| 1370 | + // default to main namespace |
| 1371 | + full = "0:" + prefix; |
| 1372 | + |
1276 | 1373 | // add lowercase nonempty prefixes |
1277 | | - if(full != null && full.length()>0) |
| 1374 | + if (full != null && full.length() > 0) |
1278 | 1375 | filters.add(full.toLowerCase()); |
1279 | | - |
| 1376 | + |
1280 | 1377 | } |
1281 | | - this.prefixFilters = filters.toArray(new String[]{}); |
| 1378 | + this.prefixFilters = filters.toArray(new String[] {}); |
1282 | 1379 | // return the actual query without prefix |
1283 | | - return queryText.substring(0,inx); |
| 1380 | + return queryText.substring(0, inx); |
1284 | 1381 | } |
1285 | | - start = end+1; |
1286 | | - if(start < queryText.length()){ |
| 1382 | + start = end + 1; |
| 1383 | + if (start < queryText.length()) { |
1287 | 1384 | // skip phrase |
1288 | | - start = indexOf(queryText,'"',start) + 1; |
| 1385 | + start = indexOf(queryText, '"', start) + 1; |
1289 | 1386 | } |
1290 | 1387 | } |
1291 | | - |
| 1388 | + |
1292 | 1389 | return queryText; |
1293 | 1390 | } |
1294 | | - |
| 1391 | + |
1295 | 1392 | /** |
1296 | 1393 | * Extract prefix: field from the query and put it into prefixFilter |
1297 | 1394 | * variable for later retrieval |
1298 | 1395 | * |
1299 | 1396 | * @param queryText |
1300 | | - * @param field (like "ondiscussionthread:") |
| 1397 | + * @param field |
| 1398 | + * (like "ondiscussionthread:") |
1301 | 1399 | * @return [0] - queryText with field part deleted |
1302 | 1400 | * [1] - the field part |
1303 | 1401 | */ |
1304 | | - public static String[] extractRawField(String queryText, String field){ |
1305 | | - ArrayList<String> filters = new ArrayList<String>(); |
| 1402 | + public static String[] extractRawField(String queryText, String field) { |
| 1403 | + ArrayList<String> filters = new ArrayList<String>(); |
1306 | 1404 | int start = 0; |
1307 | | - while(start < queryText.length()){ |
1308 | | - int end = indexOf(queryText,'"',start); // begin of phrase |
1309 | | - int inx = queryText.indexOf(field); |
1310 | | - if(inx >=0 && inx < end){ |
1311 | | - String prefix = queryText.substring(inx+field.length()); |
| 1405 | + while (start < queryText.length()) { |
| 1406 | + int end = indexOf(queryText, '"', start); // begin of phrase |
| 1407 | + int inx = queryText.indexOf(field); |
| 1408 | + if (inx >= 0 && inx < end) { |
| 1409 | + String prefix = queryText.substring(inx + field.length()); |
1312 | 1410 | |
1313 | 1411 | String full = null; |
1314 | | - if(prefix.startsWith("[") && prefix.contains("]:")){ |
| 1412 | + if (prefix.startsWith("[") && prefix.contains("]:")) { |
1315 | 1413 | // convert from [2]:query to 2:query form |
1316 | | - full = prefix.replace("[","").replace("]:",":"); |
1317 | | - } else // default to main namespace |
1318 | | - full = "0:"+prefix ; |
1319 | | - |
| 1414 | + full = prefix.replace("[", "").replace("]:", ":"); |
| 1415 | + } else |
| 1416 | + // default to main namespace |
| 1417 | + full = "0:" + prefix; |
| 1418 | + |
1320 | 1419 | // add lowercase nonempty prefixes |
1321 | | - if(full != null && full.length()>0) |
| 1420 | + if (full != null && full.length() > 0) |
1322 | 1421 | filters.add(full); |
1323 | | - |
1324 | | - return new String[]{ queryText.substring(0,inx), full }; |
1325 | | - |
| 1422 | + |
| 1423 | + return new String[] { queryText.substring(0, inx), full }; |
| 1424 | + |
1326 | 1425 | } |
1327 | | - start = end+1; |
1328 | | - if(start < queryText.length()){ |
| 1426 | + start = end + 1; |
| 1427 | + if (start < queryText.length()) { |
1329 | 1428 | // skip phrase |
1330 | | - start = indexOf(queryText,'"',start) + 1; |
| 1429 | + start = indexOf(queryText, '"', start) + 1; |
1331 | 1430 | } |
1332 | 1431 | } |
1333 | | - |
1334 | | - return new String[]{ queryText, null }; |
| 1432 | + |
| 1433 | + return new String[] { queryText, null }; |
1335 | 1434 | } |
1336 | | - |
1337 | | - /** Like string.indexOf but return end of string instead of -1 when needle is not found */ |
1338 | | - protected static int indexOf(String string, char needle, int start){ |
1339 | | - int inx = string.indexOf(needle,start); |
1340 | | - if(inx == -1) |
| 1435 | + |
| 1436 | + /** |
| 1437 | + * Like string.indexOf but return end of string instead of -1 when needle is |
| 1438 | + * not found |
| 1439 | + */ |
| 1440 | + protected static int indexOf(String string, char needle, int start) { |
| 1441 | + int inx = string.indexOf(needle, start); |
| 1442 | + if (inx == -1) |
1341 | 1443 | return string.length(); |
1342 | 1444 | else |
1343 | 1445 | return inx; |
1344 | 1446 | } |
1345 | | - |
| 1447 | + |
1346 | 1448 | public boolean isDisableTitleAliases() { |
1347 | 1449 | return disableTitleAliases; |
1348 | 1450 | } |
— | — | @@ -1351,183 +1453,207 @@ |
1352 | 1454 | } |
1353 | 1455 | |
1354 | 1456 | /** Reset the parser state */ |
1355 | | - private void reset(){ |
1356 | | - cur = 0; |
| 1457 | + private void reset() { |
| 1458 | + cur = 0; |
1357 | 1459 | length = 0; |
1358 | | - currentField = null; |
| 1460 | + currentField = null; |
1359 | 1461 | prev_cur = 0; |
1360 | 1462 | explicitOccur = null; |
1361 | 1463 | parsedWords = new ParsedWords(); |
1362 | 1464 | urls = new ArrayList<ArrayList<Term>>(); |
1363 | 1465 | isInTitle = false; |
1364 | 1466 | } |
1365 | | - |
1366 | | - /** Init parsing, call this function to parse text */ |
1367 | | - private Query startParsing(){ |
1368 | | - reset(); |
| 1467 | + |
| 1468 | + /** Init parsing, call this function to parse text */ |
| 1469 | + private Query startParsing() { |
| 1470 | + reset(); |
1369 | 1471 | return parseClause(0); |
1370 | 1472 | } |
1371 | | - |
1372 | | - /** |
| 1473 | + |
| 1474 | + /** |
1373 | 1475 | * Simple parse on one default field, no rewrites. |
1374 | 1476 | * |
1375 | 1477 | * @param queryText |
1376 | 1478 | * @return |
1377 | 1479 | */ |
1378 | | - public Query parseRaw(String queryText){ |
| 1480 | + public Query parseRaw(String queryText) { |
1379 | 1481 | queryText = extractPrefixFilter(queryText); |
1380 | | - if(queryText.trim().length()==0 && hasPrefixFilters()) |
| 1482 | + if (queryText.trim().length() == 0 && hasPrefixFilters()) |
1381 | 1483 | return new MatchAllTitlesQuery(fields.title()); |
1382 | | - queryLength = queryText.length(); |
| 1484 | + queryLength = queryText.length(); |
1383 | 1485 | text = queryText.toCharArray(); |
1384 | | - |
| 1486 | + |
1385 | 1487 | Query query = null; |
1386 | 1488 | query = startParsing(); |
1387 | | - |
1388 | | - return query; |
| 1489 | + |
| 1490 | + return query; |
1389 | 1491 | } |
1390 | 1492 | |
1391 | 1493 | /* ======================= FULL-QUERY PARSING ========================= */ |
1392 | | - |
| 1494 | + |
1393 | 1495 | public static class ParsingOptions { |
1394 | 1496 | /** use a custom namespace-transformation policy */ |
1395 | 1497 | NamespacePolicy policy = null; |
1396 | | - /** only parse the main query (on contents and title) without relevance stuff */ |
| 1498 | + /** |
| 1499 | + * only parse the main query (on contents and title) without relevance |
| 1500 | + * stuff |
| 1501 | + */ |
1397 | 1502 | boolean coreQueryOnly = false; |
1398 | 1503 | /** interface to fetch wildcard hits */ |
1399 | 1504 | Wildcards wildcards = null; |
1400 | 1505 | /** fuzzy queries interface */ |
1401 | 1506 | Fuzzy fuzzy = null; |
1402 | | - |
1403 | | - public ParsingOptions() {} |
1404 | | - public ParsingOptions(NamespacePolicy policy){ |
| 1507 | + |
| 1508 | + public ParsingOptions() { |
| 1509 | + } |
| 1510 | + |
| 1511 | + public ParsingOptions(NamespacePolicy policy) { |
1405 | 1512 | this.policy = policy; |
1406 | 1513 | } |
1407 | | - public ParsingOptions(boolean coreQueryOnly){ |
| 1514 | + |
| 1515 | + public ParsingOptions(boolean coreQueryOnly) { |
1408 | 1516 | this.coreQueryOnly = coreQueryOnly; |
1409 | 1517 | } |
1410 | | - public ParsingOptions(Wildcards wildcards){ |
| 1518 | + |
| 1519 | + public ParsingOptions(Wildcards wildcards) { |
1411 | 1520 | this.wildcards = wildcards; |
1412 | 1521 | } |
1413 | | - public ParsingOptions(NamespacePolicy policy, Wildcards wildcards, Fuzzy fuzzy){ |
| 1522 | + |
| 1523 | + public ParsingOptions(NamespacePolicy policy, Wildcards wildcards, |
| 1524 | + Fuzzy fuzzy) { |
1414 | 1525 | this.policy = policy; |
1415 | 1526 | this.wildcards = wildcards; |
1416 | 1527 | this.fuzzy = fuzzy; |
1417 | 1528 | } |
1418 | 1529 | } |
1419 | | - |
| 1530 | + |
1420 | 1531 | /** Parse a full query with default options */ |
1421 | | - public Query parse(String queryText){ |
1422 | | - return parse(queryText,new ParsingOptions()); |
| 1532 | + public Query parse(String queryText) { |
| 1533 | + return parse(queryText, new ParsingOptions()); |
1423 | 1534 | } |
1424 | | - |
| 1535 | + |
1425 | 1536 | /** |
1426 | 1537 | * Construct a full query on all the fields in the index from search text |
1427 | | - * |
1428 | 1538 | */ |
1429 | 1539 | @SuppressWarnings("unchecked") |
1430 | | - public Query parse(String queryText, ParsingOptions options){ |
| 1540 | + public Query parse(String queryText, ParsingOptions options) { |
1431 | 1541 | this.wildcards = options.wildcards; |
1432 | 1542 | this.fuzzy = options.fuzzy; |
1433 | 1543 | queryText = quoteCJK(queryText); |
1434 | 1544 | NamespacePolicy defaultPolicy = this.namespacePolicy; |
1435 | | - if(options.policy != null) |
1436 | | - this.namespacePolicy = options.policy; |
| 1545 | + if (options.policy != null) |
| 1546 | + this.namespacePolicy = options.policy; |
1437 | 1547 | defaultBoost = CONTENTS_BOOST; |
1438 | 1548 | defaultAliasBoost = ALIAS_BOOST; |
1439 | | - |
1440 | | - this.rawFields = new HashMap<String,String>(); |
| 1549 | + |
| 1550 | + this.rawFields = new HashMap<String, String>(); |
1441 | 1551 | // parse out raw queries |
1442 | | - for(String field : new String[] {"ondiscussionpage:"}){ |
| 1552 | + for (String field : new String[] { "ondiscussionpage:" }) { |
1443 | 1553 | String[] ret = extractRawField(queryText, field); |
1444 | 1554 | queryText = ret[0]; |
1445 | | - if( ret[1] != null ) |
1446 | | - this.rawFields.put(field,ret[1]); |
| 1555 | + if (ret[1] != null) |
| 1556 | + this.rawFields.put(field, ret[1]); |
1447 | 1557 | } |
1448 | | - |
1449 | | - |
1450 | | - Query qc = parseRaw(queryText); |
| 1558 | + |
| 1559 | + Query qc = parseRaw(queryText); |
1451 | 1560 | ParsedWords words = parsedWords; |
1452 | 1561 | this.namespacePolicy = defaultPolicy; |
1453 | | - if(qc == null) // empty |
| 1562 | + if (qc == null) // empty |
1454 | 1563 | return null; |
1455 | | - |
1456 | | - highlightTerms = extractHighlightTerms(qc); |
1457 | | - |
1458 | | - if(options.coreQueryOnly || words.words.size()==0) |
| 1564 | + |
| 1565 | + highlightTerms = extractHighlightTerms(qc); |
| 1566 | + |
| 1567 | + if (options.coreQueryOnly || words.words.size() == 0) |
1459 | 1568 | return qc; |
1460 | | - |
| 1569 | + |
1461 | 1570 | ParsedWords nostopWords = filterStopWords(words); |
1462 | | - |
| 1571 | + |
1463 | 1572 | // main phrase combined with relevance meatrics |
1464 | | - Query mainPhrase = makeMainPhraseWithRelevance(words,nostopWords); |
1465 | | - if(mainPhrase == null) |
| 1573 | + Query mainPhrase = makeMainPhraseWithRelevance(words, nostopWords); |
| 1574 | + if (mainPhrase == null) |
1466 | 1575 | return qc; |
1467 | 1576 | |
1468 | 1577 | // additional queries |
1469 | | - //Query related = new LogTransformScore(makeRelatedRelevance(words,ADD_RELATED_BOOST)); |
| 1578 | + // Query related = new |
| 1579 | + // LogTransformScore(makeRelatedRelevance(words,ADD_RELATED_BOOST)); |
1470 | 1580 | // Query related = makeRelatedRelevance(words,ADD_RELATED_BOOST); |
1471 | | - |
| 1581 | + |
1472 | 1582 | // mainphrase + related |
1473 | | - /* BooleanQuery additional = new BooleanQuery(true); |
1474 | | - additional.add(mainPhrase,Occur.MUST); |
1475 | | - if(related != null) |
1476 | | - additional.add(related,Occur.SHOULD); */ |
1477 | | - |
1478 | | - /* BooleanQuery full = new BooleanQuery(true); |
1479 | | - full.add(bq,Occur.MUST); |
1480 | | - full.add(additional,Occur.SHOULD); */ |
1481 | | - |
| 1583 | + /* |
| 1584 | + * BooleanQuery additional = new BooleanQuery(true); |
| 1585 | + * additional.add(mainPhrase,Occur.MUST); |
| 1586 | + * if(related != null) |
| 1587 | + * additional.add(related,Occur.SHOULD); |
| 1588 | + */ |
| 1589 | + |
| 1590 | + /* |
| 1591 | + * BooleanQuery full = new BooleanQuery(true); |
| 1592 | + * full.add(bq,Occur.MUST); |
| 1593 | + * full.add(additional,Occur.SHOULD); |
| 1594 | + */ |
| 1595 | + |
1482 | 1596 | // redirect match (when redirect is not contained in contents or title) |
1483 | | - Query redirectMatch = makeAlttitleForRedirectsMulti(makeFirstAndSingular(words),20,1f); |
1484 | | - |
| 1597 | + Query redirectMatch = makeAlttitleForRedirectsMulti( |
| 1598 | + makeFirstAndSingular(words), 20, 1f); |
| 1599 | + |
1485 | 1600 | BooleanQuery full = new BooleanQuery(true); |
1486 | 1601 | full.add(qc, Occur.MUST); |
1487 | | - if(mainPhrase != null) |
| 1602 | + if (mainPhrase != null) |
1488 | 1603 | full.add(mainPhrase, Occur.SHOULD); |
1489 | | - if(redirectMatch != null) |
| 1604 | + if (redirectMatch != null) |
1490 | 1605 | full.add(redirectMatch, Occur.SHOULD); |
1491 | | - |
| 1606 | + |
1492 | 1607 | // add raw fields as global constrains |
1493 | | - for(Entry<String,String> e : rawFields.entrySet()){ |
| 1608 | + for (Entry<String, String> e : rawFields.entrySet()) { |
1494 | 1609 | String field = e.getKey(); |
1495 | | - if(field.endsWith(":")) |
1496 | | - field = field.substring(0, field.length()-1); |
1497 | | - // find target field in the index, e.g. ondiscussionpage -> ThreadPage |
| 1610 | + if (field.endsWith(":")) |
| 1611 | + field = field.substring(0, field.length() - 1); |
| 1612 | + // find target field in the index, e.g. ondiscussionpage -> |
| 1613 | + // ThreadPage |
1498 | 1614 | String targetField = keywordFieldMapping.get(field); |
1499 | | - if( targetField != null) |
1500 | | - full.add(new TermQuery(new Term(targetField, e.getValue())),Occur.MUST); |
| 1615 | + if (targetField != null) |
| 1616 | + full.add(new TermQuery(new Term(targetField, e.getValue())), |
| 1617 | + Occur.MUST); |
1501 | 1618 | } |
1502 | | - |
1503 | | - // init global scaling of articles |
| 1619 | + |
| 1620 | + // init global scaling of articles |
1504 | 1621 | ArticleScaling scale = new ArticleScaling.None(); |
1505 | 1622 | // based on age |
1506 | 1623 | AgeScaling age = iid.getAgeScaling(); |
1507 | | - if(age != AgeScaling.NONE){ |
1508 | | - switch(age){ |
1509 | | - case STRONG: scale = new ArticleScaling.StepScale(0.3f,1); break; |
1510 | | - case MEDIUM: scale = new ArticleScaling.StepScale(0.6f,1); break; |
1511 | | - case WEAK: scale = new ArticleScaling.StepScale(0.9f,1); break; |
1512 | | - default: throw new RuntimeException("Unsupported age scaling "+age); |
1513 | | - } |
1514 | | - |
| 1624 | + if (age != AgeScaling.NONE) { |
| 1625 | + switch (age) { |
| 1626 | + case STRONG: |
| 1627 | + scale = new ArticleScaling.StepScale(0.3f, 1); |
| 1628 | + break; |
| 1629 | + case MEDIUM: |
| 1630 | + scale = new ArticleScaling.StepScale(0.6f, 1); |
| 1631 | + break; |
| 1632 | + case WEAK: |
| 1633 | + scale = new ArticleScaling.StepScale(0.9f, 1); |
| 1634 | + break; |
| 1635 | + default: |
| 1636 | + throw new RuntimeException("Unsupported age scaling " + age); |
| 1637 | + } |
| 1638 | + |
1515 | 1639 | } |
1516 | | - |
| 1640 | + |
1517 | 1641 | // additional rank |
1518 | | - AggregateInfo rank = iid.useAdditionalRank()? new AggregateInfoImpl() : null; |
| 1642 | + AggregateInfo rank = iid.useAdditionalRank() ? new AggregateInfoImpl() |
| 1643 | + : null; |
1519 | 1644 | ArticleNamespaceScaling nsScale = iid.getNamespaceScaling(); |
1520 | | - return new ArticleQueryWrap(full,new ArticleInfoImpl(),scale,rank,nsScale); |
1521 | | - |
| 1645 | + return new ArticleQueryWrap(full, new ArticleInfoImpl(), scale, rank, |
| 1646 | + nsScale); |
| 1647 | + |
1522 | 1648 | } |
1523 | | - |
| 1649 | + |
1524 | 1650 | /** Return terms that should be highlighted in snippets */ |
1525 | 1651 | private Term[] extractHighlightTerms(Query query) { |
1526 | 1652 | HashSet<Term> terms = new HashSet<Term>(); |
1527 | 1653 | query.extractTerms(terms); |
1528 | | - |
| 1654 | + |
1529 | 1655 | // substract forbidden terms |
1530 | 1656 | BooleanQuery forbidden = extractForbidden(query); |
1531 | | - if(forbidden != null){ |
| 1657 | + if (forbidden != null) { |
1532 | 1658 | HashSet<Term> forbiddenTerms = new HashSet<Term>(); |
1533 | 1659 | forbidden.extractTerms(forbiddenTerms); |
1534 | 1660 | terms.removeAll(forbiddenTerms); |
— | — | @@ -1536,18 +1662,19 @@ |
1537 | 1663 | } |
1538 | 1664 | |
1539 | 1665 | /** Generate singular parsed words coupled with first() words */ |
1540 | | - private ParsedWords makeFirstAndSingular(ParsedWords words){ |
| 1666 | + private ParsedWords makeFirstAndSingular(ParsedWords words) { |
1541 | 1667 | ParsedWords ret = words.cloneFirstWithWildcards(); |
1542 | | - if(filters.hasSingular()){ |
| 1668 | + if (filters.hasSingular()) { |
1543 | 1669 | Singular singular = filters.getSingular(); |
1544 | 1670 | // generate singular forms if any |
1545 | | - for(WordsDesc wd : ret.words){ |
1546 | | - if(wd.isWildcardOrFuzzy()) |
| 1671 | + for (WordsDesc wd : ret.words) { |
| 1672 | + if (wd.isWildcardOrFuzzy()) |
1547 | 1673 | continue; |
1548 | 1674 | String w = wd.first(); |
1549 | 1675 | String sw = singular.getSingular(w); |
1550 | | - if( sw!=null && !w.equals(sw) ){ |
1551 | | - wd.add( new WordBoost( sw, wd.firstWordBoost().boost * SINGULAR_WORD_BOOST ) ); |
| 1676 | + if (sw != null && !w.equals(sw)) { |
| 1677 | + wd.add(new WordBoost(sw, wd.firstWordBoost().boost |
| 1678 | + * SINGULAR_WORD_BOOST)); |
1552 | 1679 | } |
1553 | 1680 | } |
1554 | 1681 | } |
— | — | @@ -1556,7 +1683,7 @@ |
1557 | 1684 | |
1558 | 1685 | private ArrayList<String> cleanupWords(ArrayList<String> words) { |
1559 | 1686 | ArrayList<String> ret = new ArrayList<String>(); |
1560 | | - for(String w : words){ |
| 1687 | + for (String w : words) { |
1561 | 1688 | ret.add(FastWikiTokenizerEngine.clearTrailing(w)); |
1562 | 1689 | } |
1563 | 1690 | return ret; |
— | — | @@ -1564,85 +1691,89 @@ |
1565 | 1692 | |
1566 | 1693 | /** Recursively transverse queries and put stop words to SHOULD */ |
1567 | 1694 | private void filterStopWords(BooleanQuery bq) { |
1568 | | - if(stopWords==null && stopWords.size()==0) |
| 1695 | + if (stopWords == null && stopWords.size() == 0) |
1569 | 1696 | return; |
1570 | | - for(BooleanClause cl : bq.getClauses()){ |
| 1697 | + for (BooleanClause cl : bq.getClauses()) { |
1571 | 1698 | Query q = cl.getQuery(); |
1572 | 1699 | Occur o = cl.getOccur(); |
1573 | | - if(q instanceof BooleanQuery){ |
1574 | | - filterStopWords((BooleanQuery)q); |
1575 | | - } else if(q instanceof TermQuery && o.equals(Occur.MUST) |
1576 | | - && stopWords.contains(((TermQuery)q).getTerm().text())){ |
| 1700 | + if (q instanceof BooleanQuery) { |
| 1701 | + filterStopWords((BooleanQuery) q); |
| 1702 | + } else if (q instanceof TermQuery && o.equals(Occur.MUST) |
| 1703 | + && stopWords.contains(((TermQuery) q).getTerm().text())) { |
1577 | 1704 | cl.setOccur(Occur.SHOULD); |
1578 | 1705 | } |
1579 | 1706 | } |
1580 | 1707 | } |
1581 | | - |
| 1708 | + |
1582 | 1709 | /** @return new ParsedWords with stop words deleted */ |
1583 | | - private ParsedWords filterStopWords(ParsedWords words){ |
| 1710 | + private ParsedWords filterStopWords(ParsedWords words) { |
1584 | 1711 | // if all stop words, don't filter |
1585 | 1712 | boolean allStop = true; |
1586 | | - for(WordsDesc d : words.words){ |
1587 | | - if(!stopWords.contains(d.first())){ |
| 1713 | + for (WordsDesc d : words.words) { |
| 1714 | + if (!stopWords.contains(d.first())) { |
1588 | 1715 | allStop = false; |
1589 | 1716 | break; |
1590 | 1717 | } |
1591 | 1718 | } |
1592 | 1719 | ParsedWords ret = new ParsedWords(); |
1593 | | - for(WordsDesc d : words.words){ |
1594 | | - if(allStop || !stopWords.contains(d.first())) |
| 1720 | + for (WordsDesc d : words.words) { |
| 1721 | + if (allStop || !stopWords.contains(d.first())) |
1595 | 1722 | ret.words.add(d); |
1596 | 1723 | } |
1597 | 1724 | return ret; |
1598 | 1725 | } |
1599 | 1726 | |
1600 | 1727 | /** Quote CJK chars to avoid frequency-based analysis */ |
1601 | | - protected String quoteCJK(String queryText){ |
1602 | | - if(!builder.filters.isUsingCJK()) |
| 1728 | + protected String quoteCJK(String queryText) { |
| 1729 | + if (!builder.filters.isUsingCJK()) |
1603 | 1730 | return queryText; |
1604 | | - |
| 1731 | + |
1605 | 1732 | StringBuilder sb = new StringBuilder(); |
1606 | 1733 | int c; |
1607 | 1734 | boolean prevCJK = false; |
1608 | 1735 | int offset = 0; |
1609 | 1736 | boolean closeQuote = false; |
1610 | 1737 | boolean inQuotes = false; |
1611 | | - for(int i=0;i<queryText.length();i++){ |
| 1738 | + for (int i = 0; i < queryText.length(); i++) { |
1612 | 1739 | c = queryText.codePointAt(i); |
1613 | | - if(c == '"') inQuotes = !inQuotes; |
1614 | | - if(inQuotes) |
| 1740 | + if (c == '"') |
| 1741 | + inQuotes = !inQuotes; |
| 1742 | + if (inQuotes) |
1615 | 1743 | continue; |
1616 | | - if(CJKFilter.isCJKChar(c)){ |
1617 | | - if(!prevCJK){ // begin of CJK stream |
1618 | | - if(i!=0) |
1619 | | - sb.append(queryText.substring(offset,i)); |
| 1744 | + if (CJKFilter.isCJKChar(c)) { |
| 1745 | + if (!prevCJK) { // begin of CJK stream |
| 1746 | + if (i != 0) |
| 1747 | + sb.append(queryText.substring(offset, i)); |
1620 | 1748 | offset = i; |
1621 | 1749 | sb.append('"'); |
1622 | 1750 | closeQuote = true; |
1623 | 1751 | prevCJK = true; |
1624 | 1752 | } |
1625 | | - } else if(prevCJK){ |
| 1753 | + } else if (prevCJK) { |
1626 | 1754 | // end of CJK stream |
1627 | | - sb.append(queryText.substring(offset,i)); |
| 1755 | + sb.append(queryText.substring(offset, i)); |
1628 | 1756 | offset = i; |
1629 | 1757 | sb.append('"'); |
1630 | 1758 | closeQuote = true; |
1631 | 1759 | prevCJK = false; |
1632 | 1760 | } |
1633 | 1761 | } |
1634 | | - if(offset == 0 && !closeQuote) |
| 1762 | + if (offset == 0 && !closeQuote) |
1635 | 1763 | return queryText; |
1636 | | - else{ |
1637 | | - sb.append(queryText.substring(offset,queryText.length())); |
1638 | | - if(closeQuote) |
| 1764 | + else { |
| 1765 | + sb.append(queryText.substring(offset, queryText.length())); |
| 1766 | + if (closeQuote) |
1639 | 1767 | sb.append('"'); |
1640 | 1768 | return sb.toString(); |
1641 | 1769 | } |
1642 | 1770 | } |
1643 | | - |
1644 | | - /** Make title query in format: title:query stemtitle:stemmedquery |
1645 | | - * Also extract words from query (to be used for phrases additional scores) |
1646 | | - * @return query */ |
| 1771 | + |
| 1772 | + /** |
| 1773 | + * Make title query in format: title:query stemtitle:stemmedquery |
| 1774 | + * Also extract words from query (to be used for phrases additional scores) |
| 1775 | + * |
| 1776 | + * @return query |
| 1777 | + */ |
1647 | 1778 | protected Query makeTitlePart(String queryText) { |
1648 | 1779 | // push on stack |
1649 | 1780 | String contentField = defaultField; |
— | — | @@ -1650,437 +1781,488 @@ |
1651 | 1782 | |
1652 | 1783 | // stemmed title |
1653 | 1784 | Query qs = null; |
1654 | | - if(ADD_STEM_TITLE && builder.getFilters().hasStemmer()){ |
1655 | | - defaultField = fields.stemtitle(); |
| 1785 | + if (ADD_STEM_TITLE && builder.getFilters().hasStemmer()) { |
| 1786 | + defaultField = fields.stemtitle(); |
1656 | 1787 | defaultBoost = STEM_TITLE_BOOST; |
1657 | 1788 | defaultAliasBoost = STEM_TITLE_ALIAS_BOOST; |
1658 | 1789 | qs = parseRaw(queryText); |
1659 | 1790 | } |
1660 | 1791 | // title |
1661 | | - defaultField = fields.title(); |
1662 | | - defaultBoost = (qs!= null)? TITLE_BOOST : TITLE_BOOST+STEM_TITLE_BOOST; |
1663 | | - defaultAliasBoost = TITLE_ALIAS_BOOST; |
| 1792 | + defaultField = fields.title(); |
| 1793 | + defaultBoost = (qs != null) ? TITLE_BOOST : TITLE_BOOST |
| 1794 | + + STEM_TITLE_BOOST; |
| 1795 | + defaultAliasBoost = TITLE_ALIAS_BOOST; |
1664 | 1796 | Query qt = parseRaw(queryText); |
1665 | | - |
| 1797 | + |
1666 | 1798 | // pop stack |
1667 | 1799 | defaultField = contentField; |
1668 | 1800 | defaultBoost = olfDefaultBoost; |
1669 | 1801 | defaultAliasBoost = ALIAS_BOOST; |
1670 | 1802 | |
1671 | | - |
1672 | | - if(qt==qs || qt.equals(qs)) // either null, or category query |
| 1803 | + if (qt == qs || qt.equals(qs)) // either null, or category query |
1673 | 1804 | return qt; |
1674 | | - if(qt == null) |
| 1805 | + if (qt == null) |
1675 | 1806 | return qs; |
1676 | | - if(qs == null) |
| 1807 | + if (qs == null) |
1677 | 1808 | return qt; |
1678 | 1809 | BooleanQuery bq = new BooleanQuery(true); |
1679 | | - bq.add(qt,Occur.SHOULD); |
1680 | | - bq.add(qs,Occur.SHOULD); |
| 1810 | + bq.add(qt, Occur.SHOULD); |
| 1811 | + bq.add(qs, Occur.SHOULD); |
1681 | 1812 | return bq; |
1682 | 1813 | } |
1683 | | - |
| 1814 | + |
1684 | 1815 | /** Extract MUST_NOT clauses form a query */ |
1685 | | - protected static BooleanQuery extractForbidden(Query q){ |
| 1816 | + protected static BooleanQuery extractForbidden(Query q) { |
1686 | 1817 | BooleanQuery bq = new BooleanQuery(); |
1687 | | - extractForbiddenRecursive(bq,q); |
1688 | | - if(bq.getClauses().length == 0) |
| 1818 | + extractForbiddenRecursive(bq, q); |
| 1819 | + if (bq.getClauses().length == 0) |
1689 | 1820 | return null; |
1690 | | - |
| 1821 | + |
1691 | 1822 | return bq; |
1692 | 1823 | } |
1693 | | - /** Recursivily extract all MUST_NOT clauses from query */ |
1694 | | - protected static void extractForbiddenRecursive(BooleanQuery forbidden, Query q){ |
1695 | | - if(q instanceof BooleanQuery){ |
1696 | | - BooleanQuery bq = (BooleanQuery)q; |
1697 | | - for(BooleanClause cl : bq.getClauses()){ |
1698 | | - if(cl.getOccur() == Occur.MUST_NOT) |
1699 | | - forbidden.add(cl.getQuery(),Occur.SHOULD); |
| 1824 | + |
| 1825 | + /** Recursivily extract all MUST_NOT clauses from query */ |
| 1826 | + protected static void extractForbiddenRecursive(BooleanQuery forbidden, |
| 1827 | + Query q) { |
| 1828 | + if (q instanceof BooleanQuery) { |
| 1829 | + BooleanQuery bq = (BooleanQuery) q; |
| 1830 | + for (BooleanClause cl : bq.getClauses()) { |
| 1831 | + if (cl.getOccur() == Occur.MUST_NOT) |
| 1832 | + forbidden.add(cl.getQuery(), Occur.SHOULD); |
1700 | 1833 | else |
1701 | | - extractForbiddenRecursive(forbidden,cl.getQuery()); |
| 1834 | + extractForbiddenRecursive(forbidden, cl.getQuery()); |
1702 | 1835 | } |
1703 | 1836 | } |
1704 | 1837 | } |
1705 | | - /** Extract forbidden terms from a query into a hashset */ |
1706 | | - public static void extractForbiddenInto(Query q, HashSet<Term> forbidden){ |
| 1838 | + |
| 1839 | + /** Extract forbidden terms from a query into a hashset */ |
| 1840 | + public static void extractForbiddenInto(Query q, HashSet<Term> forbidden) { |
1707 | 1841 | BooleanQuery bq = extractForbidden(q); |
1708 | | - if(bq != null) |
| 1842 | + if (bq != null) |
1709 | 1843 | bq.extractTerms(forbidden); |
1710 | 1844 | } |
1711 | | - |
| 1845 | + |
1712 | 1846 | /** Valid after parse(), returns if the last query had phrases in it */ |
1713 | | - public boolean hasPhrases(){ |
1714 | | - for(WordsDesc wd : parsedWords.words){ |
1715 | | - if(wd.type == ExpandedType.PHRASE) |
| 1847 | + public boolean hasPhrases() { |
| 1848 | + for (WordsDesc wd : parsedWords.words) { |
| 1849 | + if (wd.type == ExpandedType.PHRASE) |
1716 | 1850 | return true; |
1717 | 1851 | } |
1718 | 1852 | return false; |
1719 | 1853 | } |
1720 | | - |
1721 | | - /** Make the main phrases with relevance metrics */ |
1722 | | - protected Query makeMainPhraseWithRelevance(ParsedWords words, ParsedWords noStopWords){ |
| 1854 | + |
| 1855 | + /** Make the main phrases with relevance metrics */ |
| 1856 | + protected Query makeMainPhraseWithRelevance(ParsedWords words, |
| 1857 | + ParsedWords noStopWords) { |
1723 | 1858 | Query main = null; |
1724 | 1859 | String field = fields.contents(); // put to begin() for performance |
1725 | | - |
| 1860 | + |
1726 | 1861 | // all words as entered into the query |
1727 | | - Query phrase = makePositionalMulti(noStopWords,field,new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1); |
1728 | | - |
1729 | | - Query sections = makeSectionsQuery(noStopWords,SECTIONS_BOOST); |
| 1862 | + Query phrase = makePositionalMulti(noStopWords, field, |
| 1863 | + new PositionalOptions.Sloppy(), MAINPHRASE_SLOP, 1); |
| 1864 | + |
| 1865 | + Query sections = makeSectionsQuery(noStopWords, SECTIONS_BOOST); |
1730 | 1866 | // wordnet synonyms |
1731 | | - ArrayList<ArrayList<String>> wordnet = WordNet.replaceOne(words.extractFirst(),iid.getLangCode()); |
1732 | | - |
| 1867 | + ArrayList<ArrayList<String>> wordnet = WordNet.replaceOne( |
| 1868 | + words.extractFirst(), iid.getLangCode()); |
| 1869 | + |
1733 | 1870 | BooleanQuery combined = new BooleanQuery(true); |
1734 | | - // combined various queries into mainphrase |
1735 | | - if(phrase != null){ |
1736 | | - combined.add(phrase,Occur.SHOULD); |
1737 | | - // wordnet |
1738 | | - if(wordnet != null){ |
1739 | | - for(ArrayList<String> wnwords : wordnet){ |
1740 | | - if(!allStopWords(wnwords)) |
1741 | | - combined.add(makePositional(wnwords,field,new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD); |
| 1871 | + // combined various queries into mainphrase |
| 1872 | + if (phrase != null) { |
| 1873 | + combined.add(phrase, Occur.SHOULD); |
| 1874 | + // wordnet |
| 1875 | + if (wordnet != null) { |
| 1876 | + for (ArrayList<String> wnwords : wordnet) { |
| 1877 | + if (!allStopWords(wnwords)) |
| 1878 | + combined.add( |
| 1879 | + makePositional(wnwords, field, |
| 1880 | + new PositionalOptions.Sloppy(), |
| 1881 | + MAINPHRASE_SLOP, 1), Occur.SHOULD); |
1742 | 1882 | } |
1743 | 1883 | } |
1744 | 1884 | // urls |
1745 | | - if(urls.size() > 0){ |
1746 | | - for(ArrayList<Term> terms : urls){ |
1747 | | - combined.add(makePositional(extractTermText(terms), extractField(terms), new PositionalOptions.Sloppy(),0,1), Occur.SHOULD); |
| 1885 | + if (urls.size() > 0) { |
| 1886 | + for (ArrayList<Term> terms : urls) { |
| 1887 | + combined.add( |
| 1888 | + makePositional(extractTermText(terms), |
| 1889 | + extractField(terms), |
| 1890 | + new PositionalOptions.Sloppy(), 0, 1), |
| 1891 | + Occur.SHOULD); |
1748 | 1892 | } |
1749 | 1893 | } |
1750 | 1894 | } |
1751 | | - if(sections!=null) |
1752 | | - combined.add(sections,Occur.SHOULD); |
1753 | | - |
1754 | | - if(combined.getClauses().length == 1) |
| 1895 | + if (sections != null) |
| 1896 | + combined.add(sections, Occur.SHOULD); |
| 1897 | + |
| 1898 | + if (combined.getClauses().length == 1) |
1755 | 1899 | main = combined.getClauses()[0].getQuery(); |
1756 | 1900 | else |
1757 | 1901 | main = combined; |
1758 | | - |
1759 | | - |
| 1902 | + |
1760 | 1903 | main.setBoost(MAINPHRASE_BOOST); |
1761 | | - |
| 1904 | + |
1762 | 1905 | // relevance: alttitle |
1763 | | - Query alttitle = makeAlttitleRelevance(words,RELEVANCE_ALTTITLE_BOOST); |
| 1906 | + Query alttitle = makeAlttitleRelevance(words, RELEVANCE_ALTTITLE_BOOST); |
1764 | 1907 | ArrayList<Query> altAdd = new ArrayList<Query>(); |
1765 | | - if(wordnet!=null) |
1766 | | - for(ArrayList<String> wnwords : wordnet) |
1767 | | - if(!allStopWords(wnwords)) |
1768 | | - altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST)); |
1769 | | - alttitle = simplify(combine(alttitle,altAdd)); |
1770 | | - |
| 1908 | + if (wordnet != null) |
| 1909 | + for (ArrayList<String> wnwords : wordnet) |
| 1910 | + if (!allStopWords(wnwords)) |
| 1911 | + altAdd.add(makeAlttitleRelevance(wnwords, |
| 1912 | + RELEVANCE_ALTTITLE_BOOST)); |
| 1913 | + alttitle = simplify(combine(alttitle, altAdd)); |
| 1914 | + |
1771 | 1915 | // relevance: related |
1772 | | - Query related = makeRelatedRelevance(words,RELEVANCE_RELATED_BOOST); |
| 1916 | + Query related = makeRelatedRelevance(words, RELEVANCE_RELATED_BOOST); |
1773 | 1917 | ArrayList<Query> relAdd = new ArrayList<Query>(); |
1774 | | - if(wordnet!=null) |
1775 | | - for(ArrayList<String> wnwords : wordnet) |
1776 | | - if(!allStopWords(wnwords)) |
1777 | | - relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST)); |
1778 | | - related = simplify(combine(related,relAdd)); |
1779 | | - |
| 1918 | + if (wordnet != null) |
| 1919 | + for (ArrayList<String> wnwords : wordnet) |
| 1920 | + if (!allStopWords(wnwords)) |
| 1921 | + relAdd.add(makeRelatedRelevance(wnwords, |
| 1922 | + RELEVANCE_RELATED_BOOST)); |
| 1923 | + related = simplify(combine(related, relAdd)); |
| 1924 | + |
1780 | 1925 | BooleanQuery relevances = new BooleanQuery(true); |
1781 | | - relevances.add(alttitle,Occur.SHOULD); |
1782 | | - relevances.add(related,Occur.SHOULD); |
1783 | | - |
| 1926 | + relevances.add(alttitle, Occur.SHOULD); |
| 1927 | + relevances.add(related, Occur.SHOULD); |
| 1928 | + |
1784 | 1929 | RelevanceQuery whole = new RelevanceQuery(main); |
1785 | 1930 | whole.addRelevanceMeasure(relevances); |
1786 | | - |
| 1931 | + |
1787 | 1932 | return whole; |
1788 | 1933 | } |
1789 | | - |
| 1934 | + |
1790 | 1935 | private String extractField(ArrayList<Term> terms) { |
1791 | | - if(terms.size() > 0) |
| 1936 | + if (terms.size() > 0) |
1792 | 1937 | return terms.get(0).field(); |
1793 | 1938 | else |
1794 | | - throw new RuntimeException("Trying to extract field from zero-length list of terms"); |
| 1939 | + throw new RuntimeException( |
| 1940 | + "Trying to extract field from zero-length list of terms"); |
1795 | 1941 | } |
1796 | 1942 | |
1797 | 1943 | private ArrayList<String> extractTermText(ArrayList<Term> terms) { |
1798 | 1944 | ArrayList<String> tt = new ArrayList<String>(); |
1799 | | - for(Term t : terms) |
| 1945 | + for (Term t : terms) |
1800 | 1946 | tt.add(t.text()); |
1801 | 1947 | return tt; |
1802 | 1948 | } |
1803 | 1949 | |
1804 | | - /** Combine one main query with a number of other queries into a boolean query */ |
| 1950 | + /** |
| 1951 | + * Combine one main query with a number of other queries into a boolean |
| 1952 | + * query |
| 1953 | + */ |
1805 | 1954 | private Query combine(Query query, ArrayList<Query> additional) { |
1806 | | - if(additional.size()==0) |
| 1955 | + if (additional.size() == 0) |
1807 | 1956 | return query; |
1808 | 1957 | BooleanQuery bq = new BooleanQuery(true); |
1809 | | - bq.add(query,Occur.SHOULD); |
1810 | | - for(Query q : additional){ |
1811 | | - if(q != null) |
1812 | | - bq.add(q,Occur.SHOULD); |
| 1958 | + bq.add(query, Occur.SHOULD); |
| 1959 | + for (Query q : additional) { |
| 1960 | + if (q != null) |
| 1961 | + bq.add(q, Occur.SHOULD); |
1813 | 1962 | } |
1814 | | - if(bq.clauses().size()==1) |
| 1963 | + if (bq.clauses().size() == 1) |
1815 | 1964 | return query; |
1816 | 1965 | return bq; |
1817 | | - } |
1818 | | - |
| 1966 | + } |
| 1967 | + |
1819 | 1968 | /** Convert multiple OR-like queries into one with larger boost */ |
1820 | | - protected Query simplify(Query q){ |
1821 | | - if(q instanceof BooleanQuery){ |
1822 | | - BooleanQuery bq = (BooleanQuery)q; |
1823 | | - if(!allShould(bq)) |
| 1969 | + protected Query simplify(Query q) { |
| 1970 | + if (q instanceof BooleanQuery) { |
| 1971 | + BooleanQuery bq = (BooleanQuery) q; |
| 1972 | + if (!allShould(bq)) |
1824 | 1973 | return q; |
1825 | 1974 | // query -> boost |
1826 | | - HashMap<Query,Float> map = new HashMap<Query,Float>(); |
1827 | | - extractAndSimplify(bq,map,1); |
1828 | | - |
| 1975 | + HashMap<Query, Float> map = new HashMap<Query, Float>(); |
| 1976 | + extractAndSimplify(bq, map, 1); |
| 1977 | + |
1829 | 1978 | // simplify |
1830 | 1979 | BooleanQuery ret = new BooleanQuery(true); |
1831 | | - for(Entry<Query,Float> e : map.entrySet()){ |
| 1980 | + for (Entry<Query, Float> e : map.entrySet()) { |
1832 | 1981 | Query qt = (Query) e.getKey(); |
1833 | 1982 | qt.setBoost(e.getValue()); |
1834 | | - ret.add(qt,Occur.SHOULD); |
| 1983 | + ret.add(qt, Occur.SHOULD); |
1835 | 1984 | } |
1836 | 1985 | return ret; |
1837 | 1986 | } |
1838 | 1987 | return q; |
1839 | 1988 | } |
1840 | | - |
1841 | | - private boolean allShould(BooleanQuery bq){ |
1842 | | - for(BooleanClause cl : bq.getClauses()){ |
1843 | | - if(!cl.getOccur().equals(Occur.SHOULD)) |
| 1989 | + |
| 1990 | + private boolean allShould(BooleanQuery bq) { |
| 1991 | + for (BooleanClause cl : bq.getClauses()) { |
| 1992 | + if (!cl.getOccur().equals(Occur.SHOULD)) |
1844 | 1993 | return false; |
1845 | | - if(cl.getQuery() instanceof BooleanQuery){ |
1846 | | - if(!allShould((BooleanQuery)cl.getQuery())) |
| 1994 | + if (cl.getQuery() instanceof BooleanQuery) { |
| 1995 | + if (!allShould((BooleanQuery) cl.getQuery())) |
1847 | 1996 | return false; |
1848 | 1997 | } |
1849 | 1998 | } |
1850 | 1999 | return true; |
1851 | 2000 | } |
1852 | | - |
1853 | | - private void extractAndSimplify(BooleanQuery bq, HashMap<Query,Float> map, float parentBoost){ |
1854 | | - for(BooleanClause cl : bq.getClauses()){ |
| 2001 | + |
| 2002 | + private void extractAndSimplify(BooleanQuery bq, HashMap<Query, Float> map, |
| 2003 | + float parentBoost) { |
| 2004 | + for (BooleanClause cl : bq.getClauses()) { |
1855 | 2005 | Query q = cl.getQuery(); |
1856 | | - if(q instanceof BooleanQuery) |
1857 | | - extractAndSimplify((BooleanQuery)q,map,parentBoost*bq.getBoost()); |
1858 | | - else{ |
| 2006 | + if (q instanceof BooleanQuery) |
| 2007 | + extractAndSimplify((BooleanQuery) q, map, |
| 2008 | + parentBoost * bq.getBoost()); |
| 2009 | + else { |
1859 | 2010 | Float boost = map.get(q); |
1860 | | - float b = boost==null? 0 : boost; |
1861 | | - b += q.getBoost()*bq.getBoost()*parentBoost; |
1862 | | - map.put(q,b); |
| 2011 | + float b = boost == null ? 0 : boost; |
| 2012 | + b += q.getBoost() * bq.getBoost() * parentBoost; |
| 2013 | + map.put(q, b); |
1863 | 2014 | } |
1864 | 2015 | } |
1865 | 2016 | } |
1866 | | - |
| 2017 | + |
1867 | 2018 | /** Make positional query by including all of the stop words */ |
1868 | | - protected PositionalQuery makePositional(ArrayList<String> words, String field, PositionalOptions options, int slop, float boost){ |
1869 | | - return makePositional(words,field,options,slop,boost,true); |
| 2019 | + protected PositionalQuery makePositional(ArrayList<String> words, |
| 2020 | + String field, PositionalOptions options, int slop, float boost) { |
| 2021 | + return makePositional(words, field, options, slop, boost, true); |
1870 | 2022 | } |
1871 | | - |
| 2023 | + |
1872 | 2024 | /** Make generic positional query */ |
1873 | | - protected PositionalQuery makePositional(ArrayList<String> words, String field, PositionalOptions options, int slop, float boost, boolean includeStopWords){ |
| 2025 | + protected PositionalQuery makePositional(ArrayList<String> words, |
| 2026 | + String field, PositionalOptions options, int slop, float boost, |
| 2027 | + boolean includeStopWords) { |
1874 | 2028 | PositionalQuery pq = new PositionalQuery(options); |
1875 | 2029 | int pos = 0; |
1876 | | - for(String w : words){ |
| 2030 | + for (String w : words) { |
1877 | 2031 | boolean isStop = stopWords.contains(w); |
1878 | | - if(!(isStop && !includeStopWords)) |
1879 | | - pq.add(new Term(field,w),pos,isStop); |
| 2032 | + if (!(isStop && !includeStopWords)) |
| 2033 | + pq.add(new Term(field, w), pos, isStop); |
1880 | 2034 | pos++; |
1881 | 2035 | } |
1882 | | - if(slop != 0) |
| 2036 | + if (slop != 0) |
1883 | 2037 | pq.setSlop(slop); |
1884 | 2038 | pq.setBoost(boost); |
1885 | | - if(pq.getPositions().length > 0) |
| 2039 | + if (pq.getPositions().length > 0) |
1886 | 2040 | return pq; |
1887 | | - else return null; |
| 2041 | + else |
| 2042 | + return null; |
1888 | 2043 | } |
1889 | | - |
1890 | | - protected Query makePositionalMulti(ParsedWords parsed, String field, PositionalOptions options, int slop, float boost){ |
| 2044 | + |
| 2045 | + protected Query makePositionalMulti(ParsedWords parsed, String field, |
| 2046 | + PositionalOptions options, int slop, float boost) { |
1891 | 2047 | PositionalMultiQuery mq = new PositionalMultiQuery(options); |
1892 | | - for(WordsDesc wd : parsed.words){ |
1893 | | - mq.addWithBoost(wd.getTerms(field),wd.getPosition(),wd.getBoosts()); |
| 2048 | + for (WordsDesc wd : parsed.words) { |
| 2049 | + mq.addWithBoost(wd.getTerms(field), wd.getPosition(), |
| 2050 | + wd.getBoosts()); |
1894 | 2051 | } |
1895 | 2052 | mq.setSlop(slop); |
1896 | 2053 | mq.setBoost(boost); |
1897 | | - if(mq.getPositions().length > 0) |
| 2054 | + if (mq.getPositions().length > 0) |
1898 | 2055 | return mq; |
1899 | | - else |
| 2056 | + else |
1900 | 2057 | return null; |
1901 | 2058 | } |
1902 | 2059 | |
1903 | 2060 | /** Make query with short subphrases anchored in non-stop words */ |
1904 | | - protected Query makeAnchoredQuery(ArrayList<String> words, String field, |
1905 | | - PositionalOptions options, PositionalOptions whole, PositionalOptions wholeSloppy, |
1906 | | - float boost, int slop){ |
| 2061 | + protected Query makeAnchoredQuery(ArrayList<String> words, String field, |
| 2062 | + PositionalOptions options, PositionalOptions whole, |
| 2063 | + PositionalOptions wholeSloppy, float boost, int slop) { |
1907 | 2064 | BooleanQuery bq = new BooleanQuery(true); |
1908 | | - if(words.size() == 1){ |
1909 | | - PositionalQuery pq = makePositional(words,field,options,0,1f); |
1910 | | - bq.add(pq,Occur.SHOULD); |
1911 | | - } else{ |
| 2065 | + if (words.size() == 1) { |
| 2066 | + PositionalQuery pq = makePositional(words, field, options, 0, 1f); |
| 2067 | + bq.add(pq, Occur.SHOULD); |
| 2068 | + } else { |
1912 | 2069 | // add words |
1913 | | - for(String w : words){ |
| 2070 | + for (String w : words) { |
1914 | 2071 | PositionalQuery pq = new PositionalQuery(options); |
1915 | | - pq.add(new Term(field,w)); |
1916 | | - bq.add(pq,Occur.SHOULD); |
| 2072 | + pq.add(new Term(field, w)); |
| 2073 | + bq.add(pq, Occur.SHOULD); |
1917 | 2074 | } |
1918 | 2075 | // phrases |
1919 | | - int i =0; |
| 2076 | + int i = 0; |
1920 | 2077 | ArrayList<String> phrase = new ArrayList<String>(); |
1921 | | - while(i < words.size()){ |
| 2078 | + while (i < words.size()) { |
1922 | 2079 | phrase.clear(); |
1923 | | - for(;i<words.size();i++){ |
| 2080 | + for (; i < words.size(); i++) { |
1924 | 2081 | String w = words.get(i); |
1925 | | - if(phrase.size() == 0 || stopWords.contains(w)) |
| 2082 | + if (phrase.size() == 0 || stopWords.contains(w)) |
1926 | 2083 | phrase.add(w); |
1927 | | - else{ |
1928 | | - phrase.add(w); |
| 2084 | + else { |
| 2085 | + phrase.add(w); |
1929 | 2086 | break; |
1930 | 2087 | } |
1931 | 2088 | } |
1932 | | - if(phrase.size() > 1) |
1933 | | - bq.add(makePositional(phrase,field,options,0,phrase.size()),Occur.SHOULD); |
| 2089 | + if (phrase.size() > 1) |
| 2090 | + bq.add(makePositional(phrase, field, options, 0, |
| 2091 | + phrase.size()), Occur.SHOULD); |
1934 | 2092 | } |
1935 | 2093 | } |
1936 | 2094 | // add the whole-only query |
1937 | | - if(whole != null) |
1938 | | - bq.add(makePositional(words,field,whole,slop,1),Occur.SHOULD); |
1939 | | - if(wholeSloppy != null){ |
1940 | | - Query ws = makePositional(words,field,wholeSloppy,slop,1,false); |
1941 | | - if(ws != null) |
1942 | | - bq.add(ws,Occur.SHOULD); |
| 2095 | + if (whole != null) |
| 2096 | + bq.add(makePositional(words, field, whole, slop, 1), Occur.SHOULD); |
| 2097 | + if (wholeSloppy != null) { |
| 2098 | + Query ws = makePositional(words, field, wholeSloppy, slop, 1, false); |
| 2099 | + if (ws != null) |
| 2100 | + bq.add(ws, Occur.SHOULD); |
1943 | 2101 | } |
1944 | 2102 | bq.setBoost(boost); |
1945 | | - |
| 2103 | + |
1946 | 2104 | return bq; |
1947 | 2105 | } |
1948 | | - |
| 2106 | + |
1949 | 2107 | /** Make query with short subphrases anchored in non-stop words */ |
1950 | | - protected Query makeAnchoredQueryMulti(ParsedWords words, String field, |
1951 | | - PositionalOptions options, PositionalOptions whole, int slopWhole, float boost){ |
| 2108 | + protected Query makeAnchoredQueryMulti(ParsedWords words, String field, |
| 2109 | + PositionalOptions options, PositionalOptions whole, int slopWhole, |
| 2110 | + float boost) { |
1952 | 2111 | BooleanQuery bq = new BooleanQuery(true); |
1953 | 2112 | // for one word will make whole only |
1954 | | - if(words.size() >= 2){ |
| 2113 | + if (words.size() >= 2) { |
1955 | 2114 | // add single words |
1956 | | - for(int i=0;i<words.size();i++){ |
1957 | | - if(!stopWords.contains(words.firstAt(i))) // skip single stop words |
1958 | | - bq.add(makePositionalMulti(words.cloneSingleWord(i),field,options,0,1),Occur.SHOULD); |
| 2115 | + for (int i = 0; i < words.size(); i++) { |
| 2116 | + if (!stopWords.contains(words.firstAt(i))) // skip single stop |
| 2117 | + // words |
| 2118 | + bq.add(makePositionalMulti(words.cloneSingleWord(i), field, |
| 2119 | + options, 0, 1), Occur.SHOULD); |
1959 | 2120 | } |
1960 | 2121 | // add two words to score higher two-word correlations |
1961 | | - if(words.size() >= 3){ |
1962 | | - for(int i=0;i<words.size()-1;){ |
| 2122 | + if (words.size() >= 3) { |
| 2123 | + for (int i = 0; i < words.size() - 1;) { |
1963 | 2124 | int i1 = i; // first word |
1964 | 2125 | int i2 = i1 + 1; // second non-stop word |
1965 | | - for(; i2<words.size()-1; i2++){ |
1966 | | - if(!stopWords.contains(words.firstAt(i2))) |
1967 | | - break; |
| 2126 | + for (; i2 < words.size() - 1; i2++) { |
| 2127 | + if (!stopWords.contains(words.firstAt(i2))) |
| 2128 | + break; |
1968 | 2129 | } |
1969 | | - bq.add(makePositionalMulti(words.cloneRange(i1,i2),field,options,10,2),Occur.SHOULD); |
| 2130 | + bq.add(makePositionalMulti(words.cloneRange(i1, i2), field, |
| 2131 | + options, 10, 2), Occur.SHOULD); |
1970 | 2132 | i = i2; |
1971 | 2133 | } |
1972 | | - } |
| 2134 | + } |
1973 | 2135 | } |
1974 | 2136 | // add the whole-only query |
1975 | | - if(whole != null) |
1976 | | - bq.add(makePositionalMulti(words,field,whole,slopWhole,1),Occur.SHOULD); |
1977 | | - |
| 2137 | + if (whole != null) |
| 2138 | + bq.add(makePositionalMulti(words, field, whole, slopWhole, 1), |
| 2139 | + Occur.SHOULD); |
| 2140 | + |
1978 | 2141 | bq.setBoost(boost); |
1979 | | - |
| 2142 | + |
1980 | 2143 | return bq; |
1981 | 2144 | } |
1982 | | - |
| 2145 | + |
1983 | 2146 | /** Query for section headings */ |
1984 | | - protected Query makeSectionsQuery(ParsedWords words, float boost){ |
1985 | | - return makeAnchoredQueryMulti(words,fields.sections(),new PositionalOptions.Sections(),new PositionalOptions.SectionsWhole(),0,boost); |
| 2147 | + protected Query makeSectionsQuery(ParsedWords words, float boost) { |
| 2148 | + return makeAnchoredQueryMulti(words, fields.sections(), |
| 2149 | + new PositionalOptions.Sections(), |
| 2150 | + new PositionalOptions.SectionsWhole(), 0, boost); |
1986 | 2151 | } |
1987 | | - |
| 2152 | + |
1988 | 2153 | /** Relevance metrics based on rank (of titles and redirects) */ |
1989 | | - protected Query makeAlttitleRelevance(ParsedWords words, float boost){ |
1990 | | - return makeAnchoredQueryMulti(words,fields.alttitle(),new PositionalOptions.Alttitle(),new PositionalOptions.AlttitleWholeSloppy(),20,boost); |
| 2154 | + protected Query makeAlttitleRelevance(ParsedWords words, float boost) { |
| 2155 | + return makeAnchoredQueryMulti(words, fields.alttitle(), |
| 2156 | + new PositionalOptions.Alttitle(), |
| 2157 | + new PositionalOptions.AlttitleWholeSloppy(), 20, boost); |
1991 | 2158 | } |
1992 | | - |
| 2159 | + |
1993 | 2160 | /** Make relevance metrics based on context via related articles */ |
1994 | | - protected Query makeRelatedRelevance(ParsedWords words, float boost){ |
1995 | | - return makeAnchoredQueryMulti(words,fields.related(),new PositionalOptions.Related(),new PositionalOptions.RelatedWhole(),0,boost); |
| 2161 | + protected Query makeRelatedRelevance(ParsedWords words, float boost) { |
| 2162 | + return makeAnchoredQueryMulti(words, fields.related(), |
| 2163 | + new PositionalOptions.Related(), |
| 2164 | + new PositionalOptions.RelatedWhole(), 0, boost); |
1996 | 2165 | } |
1997 | | - |
| 2166 | + |
1998 | 2167 | /** Relevance metrics based on rank (of titles and redirects) */ |
1999 | | - protected Query makeAlttitleRelevance(ArrayList<String> words, float boost){ |
2000 | | - return makeAnchoredQuery(words,fields.alttitle(),new PositionalOptions.Alttitle(),new PositionalOptions.AlttitleWhole(), new PositionalOptions.AlttitleWholeSloppy(),boost,20); |
| 2168 | + protected Query makeAlttitleRelevance(ArrayList<String> words, float boost) { |
| 2169 | + return makeAnchoredQuery(words, fields.alttitle(), |
| 2170 | + new PositionalOptions.Alttitle(), |
| 2171 | + new PositionalOptions.AlttitleWhole(), |
| 2172 | + new PositionalOptions.AlttitleWholeSloppy(), boost, 20); |
2001 | 2173 | } |
2002 | 2174 | |
2003 | | - |
2004 | 2175 | /** Make relevance metrics based on context via related articles */ |
2005 | | - protected Query makeRelatedRelevance(ArrayList<String> words, float boost){ |
2006 | | - return makeAnchoredQuery(words,fields.related(),new PositionalOptions.Related(),null,null,boost,0); |
| 2176 | + protected Query makeRelatedRelevance(ArrayList<String> words, float boost) { |
| 2177 | + return makeAnchoredQuery(words, fields.related(), |
| 2178 | + new PositionalOptions.Related(), null, null, boost, 0); |
2007 | 2179 | } |
2008 | 2180 | |
2009 | | - |
2010 | | - /** Additional query to match words in redirects that are not in title or article */ |
2011 | | - protected Query makeAlttitleForRedirects(ArrayList<String> words, int slop, float boost){ |
2012 | | - return makePositional(words,fields.alttitle(),new PositionalOptions.RedirectMatch(),slop,boost); |
| 2181 | + /** |
| 2182 | + * Additional query to match words in redirects that are not in title or |
| 2183 | + * article |
| 2184 | + */ |
| 2185 | + protected Query makeAlttitleForRedirects(ArrayList<String> words, int slop, |
| 2186 | + float boost) { |
| 2187 | + return makePositional(words, fields.alttitle(), |
| 2188 | + new PositionalOptions.RedirectMatch(), slop, boost); |
2013 | 2189 | } |
2014 | 2190 | |
2015 | | - protected Query makeAlttitleForRedirectsMulti(ParsedWords words, int slop, float boost){ |
2016 | | - return makePositionalMulti(words,fields.alttitle(),new PositionalOptions.RedirectMatch(),slop,boost); |
| 2191 | + protected Query makeAlttitleForRedirectsMulti(ParsedWords words, int slop, |
| 2192 | + float boost) { |
| 2193 | + return makePositionalMulti(words, fields.alttitle(), |
| 2194 | + new PositionalOptions.RedirectMatch(), slop, boost); |
2017 | 2195 | } |
2018 | | - |
2019 | | - /** Make alttitle phrase for titles indexes */ |
2020 | | - public Query makeAlttitleForTitles(List<String> words){ |
| 2196 | + |
| 2197 | + /** Make alttitle phrase for titles indexes */ |
| 2198 | + public Query makeAlttitleForTitles(List<String> words) { |
2021 | 2199 | BooleanQuery main = new BooleanQuery(true); |
2022 | 2200 | |
2023 | | - PositionalQuery exact = new PositionalQuery(new PositionalOptions.AlttitleExact()); |
2024 | | - PositionalQuery sloppy = new PositionalQuery(new PositionalOptions.AlttitleSloppy()); |
| 2201 | + PositionalQuery exact = new PositionalQuery( |
| 2202 | + new PositionalOptions.AlttitleExact()); |
| 2203 | + PositionalQuery sloppy = new PositionalQuery( |
| 2204 | + new PositionalOptions.AlttitleSloppy()); |
2025 | 2205 | |
2026 | 2206 | // make exact + sloppy |
2027 | 2207 | int pos = 0; |
2028 | | - for(String w : words){ |
2029 | | - Term term = new Term(fields.alttitle(),w); |
| 2208 | + for (String w : words) { |
| 2209 | + Term term = new Term(fields.alttitle(), w); |
2030 | 2210 | boolean isStop = stopWords.contains(w); |
2031 | | - exact.add(term,isStop); |
2032 | | - if(!isStop) |
2033 | | - sloppy.add(term,pos,isStop); // maintain gaps |
| 2211 | + exact.add(term, isStop); |
| 2212 | + if (!isStop) |
| 2213 | + sloppy.add(term, pos, isStop); // maintain gaps |
2034 | 2214 | pos++; |
2035 | 2215 | } |
2036 | | - if(sloppy.getTerms().length == 0) |
| 2216 | + if (sloppy.getTerms().length == 0) |
2037 | 2217 | return exact; |
2038 | | - |
| 2218 | + |
2039 | 2219 | sloppy.setSlop(10); |
2040 | | - main.add(exact,Occur.SHOULD); |
2041 | | - main.add(sloppy,Occur.SHOULD); |
| 2220 | + main.add(exact, Occur.SHOULD); |
| 2221 | + main.add(sloppy, Occur.SHOULD); |
2042 | 2222 | main.setBoost(1); |
2043 | 2223 | return main; |
2044 | | - |
| 2224 | + |
2045 | 2225 | } |
2046 | | - |
| 2226 | + |
2047 | 2227 | /** Make a query to search grouped titles indexes */ |
2048 | | - public Query parseForTitles(String queryText){ |
| 2228 | + public Query parseForTitles(String queryText) { |
2049 | 2229 | String oldDefaultField = this.defaultField; |
2050 | 2230 | NamespacePolicy oldPolicy = this.namespacePolicy; |
2051 | 2231 | FieldBuilder.BuilderSet oldBuilder = this.builder; |
2052 | 2232 | this.defaultField = "alttitle"; |
2053 | 2233 | this.namespacePolicy = NamespacePolicy.IGNORE; |
2054 | | - |
| 2234 | + |
2055 | 2235 | Query q = parseRaw(queryText); |
2056 | 2236 | |
2057 | 2237 | ParsedWords words = parsedWords; |
2058 | | - |
2059 | | - this.builder = oldBuilder; |
| 2238 | + |
| 2239 | + this.builder = oldBuilder; |
2060 | 2240 | this.defaultField = oldDefaultField; |
2061 | 2241 | this.namespacePolicy = oldPolicy; |
2062 | | - |
| 2242 | + |
2063 | 2243 | BooleanQuery full = new BooleanQuery(true); |
2064 | | - full.add(q,Occur.MUST); |
| 2244 | + full.add(q, Occur.MUST); |
2065 | 2245 | |
2066 | | - if(words.size() == 0) |
| 2246 | + if (words.size() == 0) |
2067 | 2247 | return q; |
2068 | | - |
2069 | | - // match whole titles |
2070 | | - Query redirectsMulti = makeAlttitleForRedirectsMulti(makeFirstAndSingular(words),20,1f); |
2071 | | - if(redirectsMulti != null) |
2072 | | - full.add(redirectsMulti,Occur.SHOULD); |
2073 | | - |
| 2248 | + |
| 2249 | + // match whole titles |
| 2250 | + Query redirectsMulti = makeAlttitleForRedirectsMulti( |
| 2251 | + makeFirstAndSingular(words), 20, 1f); |
| 2252 | + if (redirectsMulti != null) |
| 2253 | + full.add(redirectsMulti, Occur.SHOULD); |
| 2254 | + |
2074 | 2255 | ArticleNamespaceScaling nsScale = iid.getNamespaceScaling(); |
2075 | | - return new ArticleQueryWrap(full,new ArticleInfoImpl(),null,null,nsScale); |
2076 | | - |
| 2256 | + return new ArticleQueryWrap(full, new ArticleInfoImpl(), null, null, |
| 2257 | + nsScale); |
| 2258 | + |
2077 | 2259 | } |
2078 | | - |
| 2260 | + |
2079 | 2261 | /** check if all the words in the array are stop words */ |
2080 | | - private boolean allStopWords(ArrayList<String> words){ |
2081 | | - if(words == null || words.size() == 0) |
| 2262 | + private boolean allStopWords(ArrayList<String> words) { |
| 2263 | + if (words == null || words.size() == 0) |
2082 | 2264 | return false; |
2083 | | - for(String w : words){ |
2084 | | - if(!stopWords.contains(w)){ |
| 2265 | + for (String w : words) { |
| 2266 | + if (!stopWords.contains(w)) { |
2085 | 2267 | return false; |
2086 | 2268 | } |
2087 | 2269 | } |
— | — | @@ -2091,16 +2273,17 @@ |
2092 | 2274 | public Term[] getHighlightTerms() { |
2093 | 2275 | return highlightTerms; |
2094 | 2276 | } |
2095 | | - |
| 2277 | + |
2096 | 2278 | /** @return if last parsed query had wildcards in it */ |
2097 | | - public boolean hasWildcards(){ |
2098 | | - return wildcards!=null && wildcards.hasWildcards(); |
| 2279 | + public boolean hasWildcards() { |
| 2280 | + return wildcards != null && wildcards.hasWildcards(); |
2099 | 2281 | } |
| 2282 | + |
2100 | 2283 | /** @return if last parsed query has fuzzy words in it */ |
2101 | | - public boolean hasFuzzy(){ |
2102 | | - return fuzzy!=null && fuzzy.hasFuzzy(); |
| 2284 | + public boolean hasFuzzy() { |
| 2285 | + return fuzzy != null && fuzzy.hasFuzzy(); |
2103 | 2286 | } |
2104 | | - |
| 2287 | + |
2105 | 2288 | public void setNamespacePolicy(NamespacePolicy namespacePolicy) { |
2106 | 2289 | this.namespacePolicy = namespacePolicy; |
2107 | 2290 | } |
— | — | @@ -2108,13 +2291,13 @@ |
2109 | 2292 | public ArrayList<String> getWordsClean() { |
2110 | 2293 | return cleanupWords(parsedWords.extractFirst()); |
2111 | 2294 | } |
2112 | | - |
2113 | | - public boolean hasPrefixFilters(){ |
2114 | | - return prefixFilters != null && prefixFilters.length>0; |
| 2295 | + |
| 2296 | + public boolean hasPrefixFilters() { |
| 2297 | + return prefixFilters != null && prefixFilters.length > 0; |
2115 | 2298 | } |
2116 | | - |
| 2299 | + |
2117 | 2300 | /** Gets the raw prefix text, e.g. project:npov */ |
2118 | | - public String[] getPrefixFilters(){ |
| 2301 | + public String[] getPrefixFilters() { |
2119 | 2302 | return prefixFilters; |
2120 | 2303 | } |
2121 | 2304 | |
— | — | @@ -2123,7 +2306,4 @@ |
2124 | 2307 | return urls; |
2125 | 2308 | } |
2126 | 2309 | |
2127 | | - |
2128 | | - |
2129 | | - |
2130 | 2310 | } |