Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -31,6 +31,14 @@ |
32 | 32 | [Index-Path] |
33 | 33 | <default> : /mwsearch |
34 | 34 | |
| 35 | +# Global properies |
| 36 | +[Properties] |
| 37 | +# suffixes to database name, the rest is assumed to be language code |
| 38 | +Database.suffix=wiki wiktionary |
| 39 | + |
| 40 | +# dbnames that end with the suffix will use additional keywords scores |
| 41 | +KeywordScoring.suffix=wiki wikilucene wikidev |
| 42 | + |
35 | 43 | # Put here you custom namespace prefixes |
36 | 44 | # Syntax: <prefix_name> : <coma separated list of namespaces> |
37 | 45 | # <all> is a special keyword meaning all namespaces |
— | — | @@ -54,4 +62,3 @@ |
55 | 63 | [14] : 14 |
56 | 64 | [15] : 15 |
57 | 65 | |
58 | | - |
Index: trunk/lucene-search-2.0/lsearch.conf |
— | — | @@ -57,9 +57,6 @@ |
58 | 58 | # URL to message files, {0} is replaced with language code, i.e. En |
59 | 59 | Localization.url=file:///var/www/html/wiki-lucene/phase3/languages/messages/Messages{0}.php |
60 | 60 | |
61 | | -# suffixes to database name, the rest is assumed to be language code |
62 | | -Database.suffix=test |
63 | | - |
64 | 61 | # Pattern for OAI repo. {0} is replaced with dbname, {1} with language |
65 | 62 | OAI.repo=http://localhost/wiki-lucene/phase3/index.php/Special:OAIRepository |
66 | 63 | |
Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | entest : (ngram), (aspell,en) |
14 | 14 | detest,rutest : (single,true,2,10) |
15 | 15 | frtest : (split,3) (part1) (part2) (part3) |
| 16 | +srwiki : (single) |
16 | 17 | |
17 | 18 | # Search nodes |
18 | 19 | # host : db1.role, db2.role |
— | — | @@ -34,6 +35,7 @@ |
35 | 36 | 192.168.0.5 : detest, rutest, frtest |
36 | 37 | 192.168.0.2 : entest.ngram |
37 | 38 | 192.168.0.2 : frtest.part1, frtest.part2, frtest.part3 |
| 39 | +192.168.0.10 : srwiki |
38 | 40 | |
39 | 41 | # Path where indexes are on hosts, after default value put hosts where |
40 | 42 | # the location differs |
— | — | @@ -41,6 +43,13 @@ |
42 | 44 | <default> : /mwsearch |
43 | 45 | 192.168.0.5 : mwsearch2 |
44 | 46 | |
| 47 | +[Properties] |
| 48 | +# suffixes to database name, the rest is assumed to be language code |
| 49 | +Database.suffix=wiki wiktionary test |
| 50 | + |
| 51 | +# dbnames that end with the suffix will use additional keywords scores |
| 52 | +KeywordScoring.suffix=wiki rutest |
| 53 | + |
45 | 54 | # databases can be writen as {file}, where file contains list of dbs |
46 | 55 | |
47 | 56 | # Put here you custom namespace prefixes |
Index: trunk/lucene-search-2.0/src/org/apache/commons/lang/WordUtils.java |
— | — | @@ -0,0 +1,584 @@ |
| 2 | +/* |
| 3 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 4 | + * contributor license agreements. See the NOTICE file distributed with |
| 5 | + * this work for additional information regarding copyright ownership. |
| 6 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 7 | + * (the "License"); you may not use this file except in compliance with |
| 8 | + * the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, software |
| 13 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | + * See the License for the specific language governing permissions and |
| 16 | + * limitations under the License. |
| 17 | + */ |
| 18 | +package org.apache.commons.lang; |
| 19 | + |
| 20 | +/** |
| 21 | + * <p>Operations on Strings that contain words.</p> |
| 22 | + * |
| 23 | + * <p>This class tries to handle <code>null</code> input gracefully. |
| 24 | + * An exception will not be thrown for a <code>null</code> input. |
| 25 | + * Each method documents its behaviour in more detail.</p> |
| 26 | + * |
| 27 | + * @author Apache Jakarta Velocity |
| 28 | + * @author Stephen Colebourne |
| 29 | + * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a> |
| 30 | + * @author Gary Gregory |
| 31 | + * @since 2.0 |
| 32 | + * @version $Id: WordUtils.java 471626 2006-11-06 04:02:09Z bayard $ |
| 33 | + */ |
| 34 | +public class WordUtils { |
| 35 | + |
| 36 | + /** |
| 37 | + * <p><code>WordUtils</code> instances should NOT be constructed in |
| 38 | + * standard programming. Instead, the class should be used as |
| 39 | + * <code>WordUtils.wrap("foo bar", 20);</code>.</p> |
| 40 | + * |
| 41 | + * <p>This constructor is public to permit tools that require a JavaBean |
| 42 | + * instance to operate.</p> |
| 43 | + */ |
| 44 | + public WordUtils() { |
| 45 | + super(); |
| 46 | + } |
| 47 | + |
| 48 | + // Wrapping |
| 49 | + //-------------------------------------------------------------------------- |
| 50 | +// /** |
| 51 | +// * <p>Wraps a block of text to a specified line length using '\n' as |
| 52 | +// * a newline.</p> |
| 53 | +// * |
| 54 | +// * <p>This method takes a block of text, which might have long lines in it |
| 55 | +// * and wraps the long lines based on the supplied lineLength parameter.</p> |
| 56 | +// * |
| 57 | +// * <p>If a single word is longer than the line length (eg. a URL), it will |
| 58 | +// * not be broken, and will display beyond the expected width.</p> |
| 59 | +// * |
| 60 | +// * <p>If there are tabs in inString, you are going to get results that are |
| 61 | +// * a bit strange. Tabs are a single character but are displayed as 4 or 8 |
| 62 | +// * spaces. Remove the tabs.</p> |
| 63 | +// * |
| 64 | +// * @param str text which is in need of word-wrapping, may be null |
| 65 | +// * @param lineLength the column to wrap the words at |
| 66 | +// * @return the text with all the long lines word-wrapped |
| 67 | +// * <code>null</code> if null string input |
| 68 | +// */ |
| 69 | +// public static String wrapText(String str, int lineLength) { |
| 70 | +// return wrap(str, null, lineLength); |
| 71 | +// } |
| 72 | + |
| 73 | +// /** |
| 74 | +// * <p>Wraps a block of text to a specified line length.</p> |
| 75 | +// * |
| 76 | +// * <p>This method takes a block of text, which might have long lines in it |
| 77 | +// * and wraps the long lines based on the supplied lineLength parameter.</p> |
| 78 | +// * |
| 79 | +// * <p>If a single word is longer than the wrapColumn (eg. a URL), it will |
| 80 | +// * not be broken, and will display beyond the expected width.</p> |
| 81 | +// * |
| 82 | +// * <p>If there are tabs in inString, you are going to get results that are |
| 83 | +// * a bit strange. Tabs are a single character but are displayed as 4 or 8 |
| 84 | +// * spaces. Remove the tabs.</p> |
| 85 | +// * |
| 86 | +// * @param str text which is in need of word-wrapping, may be null |
| 87 | +// * @param newLineChars the characters that define a newline, null treated as \n |
| 88 | +// * @param lineLength the column to wrap the words at |
| 89 | +// * @return the text with all the long lines word-wrapped |
| 90 | +// * <code>null</code> if null string input |
| 91 | +// */ |
| 92 | +// public static String wrapText(String str, String newLineChars, int lineLength) { |
| 93 | +// if (str == null) { |
| 94 | +// return null; |
| 95 | +// } |
| 96 | +// if (newLineChars == null) { |
| 97 | +// newLineChars = "\n"; |
| 98 | +// } |
| 99 | +// StringTokenizer lineTokenizer = new StringTokenizer(str, newLineChars, true); |
| 100 | +// StringBuffer stringBuffer = new StringBuffer(); |
| 101 | +// |
| 102 | +// while (lineTokenizer.hasMoreTokens()) { |
| 103 | +// try { |
| 104 | +// String nextLine = lineTokenizer.nextToken(); |
| 105 | +// |
| 106 | +// if (nextLine.length() > lineLength) { |
| 107 | +// // This line is long enough to be wrapped. |
| 108 | +// nextLine = wrapLine(nextLine, null, lineLength, false); |
| 109 | +// } |
| 110 | +// |
| 111 | +// stringBuffer.append(nextLine); |
| 112 | +// |
| 113 | +// } catch (NoSuchElementException nsee) { |
| 114 | +// // thrown by nextToken(), but I don't know why it would |
| 115 | +// break; |
| 116 | +// } |
| 117 | +// } |
| 118 | +// |
| 119 | +// return stringBuffer.toString(); |
| 120 | +// } |
| 121 | + |
| 122 | + // Wrapping |
| 123 | + //----------------------------------------------------------------------- |
| 124 | + /** |
| 125 | + * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p> |
| 126 | + * |
| 127 | + * <p>New lines will be separated by the system property line separator. |
| 128 | + * Very long words, such as URLs will <i>not</i> be wrapped.</p> |
| 129 | + * |
| 130 | + * <p>Leading spaces on a new line are stripped. |
| 131 | + * Trailing spaces are not stripped.</p> |
| 132 | + * |
| 133 | + * <pre> |
| 134 | + * WordUtils.wrap(null, *) = null |
| 135 | + * WordUtils.wrap("", *) = "" |
| 136 | + * </pre> |
| 137 | + * |
| 138 | + * @param str the String to be word wrapped, may be null |
| 139 | + * @param wrapLength the column to wrap the words at, less than 1 is treated as 1 |
| 140 | + * @return a line with newlines inserted, <code>null</code> if null input |
| 141 | + */ |
| 142 | + public static String wrap(String str, int wrapLength) { |
| 143 | + return wrap(str, wrapLength, null, false); |
| 144 | + } |
| 145 | + |
| 146 | + /** |
| 147 | + * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p> |
| 148 | + * |
| 149 | + * <p>Leading spaces on a new line are stripped. |
| 150 | + * Trailing spaces are not stripped.</p> |
| 151 | + * |
| 152 | + * <pre> |
| 153 | + * WordUtils.wrap(null, *, *, *) = null |
| 154 | + * WordUtils.wrap("", *, *, *) = "" |
| 155 | + * </pre> |
| 156 | + * |
| 157 | + * @param str the String to be word wrapped, may be null |
| 158 | + * @param wrapLength the column to wrap the words at, less than 1 is treated as 1 |
| 159 | + * @param newLineStr the string to insert for a new line, |
| 160 | + * <code>null</code> uses the system property line separator |
| 161 | + * @param wrapLongWords true if long words (such as URLs) should be wrapped |
| 162 | + * @return a line with newlines inserted, <code>null</code> if null input |
| 163 | + */ |
| 164 | + public static String wrap(String str, int wrapLength, String newLineStr, boolean wrapLongWords) { |
| 165 | + if (str == null) { |
| 166 | + return null; |
| 167 | + } |
| 168 | + if (newLineStr == null) { |
| 169 | + newLineStr = System.getProperty("line.separator"); |
| 170 | + } |
| 171 | + if (wrapLength < 1) { |
| 172 | + wrapLength = 1; |
| 173 | + } |
| 174 | + int inputLineLength = str.length(); |
| 175 | + int offset = 0; |
| 176 | + StringBuffer wrappedLine = new StringBuffer(inputLineLength + 32); |
| 177 | + |
| 178 | + while ((inputLineLength - offset) > wrapLength) { |
| 179 | + if (str.charAt(offset) == ' ') { |
| 180 | + offset++; |
| 181 | + continue; |
| 182 | + } |
| 183 | + int spaceToWrapAt = str.lastIndexOf(' ', wrapLength + offset); |
| 184 | + |
| 185 | + if (spaceToWrapAt >= offset) { |
| 186 | + // normal case |
| 187 | + wrappedLine.append(str.substring(offset, spaceToWrapAt)); |
| 188 | + wrappedLine.append(newLineStr); |
| 189 | + offset = spaceToWrapAt + 1; |
| 190 | + |
| 191 | + } else { |
| 192 | + // really long word or URL |
| 193 | + if (wrapLongWords) { |
| 194 | + // wrap really long word one line at a time |
| 195 | + wrappedLine.append(str.substring(offset, wrapLength + offset)); |
| 196 | + wrappedLine.append(newLineStr); |
| 197 | + offset += wrapLength; |
| 198 | + } else { |
| 199 | + // do not wrap really long word, just extend beyond limit |
| 200 | + spaceToWrapAt = str.indexOf(' ', wrapLength + offset); |
| 201 | + if (spaceToWrapAt >= 0) { |
| 202 | + wrappedLine.append(str.substring(offset, spaceToWrapAt)); |
| 203 | + wrappedLine.append(newLineStr); |
| 204 | + offset = spaceToWrapAt + 1; |
| 205 | + } else { |
| 206 | + wrappedLine.append(str.substring(offset)); |
| 207 | + offset = inputLineLength; |
| 208 | + } |
| 209 | + } |
| 210 | + } |
| 211 | + } |
| 212 | + |
| 213 | + // Whatever is left in line is short enough to just pass through |
| 214 | + wrappedLine.append(str.substring(offset)); |
| 215 | + |
| 216 | + return wrappedLine.toString(); |
| 217 | + } |
| 218 | + |
| 219 | + // Capitalizing |
| 220 | + //----------------------------------------------------------------------- |
| 221 | + /** |
| 222 | + * <p>Capitalizes all the whitespace separated words in a String. |
| 223 | + * Only the first letter of each word is changed. To convert the |
| 224 | + * rest of each word to lowercase at the same time, |
| 225 | + * use {@link #capitalizeFully(String)}.</p> |
| 226 | + * |
| 227 | + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 228 | + * A <code>null</code> input String returns <code>null</code>. |
| 229 | + * Capitalization uses the unicode title case, normally equivalent to |
| 230 | + * upper case.</p> |
| 231 | + * |
| 232 | + * <pre> |
| 233 | + * WordUtils.capitalize(null) = null |
| 234 | + * WordUtils.capitalize("") = "" |
| 235 | + * WordUtils.capitalize("i am FINE") = "I Am FINE" |
| 236 | + * </pre> |
| 237 | + * |
| 238 | + * @param str the String to capitalize, may be null |
| 239 | + * @return capitalized String, <code>null</code> if null String input |
| 240 | + * @see #uncapitalize(String) |
| 241 | + * @see #capitalizeFully(String) |
| 242 | + */ |
| 243 | + public static String capitalize(String str) { |
| 244 | + return capitalize(str, null); |
| 245 | + } |
| 246 | + |
| 247 | + /** |
| 248 | + * <p>Capitalizes all the delimiter separated words in a String. |
| 249 | + * Only the first letter of each word is changed. To convert the |
| 250 | + * rest of each word to lowercase at the same time, |
| 251 | + * use {@link #capitalizeFully(String, char[])}.</p> |
| 252 | + * |
| 253 | + * <p>The delimiters represent a set of characters understood to separate words. |
| 254 | + * The first string character and the first non-delimiter character after a |
| 255 | + * delimiter will be capitalized. </p> |
| 256 | + * |
| 257 | + * <p>A <code>null</code> input String returns <code>null</code>. |
| 258 | + * Capitalization uses the unicode title case, normally equivalent to |
| 259 | + * upper case.</p> |
| 260 | + * |
| 261 | + * <pre> |
| 262 | + * WordUtils.capitalize(null, *) = null |
| 263 | + * WordUtils.capitalize("", *) = "" |
| 264 | + * WordUtils.capitalize(*, new char[0]) = * |
| 265 | + * WordUtils.capitalize("i am fine", null) = "I Am Fine" |
| 266 | + * WordUtils.capitalize("i aM.fine", {'.'}) = "I aM.Fine" |
| 267 | + * </pre> |
| 268 | + * |
| 269 | + * @param str the String to capitalize, may be null |
| 270 | + * @param delimiters set of characters to determine capitalization, null means whitespace |
| 271 | + * @return capitalized String, <code>null</code> if null String input |
| 272 | + * @see #uncapitalize(String) |
| 273 | + * @see #capitalizeFully(String) |
| 274 | + * @since 2.1 |
| 275 | + */ |
| 276 | + public static String capitalize(String str, char[] delimiters) { |
| 277 | + int delimLen = (delimiters == null ? -1 : delimiters.length); |
| 278 | + if (str == null || str.length() == 0 || delimLen == 0) { |
| 279 | + return str; |
| 280 | + } |
| 281 | + int strLen = str.length(); |
| 282 | + StringBuffer buffer = new StringBuffer(strLen); |
| 283 | + boolean capitalizeNext = true; |
| 284 | + for (int i = 0; i < strLen; i++) { |
| 285 | + char ch = str.charAt(i); |
| 286 | + |
| 287 | + if (isDelimiter(ch, delimiters)) { |
| 288 | + buffer.append(ch); |
| 289 | + capitalizeNext = true; |
| 290 | + } else if (capitalizeNext) { |
| 291 | + buffer.append(Character.toTitleCase(ch)); |
| 292 | + capitalizeNext = false; |
| 293 | + } else { |
| 294 | + buffer.append(ch); |
| 295 | + } |
| 296 | + } |
| 297 | + return buffer.toString(); |
| 298 | + } |
| 299 | + |
| 300 | + //----------------------------------------------------------------------- |
| 301 | + /** |
| 302 | + * <p>Converts all the whitespace separated words in a String into capitalized words, |
| 303 | + * that is each word is made up of a titlecase character and then a series of |
| 304 | + * lowercase characters. </p> |
| 305 | + * |
| 306 | + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 307 | + * A <code>null</code> input String returns <code>null</code>. |
| 308 | + * Capitalization uses the unicode title case, normally equivalent to |
| 309 | + * upper case.</p> |
| 310 | + * |
| 311 | + * <pre> |
| 312 | + * WordUtils.capitalizeFully(null) = null |
| 313 | + * WordUtils.capitalizeFully("") = "" |
| 314 | + * WordUtils.capitalizeFully("i am FINE") = "I Am Fine" |
| 315 | + * </pre> |
| 316 | + * |
| 317 | + * @param str the String to capitalize, may be null |
| 318 | + * @return capitalized String, <code>null</code> if null String input |
| 319 | + */ |
| 320 | + public static String capitalizeFully(String str) { |
| 321 | + return capitalizeFully(str, null); |
| 322 | + } |
| 323 | + |
| 324 | + /** |
| 325 | + * <p>Converts all the delimiter separated words in a String into capitalized words, |
| 326 | + * that is each word is made up of a titlecase character and then a series of |
| 327 | + * lowercase characters. </p> |
| 328 | + * |
| 329 | + * <p>The delimiters represent a set of characters understood to separate words. |
| 330 | + * The first string character and the first non-delimiter character after a |
| 331 | + * delimiter will be capitalized. </p> |
| 332 | + * |
| 333 | + * <p>A <code>null</code> input String returns <code>null</code>. |
| 334 | + * Capitalization uses the unicode title case, normally equivalent to |
| 335 | + * upper case.</p> |
| 336 | + * |
| 337 | + * <pre> |
| 338 | + * WordUtils.capitalizeFully(null, *) = null |
| 339 | + * WordUtils.capitalizeFully("", *) = "" |
| 340 | + * WordUtils.capitalizeFully(*, null) = * |
| 341 | + * WordUtils.capitalizeFully(*, new char[0]) = * |
| 342 | + * WordUtils.capitalizeFully("i aM.fine", {'.'}) = "I am.Fine" |
| 343 | + * </pre> |
| 344 | + * |
| 345 | + * @param str the String to capitalize, may be null |
| 346 | + * @param delimiters set of characters to determine capitalization, null means whitespace |
| 347 | + * @return capitalized String, <code>null</code> if null String input |
| 348 | + * @since 2.1 |
| 349 | + */ |
| 350 | + public static String capitalizeFully(String str, char[] delimiters) { |
| 351 | + int delimLen = (delimiters == null ? -1 : delimiters.length); |
| 352 | + if (str == null || str.length() == 0 || delimLen == 0) { |
| 353 | + return str; |
| 354 | + } |
| 355 | + str = str.toLowerCase(); |
| 356 | + return capitalize(str, delimiters); |
| 357 | + } |
| 358 | + |
| 359 | + //----------------------------------------------------------------------- |
| 360 | + /** |
| 361 | + * <p>Uncapitalizes all the whitespace separated words in a String. |
| 362 | + * Only the first letter of each word is changed.</p> |
| 363 | + * |
| 364 | + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 365 | + * A <code>null</code> input String returns <code>null</code>.</p> |
| 366 | + * |
| 367 | + * <pre> |
| 368 | + * WordUtils.uncapitalize(null) = null |
| 369 | + * WordUtils.uncapitalize("") = "" |
| 370 | + * WordUtils.uncapitalize("I Am FINE") = "i am fINE" |
| 371 | + * </pre> |
| 372 | + * |
| 373 | + * @param str the String to uncapitalize, may be null |
| 374 | + * @return uncapitalized String, <code>null</code> if null String input |
| 375 | + * @see #capitalize(String) |
| 376 | + */ |
| 377 | + public static String uncapitalize(String str) { |
| 378 | + return uncapitalize(str, null); |
| 379 | + } |
| 380 | + |
| 381 | + /** |
| 382 | + * <p>Uncapitalizes all the whitespace separated words in a String. |
| 383 | + * Only the first letter of each word is changed.</p> |
| 384 | + * |
| 385 | + * <p>The delimiters represent a set of characters understood to separate words. |
| 386 | + * The first string character and the first non-delimiter character after a |
| 387 | + * delimiter will be uncapitalized. </p> |
| 388 | + * |
| 389 | + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 390 | + * A <code>null</code> input String returns <code>null</code>.</p> |
| 391 | + * |
| 392 | + * <pre> |
| 393 | + * WordUtils.uncapitalize(null, *) = null |
| 394 | + * WordUtils.uncapitalize("", *) = "" |
| 395 | + * WordUtils.uncapitalize(*, null) = * |
| 396 | + * WordUtils.uncapitalize(*, new char[0]) = * |
| 397 | + * WordUtils.uncapitalize("I AM.FINE", {'.'}) = "i AM.fINE" |
| 398 | + * </pre> |
| 399 | + * |
| 400 | + * @param str the String to uncapitalize, may be null |
| 401 | + * @param delimiters set of characters to determine uncapitalization, null means whitespace |
| 402 | + * @return uncapitalized String, <code>null</code> if null String input |
| 403 | + * @see #capitalize(String) |
| 404 | + * @since 2.1 |
| 405 | + */ |
| 406 | + public static String uncapitalize(String str, char[] delimiters) { |
| 407 | + int delimLen = (delimiters == null ? -1 : delimiters.length); |
| 408 | + if (str == null || str.length() == 0 || delimLen == 0) { |
| 409 | + return str; |
| 410 | + } |
| 411 | + int strLen = str.length(); |
| 412 | + StringBuffer buffer = new StringBuffer(strLen); |
| 413 | + boolean uncapitalizeNext = true; |
| 414 | + for (int i = 0; i < strLen; i++) { |
| 415 | + char ch = str.charAt(i); |
| 416 | + |
| 417 | + if (isDelimiter(ch, delimiters)) { |
| 418 | + buffer.append(ch); |
| 419 | + uncapitalizeNext = true; |
| 420 | + } else if (uncapitalizeNext) { |
| 421 | + buffer.append(Character.toLowerCase(ch)); |
| 422 | + uncapitalizeNext = false; |
| 423 | + } else { |
| 424 | + buffer.append(ch); |
| 425 | + } |
| 426 | + } |
| 427 | + return buffer.toString(); |
| 428 | + } |
| 429 | + |
| 430 | + //----------------------------------------------------------------------- |
| 431 | + /** |
| 432 | + * <p>Swaps the case of a String using a word based algorithm.</p> |
| 433 | + * |
| 434 | + * <ul> |
| 435 | + * <li>Upper case character converts to Lower case</li> |
| 436 | + * <li>Title case character converts to Lower case</li> |
| 437 | + * <li>Lower case character after Whitespace or at start converts to Title case</li> |
| 438 | + * <li>Other Lower case character converts to Upper case</li> |
| 439 | + * </ul> |
| 440 | + * |
| 441 | + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 442 | + * A <code>null</code> input String returns <code>null</code>.</p> |
| 443 | + * |
| 444 | + * <pre> |
| 445 | + * StringUtils.swapCase(null) = null |
| 446 | + * StringUtils.swapCase("") = "" |
| 447 | + * StringUtils.swapCase("The dog has a BONE") = "tHE DOG HAS A bone" |
| 448 | + * </pre> |
| 449 | + * |
| 450 | + * @param str the String to swap case, may be null |
| 451 | + * @return the changed String, <code>null</code> if null String input |
| 452 | + */ |
| 453 | + public static String swapCase(String str) { |
| 454 | + int strLen; |
| 455 | + if (str == null || (strLen = str.length()) == 0) { |
| 456 | + return str; |
| 457 | + } |
| 458 | + StringBuffer buffer = new StringBuffer(strLen); |
| 459 | + |
| 460 | + boolean whitespace = true; |
| 461 | + char ch = 0; |
| 462 | + char tmp = 0; |
| 463 | + |
| 464 | + for (int i = 0; i < strLen; i++) { |
| 465 | + ch = str.charAt(i); |
| 466 | + if (Character.isUpperCase(ch)) { |
| 467 | + tmp = Character.toLowerCase(ch); |
| 468 | + } else if (Character.isTitleCase(ch)) { |
| 469 | + tmp = Character.toLowerCase(ch); |
| 470 | + } else if (Character.isLowerCase(ch)) { |
| 471 | + if (whitespace) { |
| 472 | + tmp = Character.toTitleCase(ch); |
| 473 | + } else { |
| 474 | + tmp = Character.toUpperCase(ch); |
| 475 | + } |
| 476 | + } else { |
| 477 | + tmp = ch; |
| 478 | + } |
| 479 | + buffer.append(tmp); |
| 480 | + whitespace = Character.isWhitespace(ch); |
| 481 | + } |
| 482 | + return buffer.toString(); |
| 483 | + } |
| 484 | + |
| 485 | + //----------------------------------------------------------------------- |
| 486 | + /** |
| 487 | + * <p>Extracts the initial letters from each word in the String.</p> |
| 488 | + * |
| 489 | + * <p>The first letter of the string and all first letters after |
| 490 | + * whitespace are returned as a new string. |
| 491 | + * Their case is not changed.</p> |
| 492 | + * |
| 493 | + * <p>Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 494 | + * A <code>null</code> input String returns <code>null</code>.</p> |
| 495 | + * |
| 496 | + * <pre> |
| 497 | + * WordUtils.initials(null) = null |
| 498 | + * WordUtils.initials("") = "" |
| 499 | + * WordUtils.initials("Ben John Lee") = "BJL" |
| 500 | + * WordUtils.initials("Ben J.Lee") = "BJ" |
| 501 | + * </pre> |
| 502 | + * |
| 503 | + * @param str the String to get initials from, may be null |
| 504 | + * @return String of initial letters, <code>null</code> if null String input |
| 505 | + * @see #initials(String,char[]) |
| 506 | + * @since 2.2 |
| 507 | + */ |
| 508 | + public static String initials(String str) { |
| 509 | + return initials(str, null); |
| 510 | + } |
| 511 | + |
| 512 | + /** |
| 513 | + * <p>Extracts the initial letters from each word in the String.</p> |
| 514 | + * |
| 515 | + * <p>The first letter of the string and all first letters after the |
| 516 | + * defined delimiters are returned as a new string. |
| 517 | + * Their case is not changed.</p> |
| 518 | + * |
| 519 | + * <p>If the delimiters array is null, then Whitespace is used. |
| 520 | + * Whitespace is defined by {@link Character#isWhitespace(char)}. |
| 521 | + * A <code>null</code> input String returns <code>null</code>. |
| 522 | + * An empty delimiter array returns an empty String.</p> |
| 523 | + * |
| 524 | + * <pre> |
| 525 | + * WordUtils.initials(null, *) = null |
| 526 | + * WordUtils.initials("", *) = "" |
| 527 | + * WordUtils.initials("Ben John Lee", null) = "BJL" |
| 528 | + * WordUtils.initials("Ben J.Lee", null) = "BJ" |
| 529 | + * WordUtils.initials("Ben J.Lee", [' ','.']) = "BJL" |
| 530 | + * WordUtils.initials(*, new char[0]) = "" |
| 531 | + * </pre> |
| 532 | + * |
| 533 | + * @param str the String to get initials from, may be null |
| 534 | + * @param delimiters set of characters to determine words, null means whitespace |
| 535 | + * @return String of initial letters, <code>null</code> if null String input |
| 536 | + * @see #initials(String) |
| 537 | + * @since 2.2 |
| 538 | + */ |
| 539 | + public static String initials(String str, char[] delimiters) { |
| 540 | + if (str == null || str.length() == 0) { |
| 541 | + return str; |
| 542 | + } |
| 543 | + if (delimiters != null && delimiters.length == 0) { |
| 544 | + return ""; |
| 545 | + } |
| 546 | + int strLen = str.length(); |
| 547 | + char[] buf = new char[strLen / 2 + 1]; |
| 548 | + int count = 0; |
| 549 | + boolean lastWasGap = true; |
| 550 | + for (int i = 0; i < strLen; i++) { |
| 551 | + char ch = str.charAt(i); |
| 552 | + |
| 553 | + if (isDelimiter(ch, delimiters)) { |
| 554 | + lastWasGap = true; |
| 555 | + } else if (lastWasGap) { |
| 556 | + buf[count++] = ch; |
| 557 | + lastWasGap = false; |
| 558 | + } else { |
| 559 | + // ignore ch |
| 560 | + } |
| 561 | + } |
| 562 | + return new String(buf, 0, count); |
| 563 | + } |
| 564 | + |
| 565 | + //----------------------------------------------------------------------- |
| 566 | + /** |
| 567 | + * Is the character a delimiter. |
| 568 | + * |
| 569 | + * @param ch the character to check |
| 570 | + * @param delimiters the delimiters |
| 571 | + * @return true if it is a delimiter |
| 572 | + */ |
| 573 | + private static boolean isDelimiter(char ch, char[] delimiters) { |
| 574 | + if (delimiters == null) { |
| 575 | + return Character.isWhitespace(ch); |
| 576 | + } |
| 577 | + for (int i = 0, isize = delimiters.length; i < isize; i++) { |
| 578 | + if (ch == delimiters[i]) { |
| 579 | + return true; |
| 580 | + } |
| 581 | + } |
| 582 | + return false; |
| 583 | + } |
| 584 | + |
| 585 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -273,8 +273,33 @@ |
274 | 274 | q = parser.parseTwoPass("[1,12]:beans",NamespacePolicy.REWRITE); |
275 | 275 | assertEquals("(+(namespace:1 namespace:12) +(contents:beans contents:bean^0.5)) (+(namespace:1 namespace:12) +title:beans^2.0)",q.toString()); |
276 | 276 | |
| 277 | + q = parser.parseTwoPass("[1,12]:beans and others incategory:food",NamespacePolicy.REWRITE); |
| 278 | + assertEquals("(+(namespace:1 namespace:12) +(+(contents:beans contents:bean^0.5) +contents:and +(contents:others contents:other^0.5) +category:food)) (+(namespace:1 namespace:12) +(+title:beans^2.0 +title:and^2.0 +title:others^2.0 +category:food))",q.toString()); |
| 279 | + |
277 | 280 | q = parser.parseTwoPass("[1,a12]:beans",NamespacePolicy.IGNORE); |
278 | 281 | assertEquals("(+contents:1 +contents:a12 +(contents:beans contents:bean^0.5)) (+title:1^2.0 +title:a12^2.0 +title:beans^2.0)",q.toString()); |
| 282 | + |
| 283 | + // Redirect third pass tests |
| 284 | + q = parser.parseThreePass("beans",NamespacePolicy.IGNORE); |
| 285 | + assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 redirect:beans^2.0",q.toString()); |
| 286 | + |
| 287 | + q = parser.parseThreePass("beans everyone",NamespacePolicy.IGNORE); |
| 288 | + assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false)",q.toString()); |
| 289 | + |
| 290 | + q = parser.parseThreePass("beans everyone incategory:mouse",NamespacePolicy.IGNORE); |
| 291 | + assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) (+spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false) +category:mouse)",q.toString()); |
| 292 | + |
| 293 | + q = parser.parseThreePass("beans OR everyone",NamespacePolicy.IGNORE); |
| 294 | + assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0)",q.toString()); |
| 295 | + |
| 296 | + q = parser.parseThreePass("beans -everyone",NamespacePolicy.IGNORE); |
| 297 | + assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0)",q.toString()); |
| 298 | + |
| 299 | + q = parser.parseThreePass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE); |
| 300 | + assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false))",q.toString()); |
| 301 | + |
| 302 | + q = parser.parseThreePass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE); |
| 303 | + assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0))",q.toString()); |
279 | 304 | |
280 | 305 | // Test field extraction |
281 | 306 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -11,9 +11,11 @@ |
12 | 12 | import java.net.URL; |
13 | 13 | import java.util.ArrayList; |
14 | 14 | import java.util.Hashtable; |
| 15 | +import java.util.Properties; |
15 | 16 | |
16 | 17 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
17 | 18 | import org.wikimedia.lsearch.config.IndexId; |
| 19 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
18 | 20 | |
19 | 21 | import junit.framework.TestCase; |
20 | 22 | |
— | — | @@ -57,7 +59,11 @@ |
58 | 60 | return searchGroup; |
59 | 61 | } |
60 | 62 | |
| 63 | + public Properties getGlobalProps(){ |
| 64 | + return globalProperties; |
| 65 | + } |
61 | 66 | |
| 67 | + |
62 | 68 | } |
63 | 69 | |
64 | 70 | public static GlobalConfigurationTest.TestGC testgc = null; |
— | — | @@ -80,7 +86,7 @@ |
81 | 87 | String testurl = "file://"+System.getProperty("user.dir")+"/test-data/mwsearch-global.test"; |
82 | 88 | try { |
83 | 89 | URL url = new URL(testurl); |
84 | | - testgc.readFromURL(url,"/usr/local/var/mwsearch","",null); |
| 90 | + testgc.readFromURL(url,"/usr/local/var/mwsearch",""); |
85 | 91 | |
86 | 92 | // database |
87 | 93 | Hashtable database = testgc.getDatabase(); |
— | — | @@ -147,6 +153,23 @@ |
148 | 154 | String hostName = host.getHostName(); |
149 | 155 | System.out.println("Verify internet IP: "+hostAddr+", and hostname: "+hostName); |
150 | 156 | |
| 157 | + // test prefixes |
| 158 | + Hashtable<String,NamespaceFilter> p = testgc.getNamespacePrefixes(); |
| 159 | + assertEquals(17,p.size()); |
| 160 | + |
| 161 | + // check global properties |
| 162 | + Properties prop = testgc.getGlobalProps(); |
| 163 | + assertEquals("wiki wiktionary test",prop.get("Database.suffix")); |
| 164 | + assertEquals("wiki rutest",prop.get("KeywordScoring.suffix")); |
| 165 | + |
| 166 | + // check languages and keyword stuff |
| 167 | + assertEquals("en",testgc.getLanguage("entest")); |
| 168 | + assertEquals("sr",testgc.getLanguage("srwiki")); |
| 169 | + assertFalse(testgc.useKeywordScoring("frtest")); |
| 170 | + assertTrue(testgc.useKeywordScoring("srwiki")); |
| 171 | + assertTrue(testgc.useKeywordScoring("rutest")); |
| 172 | + |
| 173 | + |
151 | 174 | } catch (MalformedURLException e) { |
152 | 175 | e.printStackTrace(); |
153 | 176 | } catch (IOException e) { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import java.io.StringReader; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.HashMap; |
| 7 | +import java.util.HashSet; |
7 | 8 | import java.util.Map.Entry; |
8 | 9 | |
9 | 10 | import org.apache.lucene.analysis.Analyzer; |
— | — | @@ -38,6 +39,16 @@ |
39 | 40 | System.out.print("["+t.getKey()+"] => ["+t.getValue()+"] "); |
40 | 41 | } |
41 | 42 | if(iw.size()!=0) System.out.println(); |
| 43 | + |
| 44 | + HashSet<String> keywords = parser.getKeywords(); |
| 45 | + if(keywords.size()!=0){ |
| 46 | + System.out.print("KEYWORDS: "); |
| 47 | + } |
| 48 | + for(String t : keywords){ |
| 49 | + System.out.print("["+t+"] "); |
| 50 | + } |
| 51 | + if(keywords.size()!=0) System.out.println(); |
| 52 | + |
42 | 53 | System.out.println(); |
43 | 54 | } |
44 | 55 | |
— | — | @@ -75,6 +86,10 @@ |
76 | 87 | showTokens(text); |
77 | 88 | text = "{{IPstack|name = Hundai}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages."; |
78 | 89 | showTokens(text); |
| 90 | + text = "[[First link]]\n== Some caption ==\n[[Other link]]"; |
| 91 | + showTokens(text); |
| 92 | + text = "[[First]] second third fourth and so on goes the ... [[last link]]"; |
| 93 | + showTokens(text); |
79 | 94 | |
80 | 95 | ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test"); |
81 | 96 | ArrayList<TestArticle> articles = ap.getArticles(); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java |
— | — | @@ -29,7 +29,7 @@ |
30 | 30 | this.page = page; |
31 | 31 | } |
32 | 32 | public void writeEndPage() throws IOException { |
33 | | - String key = page.Title.Namespace+":"+page.Title.Text.toLowerCase(); |
| 33 | + String key = page.Title.Namespace+":"+page.Title.Text; |
34 | 34 | titles.put(key,new Rank(0)); |
35 | 35 | } |
36 | 36 | public HashMap<String,Rank> getTitles() { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -1,7 +1,9 @@ |
2 | 2 | package org.wikimedia.lsearch.importer; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
5 | 6 | import java.util.HashMap; |
| 7 | +import java.util.Map.Entry; |
6 | 8 | import java.util.concurrent.ThreadPoolExecutor.AbortPolicy; |
7 | 9 | import java.util.regex.Matcher; |
8 | 10 | import java.util.regex.Pattern; |
— | — | @@ -43,21 +45,17 @@ |
44 | 46 | } |
45 | 47 | public void writeEndPage() throws IOException { |
46 | 48 | // get rank |
47 | | - String key = page.Title.Namespace+":"+page.Title.Text.toLowerCase(); |
| 49 | + String key = page.Title.Namespace+":"+page.Title.Text; |
48 | 50 | Rank r = ranks.get(key); |
49 | 51 | int rank; |
50 | | - boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode)!=null; |
| 52 | + boolean isRedirect = r.redirectsTo != null; |
51 | 53 | if(r == null){ |
52 | 54 | rank = 0; |
53 | | - log.error("Rank for "+(page.Title.Namespace+":"+page.Title.Text.toLowerCase())+" is undefined, which should never happen."); |
54 | | - } else{ |
55 | | - if(r.redirect != null && key.equals(r.redirect) && isRedirect){ |
56 | | - rank = 0; |
57 | | - } else |
58 | | - rank = r.links; |
59 | | - } |
| 55 | + log.error("Rank for "+key+" is undefined, which should never happen."); |
| 56 | + } else |
| 57 | + rank = r.links; |
60 | 58 | // make article |
61 | | - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,rank); |
| 59 | + Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,rank,r.redirected); |
62 | 60 | writer.addArticle(article); |
63 | 61 | count++; |
64 | 62 | if(limit >= 0 && count > limit) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/RankReader.java |
— | — | @@ -6,6 +6,7 @@ |
7 | 7 | import java.util.regex.Matcher; |
8 | 8 | import java.util.regex.Pattern; |
9 | 9 | |
| 10 | +import org.apache.commons.lang.WordUtils; |
10 | 11 | import org.apache.log4j.Logger; |
11 | 12 | import org.mediawiki.importer.DumpWriter; |
12 | 13 | import org.mediawiki.importer.Page; |
— | — | @@ -49,26 +50,53 @@ |
50 | 51 | this.page = page; |
51 | 52 | } |
52 | 53 | public void writeEndPage() throws IOException { |
53 | | - Rank r = ranks.get(page.Title.Namespace+":"+page.Title.Text.toLowerCase()); |
| 54 | + Rank r = ranks.get(page.Title.Namespace+":"+page.Title.Text); |
54 | 55 | // register redirect |
55 | 56 | String redirect = Localization.getRedirectTarget(revision.Text,langCode); |
56 | 57 | if( redirect !=null ){ |
57 | | - redirect = redirect.toLowerCase(); |
58 | 58 | int ns = 0; |
59 | 59 | String title = redirect; |
60 | 60 | String[] parts = redirect.split(":",2); |
61 | 61 | if(parts.length == 2 && parts[0].length()>1){ |
62 | | - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1)); |
| 62 | + Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase()); |
63 | 63 | if(inx != null){ |
64 | 64 | ns = inx; |
65 | 65 | title = parts[1]; |
66 | 66 | } |
67 | 67 | } |
68 | | - r.redirect = ns+":"+title; |
| 68 | + r.redirectsTo = findRank(ns,title); |
69 | 69 | } else // process links |
70 | 70 | processRanks(revision.Text,page.Title.Namespace); |
71 | 71 | } |
72 | 72 | |
| 73 | + /** Find the rank object for the ns:title */ |
| 74 | + protected Rank findRank(int ns, String title){ |
| 75 | + String key; |
| 76 | + Rank rank; |
| 77 | + // try exact match |
| 78 | + key = ns+":"+title; |
| 79 | + rank = ranks.get(key); |
| 80 | + if(rank != null) |
| 81 | + return rank; |
| 82 | + // try lowercase |
| 83 | + key = ns+":"+title.toLowerCase(); |
| 84 | + rank = ranks.get(key); |
| 85 | + if(rank != null) |
| 86 | + return rank; |
| 87 | + // try title case |
| 88 | + key = ns+":"+WordUtils.capitalize(title); |
| 89 | + rank = ranks.get(key); |
| 90 | + if(rank != null) |
| 91 | + return rank; |
| 92 | + // try capitalizing at word breaks |
| 93 | + key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
| 94 | + rank = ranks.get(key); |
| 95 | + if(rank != null) |
| 96 | + return rank; |
| 97 | + |
| 98 | + return null; |
| 99 | + } |
| 100 | + |
73 | 101 | /** Extract all links from this page, and increment ranks for linked pages */ |
74 | 102 | protected void processRanks(String text, int namespace) { |
75 | 103 | Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
— | — | @@ -76,14 +104,12 @@ |
77 | 105 | int ns; String title; |
78 | 106 | boolean escaped; |
79 | 107 | |
80 | | - HashSet<String> links = new HashSet<String>(); |
| 108 | + HashSet<Rank> links = new HashSet<Rank>(); |
81 | 109 | while(matcher.find()){ |
82 | | - String link = matcher.group(1).toLowerCase(); |
| 110 | + String link = matcher.group(1); |
83 | 111 | int fragment = link.lastIndexOf('#'); |
84 | 112 | if(fragment != -1) |
85 | 113 | link = link.substring(0,fragment); |
86 | | - if(link.length() > 100) |
87 | | - continue; // probably an error |
88 | 114 | //System.out.println("Got link "+link); |
89 | 115 | if(link.startsWith(":")){ |
90 | 116 | escaped = true; |
— | — | @@ -94,7 +120,7 @@ |
95 | 121 | // check for ns:title syntax |
96 | 122 | String[] parts = link.split(":",2); |
97 | 123 | if(parts.length == 2 && parts[0].length() > 1){ |
98 | | - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1)); |
| 124 | + Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase()); |
99 | 125 | if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14))) |
100 | 126 | continue; // categories, ignore |
101 | 127 | if(inx!=null && inx < 0) |
— | — | @@ -108,17 +134,17 @@ |
109 | 135 | if(interwiki.contains(parts[0])) |
110 | 136 | continue; |
111 | 137 | } |
| 138 | + if(ns == 0 && namespace!=0) |
| 139 | + continue; // skip links from other namespaces into the main namespace |
| 140 | + |
112 | 141 | // register as link |
113 | | - String key = ns+":"+title; |
114 | | - links.add(key); |
| 142 | + Rank target = findRank(ns,title); |
| 143 | + if(target != null) |
| 144 | + links.add(target); |
115 | 145 | } |
116 | 146 | // increment page ranks |
117 | | - for(String t : links){ |
118 | | - if(t.startsWith("0:") && namespace!=0) |
119 | | - continue; // skip links from other namespaces into the main namespace |
120 | | - Rank rank = ranks.get(t); |
121 | | - if(rank != null) |
122 | | - rank.links++; |
| 147 | + for(Rank rank : links){ |
| 148 | + rank.links++; |
123 | 149 | } |
124 | 150 | } |
125 | 151 | public void writeSiteinfo(Siteinfo info) throws IOException { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -86,6 +86,7 @@ |
87 | 87 | else |
88 | 88 | writer.setMaxBufferedDocs(glMaxBufDocs); |
89 | 89 | writer.setUseCompoundFile(true); |
| 90 | + writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH); |
90 | 91 | |
91 | 92 | return writer; |
92 | 93 | } |
— | — | @@ -94,7 +95,7 @@ |
95 | 96 | public void addArticle(Article a){ |
96 | 97 | if(!WikiIndexModifier.checkAddPreconditions(a,langCode)) |
97 | 98 | return; // don't add if preconditions are not met |
98 | | - |
| 99 | + WikiIndexModifier.transformArticleForIndexing(a); |
99 | 100 | IndexId target; |
100 | 101 | if(iid.isSingle()) |
101 | 102 | target = iid; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -5,6 +5,7 @@ |
6 | 6 | import java.util.ArrayList; |
7 | 7 | import java.util.HashMap; |
8 | 8 | import java.util.HashSet; |
| 9 | +import java.util.Map.Entry; |
9 | 10 | |
10 | 11 | import org.apache.log4j.Logger; |
11 | 12 | import org.mediawiki.dumper.ProgressFilter; |
— | — | @@ -95,17 +96,19 @@ |
96 | 97 | long start = System.currentTimeMillis(); |
97 | 98 | |
98 | 99 | HashMap<String,Rank> ranks = processRanks(inputfile,getTitles(inputfile),langCode); |
99 | | - |
| 100 | + |
100 | 101 | // add-up ranks of redirects to pages where they redirect to |
101 | | - for(Rank r : ranks.values()){ |
102 | | - if(r.redirect != null){ |
103 | | - Rank dest = ranks.get(r.redirect); |
104 | | - if(dest != null && dest != r){ |
105 | | - dest.links += r.links; |
106 | | - r.links = 0; |
107 | | - } |
| 102 | + for(Entry<String,Rank> e : ranks.entrySet()){ |
| 103 | + Rank r = e.getValue(); |
| 104 | + if(r.redirectsTo != null && r != r.redirectsTo){ |
| 105 | + r.redirectsTo.links += r.links; |
| 106 | + r.links = 0; |
| 107 | + if(r.redirectsTo.redirected == null) |
| 108 | + r.redirectsTo.redirected = new ArrayList<String>(); |
| 109 | + r.redirectsTo.redirected.add(e.getKey()); |
108 | 110 | } |
109 | 111 | } |
| 112 | + |
110 | 113 | log.info("Third pass, indexing articles..."); |
111 | 114 | |
112 | 115 | // open |
— | — | @@ -119,7 +122,7 @@ |
120 | 123 | |
121 | 124 | // read |
122 | 125 | DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,ranks,langCode); |
123 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100)); |
| 126 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000)); |
124 | 127 | try { |
125 | 128 | reader.readDump(); |
126 | 129 | } catch (IOException e) { |
— | — | @@ -168,7 +171,7 @@ |
169 | 172 | } |
170 | 173 | // calculate ranks |
171 | 174 | RankReader rr = new RankReader(ranks,langCode); |
172 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 100)); |
| 175 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000)); |
173 | 176 | try { |
174 | 177 | reader.readDump(); |
175 | 178 | } catch (IOException e) { |
— | — | @@ -189,7 +192,7 @@ |
190 | 193 | } |
191 | 194 | // first pass, get titles |
192 | 195 | TitleReader tr = new TitleReader(); |
193 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 100)); |
| 196 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
194 | 197 | try { |
195 | 198 | reader.readDump(); |
196 | 199 | input.close(); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java |
— | — | @@ -106,7 +106,6 @@ |
107 | 107 | String globalurl = getString("MWConfig","global"); |
108 | 108 | String indexpath = getString("Indexes","path"); |
109 | 109 | String oairepo = getString("OAI","repo"); |
110 | | - String[] dbsuffixes = getArray("Database","suffix"); |
111 | 110 | if(globalurl==null){ |
112 | 111 | System.out.println("FATAL: Need to define global configuration url in local config file."); |
113 | 112 | System.exit(1); |
— | — | @@ -115,7 +114,7 @@ |
116 | 115 | System.exit(1); |
117 | 116 | } |
118 | 117 | try { |
119 | | - global.readFromURL(new URL(globalurl),indexpath,oairepo,dbsuffixes); |
| 118 | + global.readFromURL(new URL(globalurl),indexpath,oairepo); |
120 | 119 | } catch (MalformedURLException e) { |
121 | 120 | System.out.println("Malformed URL "+globalurl+" cannot read global configuration (check MWConfig.global in "+CONF_FILE_NAME+"), exiting..."); |
122 | 121 | System.exit(1); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -5,8 +5,10 @@ |
6 | 6 | package org.wikimedia.lsearch.config; |
7 | 7 | |
8 | 8 | import java.io.BufferedReader; |
| 9 | +import java.io.ByteArrayInputStream; |
9 | 10 | import java.io.IOException; |
10 | 11 | import java.io.InputStreamReader; |
| 12 | +import java.io.StringReader; |
11 | 13 | import java.net.Inet4Address; |
12 | 14 | import java.net.InetAddress; |
13 | 15 | import java.net.MalformedURLException; |
— | — | @@ -19,6 +21,7 @@ |
20 | 22 | import java.util.HashSet; |
21 | 23 | import java.util.Hashtable; |
22 | 24 | import java.util.Iterator; |
| 25 | +import java.util.Properties; |
23 | 26 | import java.util.Set; |
24 | 27 | import java.util.regex.Matcher; |
25 | 28 | import java.util.regex.Pattern; |
— | — | @@ -63,8 +66,12 @@ |
64 | 67 | /** OAI repo pattern from lsearch2.conf */ |
65 | 68 | protected String OAIRepoPattern; |
66 | 69 | /** Database suffix if dbname, the rest is supposed to be language, e.g srwiki => (suffix wiki) => sr */ |
67 | | - protected String[] databaseSuffixes; |
| 70 | + protected String[] databaseSuffixes = null; |
| 71 | + /** Databases ending in suffix will use additional keyword scores */ |
| 72 | + protected String[] keywordScoringSuffixes = null; |
68 | 73 | |
| 74 | + protected Properties globalProperties = null; |
| 75 | + |
69 | 76 | /** All identifiers of all indexes (dbrole -> IndexId) */ |
70 | 77 | protected static Hashtable<String,IndexId> indexIdPool = new Hashtable<String,IndexId>(); |
71 | 78 | |
— | — | @@ -192,13 +199,13 @@ |
193 | 200 | * @param url |
194 | 201 | * @throws IOException |
195 | 202 | */ |
196 | | - public void readFromURL(URL url, String indexpath, String oaiRepo, String[] dbsuffixes) throws IOException{ |
| 203 | + public void readFromURL(URL url, String indexpath, String oaiRepo) throws IOException{ |
197 | 204 | BufferedReader in; |
198 | 205 | try { |
199 | 206 | in = new BufferedReader( |
200 | 207 | new InputStreamReader( |
201 | 208 | url.openStream())); |
202 | | - read(in,indexpath,oaiRepo,dbsuffixes); |
| 209 | + read(in,indexpath,oaiRepo); |
203 | 210 | } catch (IOException e) { |
204 | 211 | System.out.println("I/O Error in opening or reading global config at url "+url); |
205 | 212 | throw e; |
— | — | @@ -221,6 +228,13 @@ |
222 | 229 | namespacePrefixAll = "all"; // default |
223 | 230 | } |
224 | 231 | |
| 232 | + protected String[] getArrayProperty(String name){ |
| 233 | + String s = globalProperties.getProperty(name); |
| 234 | + if (s != null) |
| 235 | + return s.split(" "); |
| 236 | + return null; |
| 237 | + } |
| 238 | + |
225 | 239 | /** |
226 | 240 | * Reads a config file from a bufferedreader, will |
227 | 241 | * close the reader when done. |
— | — | @@ -228,7 +242,7 @@ |
229 | 243 | * @param in opened reader |
230 | 244 | * @throws IOException |
231 | 245 | */ |
232 | | - protected void read(BufferedReader in, String indexpath, String oaiRepo, String[] dbsuffixes) throws IOException{ |
| 246 | + protected void read(BufferedReader in, String indexpath, String oaiRepo) throws IOException{ |
233 | 247 | String line=""; |
234 | 248 | int section = -1; |
235 | 249 | Pattern roleRegexp = Pattern.compile("\\((.*?)\\)"); |
— | — | @@ -245,7 +259,6 @@ |
246 | 260 | init(); |
247 | 261 | this.indexPath = indexpath; |
248 | 262 | this.OAIRepoPattern = oaiRepo == null? "" : oaiRepo; |
249 | | - this.databaseSuffixes = dbsuffixes; |
250 | 263 | |
251 | 264 | while((line = in.readLine()) != null){ |
252 | 265 | lineNum ++; |
— | — | @@ -260,6 +273,27 @@ |
261 | 274 | if(line.startsWith("[") && line.length()>2 && !Character.isDigit(line.charAt(1))){ // section |
262 | 275 | int last = line.indexOf("]"); |
263 | 276 | String s = line.substring(1,last); |
| 277 | + |
| 278 | + if(s.equalsIgnoreCase("properties")){ |
| 279 | + globalProperties = new Properties(); |
| 280 | + StringBuilder prop = new StringBuilder(line+"\n"); |
| 281 | + while((line = in.readLine()) != null){ |
| 282 | + if(line.startsWith("[") && line.length()>2 && !Character.isDigit(line.charAt(1))) |
| 283 | + break; |
| 284 | + prop.append(line); |
| 285 | + prop.append("\n"); |
| 286 | + } |
| 287 | + globalProperties.load(new ByteArrayInputStream(prop.toString().getBytes("utf-8"))); |
| 288 | + // get some predifined global properties |
| 289 | + this.databaseSuffixes = getArrayProperty("Database.suffix"); |
| 290 | + this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix"); |
| 291 | + if(line == null) |
| 292 | + break; |
| 293 | + // else: line points to beginning of next section |
| 294 | + last = line.indexOf("]"); |
| 295 | + s = line.substring(1,last); |
| 296 | + } |
| 297 | + |
264 | 298 | if(s.equalsIgnoreCase("database")) |
265 | 299 | section = DATABASE; |
266 | 300 | else if(s.equalsIgnoreCase("index")) |
— | — | @@ -314,8 +348,7 @@ |
315 | 349 | if(filter.equalsIgnoreCase("<all>")) |
316 | 350 | namespacePrefixAll = prefix; |
317 | 351 | else |
318 | | - namespacePrefix.put(prefix,new NamespaceFilter(filter)); |
319 | | - |
| 352 | + namespacePrefix.put(prefix,new NamespaceFilter(filter)); |
320 | 353 | } |
321 | 354 | } |
322 | 355 | if( !checkIntegrity() ){ |
— | — | @@ -769,6 +802,24 @@ |
770 | 803 | return namespacePrefixAll; |
771 | 804 | } |
772 | 805 | |
| 806 | + /** Returns if keyword scoring should be used for this db, using |
| 807 | + * the suffixes from the global configuration |
| 808 | + * |
| 809 | + * @param dbname |
| 810 | + * @return |
| 811 | + */ |
| 812 | + public boolean useKeywordScoring(String dbname){ |
| 813 | + if(keywordScoringSuffixes == null) |
| 814 | + return false; |
| 815 | + else{ |
| 816 | + for (String suffix : keywordScoringSuffixes) { |
| 817 | + if (dbname.endsWith(suffix)) |
| 818 | + return true; |
| 819 | + } |
| 820 | + } |
| 821 | + return false; |
| 822 | + } |
773 | 823 | |
| 824 | + |
774 | 825 | |
775 | 826 | } |
\ No newline at end of file |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java |
— | — | @@ -188,6 +188,7 @@ |
189 | 189 | } |
190 | 190 | long startTime = System.currentTimeMillis(); |
191 | 191 | // rsync |
| 192 | + log.info("Starting rsync of "+iid); |
192 | 193 | String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp; |
193 | 194 | command = "/usr/bin/rsync -W --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath(); |
194 | 195 | log.debug("Running shell command: "+command); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -135,10 +135,10 @@ |
136 | 136 | |
137 | 137 | try { |
138 | 138 | if(nsfw == null){ |
139 | | - q = parser.parseTwoPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE); |
| 139 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname()); |
140 | 140 | } |
141 | 141 | else{ |
142 | | - q = parser.parseTwoPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE); |
| 142 | + q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
143 | 143 | log.info("Using NamespaceFilterWrapper "+nsfw); |
144 | 144 | } |
145 | 145 | |
— | — | @@ -250,10 +250,13 @@ |
251 | 251 | // fetch documents |
252 | 252 | Document[] docs = s.docs(docids); |
253 | 253 | int j=0; |
| 254 | + float maxScore = 1; |
| 255 | + if(numhits>0) |
| 256 | + maxScore = hits.score(0); |
254 | 257 | for(Document doc : docs){ |
255 | 258 | String namespace = doc.get("namespace"); |
256 | 259 | String title = doc.get("title"); |
257 | | - float score = transformScore(scores[j]); |
| 260 | + float score = transformScore(scores[j]/maxScore); |
258 | 261 | ResultSet rs = new ResultSet(score,namespace,title); |
259 | 262 | if(explain) |
260 | 263 | rs.setExplanation(((IndexSearcherMul)s).explain(q,docids[j])); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -65,7 +65,7 @@ |
66 | 66 | |
67 | 67 | try{ |
68 | 68 | for(int i=0; i < count ; i++){ |
69 | | - Query q = parser.parseTwoPass(terms.next(),WikiQueryParser.NamespacePolicy.IGNORE); |
| 69 | + Query q = parser.parseFourPass(terms.next(),WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
70 | 70 | Hits hits = is.search(q); |
71 | 71 | for(int j =0; j<20 && j<hits.length(); j++) |
72 | 72 | hits.doc(j); // retrieve some documents |
— | — | @@ -117,7 +117,7 @@ |
118 | 118 | public static void simpleWarmup(IndexSearcherMul is, IndexId iid){ |
119 | 119 | try{ |
120 | 120 | WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE); |
121 | | - Query q = parser.parseTwoPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE); |
| 121 | + Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
122 | 122 | is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0"))); |
123 | 123 | } catch (IOException e) { |
124 | 124 | log.error("Error warming up local IndexSearcherMul for "+iid); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -6,6 +6,8 @@ |
7 | 7 | |
8 | 8 | import java.io.File; |
9 | 9 | import java.io.IOException; |
| 10 | +import java.util.ArrayList; |
| 11 | +import java.util.Arrays; |
10 | 12 | import java.util.Collection; |
11 | 13 | import java.util.Collections; |
12 | 14 | import java.util.HashSet; |
— | — | @@ -24,7 +26,9 @@ |
25 | 27 | import org.apache.lucene.store.Directory; |
26 | 28 | import org.apache.lucene.store.FSDirectory; |
27 | 29 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 30 | +import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
28 | 31 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
| 32 | +import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
29 | 33 | import org.wikimedia.lsearch.beans.Article; |
30 | 34 | import org.wikimedia.lsearch.beans.IndexReportCard; |
31 | 35 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
— | — | @@ -48,13 +52,13 @@ |
49 | 53 | } |
50 | 54 | } |
51 | 55 | |
| 56 | + static public final int MAX_FIELD_LENGTH = 100000; |
52 | 57 | /** Simple implementation of batch addition and deletion */ |
53 | 58 | class SimpleIndexModifier { |
54 | 59 | protected IndexId iid; |
55 | 60 | protected IndexReader reader; |
56 | 61 | protected IndexWriter writer; |
57 | | - protected boolean rewrite; |
58 | | - protected int maxFieldLength; |
| 62 | + protected boolean rewrite; |
59 | 63 | protected String langCode; |
60 | 64 | |
61 | 65 | protected HashSet<IndexUpdateRecord> nonDeleteDocuments; |
— | — | @@ -75,13 +79,8 @@ |
76 | 80 | this.iid = iid; |
77 | 81 | this.rewrite = rewrite; |
78 | 82 | this.langCode = langCode; |
79 | | - maxFieldLength = 0; |
80 | 83 | reportQueue = new Hashtable<IndexUpdateRecord,IndexReportCard>(); |
81 | 84 | } |
82 | | - |
83 | | - public void setMaxFieldLength(int maxFieldLength) { |
84 | | - this.maxFieldLength = maxFieldLength; |
85 | | - } |
86 | 85 | |
87 | 86 | protected IndexReportCard getReportCard(IndexUpdateRecord rec){ |
88 | 87 | if(!rec.isReportBack()) |
— | — | @@ -168,8 +167,7 @@ |
169 | 168 | writer.setMergeFactor(mergeFactor); |
170 | 169 | writer.setMaxBufferedDocs(maxBufDocs); |
171 | 170 | writer.setUseCompoundFile(true); |
172 | | - if(maxFieldLength!=0) |
173 | | - writer.setMaxFieldLength(maxFieldLength); |
| 171 | + writer.setMaxFieldLength(MAX_FIELD_LENGTH); |
174 | 172 | |
175 | 173 | FilterFactory filters = new FilterFactory(langCode); |
176 | 174 | |
— | — | @@ -179,6 +177,7 @@ |
180 | 178 | continue; // don't add if delete/add are paired operations |
181 | 179 | if(!checkPreconditions(rec)) |
182 | 180 | continue; // article shoouldn't be added for some (heuristic) reason |
| 181 | + transformArticleForIndexing(rec.getArticle()); // tranform record so that unnecessary stuff is deleted, e.g. some redirects |
183 | 182 | IndexReportCard card = getReportCard(rec); |
184 | 183 | Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters); |
185 | 184 | Document doc = (Document) ret[0]; |
— | — | @@ -210,7 +209,7 @@ |
211 | 210 | } |
212 | 211 | return succ; |
213 | 212 | } |
214 | | - |
| 213 | + |
215 | 214 | public boolean checkPreconditions(IndexUpdateRecord rec){ |
216 | 215 | return checkAddPreconditions(rec.getArticle(),langCode); |
217 | 216 | } |
— | — | @@ -226,15 +225,41 @@ |
227 | 226 | public static boolean checkAddPreconditions(Article ar, String langCode){ |
228 | 227 | if(ar.getNamespace().equals("0")){ |
229 | 228 | String redirect = Localization.getRedirectTarget(ar.getContents(),langCode); |
230 | | - if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){ |
| 229 | + if(redirect != null) |
| 230 | + return false; // don't add redirects |
| 231 | + /*if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){ |
231 | 232 | log.debug("Not adding "+ar+" into index: "+ar.getContents()); |
232 | 233 | return false; |
233 | | - } |
| 234 | + } */ |
234 | 235 | } |
235 | 236 | return true; |
236 | 237 | } |
237 | 238 | |
238 | 239 | /** |
| 240 | + * Changes the article, so that things we don't want to index are deleted, |
| 241 | + * e.g. it deletes redirects from nonmain namespace to article in main namespace |
| 242 | + * |
| 243 | + * @param rec |
| 244 | + */ |
| 245 | + public static void transformArticleForIndexing(Article ar) { |
| 246 | + ArrayList<String> redirects = ar.getRedirects(); |
| 247 | + String ns = ar.getNamespace()+":"; |
| 248 | + if(redirects != null){ |
| 249 | + ArrayList<String> filtered = new ArrayList<String>(); |
| 250 | + // index only redirects from the same namespace |
| 251 | + // to avoid a lot of unusable redirects from/to |
| 252 | + // user namespace, but always index redirect FROM main |
| 253 | + for(String r : redirects){ |
| 254 | + if(r.startsWith(ns) || r.startsWith("0:")) |
| 255 | + filtered.add(r.split(":",2)[1]); |
| 256 | + //else |
| 257 | + //log.info("Ignoring redirect "+r+" to "+ar); |
| 258 | + } |
| 259 | + ar.setRedirects(filtered); |
| 260 | + } |
| 261 | + } |
| 262 | + |
| 263 | + /** |
239 | 264 | * Create necessary directories for index |
240 | 265 | * @param dbname |
241 | 266 | * @return relative path (to document root) of db within filesystem |
— | — | @@ -347,6 +372,7 @@ |
348 | 373 | */ |
349 | 374 | public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){ |
350 | 375 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
| 376 | + WikiTokenizer tokenizer = null; |
351 | 377 | Document doc = new Document(); |
352 | 378 | |
353 | 379 | // This will be used to look up and replace entries on index updates. |
— | — | @@ -357,10 +383,22 @@ |
358 | 384 | |
359 | 385 | // boost document title with it's article rank |
360 | 386 | Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED); |
361 | | - log.debug(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()); |
362 | | - title.setBoost(calculateArticleRank(article.getRank())); |
| 387 | + //log.debug(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
| 388 | + float rankBoost = calculateArticleRank(article.getRank()); |
| 389 | + title.setBoost(rankBoost); |
363 | 390 | doc.add(title); |
364 | 391 | |
| 392 | + // add titles of redirects, generated from analyzer |
| 393 | + Field redirect = new Field("redirect", "", |
| 394 | + Field.Store.NO, Field.Index.TOKENIZED); |
| 395 | + redirect.setBoost(rankBoost); |
| 396 | + doc.add(redirect); |
| 397 | + |
| 398 | + // most significat words in the text, gets extra score, from analyzer |
| 399 | + Field keyword = new Field("keyword", "", |
| 400 | + Field.Store.NO, Field.Index.TOKENIZED); |
| 401 | + doc.add(keyword); |
| 402 | + |
365 | 403 | // the next fields are generated using wikitokenizer |
366 | 404 | doc.add(new Field("contents", "", |
367 | 405 | Field.Store.NO, Field.Index.TOKENIZED)); |
— | — | @@ -372,9 +410,13 @@ |
373 | 411 | String text = article.getContents(); |
374 | 412 | if(article.isRedirect()) |
375 | 413 | text=""; // for redirects index only the title |
| 414 | + Object[] ret = Analyzers.getIndexerAnalyzer(text,filters,article.getRedirects()); |
| 415 | + perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0]; |
376 | 416 | |
377 | | - perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,filters); |
378 | | - |
| 417 | + // set boost for keyword field |
| 418 | + tokenizer = (WikiTokenizer) ret[1]; |
| 419 | + keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size())); |
| 420 | + |
379 | 421 | return new Object[] { doc, perFieldAnalyzer }; |
380 | 422 | } |
381 | 423 | |
— | — | @@ -392,5 +434,19 @@ |
393 | 435 | else |
394 | 436 | return (float) (1 + rank/15.0); |
395 | 437 | } |
| 438 | + |
| 439 | + /** |
| 440 | + * We don't want whole stub articles fetched as keywords, so we penalize if |
| 441 | + * the article is too short for keyword extraction. |
| 442 | + * |
| 443 | + * @param numTokens |
| 444 | + * @return |
| 445 | + */ |
| 446 | + public static float calculateKeywordsBoost(int numTokens){ |
| 447 | + if(numTokens > 2 * FastWikiTokenizerEngine.KEYWORD_TOKEN_LIMIT) |
| 448 | + return 1; |
| 449 | + else |
| 450 | + return ((float)numTokens)/FastWikiTokenizerEngine.KEYWORD_TOKEN_LIMIT/2; |
| 451 | + } |
396 | 452 | |
397 | 453 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java |
— | — | @@ -20,6 +20,9 @@ |
21 | 21 | * |
22 | 22 | * For titles: |
23 | 23 | * * 1/sqrt(term^3) |
| 24 | + * |
| 25 | + * For redirect: |
| 26 | + * * no length norm |
24 | 27 | * |
25 | 28 | */ |
26 | 29 | @Override |
— | — | @@ -36,6 +39,8 @@ |
37 | 40 | float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
38 | 41 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
39 | 42 | return f; |
| 43 | + } else if(fieldName.equals("redirect") || fieldName.equals("keyword")){ |
| 44 | + return 1; |
40 | 45 | } else |
41 | 46 | return super.lengthNorm(fieldName,numTokens); |
42 | 47 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -0,0 +1,113 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.Reader; |
| 6 | +import java.util.ArrayList; |
| 7 | +import java.util.HashSet; |
| 8 | + |
| 9 | +import org.apache.log4j.Logger; |
| 10 | +import org.apache.lucene.analysis.Analyzer; |
| 11 | +import org.apache.lucene.analysis.Token; |
| 12 | +import org.apache.lucene.analysis.TokenStream; |
| 13 | + |
| 14 | +/** |
| 15 | + * Analyzer that builds a field with an array of keywords, |
| 16 | + * each keyword is separated by a large token gap, so it's |
| 17 | + * convenient to run SpanNearQueries on the field. Keywords |
| 18 | + * themselves are tokenized. E.g. |
| 19 | + * |
| 20 | + * ("something different", "other") -> |
| 21 | + * "something" +1 "different" +201 "other" |
| 22 | + * |
| 23 | + * Currently used for fields "redirect" and "keyword" |
| 24 | + * |
| 25 | + * @author rainman |
| 26 | + * |
| 27 | + */ |
| 28 | +public class KeywordsAnalyzer extends Analyzer{ |
| 29 | + static Logger log = Logger.getLogger(KeywordsAnalyzer.class); |
| 30 | + protected ArrayList<String> keywords; |
| 31 | + protected FilterFactory filters; |
| 32 | + protected KeywordsTokenStream tokens; |
| 33 | + |
| 34 | + public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters){ |
| 35 | + ArrayList<String> k = new ArrayList<String>(); |
| 36 | + k.addAll(keywords); |
| 37 | + tokens = new KeywordsTokenStream(k,filters); |
| 38 | + } |
| 39 | + |
| 40 | + public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters){ |
| 41 | + tokens = new KeywordsTokenStream(keywords,filters); |
| 42 | + } |
| 43 | + /** positional increment between different redirects */ |
| 44 | + public static final int tokenGap = 201; |
| 45 | + |
| 46 | + @Override |
| 47 | + public TokenStream tokenStream(String fieldName, Reader reader) { |
| 48 | + return tokens; |
| 49 | + } |
| 50 | + @Override |
| 51 | + public TokenStream tokenStream(String fieldName, String text) { |
| 52 | + return tokens; |
| 53 | + } |
| 54 | + |
| 55 | + class KeywordsTokenStream extends TokenStream { |
| 56 | + protected Analyzer analyzer; |
| 57 | + protected ArrayList<String> keywords; |
| 58 | + protected int index; |
| 59 | + protected String keyword; |
| 60 | + protected TokenStream tokens; |
| 61 | + |
| 62 | + public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters){ |
| 63 | + this.analyzer = new QueryLanguageAnalyzer(filters); |
| 64 | + this.keywords = keywords; |
| 65 | + this.index = 0; |
| 66 | + this.keyword = null; |
| 67 | + this.tokens = null; |
| 68 | + } |
| 69 | + @Override |
| 70 | + public Token next() throws IOException { |
| 71 | + if(keywords == null) |
| 72 | + return null; // nothing to do |
| 73 | + Token t; |
| 74 | + if(keyword == null){ |
| 75 | + t = openNext(); |
| 76 | + return t; |
| 77 | + } |
| 78 | + if(keyword != null && tokens!=null){ |
| 79 | + t = tokens.next(); |
| 80 | + if(t == null){ |
| 81 | + t = openNext(); |
| 82 | + if(t != null) |
| 83 | + t.setPositionIncrement(tokenGap); |
| 84 | + } |
| 85 | + return t; |
| 86 | + } else{ |
| 87 | + log.warn("Inconsistent state: key="+keyword+", tokens="+tokens); |
| 88 | + } |
| 89 | + return null; |
| 90 | + } |
| 91 | + |
| 92 | + protected Token openNext() throws IOException { |
| 93 | + Token t; |
| 94 | + if(index >= keywords.size()) |
| 95 | + return null; // processed all keywords |
| 96 | + // try subsequent keyword titles until find one with |
| 97 | + // title that can be tokenized |
| 98 | + do{ |
| 99 | + // next keyword title |
| 100 | + keyword = keywords.get(index++); |
| 101 | + tokens = analyzer.tokenStream("",keyword); |
| 102 | + // try to tokenize |
| 103 | + t = tokens.next(); |
| 104 | + if(t == null && index == keywords.size()) |
| 105 | + return null; // last token |
| 106 | + else if(t!=null) |
| 107 | + return t; |
| 108 | + } while(keyword == null); |
| 109 | + return null; |
| 110 | + } |
| 111 | + |
| 112 | + } |
| 113 | + |
| 114 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java |
— | — | @@ -4,6 +4,7 @@ |
5 | 5 | import java.io.Reader; |
6 | 6 | import java.util.ArrayList; |
7 | 7 | import java.util.HashMap; |
| 8 | +import java.util.HashSet; |
8 | 9 | import java.util.Iterator; |
9 | 10 | |
10 | 11 | import org.apache.log4j.Logger; |
— | — | @@ -17,6 +18,7 @@ |
18 | 19 | protected Iterator<Token> tokenIt = null; |
19 | 20 | protected ArrayList<String> categories = null; |
20 | 21 | protected HashMap<String,String> interwikis = null; |
| 22 | + protected HashSet<String> keywords = null; |
21 | 23 | |
22 | 24 | /** Use <code>WikiTokenizer(String)</code> constructor */ |
23 | 25 | @Deprecated |
— | — | @@ -52,6 +54,7 @@ |
53 | 55 | tokenIt = tokens.iterator(); |
54 | 56 | categories = parser.getCategories(); |
55 | 57 | interwikis = parser.getInterwikis(); |
| 58 | + keywords = parser.getKeywords(); |
56 | 59 | } |
57 | 60 | } |
58 | 61 | |
— | — | @@ -84,7 +87,12 @@ |
85 | 88 | public ArrayList<Token> getTokens() { |
86 | 89 | return tokens; |
87 | 90 | } |
| 91 | + |
| 92 | + public HashSet<String> getKeywords() { |
| 93 | + return keywords; |
| 94 | + } |
88 | 95 | |
89 | 96 | |
| 97 | + |
90 | 98 | |
91 | 99 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -7,6 +7,7 @@ |
8 | 8 | import java.util.HashSet; |
9 | 9 | import java.util.Hashtable; |
10 | 10 | |
| 11 | +import org.apache.commons.lang.WordUtils; |
11 | 12 | import org.apache.lucene.analysis.Token; |
12 | 13 | import org.wikimedia.lsearch.util.Localization; |
13 | 14 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -35,6 +36,7 @@ |
36 | 37 | private ArrayList<Token> tokens; |
37 | 38 | protected ArrayList<String> categories; |
38 | 39 | protected HashMap<String,String> interwikis; |
| 40 | + protected HashSet<String> keywords; |
39 | 41 | private int length = 0; // length of token |
40 | 42 | private int start = 0; // start position of token |
41 | 43 | private int cur = 0; // current position in the input string |
— | — | @@ -45,11 +47,17 @@ |
46 | 48 | private int decompi; |
47 | 49 | private char cl; // lowercased character |
48 | 50 | private boolean numberToken; // if the buffer holds a number token |
| 51 | + private int headings = 0; // how many headings did we see |
49 | 52 | |
50 | 53 | private int prefixLen = 0; |
51 | 54 | private final char[] prefixBuf = new char[MAX_WORD_LEN]; |
52 | 55 | private int semicolonInx = -1; |
| 56 | + private final char[] keywordBuf = new char[MAX_WORD_LEN]; |
| 57 | + private int keywordLen = 0; |
53 | 58 | |
| 59 | + /** This many tokens from begining of text are eligable for keywords */ |
| 60 | + public static final int KEYWORD_TOKEN_LIMIT = 250; |
| 61 | + |
54 | 62 | /** language code */ |
55 | 63 | private String language; |
56 | 64 | /** language code -> set (image namespace names) */ |
— | — | @@ -60,12 +68,12 @@ |
61 | 69 | |
62 | 70 | private UnicodeDecomposer decomposer; |
63 | 71 | |
64 | | - enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, |
| 72 | + enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD, |
65 | 73 | LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS, |
66 | 74 | TEMPLATE_BEGIN, TEMPLATE_WORDS, TEMPLATE_END, |
67 | 75 | TABLE_BEGIN}; |
68 | 76 | |
69 | | - enum FetchState { WORD, CATEGORY, INTERWIKI}; |
| 77 | + enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD }; |
70 | 78 | |
71 | 79 | |
72 | 80 | private void init(){ |
— | — | @@ -73,6 +81,7 @@ |
74 | 82 | categories = new ArrayList<String>(); |
75 | 83 | interwikis = new HashMap<String,String>(); |
76 | 84 | decomposer = UnicodeDecomposer.getInstance(); |
| 85 | + keywords = new HashSet<String>(); |
77 | 86 | numberToken = false; |
78 | 87 | } |
79 | 88 | |
— | — | @@ -258,6 +267,50 @@ |
259 | 268 | return Localization.getRedirectTarget(textString,language)!=null; |
260 | 269 | } |
261 | 270 | |
| 271 | + /** |
| 272 | + * Decide if link that is currently being processed is to be appended to list of keywords |
| 273 | + * |
| 274 | + * Criterion: link is within first 300 words, and before the |
| 275 | + * first heading |
| 276 | + * |
| 277 | + */ |
| 278 | + protected boolean isGoodKeywordLink(){ |
| 279 | + return headings == 0 && tokens.size() <= KEYWORD_TOKEN_LIMIT; |
| 280 | + } |
| 281 | + |
| 282 | + /** When encountering '=' check if this line is actually a heading */ |
| 283 | + private void checkHeadings() { |
| 284 | + // make sure = is at a begining of a line |
| 285 | + if(cur == 0 || text[cur-1]=='\n' || text[cur-1]=='\r'){ |
| 286 | + int endOfLine; |
| 287 | + // find end of line/text |
| 288 | + for(endOfLine = cur ; endOfLine < textLength ; endOfLine++ ){ |
| 289 | + lc = text[endOfLine]; |
| 290 | + if(lc == '\n' || lc =='\r') |
| 291 | + break; |
| 292 | + } |
| 293 | + int start=0, end=0; // number of ='s at begining and end of line |
| 294 | + // find first sequence of = |
| 295 | + for(lookup = cur ; lookup < textLength && lookup < endOfLine ; lookup++ ){ |
| 296 | + if(text[lookup] == '=') |
| 297 | + start++; |
| 298 | + else |
| 299 | + break; |
| 300 | + } |
| 301 | + // find the last squence of = |
| 302 | + for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){ |
| 303 | + if(text[lookup] == '=') |
| 304 | + end++; |
| 305 | + else |
| 306 | + break; |
| 307 | + } |
| 308 | + // check |
| 309 | + if(start == end && start != 0 && start+end<endOfLine-cur && start>=2 && start<=4){ |
| 310 | + headings++; |
| 311 | + } |
| 312 | + } |
| 313 | + } |
| 314 | + |
262 | 315 | /** |
263 | 316 | * Parse Wiki text, and produce an arraylist of tokens. |
264 | 317 | * Also fills the lists categories and interwikis. |
— | — | @@ -281,6 +334,9 @@ |
282 | 335 | switch(state){ |
283 | 336 | case WORD: |
284 | 337 | switch(c){ |
| 338 | + case '=': |
| 339 | + checkHeadings(); |
| 340 | + break; |
285 | 341 | case '<': |
286 | 342 | addToken(); |
287 | 343 | state = ParserState.IGNORE; |
— | — | @@ -369,12 +425,18 @@ |
370 | 426 | fetch = FetchState.INTERWIKI; |
371 | 427 | state = ParserState.LINK_FETCH; |
372 | 428 | continue; |
373 | | - } else{ |
374 | | - // unrecognized, ignore |
375 | | - cur--; |
376 | | - continue; |
377 | 429 | } |
378 | 430 | } |
| 431 | + // add this link to keywords? |
| 432 | + if(isGoodKeywordLink()){ |
| 433 | + fetch = FetchState.KEYWORD; |
| 434 | + state = ParserState.LINK_KEYWORD; |
| 435 | + if(pipeInx != -1) |
| 436 | + cur = pipeInx; // ignore up to pipe |
| 437 | + else |
| 438 | + cur--; // return the first character of link |
| 439 | + continue; |
| 440 | + } |
379 | 441 | |
380 | 442 | // no semicolon, search for pipe: |
381 | 443 | if(pipeInx != -1){ |
— | — | @@ -384,6 +446,11 @@ |
385 | 447 | addLetter(); |
386 | 448 | continue; |
387 | 449 | } |
| 450 | + case LINK_KEYWORD: |
| 451 | + if(keywordLen < keywordBuf.length && c!=']'){ |
| 452 | + keywordBuf[keywordLen++] = c; |
| 453 | + } |
| 454 | + // fall-thru |
388 | 455 | case LINK_WORDS: |
389 | 456 | if(c == ']'){ |
390 | 457 | state = ParserState.LINK_END; |
— | — | @@ -419,7 +486,7 @@ |
420 | 487 | |
421 | 488 | if(length<buffer.length) |
422 | 489 | buffer[length++] = c; |
423 | | - continue; |
| 490 | + continue; |
424 | 491 | case LINK_END: |
425 | 492 | if(c == ']'){ // good link ending |
426 | 493 | state = ParserState.WORD; |
— | — | @@ -439,6 +506,11 @@ |
440 | 507 | length = 0; |
441 | 508 | fetch = FetchState.WORD; |
442 | 509 | continue; |
| 510 | + case KEYWORD: |
| 511 | + keywords.add(new String(keywordBuf,0,keywordLen)); |
| 512 | + keywordLen = 0; |
| 513 | + fetch = FetchState.WORD; |
| 514 | + continue; |
443 | 515 | } |
444 | 516 | } else{ |
445 | 517 | // bad syntax, ignore any categories, etc.. |
— | — | @@ -478,7 +550,7 @@ |
479 | 551 | addToken(); |
480 | 552 | return tokens; |
481 | 553 | } |
482 | | - |
| 554 | + |
483 | 555 | /** Check if this is an "image" keyword using localization */ |
484 | 556 | private final boolean isImage(String prefix){ |
485 | 557 | prefix = prefix.toLowerCase(); |
— | — | @@ -530,4 +602,10 @@ |
531 | 603 | public ArrayList<Token> getTokens() { |
532 | 604 | return tokens; |
533 | 605 | } |
| 606 | + |
| 607 | + public HashSet<String> getKeywords() { |
| 608 | + return keywords; |
| 609 | + } |
| 610 | + |
| 611 | + |
534 | 612 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -18,6 +18,9 @@ |
19 | 19 | import org.apache.lucene.search.Query; |
20 | 20 | import org.apache.lucene.search.TermQuery; |
21 | 21 | import org.apache.lucene.search.WildcardQuery; |
| 22 | +import org.apache.lucene.search.spans.SpanNearQuery; |
| 23 | +import org.apache.lucene.search.spans.SpanQuery; |
| 24 | +import org.apache.lucene.search.spans.SpanTermQuery; |
22 | 25 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
23 | 26 | import org.wikimedia.lsearch.search.NamespaceFilter; |
24 | 27 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -69,7 +72,9 @@ |
70 | 73 | /** boost for alias words from analyzer */ |
71 | 74 | public final float ALIAS_BOOST = 0.5f; |
72 | 75 | /** boost for title field */ |
73 | | - public static float TITLE_BOOST = 4; |
| 76 | + public static float TITLE_BOOST = 4; |
| 77 | + public static float REDIRECT_BOOST = 0.5f; |
| 78 | + public static float KEYWORD_BOOST = 1; |
74 | 79 | |
75 | 80 | /** Policies in treating field names: |
76 | 81 | * |
— | — | @@ -90,6 +95,7 @@ |
91 | 96 | private Query namespaceRewriteQuery; |
92 | 97 | private NamespacePolicy namespacePolicy; |
93 | 98 | protected NamespaceFilter defaultNamespaceFilter; |
| 99 | + protected static GlobalConfiguration global=null; |
94 | 100 | |
95 | 101 | /** default value for boolean queries */ |
96 | 102 | public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST; |
— | — | @@ -102,7 +108,8 @@ |
103 | 109 | protected void initNamespaces(){ |
104 | 110 | if(namespaceQueries != null) |
105 | 111 | return; |
106 | | - GlobalConfiguration global = GlobalConfiguration.getInstance(); |
| 112 | + if(global == null) |
| 113 | + global = GlobalConfiguration.getInstance(); |
107 | 114 | namespaceAllKeyword = global.getNamespacePrefixAll(); |
108 | 115 | namespaceQueries = new Hashtable<String,Query>(); |
109 | 116 | namespacePrefixes = new Hashtable<NamespaceFilter,String>(); |
— | — | @@ -891,16 +898,146 @@ |
892 | 899 | return query; |
893 | 900 | } |
894 | 901 | |
| 902 | + protected boolean isNamespaceQuery(Query q){ |
| 903 | + if(q instanceof TermQuery) |
| 904 | + return ((TermQuery)q).getTerm().field().equals("namespace"); |
| 905 | + else if(q instanceof BooleanQuery){ |
| 906 | + for(BooleanClause cl : ((BooleanQuery)q).getClauses()){ |
| 907 | + if(cl.getQuery() instanceof TermQuery && |
| 908 | + ((TermQuery)cl.getQuery()).getTerm().field().equals("namespace")); |
| 909 | + else |
| 910 | + return false; |
| 911 | + } |
| 912 | + return true; |
| 913 | + } |
| 914 | + return false; |
| 915 | + } |
| 916 | + |
895 | 917 | /** |
896 | | - * Parse the query according to policy. Instead of rewrite phrase, simply pass |
897 | | - * twice the query with different default fields. |
| 918 | + * Doing some very simple analysis extract span queries to use for |
| 919 | + * redirect field. Currently only extracts if all boolean clauses are |
| 920 | + * required or if it's a phrase query. This is since making span |
| 921 | + * queries in non-trivial in other cases. :( |
898 | 922 | * |
| 923 | + * The function heavily depends on the format of output of parser, |
| 924 | + * especially for rewrite. |
| 925 | + * |
| 926 | + * @param query |
| 927 | + * @param level - recursion level |
| 928 | + * @return |
| 929 | + */ |
| 930 | + protected Query extractSpans(Query query, int level, String fieldName, float boost) { |
| 931 | + // phrase, or termquery just rewrite field name |
| 932 | + if(query instanceof TermQuery){ |
| 933 | + TermQuery tq = (TermQuery)query; |
| 934 | + TermQuery ret = new TermQuery(new Term(fieldName,tq.getTerm().text())); |
| 935 | + ret.setBoost(boost); |
| 936 | + return ret; |
| 937 | + } else if(query instanceof PhraseQuery){ |
| 938 | + PhraseQuery phrase = new PhraseQuery(); |
| 939 | + for(Term term : ((PhraseQuery)query).getTerms()){ |
| 940 | + phrase.add(new Term(fieldName,term.text())); |
| 941 | + } |
| 942 | + phrase.setBoost(boost); |
| 943 | + return phrase; |
| 944 | + } else if(query instanceof BooleanQuery){ |
| 945 | + BooleanQuery bq = (BooleanQuery)query; |
| 946 | + // check for rewritten queries, TODO: parse complex multi-part rewrites |
| 947 | + if(level==0 && namespacePolicy != null && namespacePolicy == NamespacePolicy.REWRITE){ |
| 948 | + if(bq.getClauses().length == 2 && isNamespaceQuery(bq.getClauses()[0].getQuery())){ |
| 949 | + BooleanQuery ret = new BooleanQuery(); |
| 950 | + ret.add(bq.getClauses()[0]); |
| 951 | + // the second clause is always the query |
| 952 | + ret.add(extractSpans(bq.getClauses()[1].getQuery(),level+1,fieldName,boost),BooleanClause.Occur.MUST); |
| 953 | + return ret; |
| 954 | + } else |
| 955 | + return null; |
| 956 | + } |
| 957 | + // we can parse if all clauses are required |
| 958 | + boolean canTransform = true; |
| 959 | + for(BooleanClause cl : bq.getClauses()){ |
| 960 | + if(cl.getOccur() != BooleanClause.Occur.MUST){ |
| 961 | + canTransform = false; |
| 962 | + break; |
| 963 | + } |
| 964 | + } |
| 965 | + if(!canTransform) |
| 966 | + return null; |
| 967 | + // rewrite into span queries + categories |
| 968 | + ArrayList<SpanQuery> spans = new ArrayList<SpanQuery>(); |
| 969 | + ArrayList<Query> categories = new ArrayList<Query>(); |
| 970 | + for(BooleanClause cl : bq.getClauses()){ |
| 971 | + Query q = cl.getQuery(); |
| 972 | + if(q instanceof TermQuery){ // -> SpanTermQuery |
| 973 | + TermQuery tq = (TermQuery)q; |
| 974 | + Term t = tq.getTerm(); |
| 975 | + if(t.field().equals("category")){ |
| 976 | + categories.add(q); |
| 977 | + } else { |
| 978 | + SpanTermQuery stq = new SpanTermQuery(new Term(fieldName,t.text())); |
| 979 | + stq.setBoost(boost); |
| 980 | + spans.add(stq); |
| 981 | + } |
| 982 | + } else if(q instanceof PhraseQuery){ // -> SpanNearQuery(slop=0,inOrder=true) |
| 983 | + PhraseQuery pq = (PhraseQuery)q; |
| 984 | + Term[] terms = pq.getTerms(); |
| 985 | + if(terms[0].field().equals("category")){ |
| 986 | + categories.add(q); |
| 987 | + } else{ |
| 988 | + SpanTermQuery[] spanTerms = new SpanTermQuery[terms.length]; |
| 989 | + for(int i=0; i<terms.length; i++ ){ |
| 990 | + spanTerms[i] = new SpanTermQuery(new Term(fieldName,terms[i].text())); |
| 991 | + } |
| 992 | + SpanNearQuery snq = new SpanNearQuery(spanTerms,0,true); |
| 993 | + snq.setBoost(boost); |
| 994 | + spans.add(snq); |
| 995 | + } |
| 996 | + } |
| 997 | + } |
| 998 | + // create the queries |
| 999 | + Query cat = null; |
| 1000 | + SpanQuery span = null; |
| 1001 | + if(categories.size() != 0){ |
| 1002 | + if(categories.size() == 1) |
| 1003 | + cat = categories.get(0); |
| 1004 | + else{ |
| 1005 | + BooleanQuery b = new BooleanQuery(); |
| 1006 | + for(Query q : categories) |
| 1007 | + b.add(q,BooleanClause.Occur.MUST); |
| 1008 | + cat = b; // intersection of categories, bool query |
| 1009 | + } |
| 1010 | + } |
| 1011 | + if(spans.size() != 0){ |
| 1012 | + if(spans.size() == 1) |
| 1013 | + span = spans.get(0); |
| 1014 | + else{ |
| 1015 | + // make a span-near query that has a slop 1/2 of tokenGap |
| 1016 | + span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.tokenGap-1)/2,false); |
| 1017 | + } |
| 1018 | + } |
| 1019 | + if(cat != null && span != null){ |
| 1020 | + BooleanQuery ret = new BooleanQuery(); |
| 1021 | + ret.add(span,BooleanClause.Occur.MUST); |
| 1022 | + ret.add(cat,BooleanClause.Occur.MUST); |
| 1023 | + return ret; |
| 1024 | + } else if(span != null) |
| 1025 | + return span; |
| 1026 | + else // we don't want categories only |
| 1027 | + return null; |
| 1028 | + |
| 1029 | + } |
| 1030 | + return null; |
| 1031 | + } |
| 1032 | + |
| 1033 | + /** |
| 1034 | + * Main function for multi-pass parsing. |
| 1035 | + * |
899 | 1036 | * @param queryText |
900 | 1037 | * @param policy |
| 1038 | + * @param makeRedirect |
901 | 1039 | * @return |
902 | | - * @throws ParseException |
903 | 1040 | */ |
904 | | - public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{ |
| 1041 | + protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){ |
905 | 1042 | if(policy != null) |
906 | 1043 | this.namespacePolicy = policy; |
907 | 1044 | float olfDefaultBoost = defaultBoost; |
— | — | @@ -914,15 +1051,66 @@ |
915 | 1052 | defaultField = contentField; |
916 | 1053 | defaultBoost = olfDefaultBoost; |
917 | 1054 | if(qc == null || qt == null) |
918 | | - return new BooleanQuery(); |
919 | | - |
| 1055 | + return new BooleanQuery(); |
920 | 1056 | if(qc.equals(qt)) |
921 | 1057 | return qc; // don't duplicate (probably a query for categories only) |
922 | 1058 | BooleanQuery bq = new BooleanQuery(); |
923 | 1059 | bq.add(qc,BooleanClause.Occur.SHOULD); |
924 | 1060 | bq.add(qt,BooleanClause.Occur.SHOULD); |
| 1061 | + |
| 1062 | + // redirect pass |
| 1063 | + if(makeRedirect){ |
| 1064 | + Query qr = extractSpans(qt,0,"redirect",REDIRECT_BOOST); |
| 1065 | + if(qr != null) |
| 1066 | + bq.add(qr,BooleanClause.Occur.SHOULD); |
| 1067 | + } |
| 1068 | + // keyword pass |
| 1069 | + if(makeKeywords){ |
| 1070 | + Query qk = extractSpans(qt,0,"keyword",KEYWORD_BOOST); |
| 1071 | + if(qk != null) |
| 1072 | + bq.add(qk,BooleanClause.Occur.SHOULD); |
| 1073 | + } |
| 1074 | + |
925 | 1075 | return bq; |
| 1076 | + |
926 | 1077 | } |
| 1078 | + |
| 1079 | + /** |
| 1080 | + * Three parse pases: contents, title, redirect |
| 1081 | + * |
| 1082 | + * @param queryText |
| 1083 | + * @param policy |
| 1084 | + * @return |
| 1085 | + * @throws ParseException |
| 1086 | + */ |
| 1087 | + public Query parseThreePass(String queryText, NamespacePolicy policy) throws ParseException{ |
| 1088 | + return parseMultiPass(queryText,policy,true,false); |
| 1089 | + } |
| 1090 | + |
| 1091 | + /** |
| 1092 | + * Depending on settings for db, do all 4 passes of parsing: |
| 1093 | + * 1) contents |
| 1094 | + * 2) titles |
| 1095 | + * 3) redirects |
| 1096 | + * 4) keywords |
| 1097 | + */ |
| 1098 | + public Query parseFourPass(String queryText, NamespacePolicy policy, String dbname) throws ParseException{ |
| 1099 | + boolean makeKeywords = global.useKeywordScoring(dbname); |
| 1100 | + return parseMultiPass(queryText,policy,true,makeKeywords); |
| 1101 | + } |
| 1102 | + |
| 1103 | + /** |
| 1104 | + * Parse the query according to policy. Instead of rewrite phrase, simply pass |
| 1105 | + * twice the query with different default fields. |
| 1106 | + * |
| 1107 | + * @param queryText |
| 1108 | + * @param policy |
| 1109 | + * @return |
| 1110 | + * @throws ParseException |
| 1111 | + */ |
| 1112 | + public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{ |
| 1113 | + return parseMultiPass(queryText,policy,false,false); |
| 1114 | + } |
927 | 1115 | |
928 | 1116 | |
929 | 1117 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -47,9 +47,9 @@ |
48 | 48 | * |
49 | 49 | * @param text text to be tokenized |
50 | 50 | * @param languageAnalyzer language filter class (e.g. PorterStemFilter) |
51 | | - * @return |
| 51 | + * @return {PerFieldAnalyzerWrapper,WikiTokenizer} |
52 | 52 | */ |
53 | | - public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, FilterFactory filters) { |
| 53 | + public static Object[] getIndexerAnalyzer(String text, FilterFactory filters, ArrayList<String> redirects) { |
54 | 54 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
55 | 55 | // parse wiki-text to get categories |
56 | 56 | WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage()); |
— | — | @@ -63,8 +63,11 @@ |
64 | 64 | new CategoryAnalyzer(categories)); |
65 | 65 | perFieldAnalyzer.addAnalyzer("title", |
66 | 66 | getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
67 | | - |
68 | | - return perFieldAnalyzer; |
| 67 | + perFieldAnalyzer.addAnalyzer("redirect", |
| 68 | + new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory())); |
| 69 | + perFieldAnalyzer.addAnalyzer("keyword", |
| 70 | + new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory())); |
| 71 | + return new Object[] {perFieldAnalyzer,tokenizer}; |
69 | 72 | } |
70 | 73 | |
71 | 74 | public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -25,6 +25,8 @@ |
26 | 26 | package org.wikimedia.lsearch.beans; |
27 | 27 | |
28 | 28 | import java.io.Serializable; |
| 29 | +import java.util.ArrayList; |
| 30 | +import java.util.Collection; |
29 | 31 | |
30 | 32 | /** |
31 | 33 | * Wiki article. |
— | — | @@ -37,6 +39,8 @@ |
38 | 40 | private boolean redirect; |
39 | 41 | private long pageId; |
40 | 42 | private int rank; |
| 43 | + /** all redirects in format: ns:title */ |
| 44 | + private ArrayList<String> redirects; // pages that redirect to this page |
41 | 45 | |
42 | 46 | public Article(){ |
43 | 47 | namespace=""; |
— | — | @@ -44,7 +48,8 @@ |
45 | 49 | contents=""; |
46 | 50 | pageId = 0; |
47 | 51 | redirect=false; |
48 | | - rank=0; |
| 52 | + rank = 0; |
| 53 | + redirects=new ArrayList<String>(); |
49 | 54 | } |
50 | 55 | |
51 | 56 | public Article(long pageId, Title title, String text, boolean redirect, int rank) { |
— | — | @@ -54,6 +59,7 @@ |
55 | 60 | this.pageId = pageId; |
56 | 61 | this.redirect = redirect; |
57 | 62 | this.rank = rank; |
| 63 | + this.redirects = new ArrayList<String>(); |
58 | 64 | } |
59 | 65 | |
60 | 66 | public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int rank) { |
— | — | @@ -63,8 +69,19 @@ |
64 | 70 | this.redirect = redirect; |
65 | 71 | this.pageId = pageId; |
66 | 72 | this.rank = rank; |
| 73 | + this.redirects = new ArrayList<String>(); |
67 | 74 | } |
68 | 75 | |
| 76 | + public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int rank, ArrayList<String> redirects) { |
| 77 | + this.namespace = Integer.toString(namespace); |
| 78 | + this.title = titleText; |
| 79 | + contents = text; |
| 80 | + this.redirect = redirect; |
| 81 | + this.pageId = pageId; |
| 82 | + this.rank = rank; |
| 83 | + this.redirects = redirects; |
| 84 | + } |
| 85 | + |
69 | 86 | public boolean isRedirect() { |
70 | 87 | return redirect; |
71 | 88 | } |
— | — | @@ -111,8 +128,29 @@ |
112 | 129 | return "(" + namespace + ",\"" + title + "\")"; |
113 | 130 | } |
114 | 131 | |
| 132 | + /** Get how many articles link to this article */ |
115 | 133 | public int getRank() { |
116 | 134 | return rank; |
117 | 135 | } |
118 | 136 | |
| 137 | + /** Register a redirect to this article */ |
| 138 | + public void addRedirect(String linkingArticle){ |
| 139 | + redirects.add(linkingArticle); |
| 140 | + } |
| 141 | + |
| 142 | + /** Register a list of redirects to this article */ |
| 143 | + public void addRedirects(Collection<String> linkingArticles){ |
| 144 | + redirects.addAll(linkingArticles); |
| 145 | + } |
| 146 | + |
| 147 | + /** Get list of articles that redirect to this article */ |
| 148 | + public ArrayList<String> getRedirects() { |
| 149 | + return redirects; |
| 150 | + } |
| 151 | + |
| 152 | + public void setRedirects(ArrayList<String> redirects) { |
| 153 | + this.redirects = redirects; |
| 154 | + } |
| 155 | + |
| 156 | + |
119 | 157 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Rank.java |
— | — | @@ -1,19 +1,53 @@ |
2 | 2 | package org.wikimedia.lsearch.beans; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
| 5 | + |
4 | 6 | public class Rank { |
5 | 7 | /** Number of linking articles */ |
6 | 8 | public int links; |
7 | 9 | /** if this is redirect, point to the target title */ |
8 | | - public String redirect; |
| 10 | + public Rank redirectsTo; |
| 11 | + /** all the pages that get redirected here */ |
| 12 | + public ArrayList<String> redirected; |
9 | 13 | |
10 | 14 | public Rank(int links) { |
11 | 15 | this.links = links; |
12 | | - redirect = null; |
| 16 | + redirectsTo = null; |
13 | 17 | } |
14 | 18 | |
15 | | - public Rank(int links, String redirect) { |
| 19 | + public Rank(int links, Rank redirect) { |
16 | 20 | this.links = links; |
17 | | - this.redirect = redirect; |
| 21 | + this.redirectsTo = redirect; |
18 | 22 | } |
| 23 | + |
| 24 | + @Override |
| 25 | + public int hashCode() { |
| 26 | + final int PRIME = 31; |
| 27 | + int result = 1; |
| 28 | + result = PRIME * result + links; |
| 29 | + result = PRIME * result + 0; |
| 30 | + return result; |
| 31 | + } |
| 32 | + |
| 33 | + @Override |
| 34 | + public boolean equals(Object obj) { |
| 35 | + if (this == obj) |
| 36 | + return true; |
| 37 | + if (obj == null) |
| 38 | + return false; |
| 39 | + if (getClass() != obj.getClass()) |
| 40 | + return false; |
| 41 | + final Rank other = (Rank) obj; |
| 42 | + if (links != other.links) |
| 43 | + return false; |
| 44 | + if (redirectsTo == null) { |
| 45 | + if (other.redirectsTo != null) |
| 46 | + return false; |
| 47 | + } else if (redirectsTo != other.redirectsTo) |
| 48 | + return false; |
| 49 | + return true; |
| 50 | + } |
19 | 51 | |
| 52 | + |
| 53 | + |
20 | 54 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -18,12 +18,17 @@ |
19 | 19 | protected Revision revision; |
20 | 20 | protected ArrayList<IndexUpdateRecord> records = new ArrayList<IndexUpdateRecord>(); |
21 | 21 | protected IndexId iid; |
22 | | - protected int references; |
| 22 | + protected int references = 0; |
| 23 | + protected ArrayList<String> redirects = new ArrayList<String>(); |
23 | 24 | |
24 | 25 | public IndexUpdatesCollector(IndexId iid){ |
25 | 26 | this.iid = iid; |
26 | 27 | } |
27 | 28 | |
| 29 | + public void addRedirect(String redirectTitle, int references) { |
| 30 | + redirects.add(redirectTitle); |
| 31 | + addReferences(references); |
| 32 | + } |
28 | 33 | public void addDeletion(long pageId){ |
29 | 34 | // pageId is enough for page deletion |
30 | 35 | Article article = new Article(pageId,-1,"","",false,1); |
— | — | @@ -42,10 +47,12 @@ |
43 | 48 | this.page = page; |
44 | 49 | } |
45 | 50 | public void writeEndPage() throws IOException { |
46 | | - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references); |
47 | | - log.info("Collected "+article+" with rank "+references); |
| 51 | + Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects); |
| 52 | + //log.info("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects); |
48 | 53 | records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE)); |
49 | 54 | log.debug(iid+": Update for "+article); |
| 55 | + references = 0; |
| 56 | + redirects.clear(); |
50 | 57 | } |
51 | 58 | |
52 | 59 | public void close() throws IOException { |
— | — | @@ -64,10 +71,12 @@ |
65 | 72 | return references; |
66 | 73 | } |
67 | 74 | |
68 | | - public void setReferences(int references) { |
69 | | - this.references = references; |
| 75 | + public void addReferences(int references) { |
| 76 | + this.references += references; |
70 | 77 | } |
71 | 78 | |
| 79 | + |
| 80 | + |
72 | 81 | |
73 | 82 | |
74 | 83 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIParser.java |
— | — | @@ -8,7 +8,6 @@ |
9 | 9 | import javax.xml.parsers.SAXParser; |
10 | 10 | import javax.xml.parsers.SAXParserFactory; |
11 | 11 | |
12 | | -import org.mediawiki.importer.DumpWriter; |
13 | 12 | import org.mediawiki.importer.XmlDumpReader; |
14 | 13 | import org.xml.sax.Attributes; |
15 | 14 | import org.xml.sax.SAXException; |
— | — | @@ -24,6 +23,8 @@ |
25 | 24 | * appears as a continious stream. For this stream |
26 | 25 | * calls to sax parser methods are delegated to XmlDumpReader. |
27 | 26 | * |
| 27 | + * Note: implementation is very lazy and messy :( |
| 28 | + * |
28 | 29 | * @author rainman |
29 | 30 | * |
30 | 31 | */ |
— | — | @@ -37,8 +38,8 @@ |
38 | 39 | protected String oaiId,pageId,resumptionToken,responseDate; |
39 | 40 | protected boolean beginMW; // beginning of mediawiki stream |
40 | 41 | protected String mwUri, mwLocalName, mwQName; |
41 | | - protected boolean isDeleted, inReferences; |
42 | | - protected String references; |
| 42 | + protected boolean isDeleted, inReferences, inRedirect, inRedirectTitle, inRedirectRef; |
| 43 | + protected String references, redirectTitle, redirectRef; |
43 | 44 | |
44 | 45 | |
45 | 46 | public OAIParser(InputStream in, IndexUpdatesCollector collector){ |
— | — | @@ -50,6 +51,8 @@ |
51 | 52 | inResponseDate = false; inReferences = false; |
52 | 53 | oaiId = ""; resumptionToken = ""; responseDate = ""; |
53 | 54 | beginMW = true; references = ""; |
| 55 | + inRedirect = false; inRedirectTitle= false; inRedirectRef = false; |
| 56 | + redirectTitle = ""; redirectRef = ""; |
54 | 57 | } |
55 | 58 | |
56 | 59 | public void parse() throws IOException{ |
— | — | @@ -74,8 +77,17 @@ |
75 | 78 | inDump = false; // lsearch syntax |
76 | 79 | inReferences = true; |
77 | 80 | references = ""; |
| 81 | + } else if(inDump && qName.equals("redirect")){ |
| 82 | + inDump = false; |
| 83 | + inRedirect = true; |
| 84 | + redirectTitle = ""; |
| 85 | + redirectRef = ""; |
78 | 86 | } else if(inDump) |
79 | 87 | dumpReader.startElement(uri, localName, qName, attributes); |
| 88 | + else if(inRedirect && qName.equals("title")) |
| 89 | + inRedirectTitle = true; |
| 90 | + else if(inRedirect && qName.equals("references")) |
| 91 | + inRedirectRef = true; |
80 | 92 | else if(qName.equals("record")) |
81 | 93 | inRecord = true; |
82 | 94 | else if(qName.equals("header") && inRecord){ |
— | — | @@ -85,8 +97,7 @@ |
86 | 98 | isDeleted = true; |
87 | 99 | else |
88 | 100 | isDeleted = false; |
89 | | - } |
90 | | - else if(qName.equals("identifier") && inHeader){ |
| 101 | + } else if(qName.equals("identifier") && inHeader){ |
91 | 102 | oaiId = ""; |
92 | 103 | inIdentifier = true; |
93 | 104 | } else if(qName.equals("metadata")) |
— | — | @@ -115,10 +126,23 @@ |
116 | 127 | dumpReader.endElement(uri, localName, qName); |
117 | 128 | else if(qName.equals("upload")) |
118 | 129 | inDump = true; // we ignored upload tag / parsed references, we can now resume |
119 | | - else if(qName.equals("references")){ |
| 130 | + else if(!inRedirect && qName.equals("references")){ |
120 | 131 | inDump = true; |
121 | | - collector.setReferences(Integer.parseInt(references)); |
122 | | - } else if(qName.equals("record")) |
| 132 | + inReferences = false; |
| 133 | + if(!references.equals("")) |
| 134 | + collector.addReferences(Integer.parseInt(references)); |
| 135 | + } if(qName.equals("redirect")){ |
| 136 | + inDump = true; |
| 137 | + int ref = 0; |
| 138 | + if(!redirectRef.equals("")) |
| 139 | + ref = Integer.parseInt(redirectRef); |
| 140 | + collector.addRedirect(redirectTitle,ref); |
| 141 | + inRedirect = false; |
| 142 | + } else if(inRedirect && qName.equals("title")) |
| 143 | + inRedirectTitle = false; |
| 144 | + else if(inRedirect && qName.equals("references")) |
| 145 | + inRedirectRef = false; |
| 146 | + else if(qName.equals("record")) |
123 | 147 | inRecord = false; |
124 | 148 | else if(qName.equals("header")) |
125 | 149 | inHeader = false; |
— | — | @@ -153,6 +177,10 @@ |
154 | 178 | responseDate += new String(ch,start,length); |
155 | 179 | } else if(inReferences){ |
156 | 180 | references += new String(ch,start,length); |
| 181 | + } else if(inRedirectTitle){ |
| 182 | + redirectTitle += new String(ch,start,length); |
| 183 | + } else if(inRedirectRef){ |
| 184 | + redirectRef += new String(ch,start,length); |
157 | 185 | } |
158 | 186 | } |
159 | 187 | |