r22539 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r22538‎ | r22539 | r22540 >
Date:17:58, 29 May 2007
Author:rainman
Status:old
Tags:
Comment:
Updated ranking which is now three-fold, using:
* number of references to page
* redirect names
* keywords from the beginning of the article
Modified paths:
  • /trunk/lucene-search-2.0/lsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/lsearch.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/apache/commons (added) (history)
  • /trunk/lucene-search-2.0/src/org/apache/commons/lang (added) (history)
  • /trunk/lucene-search-2.0/src/org/apache/commons/lang/WordUtils.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Rank.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/RankReader.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /trunk/lucene-search-2.0/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/lsearch-global.conf
@@ -31,6 +31,14 @@
3232 [Index-Path]
3333 <default> : /mwsearch
3434
 35+# Global properies
 36+[Properties]
 37+# suffixes to database name, the rest is assumed to be language code
 38+Database.suffix=wiki wiktionary
 39+
 40+# dbnames that end with the suffix will use additional keywords scores
 41+KeywordScoring.suffix=wiki wikilucene wikidev
 42+
3543 # Put here you custom namespace prefixes
3644 # Syntax: <prefix_name> : <coma separated list of namespaces>
3745 # <all> is a special keyword meaning all namespaces
@@ -54,4 +62,3 @@
5563 [14] : 14
5664 [15] : 15
5765
58 -
Index: trunk/lucene-search-2.0/lsearch.conf
@@ -57,9 +57,6 @@
5858 # URL to message files, {0} is replaced with language code, i.e. En
5959 Localization.url=file:///var/www/html/wiki-lucene/phase3/languages/messages/Messages{0}.php
6060
61 -# suffixes to database name, the rest is assumed to be language code
62 -Database.suffix=test
63 -
6461 # Pattern for OAI repo. {0} is replaced with dbname, {1} with language
6562 OAI.repo=http://localhost/wiki-lucene/phase3/index.php/Special:OAIRepository
6663
Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test
@@ -12,6 +12,7 @@
1313 entest : (ngram), (aspell,en)
1414 detest,rutest : (single,true,2,10)
1515 frtest : (split,3) (part1) (part2) (part3)
 16+srwiki : (single)
1617
1718 # Search nodes
1819 # host : db1.role, db2.role
@@ -34,6 +35,7 @@
3536 192.168.0.5 : detest, rutest, frtest
3637 192.168.0.2 : entest.ngram
3738 192.168.0.2 : frtest.part1, frtest.part2, frtest.part3
 39+192.168.0.10 : srwiki
3840
3941 # Path where indexes are on hosts, after default value put hosts where
4042 # the location differs
@@ -41,6 +43,13 @@
4244 <default> : /mwsearch
4345 192.168.0.5 : mwsearch2
4446
 47+[Properties]
 48+# suffixes to database name, the rest is assumed to be language code
 49+Database.suffix=wiki wiktionary test
 50+
 51+# dbnames that end with the suffix will use additional keywords scores
 52+KeywordScoring.suffix=wiki rutest
 53+
4554 # databases can be writen as {file}, where file contains list of dbs
4655
4756 # Put here you custom namespace prefixes
Index: trunk/lucene-search-2.0/src/org/apache/commons/lang/WordUtils.java
@@ -0,0 +1,584 @@
 2+/*
 3+ * Licensed to the Apache Software Foundation (ASF) under one or more
 4+ * contributor license agreements. See the NOTICE file distributed with
 5+ * this work for additional information regarding copyright ownership.
 6+ * The ASF licenses this file to You under the Apache License, Version 2.0
 7+ * (the "License"); you may not use this file except in compliance with
 8+ * the License. You may obtain a copy of the License at
 9+ *
 10+ * http://www.apache.org/licenses/LICENSE-2.0
 11+ *
 12+ * Unless required by applicable law or agreed to in writing, software
 13+ * distributed under the License is distributed on an "AS IS" BASIS,
 14+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15+ * See the License for the specific language governing permissions and
 16+ * limitations under the License.
 17+ */
 18+package org.apache.commons.lang;
 19+
 20+/**
 21+ * <p>Operations on Strings that contain words.</p>
 22+ *
 23+ * <p>This class tries to handle <code>null</code> input gracefully.
 24+ * An exception will not be thrown for a <code>null</code> input.
 25+ * Each method documents its behaviour in more detail.</p>
 26+ *
 27+ * @author Apache Jakarta Velocity
 28+ * @author Stephen Colebourne
 29+ * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
 30+ * @author Gary Gregory
 31+ * @since 2.0
 32+ * @version $Id: WordUtils.java 471626 2006-11-06 04:02:09Z bayard $
 33+ */
 34+public class WordUtils {
 35+
 36+ /**
 37+ * <p><code>WordUtils</code> instances should NOT be constructed in
 38+ * standard programming. Instead, the class should be used as
 39+ * <code>WordUtils.wrap("foo bar", 20);</code>.</p>
 40+ *
 41+ * <p>This constructor is public to permit tools that require a JavaBean
 42+ * instance to operate.</p>
 43+ */
 44+ public WordUtils() {
 45+ super();
 46+ }
 47+
 48+ // Wrapping
 49+ //--------------------------------------------------------------------------
 50+// /**
 51+// * <p>Wraps a block of text to a specified line length using '\n' as
 52+// * a newline.</p>
 53+// *
 54+// * <p>This method takes a block of text, which might have long lines in it
 55+// * and wraps the long lines based on the supplied lineLength parameter.</p>
 56+// *
 57+// * <p>If a single word is longer than the line length (eg. a URL), it will
 58+// * not be broken, and will display beyond the expected width.</p>
 59+// *
 60+// * <p>If there are tabs in inString, you are going to get results that are
 61+// * a bit strange. Tabs are a single character but are displayed as 4 or 8
 62+// * spaces. Remove the tabs.</p>
 63+// *
 64+// * @param str text which is in need of word-wrapping, may be null
 65+// * @param lineLength the column to wrap the words at
 66+// * @return the text with all the long lines word-wrapped
 67+// * <code>null</code> if null string input
 68+// */
 69+// public static String wrapText(String str, int lineLength) {
 70+// return wrap(str, null, lineLength);
 71+// }
 72+
 73+// /**
 74+// * <p>Wraps a block of text to a specified line length.</p>
 75+// *
 76+// * <p>This method takes a block of text, which might have long lines in it
 77+// * and wraps the long lines based on the supplied lineLength parameter.</p>
 78+// *
 79+// * <p>If a single word is longer than the wrapColumn (eg. a URL), it will
 80+// * not be broken, and will display beyond the expected width.</p>
 81+// *
 82+// * <p>If there are tabs in inString, you are going to get results that are
 83+// * a bit strange. Tabs are a single character but are displayed as 4 or 8
 84+// * spaces. Remove the tabs.</p>
 85+// *
 86+// * @param str text which is in need of word-wrapping, may be null
 87+// * @param newLineChars the characters that define a newline, null treated as \n
 88+// * @param lineLength the column to wrap the words at
 89+// * @return the text with all the long lines word-wrapped
 90+// * <code>null</code> if null string input
 91+// */
 92+// public static String wrapText(String str, String newLineChars, int lineLength) {
 93+// if (str == null) {
 94+// return null;
 95+// }
 96+// if (newLineChars == null) {
 97+// newLineChars = "\n";
 98+// }
 99+// StringTokenizer lineTokenizer = new StringTokenizer(str, newLineChars, true);
 100+// StringBuffer stringBuffer = new StringBuffer();
 101+//
 102+// while (lineTokenizer.hasMoreTokens()) {
 103+// try {
 104+// String nextLine = lineTokenizer.nextToken();
 105+//
 106+// if (nextLine.length() > lineLength) {
 107+// // This line is long enough to be wrapped.
 108+// nextLine = wrapLine(nextLine, null, lineLength, false);
 109+// }
 110+//
 111+// stringBuffer.append(nextLine);
 112+//
 113+// } catch (NoSuchElementException nsee) {
 114+// // thrown by nextToken(), but I don't know why it would
 115+// break;
 116+// }
 117+// }
 118+//
 119+// return stringBuffer.toString();
 120+// }
 121+
 122+ // Wrapping
 123+ //-----------------------------------------------------------------------
 124+ /**
 125+ * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p>
 126+ *
 127+ * <p>New lines will be separated by the system property line separator.
 128+ * Very long words, such as URLs will <i>not</i> be wrapped.</p>
 129+ *
 130+ * <p>Leading spaces on a new line are stripped.
 131+ * Trailing spaces are not stripped.</p>
 132+ *
 133+ * <pre>
 134+ * WordUtils.wrap(null, *) = null
 135+ * WordUtils.wrap("", *) = ""
 136+ * </pre>
 137+ *
 138+ * @param str the String to be word wrapped, may be null
 139+ * @param wrapLength the column to wrap the words at, less than 1 is treated as 1
 140+ * @return a line with newlines inserted, <code>null</code> if null input
 141+ */
 142+ public static String wrap(String str, int wrapLength) {
 143+ return wrap(str, wrapLength, null, false);
 144+ }
 145+
 146+ /**
 147+ * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p>
 148+ *
 149+ * <p>Leading spaces on a new line are stripped.
 150+ * Trailing spaces are not stripped.</p>
 151+ *
 152+ * <pre>
 153+ * WordUtils.wrap(null, *, *, *) = null
 154+ * WordUtils.wrap("", *, *, *) = ""
 155+ * </pre>
 156+ *
 157+ * @param str the String to be word wrapped, may be null
 158+ * @param wrapLength the column to wrap the words at, less than 1 is treated as 1
 159+ * @param newLineStr the string to insert for a new line,
 160+ * <code>null</code> uses the system property line separator
 161+ * @param wrapLongWords true if long words (such as URLs) should be wrapped
 162+ * @return a line with newlines inserted, <code>null</code> if null input
 163+ */
 164+ public static String wrap(String str, int wrapLength, String newLineStr, boolean wrapLongWords) {
 165+ if (str == null) {
 166+ return null;
 167+ }
 168+ if (newLineStr == null) {
 169+ newLineStr = System.getProperty("line.separator");
 170+ }
 171+ if (wrapLength < 1) {
 172+ wrapLength = 1;
 173+ }
 174+ int inputLineLength = str.length();
 175+ int offset = 0;
 176+ StringBuffer wrappedLine = new StringBuffer(inputLineLength + 32);
 177+
 178+ while ((inputLineLength - offset) > wrapLength) {
 179+ if (str.charAt(offset) == ' ') {
 180+ offset++;
 181+ continue;
 182+ }
 183+ int spaceToWrapAt = str.lastIndexOf(' ', wrapLength + offset);
 184+
 185+ if (spaceToWrapAt >= offset) {
 186+ // normal case
 187+ wrappedLine.append(str.substring(offset, spaceToWrapAt));
 188+ wrappedLine.append(newLineStr);
 189+ offset = spaceToWrapAt + 1;
 190+
 191+ } else {
 192+ // really long word or URL
 193+ if (wrapLongWords) {
 194+ // wrap really long word one line at a time
 195+ wrappedLine.append(str.substring(offset, wrapLength + offset));
 196+ wrappedLine.append(newLineStr);
 197+ offset += wrapLength;
 198+ } else {
 199+ // do not wrap really long word, just extend beyond limit
 200+ spaceToWrapAt = str.indexOf(' ', wrapLength + offset);
 201+ if (spaceToWrapAt >= 0) {
 202+ wrappedLine.append(str.substring(offset, spaceToWrapAt));
 203+ wrappedLine.append(newLineStr);
 204+ offset = spaceToWrapAt + 1;
 205+ } else {
 206+ wrappedLine.append(str.substring(offset));
 207+ offset = inputLineLength;
 208+ }
 209+ }
 210+ }
 211+ }
 212+
 213+ // Whatever is left in line is short enough to just pass through
 214+ wrappedLine.append(str.substring(offset));
 215+
 216+ return wrappedLine.toString();
 217+ }
 218+
 219+ // Capitalizing
 220+ //-----------------------------------------------------------------------
 221+ /**
 222+ * <p>Capitalizes all the whitespace separated words in a String.
 223+ * Only the first letter of each word is changed. To convert the
 224+ * rest of each word to lowercase at the same time,
 225+ * use {@link #capitalizeFully(String)}.</p>
 226+ *
 227+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
 228+ * A <code>null</code> input String returns <code>null</code>.
 229+ * Capitalization uses the unicode title case, normally equivalent to
 230+ * upper case.</p>
 231+ *
 232+ * <pre>
 233+ * WordUtils.capitalize(null) = null
 234+ * WordUtils.capitalize("") = ""
 235+ * WordUtils.capitalize("i am FINE") = "I Am FINE"
 236+ * </pre>
 237+ *
 238+ * @param str the String to capitalize, may be null
 239+ * @return capitalized String, <code>null</code> if null String input
 240+ * @see #uncapitalize(String)
 241+ * @see #capitalizeFully(String)
 242+ */
 243+ public static String capitalize(String str) {
 244+ return capitalize(str, null);
 245+ }
 246+
 247+ /**
 248+ * <p>Capitalizes all the delimiter separated words in a String.
 249+ * Only the first letter of each word is changed. To convert the
 250+ * rest of each word to lowercase at the same time,
 251+ * use {@link #capitalizeFully(String, char[])}.</p>
 252+ *
 253+ * <p>The delimiters represent a set of characters understood to separate words.
 254+ * The first string character and the first non-delimiter character after a
 255+ * delimiter will be capitalized. </p>
 256+ *
 257+ * <p>A <code>null</code> input String returns <code>null</code>.
 258+ * Capitalization uses the unicode title case, normally equivalent to
 259+ * upper case.</p>
 260+ *
 261+ * <pre>
 262+ * WordUtils.capitalize(null, *) = null
 263+ * WordUtils.capitalize("", *) = ""
 264+ * WordUtils.capitalize(*, new char[0]) = *
 265+ * WordUtils.capitalize("i am fine", null) = "I Am Fine"
 266+ * WordUtils.capitalize("i aM.fine", {'.'}) = "I aM.Fine"
 267+ * </pre>
 268+ *
 269+ * @param str the String to capitalize, may be null
 270+ * @param delimiters set of characters to determine capitalization, null means whitespace
 271+ * @return capitalized String, <code>null</code> if null String input
 272+ * @see #uncapitalize(String)
 273+ * @see #capitalizeFully(String)
 274+ * @since 2.1
 275+ */
 276+ public static String capitalize(String str, char[] delimiters) {
 277+ int delimLen = (delimiters == null ? -1 : delimiters.length);
 278+ if (str == null || str.length() == 0 || delimLen == 0) {
 279+ return str;
 280+ }
 281+ int strLen = str.length();
 282+ StringBuffer buffer = new StringBuffer(strLen);
 283+ boolean capitalizeNext = true;
 284+ for (int i = 0; i < strLen; i++) {
 285+ char ch = str.charAt(i);
 286+
 287+ if (isDelimiter(ch, delimiters)) {
 288+ buffer.append(ch);
 289+ capitalizeNext = true;
 290+ } else if (capitalizeNext) {
 291+ buffer.append(Character.toTitleCase(ch));
 292+ capitalizeNext = false;
 293+ } else {
 294+ buffer.append(ch);
 295+ }
 296+ }
 297+ return buffer.toString();
 298+ }
 299+
 300+ //-----------------------------------------------------------------------
 301+ /**
 302+ * <p>Converts all the whitespace separated words in a String into capitalized words,
 303+ * that is each word is made up of a titlecase character and then a series of
 304+ * lowercase characters. </p>
 305+ *
 306+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
 307+ * A <code>null</code> input String returns <code>null</code>.
 308+ * Capitalization uses the unicode title case, normally equivalent to
 309+ * upper case.</p>
 310+ *
 311+ * <pre>
 312+ * WordUtils.capitalizeFully(null) = null
 313+ * WordUtils.capitalizeFully("") = ""
 314+ * WordUtils.capitalizeFully("i am FINE") = "I Am Fine"
 315+ * </pre>
 316+ *
 317+ * @param str the String to capitalize, may be null
 318+ * @return capitalized String, <code>null</code> if null String input
 319+ */
 320+ public static String capitalizeFully(String str) {
 321+ return capitalizeFully(str, null);
 322+ }
 323+
 324+ /**
 325+ * <p>Converts all the delimiter separated words in a String into capitalized words,
 326+ * that is each word is made up of a titlecase character and then a series of
 327+ * lowercase characters. </p>
 328+ *
 329+ * <p>The delimiters represent a set of characters understood to separate words.
 330+ * The first string character and the first non-delimiter character after a
 331+ * delimiter will be capitalized. </p>
 332+ *
 333+ * <p>A <code>null</code> input String returns <code>null</code>.
 334+ * Capitalization uses the unicode title case, normally equivalent to
 335+ * upper case.</p>
 336+ *
 337+ * <pre>
 338+ * WordUtils.capitalizeFully(null, *) = null
 339+ * WordUtils.capitalizeFully("", *) = ""
 340+ * WordUtils.capitalizeFully(*, null) = *
 341+ * WordUtils.capitalizeFully(*, new char[0]) = *
 342+ * WordUtils.capitalizeFully("i aM.fine", {'.'}) = "I am.Fine"
 343+ * </pre>
 344+ *
 345+ * @param str the String to capitalize, may be null
 346+ * @param delimiters set of characters to determine capitalization, null means whitespace
 347+ * @return capitalized String, <code>null</code> if null String input
 348+ * @since 2.1
 349+ */
 350+ public static String capitalizeFully(String str, char[] delimiters) {
 351+ int delimLen = (delimiters == null ? -1 : delimiters.length);
 352+ if (str == null || str.length() == 0 || delimLen == 0) {
 353+ return str;
 354+ }
 355+ str = str.toLowerCase();
 356+ return capitalize(str, delimiters);
 357+ }
 358+
 359+ //-----------------------------------------------------------------------
 360+ /**
 361+ * <p>Uncapitalizes all the whitespace separated words in a String.
 362+ * Only the first letter of each word is changed.</p>
 363+ *
 364+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
 365+ * A <code>null</code> input String returns <code>null</code>.</p>
 366+ *
 367+ * <pre>
 368+ * WordUtils.uncapitalize(null) = null
 369+ * WordUtils.uncapitalize("") = ""
 370+ * WordUtils.uncapitalize("I Am FINE") = "i am fINE"
 371+ * </pre>
 372+ *
 373+ * @param str the String to uncapitalize, may be null
 374+ * @return uncapitalized String, <code>null</code> if null String input
 375+ * @see #capitalize(String)
 376+ */
 377+ public static String uncapitalize(String str) {
 378+ return uncapitalize(str, null);
 379+ }
 380+
 381+ /**
 382+ * <p>Uncapitalizes all the whitespace separated words in a String.
 383+ * Only the first letter of each word is changed.</p>
 384+ *
 385+ * <p>The delimiters represent a set of characters understood to separate words.
 386+ * The first string character and the first non-delimiter character after a
 387+ * delimiter will be uncapitalized. </p>
 388+ *
 389+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
 390+ * A <code>null</code> input String returns <code>null</code>.</p>
 391+ *
 392+ * <pre>
 393+ * WordUtils.uncapitalize(null, *) = null
 394+ * WordUtils.uncapitalize("", *) = ""
 395+ * WordUtils.uncapitalize(*, null) = *
 396+ * WordUtils.uncapitalize(*, new char[0]) = *
 397+ * WordUtils.uncapitalize("I AM.FINE", {'.'}) = "i AM.fINE"
 398+ * </pre>
 399+ *
 400+ * @param str the String to uncapitalize, may be null
 401+ * @param delimiters set of characters to determine uncapitalization, null means whitespace
 402+ * @return uncapitalized String, <code>null</code> if null String input
 403+ * @see #capitalize(String)
 404+ * @since 2.1
 405+ */
 406+ public static String uncapitalize(String str, char[] delimiters) {
 407+ int delimLen = (delimiters == null ? -1 : delimiters.length);
 408+ if (str == null || str.length() == 0 || delimLen == 0) {
 409+ return str;
 410+ }
 411+ int strLen = str.length();
 412+ StringBuffer buffer = new StringBuffer(strLen);
 413+ boolean uncapitalizeNext = true;
 414+ for (int i = 0; i < strLen; i++) {
 415+ char ch = str.charAt(i);
 416+
 417+ if (isDelimiter(ch, delimiters)) {
 418+ buffer.append(ch);
 419+ uncapitalizeNext = true;
 420+ } else if (uncapitalizeNext) {
 421+ buffer.append(Character.toLowerCase(ch));
 422+ uncapitalizeNext = false;
 423+ } else {
 424+ buffer.append(ch);
 425+ }
 426+ }
 427+ return buffer.toString();
 428+ }
 429+
 430+ //-----------------------------------------------------------------------
 431+ /**
 432+ * <p>Swaps the case of a String using a word based algorithm.</p>
 433+ *
 434+ * <ul>
 435+ * <li>Upper case character converts to Lower case</li>
 436+ * <li>Title case character converts to Lower case</li>
 437+ * <li>Lower case character after Whitespace or at start converts to Title case</li>
 438+ * <li>Other Lower case character converts to Upper case</li>
 439+ * </ul>
 440+ *
 441+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
 442+ * A <code>null</code> input String returns <code>null</code>.</p>
 443+ *
 444+ * <pre>
 445+ * StringUtils.swapCase(null) = null
 446+ * StringUtils.swapCase("") = ""
 447+ * StringUtils.swapCase("The dog has a BONE") = "tHE DOG HAS A bone"
 448+ * </pre>
 449+ *
 450+ * @param str the String to swap case, may be null
 451+ * @return the changed String, <code>null</code> if null String input
 452+ */
 453+ public static String swapCase(String str) {
 454+ int strLen;
 455+ if (str == null || (strLen = str.length()) == 0) {
 456+ return str;
 457+ }
 458+ StringBuffer buffer = new StringBuffer(strLen);
 459+
 460+ boolean whitespace = true;
 461+ char ch = 0;
 462+ char tmp = 0;
 463+
 464+ for (int i = 0; i < strLen; i++) {
 465+ ch = str.charAt(i);
 466+ if (Character.isUpperCase(ch)) {
 467+ tmp = Character.toLowerCase(ch);
 468+ } else if (Character.isTitleCase(ch)) {
 469+ tmp = Character.toLowerCase(ch);
 470+ } else if (Character.isLowerCase(ch)) {
 471+ if (whitespace) {
 472+ tmp = Character.toTitleCase(ch);
 473+ } else {
 474+ tmp = Character.toUpperCase(ch);
 475+ }
 476+ } else {
 477+ tmp = ch;
 478+ }
 479+ buffer.append(tmp);
 480+ whitespace = Character.isWhitespace(ch);
 481+ }
 482+ return buffer.toString();
 483+ }
 484+
 485+ //-----------------------------------------------------------------------
 486+ /**
 487+ * <p>Extracts the initial letters from each word in the String.</p>
 488+ *
 489+ * <p>The first letter of the string and all first letters after
 490+ * whitespace are returned as a new string.
 491+ * Their case is not changed.</p>
 492+ *
 493+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
 494+ * A <code>null</code> input String returns <code>null</code>.</p>
 495+ *
 496+ * <pre>
 497+ * WordUtils.initials(null) = null
 498+ * WordUtils.initials("") = ""
 499+ * WordUtils.initials("Ben John Lee") = "BJL"
 500+ * WordUtils.initials("Ben J.Lee") = "BJ"
 501+ * </pre>
 502+ *
 503+ * @param str the String to get initials from, may be null
 504+ * @return String of initial letters, <code>null</code> if null String input
 505+ * @see #initials(String,char[])
 506+ * @since 2.2
 507+ */
 508+ public static String initials(String str) {
 509+ return initials(str, null);
 510+ }
 511+
 512+ /**
 513+ * <p>Extracts the initial letters from each word in the String.</p>
 514+ *
 515+ * <p>The first letter of the string and all first letters after the
 516+ * defined delimiters are returned as a new string.
 517+ * Their case is not changed.</p>
 518+ *
 519+ * <p>If the delimiters array is null, then Whitespace is used.
 520+ * Whitespace is defined by {@link Character#isWhitespace(char)}.
 521+ * A <code>null</code> input String returns <code>null</code>.
 522+ * An empty delimiter array returns an empty String.</p>
 523+ *
 524+ * <pre>
 525+ * WordUtils.initials(null, *) = null
 526+ * WordUtils.initials("", *) = ""
 527+ * WordUtils.initials("Ben John Lee", null) = "BJL"
 528+ * WordUtils.initials("Ben J.Lee", null) = "BJ"
 529+ * WordUtils.initials("Ben J.Lee", [' ','.']) = "BJL"
 530+ * WordUtils.initials(*, new char[0]) = ""
 531+ * </pre>
 532+ *
 533+ * @param str the String to get initials from, may be null
 534+ * @param delimiters set of characters to determine words, null means whitespace
 535+ * @return String of initial letters, <code>null</code> if null String input
 536+ * @see #initials(String)
 537+ * @since 2.2
 538+ */
 539+ public static String initials(String str, char[] delimiters) {
 540+ if (str == null || str.length() == 0) {
 541+ return str;
 542+ }
 543+ if (delimiters != null && delimiters.length == 0) {
 544+ return "";
 545+ }
 546+ int strLen = str.length();
 547+ char[] buf = new char[strLen / 2 + 1];
 548+ int count = 0;
 549+ boolean lastWasGap = true;
 550+ for (int i = 0; i < strLen; i++) {
 551+ char ch = str.charAt(i);
 552+
 553+ if (isDelimiter(ch, delimiters)) {
 554+ lastWasGap = true;
 555+ } else if (lastWasGap) {
 556+ buf[count++] = ch;
 557+ lastWasGap = false;
 558+ } else {
 559+ // ignore ch
 560+ }
 561+ }
 562+ return new String(buf, 0, count);
 563+ }
 564+
 565+ //-----------------------------------------------------------------------
 566+ /**
 567+ * Is the character a delimiter.
 568+ *
 569+ * @param ch the character to check
 570+ * @param delimiters the delimiters
 571+ * @return true if it is a delimiter
 572+ */
 573+ private static boolean isDelimiter(char ch, char[] delimiters) {
 574+ if (delimiters == null) {
 575+ return Character.isWhitespace(ch);
 576+ }
 577+ for (int i = 0, isize = delimiters.length; i < isize; i++) {
 578+ if (ch == delimiters[i]) {
 579+ return true;
 580+ }
 581+ }
 582+ return false;
 583+ }
 584+
 585+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -273,8 +273,33 @@
274274 q = parser.parseTwoPass("[1,12]:beans",NamespacePolicy.REWRITE);
275275 assertEquals("(+(namespace:1 namespace:12) +(contents:beans contents:bean^0.5)) (+(namespace:1 namespace:12) +title:beans^2.0)",q.toString());
276276
 277+ q = parser.parseTwoPass("[1,12]:beans and others incategory:food",NamespacePolicy.REWRITE);
 278+ assertEquals("(+(namespace:1 namespace:12) +(+(contents:beans contents:bean^0.5) +contents:and +(contents:others contents:other^0.5) +category:food)) (+(namespace:1 namespace:12) +(+title:beans^2.0 +title:and^2.0 +title:others^2.0 +category:food))",q.toString());
 279+
277280 q = parser.parseTwoPass("[1,a12]:beans",NamespacePolicy.IGNORE);
278281 assertEquals("(+contents:1 +contents:a12 +(contents:beans contents:bean^0.5)) (+title:1^2.0 +title:a12^2.0 +title:beans^2.0)",q.toString());
 282+
 283+ // Redirect third pass tests
 284+ q = parser.parseThreePass("beans",NamespacePolicy.IGNORE);
 285+ assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 redirect:beans^2.0",q.toString());
 286+
 287+ q = parser.parseThreePass("beans everyone",NamespacePolicy.IGNORE);
 288+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false)",q.toString());
 289+
 290+ q = parser.parseThreePass("beans everyone incategory:mouse",NamespacePolicy.IGNORE);
 291+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) (+spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false) +category:mouse)",q.toString());
 292+
 293+ q = parser.parseThreePass("beans OR everyone",NamespacePolicy.IGNORE);
 294+ assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0)",q.toString());
 295+
 296+ q = parser.parseThreePass("beans -everyone",NamespacePolicy.IGNORE);
 297+ assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0)",q.toString());
 298+
 299+ q = parser.parseThreePass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE);
 300+ assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false))",q.toString());
 301+
 302+ q = parser.parseThreePass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE);
 303+ assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0))",q.toString());
279304
280305 // Test field extraction
281306 HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -11,9 +11,11 @@
1212 import java.net.URL;
1313 import java.util.ArrayList;
1414 import java.util.Hashtable;
 15+import java.util.Properties;
1516
1617 import org.wikimedia.lsearch.config.GlobalConfiguration;
1718 import org.wikimedia.lsearch.config.IndexId;
 19+import org.wikimedia.lsearch.search.NamespaceFilter;
1820
1921 import junit.framework.TestCase;
2022
@@ -57,7 +59,11 @@
5860 return searchGroup;
5961 }
6062
 63+ public Properties getGlobalProps(){
 64+ return globalProperties;
 65+ }
6166
 67+
6268 }
6369
6470 public static GlobalConfigurationTest.TestGC testgc = null;
@@ -80,7 +86,7 @@
8187 String testurl = "file://"+System.getProperty("user.dir")+"/test-data/mwsearch-global.test";
8288 try {
8389 URL url = new URL(testurl);
84 - testgc.readFromURL(url,"/usr/local/var/mwsearch","",null);
 90+ testgc.readFromURL(url,"/usr/local/var/mwsearch","");
8591
8692 // database
8793 Hashtable database = testgc.getDatabase();
@@ -147,6 +153,23 @@
148154 String hostName = host.getHostName();
149155 System.out.println("Verify internet IP: "+hostAddr+", and hostname: "+hostName);
150156
 157+ // test prefixes
 158+ Hashtable<String,NamespaceFilter> p = testgc.getNamespacePrefixes();
 159+ assertEquals(17,p.size());
 160+
 161+ // check global properties
 162+ Properties prop = testgc.getGlobalProps();
 163+ assertEquals("wiki wiktionary test",prop.get("Database.suffix"));
 164+ assertEquals("wiki rutest",prop.get("KeywordScoring.suffix"));
 165+
 166+ // check languages and keyword stuff
 167+ assertEquals("en",testgc.getLanguage("entest"));
 168+ assertEquals("sr",testgc.getLanguage("srwiki"));
 169+ assertFalse(testgc.useKeywordScoring("frtest"));
 170+ assertTrue(testgc.useKeywordScoring("srwiki"));
 171+ assertTrue(testgc.useKeywordScoring("rutest"));
 172+
 173+
151174 } catch (MalformedURLException e) {
152175 e.printStackTrace();
153176 } catch (IOException e) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -3,6 +3,7 @@
44 import java.io.StringReader;
55 import java.util.ArrayList;
66 import java.util.HashMap;
 7+import java.util.HashSet;
78 import java.util.Map.Entry;
89
910 import org.apache.lucene.analysis.Analyzer;
@@ -38,6 +39,16 @@
3940 System.out.print("["+t.getKey()+"] => ["+t.getValue()+"] ");
4041 }
4142 if(iw.size()!=0) System.out.println();
 43+
 44+ HashSet<String> keywords = parser.getKeywords();
 45+ if(keywords.size()!=0){
 46+ System.out.print("KEYWORDS: ");
 47+ }
 48+ for(String t : keywords){
 49+ System.out.print("["+t+"] ");
 50+ }
 51+ if(keywords.size()!=0) System.out.println();
 52+
4253 System.out.println();
4354 }
4455
@@ -75,6 +86,10 @@
7687 showTokens(text);
7788 text = "{{IPstack|name = Hundai}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
7889 showTokens(text);
 90+ text = "[[First link]]\n== Some caption ==\n[[Other link]]";
 91+ showTokens(text);
 92+ text = "[[First]] second third fourth and so on goes the ... [[last link]]";
 93+ showTokens(text);
7994
8095 ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test");
8196 ArrayList<TestArticle> articles = ap.getArticles();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java
@@ -29,7 +29,7 @@
3030 this.page = page;
3131 }
3232 public void writeEndPage() throws IOException {
33 - String key = page.Title.Namespace+":"+page.Title.Text.toLowerCase();
 33+ String key = page.Title.Namespace+":"+page.Title.Text;
3434 titles.put(key,new Rank(0));
3535 }
3636 public HashMap<String,Rank> getTitles() {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -1,7 +1,9 @@
22 package org.wikimedia.lsearch.importer;
33
44 import java.io.IOException;
 5+import java.util.ArrayList;
56 import java.util.HashMap;
 7+import java.util.Map.Entry;
68 import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
79 import java.util.regex.Matcher;
810 import java.util.regex.Pattern;
@@ -43,21 +45,17 @@
4446 }
4547 public void writeEndPage() throws IOException {
4648 // get rank
47 - String key = page.Title.Namespace+":"+page.Title.Text.toLowerCase();
 49+ String key = page.Title.Namespace+":"+page.Title.Text;
4850 Rank r = ranks.get(key);
4951 int rank;
50 - boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode)!=null;
 52+ boolean isRedirect = r.redirectsTo != null;
5153 if(r == null){
5254 rank = 0;
53 - log.error("Rank for "+(page.Title.Namespace+":"+page.Title.Text.toLowerCase())+" is undefined, which should never happen.");
54 - } else{
55 - if(r.redirect != null && key.equals(r.redirect) && isRedirect){
56 - rank = 0;
57 - } else
58 - rank = r.links;
59 - }
 55+ log.error("Rank for "+key+" is undefined, which should never happen.");
 56+ } else
 57+ rank = r.links;
6058 // make article
61 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,rank);
 59+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,rank,r.redirected);
6260 writer.addArticle(article);
6361 count++;
6462 if(limit >= 0 && count > limit)
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/RankReader.java
@@ -6,6 +6,7 @@
77 import java.util.regex.Matcher;
88 import java.util.regex.Pattern;
99
 10+import org.apache.commons.lang.WordUtils;
1011 import org.apache.log4j.Logger;
1112 import org.mediawiki.importer.DumpWriter;
1213 import org.mediawiki.importer.Page;
@@ -49,26 +50,53 @@
5051 this.page = page;
5152 }
5253 public void writeEndPage() throws IOException {
53 - Rank r = ranks.get(page.Title.Namespace+":"+page.Title.Text.toLowerCase());
 54+ Rank r = ranks.get(page.Title.Namespace+":"+page.Title.Text);
5455 // register redirect
5556 String redirect = Localization.getRedirectTarget(revision.Text,langCode);
5657 if( redirect !=null ){
57 - redirect = redirect.toLowerCase();
5858 int ns = 0;
5959 String title = redirect;
6060 String[] parts = redirect.split(":",2);
6161 if(parts.length == 2 && parts[0].length()>1){
62 - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1));
 62+ Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
6363 if(inx != null){
6464 ns = inx;
6565 title = parts[1];
6666 }
6767 }
68 - r.redirect = ns+":"+title;
 68+ r.redirectsTo = findRank(ns,title);
6969 } else // process links
7070 processRanks(revision.Text,page.Title.Namespace);
7171 }
7272
 73+ /** Find the rank object for the ns:title */
 74+ protected Rank findRank(int ns, String title){
 75+ String key;
 76+ Rank rank;
 77+ // try exact match
 78+ key = ns+":"+title;
 79+ rank = ranks.get(key);
 80+ if(rank != null)
 81+ return rank;
 82+ // try lowercase
 83+ key = ns+":"+title.toLowerCase();
 84+ rank = ranks.get(key);
 85+ if(rank != null)
 86+ return rank;
 87+ // try title case
 88+ key = ns+":"+WordUtils.capitalize(title);
 89+ rank = ranks.get(key);
 90+ if(rank != null)
 91+ return rank;
 92+ // try capitalizing at word breaks
 93+ key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
 94+ rank = ranks.get(key);
 95+ if(rank != null)
 96+ return rank;
 97+
 98+ return null;
 99+ }
 100+
73101 /** Extract all links from this page, and increment ranks for linked pages */
74102 protected void processRanks(String text, int namespace) {
75103 Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
@@ -76,14 +104,12 @@
77105 int ns; String title;
78106 boolean escaped;
79107
80 - HashSet<String> links = new HashSet<String>();
 108+ HashSet<Rank> links = new HashSet<Rank>();
81109 while(matcher.find()){
82 - String link = matcher.group(1).toLowerCase();
 110+ String link = matcher.group(1);
83111 int fragment = link.lastIndexOf('#');
84112 if(fragment != -1)
85113 link = link.substring(0,fragment);
86 - if(link.length() > 100)
87 - continue; // probably an error
88114 //System.out.println("Got link "+link);
89115 if(link.startsWith(":")){
90116 escaped = true;
@@ -94,7 +120,7 @@
95121 // check for ns:title syntax
96122 String[] parts = link.split(":",2);
97123 if(parts.length == 2 && parts[0].length() > 1){
98 - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1));
 124+ Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
99125 if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14)))
100126 continue; // categories, ignore
101127 if(inx!=null && inx < 0)
@@ -108,17 +134,17 @@
109135 if(interwiki.contains(parts[0]))
110136 continue;
111137 }
 138+ if(ns == 0 && namespace!=0)
 139+ continue; // skip links from other namespaces into the main namespace
 140+
112141 // register as link
113 - String key = ns+":"+title;
114 - links.add(key);
 142+ Rank target = findRank(ns,title);
 143+ if(target != null)
 144+ links.add(target);
115145 }
116146 // increment page ranks
117 - for(String t : links){
118 - if(t.startsWith("0:") && namespace!=0)
119 - continue; // skip links from other namespaces into the main namespace
120 - Rank rank = ranks.get(t);
121 - if(rank != null)
122 - rank.links++;
 147+ for(Rank rank : links){
 148+ rank.links++;
123149 }
124150 }
125151 public void writeSiteinfo(Siteinfo info) throws IOException {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -86,6 +86,7 @@
8787 else
8888 writer.setMaxBufferedDocs(glMaxBufDocs);
8989 writer.setUseCompoundFile(true);
 90+ writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
9091
9192 return writer;
9293 }
@@ -94,7 +95,7 @@
9596 public void addArticle(Article a){
9697 if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
9798 return; // don't add if preconditions are not met
98 -
 99+ WikiIndexModifier.transformArticleForIndexing(a);
99100 IndexId target;
100101 if(iid.isSingle())
101102 target = iid;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -5,6 +5,7 @@
66 import java.util.ArrayList;
77 import java.util.HashMap;
88 import java.util.HashSet;
 9+import java.util.Map.Entry;
910
1011 import org.apache.log4j.Logger;
1112 import org.mediawiki.dumper.ProgressFilter;
@@ -95,17 +96,19 @@
9697 long start = System.currentTimeMillis();
9798
9899 HashMap<String,Rank> ranks = processRanks(inputfile,getTitles(inputfile),langCode);
99 -
 100+
100101 // add-up ranks of redirects to pages where they redirect to
101 - for(Rank r : ranks.values()){
102 - if(r.redirect != null){
103 - Rank dest = ranks.get(r.redirect);
104 - if(dest != null && dest != r){
105 - dest.links += r.links;
106 - r.links = 0;
107 - }
 102+ for(Entry<String,Rank> e : ranks.entrySet()){
 103+ Rank r = e.getValue();
 104+ if(r.redirectsTo != null && r != r.redirectsTo){
 105+ r.redirectsTo.links += r.links;
 106+ r.links = 0;
 107+ if(r.redirectsTo.redirected == null)
 108+ r.redirectsTo.redirected = new ArrayList<String>();
 109+ r.redirectsTo.redirected.add(e.getKey());
108110 }
109111 }
 112+
110113 log.info("Third pass, indexing articles...");
111114
112115 // open
@@ -119,7 +122,7 @@
120123
121124 // read
122125 DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,ranks,langCode);
123 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));
 126+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));
124127 try {
125128 reader.readDump();
126129 } catch (IOException e) {
@@ -168,7 +171,7 @@
169172 }
170173 // calculate ranks
171174 RankReader rr = new RankReader(ranks,langCode);
172 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 100));
 175+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
173176 try {
174177 reader.readDump();
175178 } catch (IOException e) {
@@ -189,7 +192,7 @@
190193 }
191194 // first pass, get titles
192195 TitleReader tr = new TitleReader();
193 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 100));
 196+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
194197 try {
195198 reader.readDump();
196199 input.close();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java
@@ -106,7 +106,6 @@
107107 String globalurl = getString("MWConfig","global");
108108 String indexpath = getString("Indexes","path");
109109 String oairepo = getString("OAI","repo");
110 - String[] dbsuffixes = getArray("Database","suffix");
111110 if(globalurl==null){
112111 System.out.println("FATAL: Need to define global configuration url in local config file.");
113112 System.exit(1);
@@ -115,7 +114,7 @@
116115 System.exit(1);
117116 }
118117 try {
119 - global.readFromURL(new URL(globalurl),indexpath,oairepo,dbsuffixes);
 118+ global.readFromURL(new URL(globalurl),indexpath,oairepo);
120119 } catch (MalformedURLException e) {
121120 System.out.println("Malformed URL "+globalurl+" cannot read global configuration (check MWConfig.global in "+CONF_FILE_NAME+"), exiting...");
122121 System.exit(1);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -5,8 +5,10 @@
66 package org.wikimedia.lsearch.config;
77
88 import java.io.BufferedReader;
 9+import java.io.ByteArrayInputStream;
910 import java.io.IOException;
1011 import java.io.InputStreamReader;
 12+import java.io.StringReader;
1113 import java.net.Inet4Address;
1214 import java.net.InetAddress;
1315 import java.net.MalformedURLException;
@@ -19,6 +21,7 @@
2022 import java.util.HashSet;
2123 import java.util.Hashtable;
2224 import java.util.Iterator;
 25+import java.util.Properties;
2326 import java.util.Set;
2427 import java.util.regex.Matcher;
2528 import java.util.regex.Pattern;
@@ -63,8 +66,12 @@
6467 /** OAI repo pattern from lsearch2.conf */
6568 protected String OAIRepoPattern;
6669 /** Database suffix if dbname, the rest is supposed to be language, e.g srwiki => (suffix wiki) => sr */
67 - protected String[] databaseSuffixes;
 70+ protected String[] databaseSuffixes = null;
 71+ /** Databases ending in suffix will use additional keyword scores */
 72+ protected String[] keywordScoringSuffixes = null;
6873
 74+ protected Properties globalProperties = null;
 75+
6976 /** All identifiers of all indexes (dbrole -> IndexId) */
7077 protected static Hashtable<String,IndexId> indexIdPool = new Hashtable<String,IndexId>();
7178
@@ -192,13 +199,13 @@
193200 * @param url
194201 * @throws IOException
195202 */
196 - public void readFromURL(URL url, String indexpath, String oaiRepo, String[] dbsuffixes) throws IOException{
 203+ public void readFromURL(URL url, String indexpath, String oaiRepo) throws IOException{
197204 BufferedReader in;
198205 try {
199206 in = new BufferedReader(
200207 new InputStreamReader(
201208 url.openStream()));
202 - read(in,indexpath,oaiRepo,dbsuffixes);
 209+ read(in,indexpath,oaiRepo);
203210 } catch (IOException e) {
204211 System.out.println("I/O Error in opening or reading global config at url "+url);
205212 throw e;
@@ -221,6 +228,13 @@
222229 namespacePrefixAll = "all"; // default
223230 }
224231
 232+ protected String[] getArrayProperty(String name){
 233+ String s = globalProperties.getProperty(name);
 234+ if (s != null)
 235+ return s.split(" ");
 236+ return null;
 237+ }
 238+
225239 /**
226240 * Reads a config file from a bufferedreader, will
227241 * close the reader when done.
@@ -228,7 +242,7 @@
229243 * @param in opened reader
230244 * @throws IOException
231245 */
232 - protected void read(BufferedReader in, String indexpath, String oaiRepo, String[] dbsuffixes) throws IOException{
 246+ protected void read(BufferedReader in, String indexpath, String oaiRepo) throws IOException{
233247 String line="";
234248 int section = -1;
235249 Pattern roleRegexp = Pattern.compile("\\((.*?)\\)");
@@ -245,7 +259,6 @@
246260 init();
247261 this.indexPath = indexpath;
248262 this.OAIRepoPattern = oaiRepo == null? "" : oaiRepo;
249 - this.databaseSuffixes = dbsuffixes;
250263
251264 while((line = in.readLine()) != null){
252265 lineNum ++;
@@ -260,6 +273,27 @@
261274 if(line.startsWith("[") && line.length()>2 && !Character.isDigit(line.charAt(1))){ // section
262275 int last = line.indexOf("]");
263276 String s = line.substring(1,last);
 277+
 278+ if(s.equalsIgnoreCase("properties")){
 279+ globalProperties = new Properties();
 280+ StringBuilder prop = new StringBuilder(line+"\n");
 281+ while((line = in.readLine()) != null){
 282+ if(line.startsWith("[") && line.length()>2 && !Character.isDigit(line.charAt(1)))
 283+ break;
 284+ prop.append(line);
 285+ prop.append("\n");
 286+ }
 287+ globalProperties.load(new ByteArrayInputStream(prop.toString().getBytes("utf-8")));
 288+ // get some predifined global properties
 289+ this.databaseSuffixes = getArrayProperty("Database.suffix");
 290+ this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
 291+ if(line == null)
 292+ break;
 293+ // else: line points to beginning of next section
 294+ last = line.indexOf("]");
 295+ s = line.substring(1,last);
 296+ }
 297+
264298 if(s.equalsIgnoreCase("database"))
265299 section = DATABASE;
266300 else if(s.equalsIgnoreCase("index"))
@@ -314,8 +348,7 @@
315349 if(filter.equalsIgnoreCase("<all>"))
316350 namespacePrefixAll = prefix;
317351 else
318 - namespacePrefix.put(prefix,new NamespaceFilter(filter));
319 -
 352+ namespacePrefix.put(prefix,new NamespaceFilter(filter));
320353 }
321354 }
322355 if( !checkIntegrity() ){
@@ -769,6 +802,24 @@
770803 return namespacePrefixAll;
771804 }
772805
 806+ /** Returns if keyword scoring should be used for this db, using
 807+ * the suffixes from the global configuration
 808+ *
 809+ * @param dbname
 810+ * @return
 811+ */
 812+ public boolean useKeywordScoring(String dbname){
 813+ if(keywordScoringSuffixes == null)
 814+ return false;
 815+ else{
 816+ for (String suffix : keywordScoringSuffixes) {
 817+ if (dbname.endsWith(suffix))
 818+ return true;
 819+ }
 820+ }
 821+ return false;
 822+ }
773823
 824+
774825
775826 }
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -188,6 +188,7 @@
189189 }
190190 long startTime = System.currentTimeMillis();
191191 // rsync
 192+ log.info("Starting rsync of "+iid);
192193 String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp;
193194 command = "/usr/bin/rsync -W --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath();
194195 log.debug("Running shell command: "+command);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -135,10 +135,10 @@
136136
137137 try {
138138 if(nsfw == null){
139 - q = parser.parseTwoPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE);
 139+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
140140 }
141141 else{
142 - q = parser.parseTwoPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE);
 142+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
143143 log.info("Using NamespaceFilterWrapper "+nsfw);
144144 }
145145
@@ -250,10 +250,13 @@
251251 // fetch documents
252252 Document[] docs = s.docs(docids);
253253 int j=0;
 254+ float maxScore = 1;
 255+ if(numhits>0)
 256+ maxScore = hits.score(0);
254257 for(Document doc : docs){
255258 String namespace = doc.get("namespace");
256259 String title = doc.get("title");
257 - float score = transformScore(scores[j]);
 260+ float score = transformScore(scores[j]/maxScore);
258261 ResultSet rs = new ResultSet(score,namespace,title);
259262 if(explain)
260263 rs.setExplanation(((IndexSearcherMul)s).explain(q,docids[j]));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
@@ -65,7 +65,7 @@
6666
6767 try{
6868 for(int i=0; i < count ; i++){
69 - Query q = parser.parseTwoPass(terms.next(),WikiQueryParser.NamespacePolicy.IGNORE);
 69+ Query q = parser.parseFourPass(terms.next(),WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
7070 Hits hits = is.search(q);
7171 for(int j =0; j<20 && j<hits.length(); j++)
7272 hits.doc(j); // retrieve some documents
@@ -117,7 +117,7 @@
118118 public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
119119 try{
120120 WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
121 - Query q = parser.parseTwoPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE);
 121+ Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
122122 is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
123123 } catch (IOException e) {
124124 log.error("Error warming up local IndexSearcherMul for "+iid);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -6,6 +6,8 @@
77
88 import java.io.File;
99 import java.io.IOException;
 10+import java.util.ArrayList;
 11+import java.util.Arrays;
1012 import java.util.Collection;
1113 import java.util.Collections;
1214 import java.util.HashSet;
@@ -24,7 +26,9 @@
2527 import org.apache.lucene.store.Directory;
2628 import org.apache.lucene.store.FSDirectory;
2729 import org.wikimedia.lsearch.analyzers.Analyzers;
 30+import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
2831 import org.wikimedia.lsearch.analyzers.FilterFactory;
 32+import org.wikimedia.lsearch.analyzers.WikiTokenizer;
2933 import org.wikimedia.lsearch.beans.Article;
3034 import org.wikimedia.lsearch.beans.IndexReportCard;
3135 import org.wikimedia.lsearch.config.GlobalConfiguration;
@@ -48,13 +52,13 @@
4953 }
5054 }
5155
 56+ static public final int MAX_FIELD_LENGTH = 100000;
5257 /** Simple implementation of batch addition and deletion */
5358 class SimpleIndexModifier {
5459 protected IndexId iid;
5560 protected IndexReader reader;
5661 protected IndexWriter writer;
57 - protected boolean rewrite;
58 - protected int maxFieldLength;
 62+ protected boolean rewrite;
5963 protected String langCode;
6064
6165 protected HashSet<IndexUpdateRecord> nonDeleteDocuments;
@@ -75,13 +79,8 @@
7680 this.iid = iid;
7781 this.rewrite = rewrite;
7882 this.langCode = langCode;
79 - maxFieldLength = 0;
8083 reportQueue = new Hashtable<IndexUpdateRecord,IndexReportCard>();
8184 }
82 -
83 - public void setMaxFieldLength(int maxFieldLength) {
84 - this.maxFieldLength = maxFieldLength;
85 - }
8685
8786 protected IndexReportCard getReportCard(IndexUpdateRecord rec){
8887 if(!rec.isReportBack())
@@ -168,8 +167,7 @@
169168 writer.setMergeFactor(mergeFactor);
170169 writer.setMaxBufferedDocs(maxBufDocs);
171170 writer.setUseCompoundFile(true);
172 - if(maxFieldLength!=0)
173 - writer.setMaxFieldLength(maxFieldLength);
 171+ writer.setMaxFieldLength(MAX_FIELD_LENGTH);
174172
175173 FilterFactory filters = new FilterFactory(langCode);
176174
@@ -179,6 +177,7 @@
180178 continue; // don't add if delete/add are paired operations
181179 if(!checkPreconditions(rec))
182180 continue; // article shoouldn't be added for some (heuristic) reason
 181+ transformArticleForIndexing(rec.getArticle()); // tranform record so that unnecessary stuff is deleted, e.g. some redirects
183182 IndexReportCard card = getReportCard(rec);
184183 Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters);
185184 Document doc = (Document) ret[0];
@@ -210,7 +209,7 @@
211210 }
212211 return succ;
213212 }
214 -
 213+
215214 public boolean checkPreconditions(IndexUpdateRecord rec){
216215 return checkAddPreconditions(rec.getArticle(),langCode);
217216 }
@@ -226,15 +225,41 @@
227226 public static boolean checkAddPreconditions(Article ar, String langCode){
228227 if(ar.getNamespace().equals("0")){
229228 String redirect = Localization.getRedirectTarget(ar.getContents(),langCode);
230 - if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){
 229+ if(redirect != null)
 230+ return false; // don't add redirects
 231+ /*if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){
231232 log.debug("Not adding "+ar+" into index: "+ar.getContents());
232233 return false;
233 - }
 234+ } */
234235 }
235236 return true;
236237 }
237238
238239 /**
 240+ * Changes the article, so that things we don't want to index are deleted,
 241+ * e.g. it deletes redirects from nonmain namespace to article in main namespace
 242+ *
 243+ * @param rec
 244+ */
 245+ public static void transformArticleForIndexing(Article ar) {
 246+ ArrayList<String> redirects = ar.getRedirects();
 247+ String ns = ar.getNamespace()+":";
 248+ if(redirects != null){
 249+ ArrayList<String> filtered = new ArrayList<String>();
 250+ // index only redirects from the same namespace
 251+ // to avoid a lot of unusable redirects from/to
 252+ // user namespace, but always index redirect FROM main
 253+ for(String r : redirects){
 254+ if(r.startsWith(ns) || r.startsWith("0:"))
 255+ filtered.add(r.split(":",2)[1]);
 256+ //else
 257+ //log.info("Ignoring redirect "+r+" to "+ar);
 258+ }
 259+ ar.setRedirects(filtered);
 260+ }
 261+ }
 262+
 263+ /**
239264 * Create necessary directories for index
240265 * @param dbname
241266 * @return relative path (to document root) of db within filesystem
@@ -347,6 +372,7 @@
348373 */
349374 public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){
350375 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
 376+ WikiTokenizer tokenizer = null;
351377 Document doc = new Document();
352378
353379 // This will be used to look up and replace entries on index updates.
@@ -357,10 +383,22 @@
358384
359385 // boost document title with it's article rank
360386 Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
361 - log.debug(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank());
362 - title.setBoost(calculateArticleRank(article.getRank()));
 387+ //log.debug(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
 388+ float rankBoost = calculateArticleRank(article.getRank());
 389+ title.setBoost(rankBoost);
363390 doc.add(title);
364391
 392+ // add titles of redirects, generated from analyzer
 393+ Field redirect = new Field("redirect", "",
 394+ Field.Store.NO, Field.Index.TOKENIZED);
 395+ redirect.setBoost(rankBoost);
 396+ doc.add(redirect);
 397+
 398+ // most significat words in the text, gets extra score, from analyzer
 399+ Field keyword = new Field("keyword", "",
 400+ Field.Store.NO, Field.Index.TOKENIZED);
 401+ doc.add(keyword);
 402+
365403 // the next fields are generated using wikitokenizer
366404 doc.add(new Field("contents", "",
367405 Field.Store.NO, Field.Index.TOKENIZED));
@@ -372,9 +410,13 @@
373411 String text = article.getContents();
374412 if(article.isRedirect())
375413 text=""; // for redirects index only the title
 414+ Object[] ret = Analyzers.getIndexerAnalyzer(text,filters,article.getRedirects());
 415+ perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0];
376416
377 - perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,filters);
378 -
 417+ // set boost for keyword field
 418+ tokenizer = (WikiTokenizer) ret[1];
 419+ keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size()));
 420+
379421 return new Object[] { doc, perFieldAnalyzer };
380422 }
381423
@@ -392,5 +434,19 @@
393435 else
394436 return (float) (1 + rank/15.0);
395437 }
 438+
 439+ /**
 440+ * We don't want whole stub articles fetched as keywords, so we penalize if
 441+ * the article is too short for keyword extraction.
 442+ *
 443+ * @param numTokens
 444+ * @return
 445+ */
 446+ public static float calculateKeywordsBoost(int numTokens){
 447+ if(numTokens > 2 * FastWikiTokenizerEngine.KEYWORD_TOKEN_LIMIT)
 448+ return 1;
 449+ else
 450+ return ((float)numTokens)/FastWikiTokenizerEngine.KEYWORD_TOKEN_LIMIT/2;
 451+ }
396452
397453 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
@@ -20,6 +20,9 @@
2121 *
2222 * For titles:
2323 * * 1/sqrt(term^3)
 24+ *
 25+ * For redirect:
 26+ * * no length norm
2427 *
2528 */
2629 @Override
@@ -36,6 +39,8 @@
3740 float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
3841 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
3942 return f;
 43+ } else if(fieldName.equals("redirect") || fieldName.equals("keyword")){
 44+ return 1;
4045 } else
4146 return super.lengthNorm(fieldName,numTokens);
4247
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
@@ -0,0 +1,113 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.io.Reader;
 6+import java.util.ArrayList;
 7+import java.util.HashSet;
 8+
 9+import org.apache.log4j.Logger;
 10+import org.apache.lucene.analysis.Analyzer;
 11+import org.apache.lucene.analysis.Token;
 12+import org.apache.lucene.analysis.TokenStream;
 13+
 14+/**
 15+ * Analyzer that builds a field with an array of keywords,
 16+ * each keyword is separated by a large token gap, so it's
 17+ * convenient to run SpanNearQueries on the field. Keywords
 18+ * themselves are tokenized. E.g.
 19+ *
 20+ * ("something different", "other") ->
 21+ * "something" +1 "different" +201 "other"
 22+ *
 23+ * Currently used for fields "redirect" and "keyword"
 24+ *
 25+ * @author rainman
 26+ *
 27+ */
 28+public class KeywordsAnalyzer extends Analyzer{
 29+ static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
 30+ protected ArrayList<String> keywords;
 31+ protected FilterFactory filters;
 32+ protected KeywordsTokenStream tokens;
 33+
 34+ public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters){
 35+ ArrayList<String> k = new ArrayList<String>();
 36+ k.addAll(keywords);
 37+ tokens = new KeywordsTokenStream(k,filters);
 38+ }
 39+
 40+ public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters){
 41+ tokens = new KeywordsTokenStream(keywords,filters);
 42+ }
 43+ /** positional increment between different redirects */
 44+ public static final int tokenGap = 201;
 45+
 46+ @Override
 47+ public TokenStream tokenStream(String fieldName, Reader reader) {
 48+ return tokens;
 49+ }
 50+ @Override
 51+ public TokenStream tokenStream(String fieldName, String text) {
 52+ return tokens;
 53+ }
 54+
 55+ class KeywordsTokenStream extends TokenStream {
 56+ protected Analyzer analyzer;
 57+ protected ArrayList<String> keywords;
 58+ protected int index;
 59+ protected String keyword;
 60+ protected TokenStream tokens;
 61+
 62+ public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters){
 63+ this.analyzer = new QueryLanguageAnalyzer(filters);
 64+ this.keywords = keywords;
 65+ this.index = 0;
 66+ this.keyword = null;
 67+ this.tokens = null;
 68+ }
 69+ @Override
 70+ public Token next() throws IOException {
 71+ if(keywords == null)
 72+ return null; // nothing to do
 73+ Token t;
 74+ if(keyword == null){
 75+ t = openNext();
 76+ return t;
 77+ }
 78+ if(keyword != null && tokens!=null){
 79+ t = tokens.next();
 80+ if(t == null){
 81+ t = openNext();
 82+ if(t != null)
 83+ t.setPositionIncrement(tokenGap);
 84+ }
 85+ return t;
 86+ } else{
 87+ log.warn("Inconsistent state: key="+keyword+", tokens="+tokens);
 88+ }
 89+ return null;
 90+ }
 91+
 92+ protected Token openNext() throws IOException {
 93+ Token t;
 94+ if(index >= keywords.size())
 95+ return null; // processed all keywords
 96+ // try subsequent keyword titles until find one with
 97+ // title that can be tokenized
 98+ do{
 99+ // next keyword title
 100+ keyword = keywords.get(index++);
 101+ tokens = analyzer.tokenStream("",keyword);
 102+ // try to tokenize
 103+ t = tokens.next();
 104+ if(t == null && index == keywords.size())
 105+ return null; // last token
 106+ else if(t!=null)
 107+ return t;
 108+ } while(keyword == null);
 109+ return null;
 110+ }
 111+
 112+ }
 113+
 114+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
@@ -4,6 +4,7 @@
55 import java.io.Reader;
66 import java.util.ArrayList;
77 import java.util.HashMap;
 8+import java.util.HashSet;
89 import java.util.Iterator;
910
1011 import org.apache.log4j.Logger;
@@ -17,6 +18,7 @@
1819 protected Iterator<Token> tokenIt = null;
1920 protected ArrayList<String> categories = null;
2021 protected HashMap<String,String> interwikis = null;
 22+ protected HashSet<String> keywords = null;
2123
2224 /** Use <code>WikiTokenizer(String)</code> constructor */
2325 @Deprecated
@@ -52,6 +54,7 @@
5355 tokenIt = tokens.iterator();
5456 categories = parser.getCategories();
5557 interwikis = parser.getInterwikis();
 58+ keywords = parser.getKeywords();
5659 }
5760 }
5861
@@ -84,7 +87,12 @@
8588 public ArrayList<Token> getTokens() {
8689 return tokens;
8790 }
 91+
 92+ public HashSet<String> getKeywords() {
 93+ return keywords;
 94+ }
8895
8996
 97+
9098
9199 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -7,6 +7,7 @@
88 import java.util.HashSet;
99 import java.util.Hashtable;
1010
 11+import org.apache.commons.lang.WordUtils;
1112 import org.apache.lucene.analysis.Token;
1213 import org.wikimedia.lsearch.util.Localization;
1314 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -35,6 +36,7 @@
3637 private ArrayList<Token> tokens;
3738 protected ArrayList<String> categories;
3839 protected HashMap<String,String> interwikis;
 40+ protected HashSet<String> keywords;
3941 private int length = 0; // length of token
4042 private int start = 0; // start position of token
4143 private int cur = 0; // current position in the input string
@@ -45,11 +47,17 @@
4648 private int decompi;
4749 private char cl; // lowercased character
4850 private boolean numberToken; // if the buffer holds a number token
 51+ private int headings = 0; // how many headings did we see
4952
5053 private int prefixLen = 0;
5154 private final char[] prefixBuf = new char[MAX_WORD_LEN];
5255 private int semicolonInx = -1;
 56+ private final char[] keywordBuf = new char[MAX_WORD_LEN];
 57+ private int keywordLen = 0;
5358
 59+ /** This many tokens from begining of text are eligable for keywords */
 60+ public static final int KEYWORD_TOKEN_LIMIT = 250;
 61+
5462 /** language code */
5563 private String language;
5664 /** language code -> set (image namespace names) */
@@ -60,12 +68,12 @@
6169
6270 private UnicodeDecomposer decomposer;
6371
64 - enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END,
 72+ enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD,
6573 LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS,
6674 TEMPLATE_BEGIN, TEMPLATE_WORDS, TEMPLATE_END,
6775 TABLE_BEGIN};
6876
69 - enum FetchState { WORD, CATEGORY, INTERWIKI};
 77+ enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD };
7078
7179
7280 private void init(){
@@ -73,6 +81,7 @@
7482 categories = new ArrayList<String>();
7583 interwikis = new HashMap<String,String>();
7684 decomposer = UnicodeDecomposer.getInstance();
 85+ keywords = new HashSet<String>();
7786 numberToken = false;
7887 }
7988
@@ -258,6 +267,50 @@
259268 return Localization.getRedirectTarget(textString,language)!=null;
260269 }
261270
 271+ /**
 272+ * Decide if link that is currently being processed is to be appended to list of keywords
 273+ *
 274+ * Criterion: link is within first 300 words, and before the
 275+ * first heading
 276+ *
 277+ */
 278+ protected boolean isGoodKeywordLink(){
 279+ return headings == 0 && tokens.size() <= KEYWORD_TOKEN_LIMIT;
 280+ }
 281+
 282+ /** When encountering '=' check if this line is actually a heading */
 283+ private void checkHeadings() {
 284+ // make sure = is at a begining of a line
 285+ if(cur == 0 || text[cur-1]=='\n' || text[cur-1]=='\r'){
 286+ int endOfLine;
 287+ // find end of line/text
 288+ for(endOfLine = cur ; endOfLine < textLength ; endOfLine++ ){
 289+ lc = text[endOfLine];
 290+ if(lc == '\n' || lc =='\r')
 291+ break;
 292+ }
 293+ int start=0, end=0; // number of ='s at begining and end of line
 294+ // find first sequence of =
 295+ for(lookup = cur ; lookup < textLength && lookup < endOfLine ; lookup++ ){
 296+ if(text[lookup] == '=')
 297+ start++;
 298+ else
 299+ break;
 300+ }
 301+ // find the last squence of =
 302+ for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){
 303+ if(text[lookup] == '=')
 304+ end++;
 305+ else
 306+ break;
 307+ }
 308+ // check
 309+ if(start == end && start != 0 && start+end<endOfLine-cur && start>=2 && start<=4){
 310+ headings++;
 311+ }
 312+ }
 313+ }
 314+
262315 /**
263316 * Parse Wiki text, and produce an arraylist of tokens.
264317 * Also fills the lists categories and interwikis.
@@ -281,6 +334,9 @@
282335 switch(state){
283336 case WORD:
284337 switch(c){
 338+ case '=':
 339+ checkHeadings();
 340+ break;
285341 case '<':
286342 addToken();
287343 state = ParserState.IGNORE;
@@ -369,12 +425,18 @@
370426 fetch = FetchState.INTERWIKI;
371427 state = ParserState.LINK_FETCH;
372428 continue;
373 - } else{
374 - // unrecognized, ignore
375 - cur--;
376 - continue;
377429 }
378430 }
 431+ // add this link to keywords?
 432+ if(isGoodKeywordLink()){
 433+ fetch = FetchState.KEYWORD;
 434+ state = ParserState.LINK_KEYWORD;
 435+ if(pipeInx != -1)
 436+ cur = pipeInx; // ignore up to pipe
 437+ else
 438+ cur--; // return the first character of link
 439+ continue;
 440+ }
379441
380442 // no semicolon, search for pipe:
381443 if(pipeInx != -1){
@@ -384,6 +446,11 @@
385447 addLetter();
386448 continue;
387449 }
 450+ case LINK_KEYWORD:
 451+ if(keywordLen < keywordBuf.length && c!=']'){
 452+ keywordBuf[keywordLen++] = c;
 453+ }
 454+ // fall-thru
388455 case LINK_WORDS:
389456 if(c == ']'){
390457 state = ParserState.LINK_END;
@@ -419,7 +486,7 @@
420487
421488 if(length<buffer.length)
422489 buffer[length++] = c;
423 - continue;
 490+ continue;
424491 case LINK_END:
425492 if(c == ']'){ // good link ending
426493 state = ParserState.WORD;
@@ -439,6 +506,11 @@
440507 length = 0;
441508 fetch = FetchState.WORD;
442509 continue;
 510+ case KEYWORD:
 511+ keywords.add(new String(keywordBuf,0,keywordLen));
 512+ keywordLen = 0;
 513+ fetch = FetchState.WORD;
 514+ continue;
443515 }
444516 } else{
445517 // bad syntax, ignore any categories, etc..
@@ -478,7 +550,7 @@
479551 addToken();
480552 return tokens;
481553 }
482 -
 554+
483555 /** Check if this is an "image" keyword using localization */
484556 private final boolean isImage(String prefix){
485557 prefix = prefix.toLowerCase();
@@ -530,4 +602,10 @@
531603 public ArrayList<Token> getTokens() {
532604 return tokens;
533605 }
 606+
 607+ public HashSet<String> getKeywords() {
 608+ return keywords;
 609+ }
 610+
 611+
534612 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -18,6 +18,9 @@
1919 import org.apache.lucene.search.Query;
2020 import org.apache.lucene.search.TermQuery;
2121 import org.apache.lucene.search.WildcardQuery;
 22+import org.apache.lucene.search.spans.SpanNearQuery;
 23+import org.apache.lucene.search.spans.SpanQuery;
 24+import org.apache.lucene.search.spans.SpanTermQuery;
2225 import org.wikimedia.lsearch.config.GlobalConfiguration;
2326 import org.wikimedia.lsearch.search.NamespaceFilter;
2427 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -69,7 +72,9 @@
7073 /** boost for alias words from analyzer */
7174 public final float ALIAS_BOOST = 0.5f;
7275 /** boost for title field */
73 - public static float TITLE_BOOST = 4;
 76+ public static float TITLE_BOOST = 4;
 77+ public static float REDIRECT_BOOST = 0.5f;
 78+ public static float KEYWORD_BOOST = 1;
7479
7580 /** Policies in treating field names:
7681 *
@@ -90,6 +95,7 @@
9196 private Query namespaceRewriteQuery;
9297 private NamespacePolicy namespacePolicy;
9398 protected NamespaceFilter defaultNamespaceFilter;
 99+ protected static GlobalConfiguration global=null;
94100
95101 /** default value for boolean queries */
96102 public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST;
@@ -102,7 +108,8 @@
103109 protected void initNamespaces(){
104110 if(namespaceQueries != null)
105111 return;
106 - GlobalConfiguration global = GlobalConfiguration.getInstance();
 112+ if(global == null)
 113+ global = GlobalConfiguration.getInstance();
107114 namespaceAllKeyword = global.getNamespacePrefixAll();
108115 namespaceQueries = new Hashtable<String,Query>();
109116 namespacePrefixes = new Hashtable<NamespaceFilter,String>();
@@ -891,16 +898,146 @@
892899 return query;
893900 }
894901
 902+ protected boolean isNamespaceQuery(Query q){
 903+ if(q instanceof TermQuery)
 904+ return ((TermQuery)q).getTerm().field().equals("namespace");
 905+ else if(q instanceof BooleanQuery){
 906+ for(BooleanClause cl : ((BooleanQuery)q).getClauses()){
 907+ if(cl.getQuery() instanceof TermQuery &&
 908+ ((TermQuery)cl.getQuery()).getTerm().field().equals("namespace"));
 909+ else
 910+ return false;
 911+ }
 912+ return true;
 913+ }
 914+ return false;
 915+ }
 916+
895917 /**
896 - * Parse the query according to policy. Instead of rewrite phrase, simply pass
897 - * twice the query with different default fields.
 918+ * Doing some very simple analysis extract span queries to use for
 919+ * redirect field. Currently only extracts if all boolean clauses are
 920+ * required or if it's a phrase query. This is since making span
 921+ * queries in non-trivial in other cases. :(
898922 *
 923+ * The function heavily depends on the format of output of parser,
 924+ * especially for rewrite.
 925+ *
 926+ * @param query
 927+ * @param level - recursion level
 928+ * @return
 929+ */
 930+ protected Query extractSpans(Query query, int level, String fieldName, float boost) {
 931+ // phrase, or termquery just rewrite field name
 932+ if(query instanceof TermQuery){
 933+ TermQuery tq = (TermQuery)query;
 934+ TermQuery ret = new TermQuery(new Term(fieldName,tq.getTerm().text()));
 935+ ret.setBoost(boost);
 936+ return ret;
 937+ } else if(query instanceof PhraseQuery){
 938+ PhraseQuery phrase = new PhraseQuery();
 939+ for(Term term : ((PhraseQuery)query).getTerms()){
 940+ phrase.add(new Term(fieldName,term.text()));
 941+ }
 942+ phrase.setBoost(boost);
 943+ return phrase;
 944+ } else if(query instanceof BooleanQuery){
 945+ BooleanQuery bq = (BooleanQuery)query;
 946+ // check for rewritten queries, TODO: parse complex multi-part rewrites
 947+ if(level==0 && namespacePolicy != null && namespacePolicy == NamespacePolicy.REWRITE){
 948+ if(bq.getClauses().length == 2 && isNamespaceQuery(bq.getClauses()[0].getQuery())){
 949+ BooleanQuery ret = new BooleanQuery();
 950+ ret.add(bq.getClauses()[0]);
 951+ // the second clause is always the query
 952+ ret.add(extractSpans(bq.getClauses()[1].getQuery(),level+1,fieldName,boost),BooleanClause.Occur.MUST);
 953+ return ret;
 954+ } else
 955+ return null;
 956+ }
 957+ // we can parse if all clauses are required
 958+ boolean canTransform = true;
 959+ for(BooleanClause cl : bq.getClauses()){
 960+ if(cl.getOccur() != BooleanClause.Occur.MUST){
 961+ canTransform = false;
 962+ break;
 963+ }
 964+ }
 965+ if(!canTransform)
 966+ return null;
 967+ // rewrite into span queries + categories
 968+ ArrayList<SpanQuery> spans = new ArrayList<SpanQuery>();
 969+ ArrayList<Query> categories = new ArrayList<Query>();
 970+ for(BooleanClause cl : bq.getClauses()){
 971+ Query q = cl.getQuery();
 972+ if(q instanceof TermQuery){ // -> SpanTermQuery
 973+ TermQuery tq = (TermQuery)q;
 974+ Term t = tq.getTerm();
 975+ if(t.field().equals("category")){
 976+ categories.add(q);
 977+ } else {
 978+ SpanTermQuery stq = new SpanTermQuery(new Term(fieldName,t.text()));
 979+ stq.setBoost(boost);
 980+ spans.add(stq);
 981+ }
 982+ } else if(q instanceof PhraseQuery){ // -> SpanNearQuery(slop=0,inOrder=true)
 983+ PhraseQuery pq = (PhraseQuery)q;
 984+ Term[] terms = pq.getTerms();
 985+ if(terms[0].field().equals("category")){
 986+ categories.add(q);
 987+ } else{
 988+ SpanTermQuery[] spanTerms = new SpanTermQuery[terms.length];
 989+ for(int i=0; i<terms.length; i++ ){
 990+ spanTerms[i] = new SpanTermQuery(new Term(fieldName,terms[i].text()));
 991+ }
 992+ SpanNearQuery snq = new SpanNearQuery(spanTerms,0,true);
 993+ snq.setBoost(boost);
 994+ spans.add(snq);
 995+ }
 996+ }
 997+ }
 998+ // create the queries
 999+ Query cat = null;
 1000+ SpanQuery span = null;
 1001+ if(categories.size() != 0){
 1002+ if(categories.size() == 1)
 1003+ cat = categories.get(0);
 1004+ else{
 1005+ BooleanQuery b = new BooleanQuery();
 1006+ for(Query q : categories)
 1007+ b.add(q,BooleanClause.Occur.MUST);
 1008+ cat = b; // intersection of categories, bool query
 1009+ }
 1010+ }
 1011+ if(spans.size() != 0){
 1012+ if(spans.size() == 1)
 1013+ span = spans.get(0);
 1014+ else{
 1015+ // make a span-near query that has a slop 1/2 of tokenGap
 1016+ span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.tokenGap-1)/2,false);
 1017+ }
 1018+ }
 1019+ if(cat != null && span != null){
 1020+ BooleanQuery ret = new BooleanQuery();
 1021+ ret.add(span,BooleanClause.Occur.MUST);
 1022+ ret.add(cat,BooleanClause.Occur.MUST);
 1023+ return ret;
 1024+ } else if(span != null)
 1025+ return span;
 1026+ else // we don't want categories only
 1027+ return null;
 1028+
 1029+ }
 1030+ return null;
 1031+ }
 1032+
 1033+ /**
 1034+ * Main function for multi-pass parsing.
 1035+ *
8991036 * @param queryText
9001037 * @param policy
 1038+ * @param makeRedirect
9011039 * @return
902 - * @throws ParseException
9031040 */
904 - public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
 1041+ protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){
9051042 if(policy != null)
9061043 this.namespacePolicy = policy;
9071044 float olfDefaultBoost = defaultBoost;
@@ -914,15 +1051,66 @@
9151052 defaultField = contentField;
9161053 defaultBoost = olfDefaultBoost;
9171054 if(qc == null || qt == null)
918 - return new BooleanQuery();
919 -
 1055+ return new BooleanQuery();
9201056 if(qc.equals(qt))
9211057 return qc; // don't duplicate (probably a query for categories only)
9221058 BooleanQuery bq = new BooleanQuery();
9231059 bq.add(qc,BooleanClause.Occur.SHOULD);
9241060 bq.add(qt,BooleanClause.Occur.SHOULD);
 1061+
 1062+ // redirect pass
 1063+ if(makeRedirect){
 1064+ Query qr = extractSpans(qt,0,"redirect",REDIRECT_BOOST);
 1065+ if(qr != null)
 1066+ bq.add(qr,BooleanClause.Occur.SHOULD);
 1067+ }
 1068+ // keyword pass
 1069+ if(makeKeywords){
 1070+ Query qk = extractSpans(qt,0,"keyword",KEYWORD_BOOST);
 1071+ if(qk != null)
 1072+ bq.add(qk,BooleanClause.Occur.SHOULD);
 1073+ }
 1074+
9251075 return bq;
 1076+
9261077 }
 1078+
 1079+ /**
 1080+ * Three parse pases: contents, title, redirect
 1081+ *
 1082+ * @param queryText
 1083+ * @param policy
 1084+ * @return
 1085+ * @throws ParseException
 1086+ */
 1087+ public Query parseThreePass(String queryText, NamespacePolicy policy) throws ParseException{
 1088+ return parseMultiPass(queryText,policy,true,false);
 1089+ }
 1090+
 1091+ /**
 1092+ * Depending on settings for db, do all 4 passes of parsing:
 1093+ * 1) contents
 1094+ * 2) titles
 1095+ * 3) redirects
 1096+ * 4) keywords
 1097+ */
 1098+ public Query parseFourPass(String queryText, NamespacePolicy policy, String dbname) throws ParseException{
 1099+ boolean makeKeywords = global.useKeywordScoring(dbname);
 1100+ return parseMultiPass(queryText,policy,true,makeKeywords);
 1101+ }
 1102+
 1103+ /**
 1104+ * Parse the query according to policy. Instead of rewrite phrase, simply pass
 1105+ * twice the query with different default fields.
 1106+ *
 1107+ * @param queryText
 1108+ * @param policy
 1109+ * @return
 1110+ * @throws ParseException
 1111+ */
 1112+ public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
 1113+ return parseMultiPass(queryText,policy,false,false);
 1114+ }
9271115
9281116
9291117 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -47,9 +47,9 @@
4848 *
4949 * @param text text to be tokenized
5050 * @param languageAnalyzer language filter class (e.g. PorterStemFilter)
51 - * @return
 51+ * @return {PerFieldAnalyzerWrapper,WikiTokenizer}
5252 */
53 - public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, FilterFactory filters) {
 53+ public static Object[] getIndexerAnalyzer(String text, FilterFactory filters, ArrayList<String> redirects) {
5454 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
5555 // parse wiki-text to get categories
5656 WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage());
@@ -63,8 +63,11 @@
6464 new CategoryAnalyzer(categories));
6565 perFieldAnalyzer.addAnalyzer("title",
6666 getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
67 -
68 - return perFieldAnalyzer;
 67+ perFieldAnalyzer.addAnalyzer("redirect",
 68+ new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory()));
 69+ perFieldAnalyzer.addAnalyzer("keyword",
 70+ new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory()));
 71+ return new Object[] {perFieldAnalyzer,tokenizer};
6972 }
7073
7174 public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
@@ -25,6 +25,8 @@
2626 package org.wikimedia.lsearch.beans;
2727
2828 import java.io.Serializable;
 29+import java.util.ArrayList;
 30+import java.util.Collection;
2931
3032 /**
3133 * Wiki article.
@@ -37,6 +39,8 @@
3840 private boolean redirect;
3941 private long pageId;
4042 private int rank;
 43+ /** all redirects in format: ns:title */
 44+ private ArrayList<String> redirects; // pages that redirect to this page
4145
4246 public Article(){
4347 namespace="";
@@ -44,7 +48,8 @@
4549 contents="";
4650 pageId = 0;
4751 redirect=false;
48 - rank=0;
 52+ rank = 0;
 53+ redirects=new ArrayList<String>();
4954 }
5055
5156 public Article(long pageId, Title title, String text, boolean redirect, int rank) {
@@ -54,6 +59,7 @@
5560 this.pageId = pageId;
5661 this.redirect = redirect;
5762 this.rank = rank;
 63+ this.redirects = new ArrayList<String>();
5864 }
5965
6066 public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int rank) {
@@ -63,8 +69,19 @@
6470 this.redirect = redirect;
6571 this.pageId = pageId;
6672 this.rank = rank;
 73+ this.redirects = new ArrayList<String>();
6774 }
6875
 76+ public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int rank, ArrayList<String> redirects) {
 77+ this.namespace = Integer.toString(namespace);
 78+ this.title = titleText;
 79+ contents = text;
 80+ this.redirect = redirect;
 81+ this.pageId = pageId;
 82+ this.rank = rank;
 83+ this.redirects = redirects;
 84+ }
 85+
6986 public boolean isRedirect() {
7087 return redirect;
7188 }
@@ -111,8 +128,29 @@
112129 return "(" + namespace + ",\"" + title + "\")";
113130 }
114131
 132+ /** Get how many articles link to this article */
115133 public int getRank() {
116134 return rank;
117135 }
118136
 137+ /** Register a redirect to this article */
 138+ public void addRedirect(String linkingArticle){
 139+ redirects.add(linkingArticle);
 140+ }
 141+
 142+ /** Register a list of redirects to this article */
 143+ public void addRedirects(Collection<String> linkingArticles){
 144+ redirects.addAll(linkingArticles);
 145+ }
 146+
 147+ /** Get list of articles that redirect to this article */
 148+ public ArrayList<String> getRedirects() {
 149+ return redirects;
 150+ }
 151+
 152+ public void setRedirects(ArrayList<String> redirects) {
 153+ this.redirects = redirects;
 154+ }
 155+
 156+
119157 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Rank.java
@@ -1,19 +1,53 @@
22 package org.wikimedia.lsearch.beans;
33
 4+import java.util.ArrayList;
 5+
46 public class Rank {
57 /** Number of linking articles */
68 public int links;
79 /** if this is redirect, point to the target title */
8 - public String redirect;
 10+ public Rank redirectsTo;
 11+ /** all the pages that get redirected here */
 12+ public ArrayList<String> redirected;
913
1014 public Rank(int links) {
1115 this.links = links;
12 - redirect = null;
 16+ redirectsTo = null;
1317 }
1418
15 - public Rank(int links, String redirect) {
 19+ public Rank(int links, Rank redirect) {
1620 this.links = links;
17 - this.redirect = redirect;
 21+ this.redirectsTo = redirect;
1822 }
 23+
 24+ @Override
 25+ public int hashCode() {
 26+ final int PRIME = 31;
 27+ int result = 1;
 28+ result = PRIME * result + links;
 29+ result = PRIME * result + 0;
 30+ return result;
 31+ }
 32+
 33+ @Override
 34+ public boolean equals(Object obj) {
 35+ if (this == obj)
 36+ return true;
 37+ if (obj == null)
 38+ return false;
 39+ if (getClass() != obj.getClass())
 40+ return false;
 41+ final Rank other = (Rank) obj;
 42+ if (links != other.links)
 43+ return false;
 44+ if (redirectsTo == null) {
 45+ if (other.redirectsTo != null)
 46+ return false;
 47+ } else if (redirectsTo != other.redirectsTo)
 48+ return false;
 49+ return true;
 50+ }
1951
 52+
 53+
2054 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -18,12 +18,17 @@
1919 protected Revision revision;
2020 protected ArrayList<IndexUpdateRecord> records = new ArrayList<IndexUpdateRecord>();
2121 protected IndexId iid;
22 - protected int references;
 22+ protected int references = 0;
 23+ protected ArrayList<String> redirects = new ArrayList<String>();
2324
2425 public IndexUpdatesCollector(IndexId iid){
2526 this.iid = iid;
2627 }
2728
 29+ public void addRedirect(String redirectTitle, int references) {
 30+ redirects.add(redirectTitle);
 31+ addReferences(references);
 32+ }
2833 public void addDeletion(long pageId){
2934 // pageId is enough for page deletion
3035 Article article = new Article(pageId,-1,"","",false,1);
@@ -42,10 +47,12 @@
4348 this.page = page;
4449 }
4550 public void writeEndPage() throws IOException {
46 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references);
47 - log.info("Collected "+article+" with rank "+references);
 51+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects);
 52+ //log.info("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
4853 records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
4954 log.debug(iid+": Update for "+article);
 55+ references = 0;
 56+ redirects.clear();
5057 }
5158
5259 public void close() throws IOException {
@@ -64,10 +71,12 @@
6572 return references;
6673 }
6774
68 - public void setReferences(int references) {
69 - this.references = references;
 75+ public void addReferences(int references) {
 76+ this.references += references;
7077 }
7178
 79+
 80+
7281
7382
7483 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIParser.java
@@ -8,7 +8,6 @@
99 import javax.xml.parsers.SAXParser;
1010 import javax.xml.parsers.SAXParserFactory;
1111
12 -import org.mediawiki.importer.DumpWriter;
1312 import org.mediawiki.importer.XmlDumpReader;
1413 import org.xml.sax.Attributes;
1514 import org.xml.sax.SAXException;
@@ -24,6 +23,8 @@
2524 * appears as a continious stream. For this stream
2625 * calls to sax parser methods are delegated to XmlDumpReader.
2726 *
 27+ * Note: implementation is very lazy and messy :(
 28+ *
2829 * @author rainman
2930 *
3031 */
@@ -37,8 +38,8 @@
3839 protected String oaiId,pageId,resumptionToken,responseDate;
3940 protected boolean beginMW; // beginning of mediawiki stream
4041 protected String mwUri, mwLocalName, mwQName;
41 - protected boolean isDeleted, inReferences;
42 - protected String references;
 42+ protected boolean isDeleted, inReferences, inRedirect, inRedirectTitle, inRedirectRef;
 43+ protected String references, redirectTitle, redirectRef;
4344
4445
4546 public OAIParser(InputStream in, IndexUpdatesCollector collector){
@@ -50,6 +51,8 @@
5152 inResponseDate = false; inReferences = false;
5253 oaiId = ""; resumptionToken = ""; responseDate = "";
5354 beginMW = true; references = "";
 55+ inRedirect = false; inRedirectTitle= false; inRedirectRef = false;
 56+ redirectTitle = ""; redirectRef = "";
5457 }
5558
5659 public void parse() throws IOException{
@@ -74,8 +77,17 @@
7578 inDump = false; // lsearch syntax
7679 inReferences = true;
7780 references = "";
 81+ } else if(inDump && qName.equals("redirect")){
 82+ inDump = false;
 83+ inRedirect = true;
 84+ redirectTitle = "";
 85+ redirectRef = "";
7886 } else if(inDump)
7987 dumpReader.startElement(uri, localName, qName, attributes);
 88+ else if(inRedirect && qName.equals("title"))
 89+ inRedirectTitle = true;
 90+ else if(inRedirect && qName.equals("references"))
 91+ inRedirectRef = true;
8092 else if(qName.equals("record"))
8193 inRecord = true;
8294 else if(qName.equals("header") && inRecord){
@@ -85,8 +97,7 @@
8698 isDeleted = true;
8799 else
88100 isDeleted = false;
89 - }
90 - else if(qName.equals("identifier") && inHeader){
 101+ } else if(qName.equals("identifier") && inHeader){
91102 oaiId = "";
92103 inIdentifier = true;
93104 } else if(qName.equals("metadata"))
@@ -115,10 +126,23 @@
116127 dumpReader.endElement(uri, localName, qName);
117128 else if(qName.equals("upload"))
118129 inDump = true; // we ignored upload tag / parsed references, we can now resume
119 - else if(qName.equals("references")){
 130+ else if(!inRedirect && qName.equals("references")){
120131 inDump = true;
121 - collector.setReferences(Integer.parseInt(references));
122 - } else if(qName.equals("record"))
 132+ inReferences = false;
 133+ if(!references.equals(""))
 134+ collector.addReferences(Integer.parseInt(references));
 135+ } if(qName.equals("redirect")){
 136+ inDump = true;
 137+ int ref = 0;
 138+ if(!redirectRef.equals(""))
 139+ ref = Integer.parseInt(redirectRef);
 140+ collector.addRedirect(redirectTitle,ref);
 141+ inRedirect = false;
 142+ } else if(inRedirect && qName.equals("title"))
 143+ inRedirectTitle = false;
 144+ else if(inRedirect && qName.equals("references"))
 145+ inRedirectRef = false;
 146+ else if(qName.equals("record"))
123147 inRecord = false;
124148 else if(qName.equals("header"))
125149 inHeader = false;
@@ -153,6 +177,10 @@
154178 responseDate += new String(ch,start,length);
155179 } else if(inReferences){
156180 references += new String(ch,start,length);
 181+ } else if(inRedirectTitle){
 182+ redirectTitle += new String(ch,start,length);
 183+ } else if(inRedirectRef){
 184+ redirectRef += new String(ch,start,length);
157185 }
158186 }
159187