r22539 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r22538‎ \| r22539 \| r22540 >
Date:	17:58, 29 May 2007
Author:	rainman
Status:	old
Tags:
Comment:	Updated ranking which is now three-fold, using: * number of references to page * redirect names * keywords from the beginning of the article
Modified paths:	/trunk/lucene-search-2.0/lsearch-global.conf (modified) (history) /trunk/lucene-search-2.0/lsearch.conf (modified) (history) /trunk/lucene-search-2.0/src/org/apache/commons (added) (history) /trunk/lucene-search-2.0/src/org/apache/commons/lang (added) (history) /trunk/lucene-search-2.0/src/org/apache/commons/lang/WordUtils.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Rank.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/RankReader.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIParser.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history) /trunk/lucene-search-2.0/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/lsearch-global.conf
—	—	@@ -31,6 +31,14 @@
32	32	[Index-Path]
33	33	<default> : /mwsearch
34	34
	35	+# Global properies
	36	+[Properties]
	37	+# suffixes to database name, the rest is assumed to be language code
	38	+Database.suffix=wiki wiktionary
	39	+
	40	+# dbnames that end with the suffix will use additional keywords scores
	41	+KeywordScoring.suffix=wiki wikilucene wikidev
	42	+
35	43	# Put here you custom namespace prefixes
36	44	# Syntax: <prefix_name> : <coma separated list of namespaces>
37	45	# <all> is a special keyword meaning all namespaces
—	—	@@ -54,4 +62,3 @@
55	63	[14] : 14
56	64	[15] : 15
57	65
58		-
Index: trunk/lucene-search-2.0/lsearch.conf
—	—	@@ -57,9 +57,6 @@
58	58	# URL to message files, {0} is replaced with language code, i.e. En
59	59	Localization.url=file:///var/www/html/wiki-lucene/phase3/languages/messages/Messages{0}.php
60	60
61		~~-# suffixes to database name, the rest is assumed to be language code~~
62		~~-Database.suffix=test~~
63		-
64	61	# Pattern for OAI repo. {0} is replaced with dbname, {1} with language
65	62	OAI.repo=http://localhost/wiki-lucene/phase3/index.php/Special:OAIRepository
66	63
Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test
—	—	@@ -12,6 +12,7 @@
13	13	entest : (ngram), (aspell,en)
14	14	detest,rutest : (single,true,2,10)
15	15	frtest : (split,3) (part1) (part2) (part3)
	16	+srwiki : (single)
16	17
17	18	# Search nodes
18	19	# host : db1.role, db2.role
—	—	@@ -34,6 +35,7 @@
35	36	192.168.0.5 : detest, rutest, frtest
36	37	192.168.0.2 : entest.ngram
37	38	192.168.0.2 : frtest.part1, frtest.part2, frtest.part3
	39	+192.168.0.10 : srwiki
38	40
39	41	# Path where indexes are on hosts, after default value put hosts where
40	42	# the location differs
—	—	@@ -41,6 +43,13 @@
42	44	<default> : /mwsearch
43	45	192.168.0.5 : mwsearch2
44	46
	47	+[Properties]
	48	+# suffixes to database name, the rest is assumed to be language code
	49	+Database.suffix=wiki wiktionary test
	50	+
	51	+# dbnames that end with the suffix will use additional keywords scores
	52	+KeywordScoring.suffix=wiki rutest
	53	+
45	54	# databases can be writen as {file}, where file contains list of dbs
46	55
47	56	# Put here you custom namespace prefixes
Index: trunk/lucene-search-2.0/src/org/apache/commons/lang/WordUtils.java
—	—	@@ -0,0 +1,584 @@
	2	+/*
	3	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	4	+ * contributor license agreements. See the NOTICE file distributed with
	5	+ * this work for additional information regarding copyright ownership.
	6	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	7	+ * (the "License"); you may not use this file except in compliance with
	8	+ * the License. You may obtain a copy of the License at
	9	+ *
	10	+ * http://www.apache.org/licenses/LICENSE-2.0
	11	+ *
	12	+ * Unless required by applicable law or agreed to in writing, software
	13	+ * distributed under the License is distributed on an "AS IS" BASIS,
	14	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	15	+ * See the License for the specific language governing permissions and
	16	+ * limitations under the License.
	17	+ */
	18	+package org.apache.commons.lang;
	19	+
	20	+/**
	21	+ * <p>Operations on Strings that contain words.</p>
	22	+ *
	23	+ * <p>This class tries to handle <code>null</code> input gracefully.
	24	+ * An exception will not be thrown for a <code>null</code> input.
	25	+ * Each method documents its behaviour in more detail.</p>
	26	+ *
	27	+ * @author Apache Jakarta Velocity
	28	+ * @author Stephen Colebourne
	29	+ * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
	30	+ * @author Gary Gregory
	31	+ * @since 2.0
	32	+ * @version $Id: WordUtils.java 471626 2006-11-06 04:02:09Z bayard $
	33	+ */
	34	+public class WordUtils {
	35	+
	36	+ /**
	37	+ * <p><code>WordUtils</code> instances should NOT be constructed in
	38	+ * standard programming. Instead, the class should be used as
	39	+ * <code>WordUtils.wrap("foo bar", 20);</code>.</p>
	40	+ *
	41	+ * <p>This constructor is public to permit tools that require a JavaBean
	42	+ * instance to operate.</p>
	43	+ */
	44	+ public WordUtils() {
	45	+ super();
	46	+ }
	47	+
	48	+ // Wrapping
	49	+ //--------------------------------------------------------------------------
	50	+// /**
	51	+// * <p>Wraps a block of text to a specified line length using '\n' as
	52	+// * a newline.</p>
	53	+// *
	54	+// * <p>This method takes a block of text, which might have long lines in it
	55	+// * and wraps the long lines based on the supplied lineLength parameter.</p>
	56	+// *
	57	+// * <p>If a single word is longer than the line length (eg. a URL), it will
	58	+// * not be broken, and will display beyond the expected width.</p>
	59	+// *
	60	+// * <p>If there are tabs in inString, you are going to get results that are
	61	+// * a bit strange. Tabs are a single character but are displayed as 4 or 8
	62	+// * spaces. Remove the tabs.</p>
	63	+// *
	64	+// * @param str text which is in need of word-wrapping, may be null
	65	+// * @param lineLength the column to wrap the words at
	66	+// * @return the text with all the long lines word-wrapped
	67	+// * <code>null</code> if null string input
	68	+// */
	69	+// public static String wrapText(String str, int lineLength) {
	70	+// return wrap(str, null, lineLength);
	71	+// }
	72	+
	73	+// /**
	74	+// * <p>Wraps a block of text to a specified line length.</p>
	75	+// *
	76	+// * <p>This method takes a block of text, which might have long lines in it
	77	+// * and wraps the long lines based on the supplied lineLength parameter.</p>
	78	+// *
	79	+// * <p>If a single word is longer than the wrapColumn (eg. a URL), it will
	80	+// * not be broken, and will display beyond the expected width.</p>
	81	+// *
	82	+// * <p>If there are tabs in inString, you are going to get results that are
	83	+// * a bit strange. Tabs are a single character but are displayed as 4 or 8
	84	+// * spaces. Remove the tabs.</p>
	85	+// *
	86	+// * @param str text which is in need of word-wrapping, may be null
	87	+// * @param newLineChars the characters that define a newline, null treated as \n
	88	+// * @param lineLength the column to wrap the words at
	89	+// * @return the text with all the long lines word-wrapped
	90	+// * <code>null</code> if null string input
	91	+// */
	92	+// public static String wrapText(String str, String newLineChars, int lineLength) {
	93	+// if (str == null) {
	94	+// return null;
	95	+// }
	96	+// if (newLineChars == null) {
	97	+// newLineChars = "\n";
	98	+// }
	99	+// StringTokenizer lineTokenizer = new StringTokenizer(str, newLineChars, true);
	100	+// StringBuffer stringBuffer = new StringBuffer();
	101	+//
	102	+// while (lineTokenizer.hasMoreTokens()) {
	103	+// try {
	104	+// String nextLine = lineTokenizer.nextToken();
	105	+//
	106	+// if (nextLine.length() > lineLength) {
	107	+// // This line is long enough to be wrapped.
	108	+// nextLine = wrapLine(nextLine, null, lineLength, false);
	109	+// }
	110	+//
	111	+// stringBuffer.append(nextLine);
	112	+//
	113	+// } catch (NoSuchElementException nsee) {
	114	+// // thrown by nextToken(), but I don't know why it would
	115	+// break;
	116	+// }
	117	+// }
	118	+//
	119	+// return stringBuffer.toString();
	120	+// }
	121	+
	122	+ // Wrapping
	123	+ //-----------------------------------------------------------------------
	124	+ /**
	125	+ * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p>
	126	+ *
	127	+ * <p>New lines will be separated by the system property line separator.
	128	+ * Very long words, such as URLs will <i>not</i> be wrapped.</p>
	129	+ *
	130	+ * <p>Leading spaces on a new line are stripped.
	131	+ * Trailing spaces are not stripped.</p>
	132	+ *
	133	+ * <pre>
	134	+ * WordUtils.wrap(null, *) = null
	135	+ * WordUtils.wrap("", *) = ""
	136	+ * </pre>
	137	+ *
	138	+ * @param str the String to be word wrapped, may be null
	139	+ * @param wrapLength the column to wrap the words at, less than 1 is treated as 1
	140	+ * @return a line with newlines inserted, <code>null</code> if null input
	141	+ */
	142	+ public static String wrap(String str, int wrapLength) {
	143	+ return wrap(str, wrapLength, null, false);
	144	+ }
	145	+
	146	+ /**
	147	+ * <p>Wraps a single line of text, identifying words by <code>' '</code>.</p>
	148	+ *
	149	+ * <p>Leading spaces on a new line are stripped.
	150	+ * Trailing spaces are not stripped.</p>
	151	+ *
	152	+ * <pre>
	153	+ * WordUtils.wrap(null, , , *) = null
	154	+ * WordUtils.wrap("", , , *) = ""
	155	+ * </pre>
	156	+ *
	157	+ * @param str the String to be word wrapped, may be null
	158	+ * @param wrapLength the column to wrap the words at, less than 1 is treated as 1
	159	+ * @param newLineStr the string to insert for a new line,
	160	+ * <code>null</code> uses the system property line separator
	161	+ * @param wrapLongWords true if long words (such as URLs) should be wrapped
	162	+ * @return a line with newlines inserted, <code>null</code> if null input
	163	+ */
	164	+ public static String wrap(String str, int wrapLength, String newLineStr, boolean wrapLongWords) {
	165	+ if (str == null) {
	166	+ return null;
	167	+ }
	168	+ if (newLineStr == null) {
	169	+ newLineStr = System.getProperty("line.separator");
	170	+ }
	171	+ if (wrapLength < 1) {
	172	+ wrapLength = 1;
	173	+ }
	174	+ int inputLineLength = str.length();
	175	+ int offset = 0;
	176	+ StringBuffer wrappedLine = new StringBuffer(inputLineLength + 32);
	177	+
	178	+ while ((inputLineLength - offset) > wrapLength) {
	179	+ if (str.charAt(offset) == ' ') {
	180	+ offset++;
	181	+ continue;
	182	+ }
	183	+ int spaceToWrapAt = str.lastIndexOf(' ', wrapLength + offset);
	184	+
	185	+ if (spaceToWrapAt >= offset) {
	186	+ // normal case
	187	+ wrappedLine.append(str.substring(offset, spaceToWrapAt));
	188	+ wrappedLine.append(newLineStr);
	189	+ offset = spaceToWrapAt + 1;
	190	+
	191	+ } else {
	192	+ // really long word or URL
	193	+ if (wrapLongWords) {
	194	+ // wrap really long word one line at a time
	195	+ wrappedLine.append(str.substring(offset, wrapLength + offset));
	196	+ wrappedLine.append(newLineStr);
	197	+ offset += wrapLength;
	198	+ } else {
	199	+ // do not wrap really long word, just extend beyond limit
	200	+ spaceToWrapAt = str.indexOf(' ', wrapLength + offset);
	201	+ if (spaceToWrapAt >= 0) {
	202	+ wrappedLine.append(str.substring(offset, spaceToWrapAt));
	203	+ wrappedLine.append(newLineStr);
	204	+ offset = spaceToWrapAt + 1;
	205	+ } else {
	206	+ wrappedLine.append(str.substring(offset));
	207	+ offset = inputLineLength;
	208	+ }
	209	+ }
	210	+ }
	211	+ }
	212	+
	213	+ // Whatever is left in line is short enough to just pass through
	214	+ wrappedLine.append(str.substring(offset));
	215	+
	216	+ return wrappedLine.toString();
	217	+ }
	218	+
	219	+ // Capitalizing
	220	+ //-----------------------------------------------------------------------
	221	+ /**
	222	+ * <p>Capitalizes all the whitespace separated words in a String.
	223	+ * Only the first letter of each word is changed. To convert the
	224	+ * rest of each word to lowercase at the same time,
	225	+ * use {@link #capitalizeFully(String)}.</p>
	226	+ *
	227	+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
	228	+ * A <code>null</code> input String returns <code>null</code>.
	229	+ * Capitalization uses the unicode title case, normally equivalent to
	230	+ * upper case.</p>
	231	+ *
	232	+ * <pre>
	233	+ * WordUtils.capitalize(null) = null
	234	+ * WordUtils.capitalize("") = ""
	235	+ * WordUtils.capitalize("i am FINE") = "I Am FINE"
	236	+ * </pre>
	237	+ *
	238	+ * @param str the String to capitalize, may be null
	239	+ * @return capitalized String, <code>null</code> if null String input
	240	+ * @see #uncapitalize(String)
	241	+ * @see #capitalizeFully(String)
	242	+ */
	243	+ public static String capitalize(String str) {
	244	+ return capitalize(str, null);
	245	+ }
	246	+
	247	+ /**
	248	+ * <p>Capitalizes all the delimiter separated words in a String.
	249	+ * Only the first letter of each word is changed. To convert the
	250	+ * rest of each word to lowercase at the same time,
	251	+ * use {@link #capitalizeFully(String, char[])}.</p>
	252	+ *
	253	+ * <p>The delimiters represent a set of characters understood to separate words.
	254	+ * The first string character and the first non-delimiter character after a
	255	+ * delimiter will be capitalized. </p>
	256	+ *
	257	+ * <p>A <code>null</code> input String returns <code>null</code>.
	258	+ * Capitalization uses the unicode title case, normally equivalent to
	259	+ * upper case.</p>
	260	+ *
	261	+ * <pre>
	262	+ * WordUtils.capitalize(null, *) = null
	263	+ * WordUtils.capitalize("", *) = ""
	264	+ * WordUtils.capitalize(, new char[0]) =
	265	+ * WordUtils.capitalize("i am fine", null) = "I Am Fine"
	266	+ * WordUtils.capitalize("i aM.fine", {'.'}) = "I aM.Fine"
	267	+ * </pre>
	268	+ *
	269	+ * @param str the String to capitalize, may be null
	270	+ * @param delimiters set of characters to determine capitalization, null means whitespace
	271	+ * @return capitalized String, <code>null</code> if null String input
	272	+ * @see #uncapitalize(String)
	273	+ * @see #capitalizeFully(String)
	274	+ * @since 2.1
	275	+ */
	276	+ public static String capitalize(String str, char[] delimiters) {
	277	+ int delimLen = (delimiters == null ? -1 : delimiters.length);
	278	+ if (str == null \|\| str.length() == 0 \|\| delimLen == 0) {
	279	+ return str;
	280	+ }
	281	+ int strLen = str.length();
	282	+ StringBuffer buffer = new StringBuffer(strLen);
	283	+ boolean capitalizeNext = true;
	284	+ for (int i = 0; i < strLen; i++) {
	285	+ char ch = str.charAt(i);
	286	+
	287	+ if (isDelimiter(ch, delimiters)) {
	288	+ buffer.append(ch);
	289	+ capitalizeNext = true;
	290	+ } else if (capitalizeNext) {
	291	+ buffer.append(Character.toTitleCase(ch));
	292	+ capitalizeNext = false;
	293	+ } else {
	294	+ buffer.append(ch);
	295	+ }
	296	+ }
	297	+ return buffer.toString();
	298	+ }
	299	+
	300	+ //-----------------------------------------------------------------------
	301	+ /**
	302	+ * <p>Converts all the whitespace separated words in a String into capitalized words,
	303	+ * that is each word is made up of a titlecase character and then a series of
	304	+ * lowercase characters. </p>
	305	+ *
	306	+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
	307	+ * A <code>null</code> input String returns <code>null</code>.
	308	+ * Capitalization uses the unicode title case, normally equivalent to
	309	+ * upper case.</p>
	310	+ *
	311	+ * <pre>
	312	+ * WordUtils.capitalizeFully(null) = null
	313	+ * WordUtils.capitalizeFully("") = ""
	314	+ * WordUtils.capitalizeFully("i am FINE") = "I Am Fine"
	315	+ * </pre>
	316	+ *
	317	+ * @param str the String to capitalize, may be null
	318	+ * @return capitalized String, <code>null</code> if null String input
	319	+ */
	320	+ public static String capitalizeFully(String str) {
	321	+ return capitalizeFully(str, null);
	322	+ }
	323	+
	324	+ /**
	325	+ * <p>Converts all the delimiter separated words in a String into capitalized words,
	326	+ * that is each word is made up of a titlecase character and then a series of
	327	+ * lowercase characters. </p>
	328	+ *
	329	+ * <p>The delimiters represent a set of characters understood to separate words.
	330	+ * The first string character and the first non-delimiter character after a
	331	+ * delimiter will be capitalized. </p>
	332	+ *
	333	+ * <p>A <code>null</code> input String returns <code>null</code>.
	334	+ * Capitalization uses the unicode title case, normally equivalent to
	335	+ * upper case.</p>
	336	+ *
	337	+ * <pre>
	338	+ * WordUtils.capitalizeFully(null, *) = null
	339	+ * WordUtils.capitalizeFully("", *) = ""
	340	+ * WordUtils.capitalizeFully(, null) =
	341	+ * WordUtils.capitalizeFully(, new char[0]) =
	342	+ * WordUtils.capitalizeFully("i aM.fine", {'.'}) = "I am.Fine"
	343	+ * </pre>
	344	+ *
	345	+ * @param str the String to capitalize, may be null
	346	+ * @param delimiters set of characters to determine capitalization, null means whitespace
	347	+ * @return capitalized String, <code>null</code> if null String input
	348	+ * @since 2.1
	349	+ */
	350	+ public static String capitalizeFully(String str, char[] delimiters) {
	351	+ int delimLen = (delimiters == null ? -1 : delimiters.length);
	352	+ if (str == null \|\| str.length() == 0 \|\| delimLen == 0) {
	353	+ return str;
	354	+ }
	355	+ str = str.toLowerCase();
	356	+ return capitalize(str, delimiters);
	357	+ }
	358	+
	359	+ //-----------------------------------------------------------------------
	360	+ /**
	361	+ * <p>Uncapitalizes all the whitespace separated words in a String.
	362	+ * Only the first letter of each word is changed.</p>
	363	+ *
	364	+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
	365	+ * A <code>null</code> input String returns <code>null</code>.</p>
	366	+ *
	367	+ * <pre>
	368	+ * WordUtils.uncapitalize(null) = null
	369	+ * WordUtils.uncapitalize("") = ""
	370	+ * WordUtils.uncapitalize("I Am FINE") = "i am fINE"
	371	+ * </pre>
	372	+ *
	373	+ * @param str the String to uncapitalize, may be null
	374	+ * @return uncapitalized String, <code>null</code> if null String input
	375	+ * @see #capitalize(String)
	376	+ */
	377	+ public static String uncapitalize(String str) {
	378	+ return uncapitalize(str, null);
	379	+ }
	380	+
	381	+ /**
	382	+ * <p>Uncapitalizes all the whitespace separated words in a String.
	383	+ * Only the first letter of each word is changed.</p>
	384	+ *
	385	+ * <p>The delimiters represent a set of characters understood to separate words.
	386	+ * The first string character and the first non-delimiter character after a
	387	+ * delimiter will be uncapitalized. </p>
	388	+ *
	389	+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
	390	+ * A <code>null</code> input String returns <code>null</code>.</p>
	391	+ *
	392	+ * <pre>
	393	+ * WordUtils.uncapitalize(null, *) = null
	394	+ * WordUtils.uncapitalize("", *) = ""
	395	+ * WordUtils.uncapitalize(, null) =
	396	+ * WordUtils.uncapitalize(, new char[0]) =
	397	+ * WordUtils.uncapitalize("I AM.FINE", {'.'}) = "i AM.fINE"
	398	+ * </pre>
	399	+ *
	400	+ * @param str the String to uncapitalize, may be null
	401	+ * @param delimiters set of characters to determine uncapitalization, null means whitespace
	402	+ * @return uncapitalized String, <code>null</code> if null String input
	403	+ * @see #capitalize(String)
	404	+ * @since 2.1
	405	+ */
	406	+ public static String uncapitalize(String str, char[] delimiters) {
	407	+ int delimLen = (delimiters == null ? -1 : delimiters.length);
	408	+ if (str == null \|\| str.length() == 0 \|\| delimLen == 0) {
	409	+ return str;
	410	+ }
	411	+ int strLen = str.length();
	412	+ StringBuffer buffer = new StringBuffer(strLen);
	413	+ boolean uncapitalizeNext = true;
	414	+ for (int i = 0; i < strLen; i++) {
	415	+ char ch = str.charAt(i);
	416	+
	417	+ if (isDelimiter(ch, delimiters)) {
	418	+ buffer.append(ch);
	419	+ uncapitalizeNext = true;
	420	+ } else if (uncapitalizeNext) {
	421	+ buffer.append(Character.toLowerCase(ch));
	422	+ uncapitalizeNext = false;
	423	+ } else {
	424	+ buffer.append(ch);
	425	+ }
	426	+ }
	427	+ return buffer.toString();
	428	+ }
	429	+
	430	+ //-----------------------------------------------------------------------
	431	+ /**
	432	+ * <p>Swaps the case of a String using a word based algorithm.</p>
	433	+ *
	434	+ * <ul>
	435	+ * <li>Upper case character converts to Lower case</li>
	436	+ * <li>Title case character converts to Lower case</li>
	437	+ * <li>Lower case character after Whitespace or at start converts to Title case</li>
	438	+ * <li>Other Lower case character converts to Upper case</li>
	439	+ * </ul>
	440	+ *
	441	+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
	442	+ * A <code>null</code> input String returns <code>null</code>.</p>
	443	+ *
	444	+ * <pre>
	445	+ * StringUtils.swapCase(null) = null
	446	+ * StringUtils.swapCase("") = ""
	447	+ * StringUtils.swapCase("The dog has a BONE") = "tHE DOG HAS A bone"
	448	+ * </pre>
	449	+ *
	450	+ * @param str the String to swap case, may be null
	451	+ * @return the changed String, <code>null</code> if null String input
	452	+ */
	453	+ public static String swapCase(String str) {
	454	+ int strLen;
	455	+ if (str == null \|\| (strLen = str.length()) == 0) {
	456	+ return str;
	457	+ }
	458	+ StringBuffer buffer = new StringBuffer(strLen);
	459	+
	460	+ boolean whitespace = true;
	461	+ char ch = 0;
	462	+ char tmp = 0;
	463	+
	464	+ for (int i = 0; i < strLen; i++) {
	465	+ ch = str.charAt(i);
	466	+ if (Character.isUpperCase(ch)) {
	467	+ tmp = Character.toLowerCase(ch);
	468	+ } else if (Character.isTitleCase(ch)) {
	469	+ tmp = Character.toLowerCase(ch);
	470	+ } else if (Character.isLowerCase(ch)) {
	471	+ if (whitespace) {
	472	+ tmp = Character.toTitleCase(ch);
	473	+ } else {
	474	+ tmp = Character.toUpperCase(ch);
	475	+ }
	476	+ } else {
	477	+ tmp = ch;
	478	+ }
	479	+ buffer.append(tmp);
	480	+ whitespace = Character.isWhitespace(ch);
	481	+ }
	482	+ return buffer.toString();
	483	+ }
	484	+
	485	+ //-----------------------------------------------------------------------
	486	+ /**
	487	+ * <p>Extracts the initial letters from each word in the String.</p>
	488	+ *
	489	+ * <p>The first letter of the string and all first letters after
	490	+ * whitespace are returned as a new string.
	491	+ * Their case is not changed.</p>
	492	+ *
	493	+ * <p>Whitespace is defined by {@link Character#isWhitespace(char)}.
	494	+ * A <code>null</code> input String returns <code>null</code>.</p>
	495	+ *
	496	+ * <pre>
	497	+ * WordUtils.initials(null) = null
	498	+ * WordUtils.initials("") = ""
	499	+ * WordUtils.initials("Ben John Lee") = "BJL"
	500	+ * WordUtils.initials("Ben J.Lee") = "BJ"
	501	+ * </pre>
	502	+ *
	503	+ * @param str the String to get initials from, may be null
	504	+ * @return String of initial letters, <code>null</code> if null String input
	505	+ * @see #initials(String,char[])
	506	+ * @since 2.2
	507	+ */
	508	+ public static String initials(String str) {
	509	+ return initials(str, null);
	510	+ }
	511	+
	512	+ /**
	513	+ * <p>Extracts the initial letters from each word in the String.</p>
	514	+ *
	515	+ * <p>The first letter of the string and all first letters after the
	516	+ * defined delimiters are returned as a new string.
	517	+ * Their case is not changed.</p>
	518	+ *
	519	+ * <p>If the delimiters array is null, then Whitespace is used.
	520	+ * Whitespace is defined by {@link Character#isWhitespace(char)}.
	521	+ * A <code>null</code> input String returns <code>null</code>.
	522	+ * An empty delimiter array returns an empty String.</p>
	523	+ *
	524	+ * <pre>
	525	+ * WordUtils.initials(null, *) = null
	526	+ * WordUtils.initials("", *) = ""
	527	+ * WordUtils.initials("Ben John Lee", null) = "BJL"
	528	+ * WordUtils.initials("Ben J.Lee", null) = "BJ"
	529	+ * WordUtils.initials("Ben J.Lee", [' ','.']) = "BJL"
	530	+ * WordUtils.initials(*, new char[0]) = ""
	531	+ * </pre>
	532	+ *
	533	+ * @param str the String to get initials from, may be null
	534	+ * @param delimiters set of characters to determine words, null means whitespace
	535	+ * @return String of initial letters, <code>null</code> if null String input
	536	+ * @see #initials(String)
	537	+ * @since 2.2
	538	+ */
	539	+ public static String initials(String str, char[] delimiters) {
	540	+ if (str == null \|\| str.length() == 0) {
	541	+ return str;
	542	+ }
	543	+ if (delimiters != null && delimiters.length == 0) {
	544	+ return "";
	545	+ }
	546	+ int strLen = str.length();
	547	+ char[] buf = new char[strLen / 2 + 1];
	548	+ int count = 0;
	549	+ boolean lastWasGap = true;
	550	+ for (int i = 0; i < strLen; i++) {
	551	+ char ch = str.charAt(i);
	552	+
	553	+ if (isDelimiter(ch, delimiters)) {
	554	+ lastWasGap = true;
	555	+ } else if (lastWasGap) {
	556	+ buf[count++] = ch;
	557	+ lastWasGap = false;
	558	+ } else {
	559	+ // ignore ch
	560	+ }
	561	+ }
	562	+ return new String(buf, 0, count);
	563	+ }
	564	+
	565	+ //-----------------------------------------------------------------------
	566	+ /**
	567	+ * Is the character a delimiter.
	568	+ *
	569	+ * @param ch the character to check
	570	+ * @param delimiters the delimiters
	571	+ * @return true if it is a delimiter
	572	+ */
	573	+ private static boolean isDelimiter(char ch, char[] delimiters) {
	574	+ if (delimiters == null) {
	575	+ return Character.isWhitespace(ch);
	576	+ }
	577	+ for (int i = 0, isize = delimiters.length; i < isize; i++) {
	578	+ if (ch == delimiters[i]) {
	579	+ return true;
	580	+ }
	581	+ }
	582	+ return false;
	583	+ }
	584	+
	585	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -273,8 +273,33 @@
274	274	q = parser.parseTwoPass("[1,12]:beans",NamespacePolicy.REWRITE);
275	275	assertEquals("(+(namespace:1 namespace:12) +(contents:beans contents:bean^0.5)) (+(namespace:1 namespace:12) +title:beans^2.0)",q.toString());
276	276
	277	+ q = parser.parseTwoPass("[1,12]:beans and others incategory:food",NamespacePolicy.REWRITE);
	278	+ assertEquals("(+(namespace:1 namespace:12) +(+(contents:beans contents:bean^0.5) +contents:and +(contents:others contents:other^0.5) +category:food)) (+(namespace:1 namespace:12) +(+title:beans^2.0 +title:and^2.0 +title:others^2.0 +category:food))",q.toString());
	279	+
277	280	q = parser.parseTwoPass("[1,a12]:beans",NamespacePolicy.IGNORE);
278	281	assertEquals("(+contents:1 +contents:a12 +(contents:beans contents:bean^0.5)) (+title:1^2.0 +title:a12^2.0 +title:beans^2.0)",q.toString());
	282	+
	283	+ // Redirect third pass tests
	284	+ q = parser.parseThreePass("beans",NamespacePolicy.IGNORE);
	285	+ assertEquals("(contents:beans contents:bean^0.5) title:beans^2.0 redirect:beans^2.0",q.toString());
	286	+
	287	+ q = parser.parseThreePass("beans everyone",NamespacePolicy.IGNORE);
	288	+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0) spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false)",q.toString());
	289	+
	290	+ q = parser.parseThreePass("beans everyone incategory:mouse",NamespacePolicy.IGNORE);
	291	+ assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:mouse) (+title:beans^2.0 +title:everyone^2.0 +category:mouse) (+spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false) +category:mouse)",q.toString());
	292	+
	293	+ q = parser.parseThreePass("beans OR everyone",NamespacePolicy.IGNORE);
	294	+ assertEquals("((contents:beans contents:bean^0.5) (contents:everyone contents:everyon^0.5)) (title:beans^2.0 title:everyone^2.0)",q.toString());
	295	+
	296	+ q = parser.parseThreePass("beans -everyone",NamespacePolicy.IGNORE);
	297	+ assertEquals("(+(contents:beans contents:bean^0.5) -(contents:everyone)) (+title:beans^2.0 -title:everyone^2.0)",q.toString());
	298	+
	299	+ q = parser.parseThreePass("[0,1,2]:beans everyone",NamespacePolicy.REWRITE);
	300	+ assertEquals("(+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+(namespace:0 namespace:1 namespace:2) +spanNear([redirect:beans^2.0, redirect:everyone^2.0], 52, false))",q.toString());
	301	+
	302	+ q = parser.parseThreePass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE);
	303	+ assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0))",q.toString());
279	304
280	305	// Test field extraction
281	306	HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
—	—	@@ -11,9 +11,11 @@
12	12	import java.net.URL;
13	13	import java.util.ArrayList;
14	14	import java.util.Hashtable;
	15	+import java.util.Properties;
15	16
16	17	import org.wikimedia.lsearch.config.GlobalConfiguration;
17	18	import org.wikimedia.lsearch.config.IndexId;
	19	+import org.wikimedia.lsearch.search.NamespaceFilter;
18	20
19	21	import junit.framework.TestCase;
20	22
—	—	@@ -57,7 +59,11 @@
58	60	return searchGroup;
59	61	}
60	62
	63	+ public Properties getGlobalProps(){
	64	+ return globalProperties;
	65	+ }
61	66
	67	+
62	68	}
63	69
64	70	public static GlobalConfigurationTest.TestGC testgc = null;
—	—	@@ -80,7 +86,7 @@
81	87	String testurl = "file://"+System.getProperty("user.dir")+"/test-data/mwsearch-global.test";
82	88	try {
83	89	URL url = new URL(testurl);
84		~~- testgc.readFromURL(url,"/usr/local/var/mwsearch","",null);~~
	90	+ testgc.readFromURL(url,"/usr/local/var/mwsearch","");
85	91
86	92	// database
87	93	Hashtable database = testgc.getDatabase();
—	—	@@ -147,6 +153,23 @@
148	154	String hostName = host.getHostName();
149	155	System.out.println("Verify internet IP: "+hostAddr+", and hostname: "+hostName);
150	156
	157	+ // test prefixes
	158	+ Hashtable<String,NamespaceFilter> p = testgc.getNamespacePrefixes();
	159	+ assertEquals(17,p.size());
	160	+
	161	+ // check global properties
	162	+ Properties prop = testgc.getGlobalProps();
	163	+ assertEquals("wiki wiktionary test",prop.get("Database.suffix"));
	164	+ assertEquals("wiki rutest",prop.get("KeywordScoring.suffix"));
	165	+
	166	+ // check languages and keyword stuff
	167	+ assertEquals("en",testgc.getLanguage("entest"));
	168	+ assertEquals("sr",testgc.getLanguage("srwiki"));
	169	+ assertFalse(testgc.useKeywordScoring("frtest"));
	170	+ assertTrue(testgc.useKeywordScoring("srwiki"));
	171	+ assertTrue(testgc.useKeywordScoring("rutest"));
	172	+
	173	+
151	174	} catch (MalformedURLException e) {
152	175	e.printStackTrace();
153	176	} catch (IOException e) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
—	—	@@ -3,6 +3,7 @@
4	4	import java.io.StringReader;
5	5	import java.util.ArrayList;
6	6	import java.util.HashMap;
	7	+import java.util.HashSet;
7	8	import java.util.Map.Entry;
8	9
9	10	import org.apache.lucene.analysis.Analyzer;
—	—	@@ -38,6 +39,16 @@
39	40	System.out.print("["+t.getKey()+"] => ["+t.getValue()+"] ");
40	41	}
41	42	if(iw.size()!=0) System.out.println();
	43	+
	44	+ HashSet<String> keywords = parser.getKeywords();
	45	+ if(keywords.size()!=0){
	46	+ System.out.print("KEYWORDS: ");
	47	+ }
	48	+ for(String t : keywords){
	49	+ System.out.print("["+t+"] ");
	50	+ }
	51	+ if(keywords.size()!=0) System.out.println();
	52	+
42	53	System.out.println();
43	54	}
44	55
—	—	@@ -75,6 +86,10 @@
76	87	showTokens(text);
77	88	text = "{{IPstack\|name = Hundai}} '''[[Hypertext]] Transfer [[communications protocol\|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
78	89	showTokens(text);
	90	+ text = "[[First link]]\n== Some caption ==\n[[Other link]]";
	91	+ showTokens(text);
	92	+ text = "[[First]] second third fourth and so on goes the ... [[last link]]";
	93	+ showTokens(text);
79	94
80	95	ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test");
81	96	ArrayList<TestArticle> articles = ap.getArticles();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java
—	—	@@ -29,7 +29,7 @@
30	30	this.page = page;
31	31	}
32	32	public void writeEndPage() throws IOException {
33		~~- String key = page.Title.Namespace+":"+page.Title.Text.toLowerCase();~~
	33	+ String key = page.Title.Namespace+":"+page.Title.Text;
34	34	titles.put(key,new Rank(0));
35	35	}
36	36	public HashMap<String,Rank> getTitles() {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
—	—	@@ -1,7 +1,9 @@
2	2	package org.wikimedia.lsearch.importer;
3	3
4	4	import java.io.IOException;
	5	+import java.util.ArrayList;
5	6	import java.util.HashMap;
	7	+import java.util.Map.Entry;
6	8	import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
7	9	import java.util.regex.Matcher;
8	10	import java.util.regex.Pattern;
—	—	@@ -43,21 +45,17 @@
44	46	}
45	47	public void writeEndPage() throws IOException {
46	48	// get rank
47		~~- String key = page.Title.Namespace+":"+page.Title.Text.toLowerCase();~~
	49	+ String key = page.Title.Namespace+":"+page.Title.Text;
48	50	Rank r = ranks.get(key);
49	51	int rank;
50		~~- boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode)!=null;~~
	52	+ boolean isRedirect = r.redirectsTo != null;
51	53	if(r == null){
52	54	rank = 0;
53		~~- log.error("Rank for "+(page.Title.Namespace+":"+page.Title.Text.toLowerCase())+" is undefined, which should never happen.");~~
54		~~- } else{~~
55		~~- if(r.redirect != null && key.equals(r.redirect) && isRedirect){~~
56		~~- rank = 0;~~
57		~~- } else~~
58		~~- rank = r.links;~~
59		~~- }~~
	55	+ log.error("Rank for "+key+" is undefined, which should never happen.");
	56	+ } else
	57	+ rank = r.links;
60	58	// make article
61		~~- Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,rank);~~
	59	+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,rank,r.redirected);
62	60	writer.addArticle(article);
63	61	count++;
64	62	if(limit >= 0 && count > limit)
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/RankReader.java
—	—	@@ -6,6 +6,7 @@
7	7	import java.util.regex.Matcher;
8	8	import java.util.regex.Pattern;
9	9
	10	+import org.apache.commons.lang.WordUtils;
10	11	import org.apache.log4j.Logger;
11	12	import org.mediawiki.importer.DumpWriter;
12	13	import org.mediawiki.importer.Page;
—	—	@@ -49,26 +50,53 @@
50	51	this.page = page;
51	52	}
52	53	public void writeEndPage() throws IOException {
53		~~- Rank r = ranks.get(page.Title.Namespace+":"+page.Title.Text.toLowerCase());~~
	54	+ Rank r = ranks.get(page.Title.Namespace+":"+page.Title.Text);
54	55	// register redirect
55	56	String redirect = Localization.getRedirectTarget(revision.Text,langCode);
56	57	if( redirect !=null ){
57		~~- redirect = redirect.toLowerCase();~~
58	58	int ns = 0;
59	59	String title = redirect;
60	60	String[] parts = redirect.split(":",2);
61	61	if(parts.length == 2 && parts[0].length()>1){
62		~~- Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1));~~
	62	+ Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
63	63	if(inx != null){
64	64	ns = inx;
65	65	title = parts[1];
66	66	}
67	67	}
68		~~- r.redirect = ns+":"+title;~~
	68	+ r.redirectsTo = findRank(ns,title);
69	69	} else // process links
70	70	processRanks(revision.Text,page.Title.Namespace);
71	71	}
72	72
	73	+ /** Find the rank object for the ns:title */
	74	+ protected Rank findRank(int ns, String title){
	75	+ String key;
	76	+ Rank rank;
	77	+ // try exact match
	78	+ key = ns+":"+title;
	79	+ rank = ranks.get(key);
	80	+ if(rank != null)
	81	+ return rank;
	82	+ // try lowercase
	83	+ key = ns+":"+title.toLowerCase();
	84	+ rank = ranks.get(key);
	85	+ if(rank != null)
	86	+ return rank;
	87	+ // try title case
	88	+ key = ns+":"+WordUtils.capitalize(title);
	89	+ rank = ranks.get(key);
	90	+ if(rank != null)
	91	+ return rank;
	92	+ // try capitalizing at word breaks
	93	+ key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
	94	+ rank = ranks.get(key);
	95	+ if(rank != null)
	96	+ return rank;
	97	+
	98	+ return null;
	99	+ }
	100	+
73	101	/** Extract all links from this page, and increment ranks for linked pages */
74	102	protected void processRanks(String text, int namespace) {
75	103	Pattern linkPat = Pattern.compile("\\[\\[(.?)(\\\|(.?))?\\]\\]");
—	—	@@ -76,14 +104,12 @@
77	105	int ns; String title;
78	106	boolean escaped;
79	107
80		~~- HashSet<String> links = new HashSet<String>();~~
	108	+ HashSet<Rank> links = new HashSet<Rank>();
81	109	while(matcher.find()){
82		~~- String link = matcher.group(1).toLowerCase();~~
	110	+ String link = matcher.group(1);
83	111	int fragment = link.lastIndexOf('#');
84	112	if(fragment != -1)
85	113	link = link.substring(0,fragment);
86		~~- if(link.length() > 100)~~
87		~~- continue; // probably an error~~
88	114	//System.out.println("Got link "+link);
89	115	if(link.startsWith(":")){
90	116	escaped = true;
—	—	@@ -94,7 +120,7 @@
95	121	// check for ns:title syntax
96	122	String[] parts = link.split(":",2);
97	123	if(parts.length == 2 && parts[0].length() > 1){
98		~~- Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1));~~
	124	+ Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
99	125	if(!escaped && (parts[0].equalsIgnoreCase("category") \|\| (inx!=null && inx==14)))
100	126	continue; // categories, ignore
101	127	if(inx!=null && inx < 0)
—	—	@@ -108,17 +134,17 @@
109	135	if(interwiki.contains(parts[0]))
110	136	continue;
111	137	}
	138	+ if(ns == 0 && namespace!=0)
	139	+ continue; // skip links from other namespaces into the main namespace
	140	+
112	141	// register as link
113		~~- String key = ns+":"+title;~~
114		~~- links.add(key);~~
	142	+ Rank target = findRank(ns,title);
	143	+ if(target != null)
	144	+ links.add(target);
115	145	}
116	146	// increment page ranks
117		~~- for(String t : links){~~
118		~~- if(t.startsWith("0:") && namespace!=0)~~
119		~~- continue; // skip links from other namespaces into the main namespace~~
120		~~- Rank rank = ranks.get(t);~~
121		~~- if(rank != null)~~
122		~~- rank.links++;~~
	147	+ for(Rank rank : links){
	148	+ rank.links++;
123	149	}
124	150	}
125	151	public void writeSiteinfo(Siteinfo info) throws IOException {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
—	—	@@ -86,6 +86,7 @@
87	87	else
88	88	writer.setMaxBufferedDocs(glMaxBufDocs);
89	89	writer.setUseCompoundFile(true);
	90	+ writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
90	91
91	92	return writer;
92	93	}
—	—	@@ -94,7 +95,7 @@
95	96	public void addArticle(Article a){
96	97	if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
97	98	return; // don't add if preconditions are not met
98		-
	99	+ WikiIndexModifier.transformArticleForIndexing(a);
99	100	IndexId target;
100	101	if(iid.isSingle())
101	102	target = iid;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -5,6 +5,7 @@
6	6	import java.util.ArrayList;
7	7	import java.util.HashMap;
8	8	import java.util.HashSet;
	9	+import java.util.Map.Entry;
9	10
10	11	import org.apache.log4j.Logger;
11	12	import org.mediawiki.dumper.ProgressFilter;
—	—	@@ -95,17 +96,19 @@
96	97	long start = System.currentTimeMillis();
97	98
98	99	HashMap<String,Rank> ranks = processRanks(inputfile,getTitles(inputfile),langCode);
99		-
	100	+
100	101	// add-up ranks of redirects to pages where they redirect to
101		~~- for(Rank r : ranks.values()){~~
102		~~- if(r.redirect != null){~~
103		~~- Rank dest = ranks.get(r.redirect);~~
104		~~- if(dest != null && dest != r){~~
105		~~- dest.links += r.links;~~
106		~~- r.links = 0;~~
107		~~- }~~
	102	+ for(Entry<String,Rank> e : ranks.entrySet()){
	103	+ Rank r = e.getValue();
	104	+ if(r.redirectsTo != null && r != r.redirectsTo){
	105	+ r.redirectsTo.links += r.links;
	106	+ r.links = 0;
	107	+ if(r.redirectsTo.redirected == null)
	108	+ r.redirectsTo.redirected = new ArrayList<String>();
	109	+ r.redirectsTo.redirected.add(e.getKey());
108	110	}
109	111	}
	112	+
110	113	log.info("Third pass, indexing articles...");
111	114
112	115	// open
—	—	@@ -119,7 +122,7 @@
120	123
121	124	// read
122	125	DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,ranks,langCode);
123		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));~~
	126	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));
124	127	try {
125	128	reader.readDump();
126	129	} catch (IOException e) {
—	—	@@ -168,7 +171,7 @@
169	172	}
170	173	// calculate ranks
171	174	RankReader rr = new RankReader(ranks,langCode);
172		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 100));~~
	175	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
173	176	try {
174	177	reader.readDump();
175	178	} catch (IOException e) {
—	—	@@ -189,7 +192,7 @@
190	193	}
191	194	// first pass, get titles
192	195	TitleReader tr = new TitleReader();
193		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 100));~~
	196	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
194	197	try {
195	198	reader.readDump();
196	199	input.close();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java
—	—	@@ -106,7 +106,6 @@
107	107	String globalurl = getString("MWConfig","global");
108	108	String indexpath = getString("Indexes","path");
109	109	String oairepo = getString("OAI","repo");
110		~~- String[] dbsuffixes = getArray("Database","suffix");~~
111	110	if(globalurl==null){
112	111	System.out.println("FATAL: Need to define global configuration url in local config file.");
113	112	System.exit(1);
—	—	@@ -115,7 +114,7 @@
116	115	System.exit(1);
117	116	}
118	117	try {
119		~~- global.readFromURL(new URL(globalurl),indexpath,oairepo,dbsuffixes);~~
	118	+ global.readFromURL(new URL(globalurl),indexpath,oairepo);
120	119	} catch (MalformedURLException e) {
121	120	System.out.println("Malformed URL "+globalurl+" cannot read global configuration (check MWConfig.global in "+CONF_FILE_NAME+"), exiting...");
122	121	System.exit(1);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -5,8 +5,10 @@
6	6	package org.wikimedia.lsearch.config;
7	7
8	8	import java.io.BufferedReader;
	9	+import java.io.ByteArrayInputStream;
9	10	import java.io.IOException;
10	11	import java.io.InputStreamReader;
	12	+import java.io.StringReader;
11	13	import java.net.Inet4Address;
12	14	import java.net.InetAddress;
13	15	import java.net.MalformedURLException;
—	—	@@ -19,6 +21,7 @@
20	22	import java.util.HashSet;
21	23	import java.util.Hashtable;
22	24	import java.util.Iterator;
	25	+import java.util.Properties;
23	26	import java.util.Set;
24	27	import java.util.regex.Matcher;
25	28	import java.util.regex.Pattern;
—	—	@@ -63,8 +66,12 @@
64	67	/** OAI repo pattern from lsearch2.conf */
65	68	protected String OAIRepoPattern;
66	69	/** Database suffix if dbname, the rest is supposed to be language, e.g srwiki => (suffix wiki) => sr */
67		~~- protected String[] databaseSuffixes;~~
	70	+ protected String[] databaseSuffixes = null;
	71	+ /** Databases ending in suffix will use additional keyword scores */
	72	+ protected String[] keywordScoringSuffixes = null;
68	73
	74	+ protected Properties globalProperties = null;
	75	+
69	76	/** All identifiers of all indexes (dbrole -> IndexId) */
70	77	protected static Hashtable<String,IndexId> indexIdPool = new Hashtable<String,IndexId>();
71	78
—	—	@@ -192,13 +199,13 @@
193	200	* @param url
194	201	* @throws IOException
195	202	*/
196		~~- public void readFromURL(URL url, String indexpath, String oaiRepo, String[] dbsuffixes) throws IOException{~~
	203	+ public void readFromURL(URL url, String indexpath, String oaiRepo) throws IOException{
197	204	BufferedReader in;
198	205	try {
199	206	in = new BufferedReader(
200	207	new InputStreamReader(
201	208	url.openStream()));
202		~~- read(in,indexpath,oaiRepo,dbsuffixes);~~
	209	+ read(in,indexpath,oaiRepo);
203	210	} catch (IOException e) {
204	211	System.out.println("I/O Error in opening or reading global config at url "+url);
205	212	throw e;
—	—	@@ -221,6 +228,13 @@
222	229	namespacePrefixAll = "all"; // default
223	230	}
224	231
	232	+ protected String[] getArrayProperty(String name){
	233	+ String s = globalProperties.getProperty(name);
	234	+ if (s != null)
	235	+ return s.split(" ");
	236	+ return null;
	237	+ }
	238	+
225	239	/**
226	240	* Reads a config file from a bufferedreader, will
227	241	* close the reader when done.
—	—	@@ -228,7 +242,7 @@
229	243	* @param in opened reader
230	244	* @throws IOException
231	245	*/
232		~~- protected void read(BufferedReader in, String indexpath, String oaiRepo, String[] dbsuffixes) throws IOException{~~
	246	+ protected void read(BufferedReader in, String indexpath, String oaiRepo) throws IOException{
233	247	String line="";
234	248	int section = -1;
235	249	Pattern roleRegexp = Pattern.compile("\$(.*?)\$");
—	—	@@ -245,7 +259,6 @@
246	260	init();
247	261	this.indexPath = indexpath;
248	262	this.OAIRepoPattern = oaiRepo == null? "" : oaiRepo;
249		~~- this.databaseSuffixes = dbsuffixes;~~
250	263
251	264	while((line = in.readLine()) != null){
252	265	lineNum ++;
—	—	@@ -260,6 +273,27 @@
261	274	if(line.startsWith("[") && line.length()>2 && !Character.isDigit(line.charAt(1))){ // section
262	275	int last = line.indexOf("]");
263	276	String s = line.substring(1,last);
	277	+
	278	+ if(s.equalsIgnoreCase("properties")){
	279	+ globalProperties = new Properties();
	280	+ StringBuilder prop = new StringBuilder(line+"\n");
	281	+ while((line = in.readLine()) != null){
	282	+ if(line.startsWith("[") && line.length()>2 && !Character.isDigit(line.charAt(1)))
	283	+ break;
	284	+ prop.append(line);
	285	+ prop.append("\n");
	286	+ }
	287	+ globalProperties.load(new ByteArrayInputStream(prop.toString().getBytes("utf-8")));
	288	+ // get some predifined global properties
	289	+ this.databaseSuffixes = getArrayProperty("Database.suffix");
	290	+ this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
	291	+ if(line == null)
	292	+ break;
	293	+ // else: line points to beginning of next section
	294	+ last = line.indexOf("]");
	295	+ s = line.substring(1,last);
	296	+ }
	297	+
264	298	if(s.equalsIgnoreCase("database"))
265	299	section = DATABASE;
266	300	else if(s.equalsIgnoreCase("index"))
—	—	@@ -314,8 +348,7 @@
315	349	if(filter.equalsIgnoreCase("<all>"))
316	350	namespacePrefixAll = prefix;
317	351	else
318		~~- namespacePrefix.put(prefix,new NamespaceFilter(filter));~~
319		-
	352	+ namespacePrefix.put(prefix,new NamespaceFilter(filter));
320	353	}
321	354	}
322	355	if( !checkIntegrity() ){
—	—	@@ -769,6 +802,24 @@
770	803	return namespacePrefixAll;
771	804	}
772	805
	806	+ /** Returns if keyword scoring should be used for this db, using
	807	+ * the suffixes from the global configuration
	808	+ *
	809	+ * @param dbname
	810	+ * @return
	811	+ */
	812	+ public boolean useKeywordScoring(String dbname){
	813	+ if(keywordScoringSuffixes == null)
	814	+ return false;
	815	+ else{
	816	+ for (String suffix : keywordScoringSuffixes) {
	817	+ if (dbname.endsWith(suffix))
	818	+ return true;
	819	+ }
	820	+ }
	821	+ return false;
	822	+ }
773	823
	824	+
774	825
775	826	}
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java
—	—	@@ -188,6 +188,7 @@
189	189	}
190	190	long startTime = System.currentTimeMillis();
191	191	// rsync
	192	+ log.info("Starting rsync of "+iid);
192	193	String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp;
193	194	command = "/usr/bin/rsync -W --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath();
194	195	log.debug("Running shell command: "+command);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -135,10 +135,10 @@
136	136
137	137	try {
138	138	if(nsfw == null){
139		~~- q = parser.parseTwoPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE);~~
	139	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
140	140	}
141	141	else{
142		~~- q = parser.parseTwoPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE);~~
	142	+ q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
143	143	log.info("Using NamespaceFilterWrapper "+nsfw);
144	144	}
145	145
—	—	@@ -250,10 +250,13 @@
251	251	// fetch documents
252	252	Document[] docs = s.docs(docids);
253	253	int j=0;
	254	+ float maxScore = 1;
	255	+ if(numhits>0)
	256	+ maxScore = hits.score(0);
254	257	for(Document doc : docs){
255	258	String namespace = doc.get("namespace");
256	259	String title = doc.get("title");
257		~~- float score = transformScore(scores[j]);~~
	260	+ float score = transformScore(scores[j]/maxScore);
258	261	ResultSet rs = new ResultSet(score,namespace,title);
259	262	if(explain)
260	263	rs.setExplanation(((IndexSearcherMul)s).explain(q,docids[j]));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
—	—	@@ -65,7 +65,7 @@
66	66
67	67	try{
68	68	for(int i=0; i < count ; i++){
69		~~- Query q = parser.parseTwoPass(terms.next(),WikiQueryParser.NamespacePolicy.IGNORE);~~
	69	+ Query q = parser.parseFourPass(terms.next(),WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
70	70	Hits hits = is.search(q);
71	71	for(int j =0; j<20 && j<hits.length(); j++)
72	72	hits.doc(j); // retrieve some documents
—	—	@@ -117,7 +117,7 @@
118	118	public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
119	119	try{
120	120	WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
121		~~- Query q = parser.parseTwoPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE);~~
	121	+ Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
122	122	is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
123	123	} catch (IOException e) {
124	124	log.error("Error warming up local IndexSearcherMul for "+iid);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -6,6 +6,8 @@
7	7
8	8	import java.io.File;
9	9	import java.io.IOException;
	10	+import java.util.ArrayList;
	11	+import java.util.Arrays;
10	12	import java.util.Collection;
11	13	import java.util.Collections;
12	14	import java.util.HashSet;
—	—	@@ -24,7 +26,9 @@
25	27	import org.apache.lucene.store.Directory;
26	28	import org.apache.lucene.store.FSDirectory;
27	29	import org.wikimedia.lsearch.analyzers.Analyzers;
	30	+import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
28	31	import org.wikimedia.lsearch.analyzers.FilterFactory;
	32	+import org.wikimedia.lsearch.analyzers.WikiTokenizer;
29	33	import org.wikimedia.lsearch.beans.Article;
30	34	import org.wikimedia.lsearch.beans.IndexReportCard;
31	35	import org.wikimedia.lsearch.config.GlobalConfiguration;
—	—	@@ -48,13 +52,13 @@
49	53	}
50	54	}
51	55
	56	+ static public final int MAX_FIELD_LENGTH = 100000;
52	57	/** Simple implementation of batch addition and deletion */
53	58	class SimpleIndexModifier {
54	59	protected IndexId iid;
55	60	protected IndexReader reader;
56	61	protected IndexWriter writer;
57		~~- protected boolean rewrite;~~
58		~~- protected int maxFieldLength;~~
	62	+ protected boolean rewrite;
59	63	protected String langCode;
60	64
61	65	protected HashSet<IndexUpdateRecord> nonDeleteDocuments;
—	—	@@ -75,13 +79,8 @@
76	80	this.iid = iid;
77	81	this.rewrite = rewrite;
78	82	this.langCode = langCode;
79		~~- maxFieldLength = 0;~~
80	83	reportQueue = new Hashtable<IndexUpdateRecord,IndexReportCard>();
81	84	}
82		-
83		~~- public void setMaxFieldLength(int maxFieldLength) {~~
84		~~- this.maxFieldLength = maxFieldLength;~~
85		~~- }~~
86	85
87	86	protected IndexReportCard getReportCard(IndexUpdateRecord rec){
88	87	if(!rec.isReportBack())
—	—	@@ -168,8 +167,7 @@
169	168	writer.setMergeFactor(mergeFactor);
170	169	writer.setMaxBufferedDocs(maxBufDocs);
171	170	writer.setUseCompoundFile(true);
172		~~- if(maxFieldLength!=0)~~
173		~~- writer.setMaxFieldLength(maxFieldLength);~~
	171	+ writer.setMaxFieldLength(MAX_FIELD_LENGTH);
174	172
175	173	FilterFactory filters = new FilterFactory(langCode);
176	174
—	—	@@ -179,6 +177,7 @@
180	178	continue; // don't add if delete/add are paired operations
181	179	if(!checkPreconditions(rec))
182	180	continue; // article shoouldn't be added for some (heuristic) reason
	181	+ transformArticleForIndexing(rec.getArticle()); // tranform record so that unnecessary stuff is deleted, e.g. some redirects
183	182	IndexReportCard card = getReportCard(rec);
184	183	Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters);
185	184	Document doc = (Document) ret[0];
—	—	@@ -210,7 +209,7 @@
211	210	}
212	211	return succ;
213	212	}
214		-
	213	+
215	214	public boolean checkPreconditions(IndexUpdateRecord rec){
216	215	return checkAddPreconditions(rec.getArticle(),langCode);
217	216	}
—	—	@@ -226,15 +225,41 @@
227	226	public static boolean checkAddPreconditions(Article ar, String langCode){
228	227	if(ar.getNamespace().equals("0")){
229	228	String redirect = Localization.getRedirectTarget(ar.getContents(),langCode);
230		~~- if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){~~
	229	+ if(redirect != null)
	230	+ return false; // don't add redirects
	231	+ /*if(redirect != null && redirect.toLowerCase().equals(ar.getTitle().toLowerCase())){
231	232	log.debug("Not adding "+ar+" into index: "+ar.getContents());
232	233	return false;
233		~~- }~~
	234	+ } */
234	235	}
235	236	return true;
236	237	}
237	238
238	239	/**
	240	+ * Changes the article, so that things we don't want to index are deleted,
	241	+ * e.g. it deletes redirects from nonmain namespace to article in main namespace
	242	+ *
	243	+ * @param rec
	244	+ */
	245	+ public static void transformArticleForIndexing(Article ar) {
	246	+ ArrayList<String> redirects = ar.getRedirects();
	247	+ String ns = ar.getNamespace()+":";
	248	+ if(redirects != null){
	249	+ ArrayList<String> filtered = new ArrayList<String>();
	250	+ // index only redirects from the same namespace
	251	+ // to avoid a lot of unusable redirects from/to
	252	+ // user namespace, but always index redirect FROM main
	253	+ for(String r : redirects){
	254	+ if(r.startsWith(ns) \|\| r.startsWith("0:"))
	255	+ filtered.add(r.split(":",2)[1]);
	256	+ //else
	257	+ //log.info("Ignoring redirect "+r+" to "+ar);
	258	+ }
	259	+ ar.setRedirects(filtered);
	260	+ }
	261	+ }
	262	+
	263	+ /**
239	264	* Create necessary directories for index
240	265	* @param dbname
241	266	* @return relative path (to document root) of db within filesystem
—	—	@@ -347,6 +372,7 @@
348	373	*/
349	374	public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){
350	375	PerFieldAnalyzerWrapper perFieldAnalyzer = null;
	376	+ WikiTokenizer tokenizer = null;
351	377	Document doc = new Document();
352	378
353	379	// This will be used to look up and replace entries on index updates.
—	—	@@ -357,10 +383,22 @@
358	384
359	385	// boost document title with it's article rank
360	386	Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
361		~~- log.debug(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank());~~
362		~~- title.setBoost(calculateArticleRank(article.getRank()));~~
	387	+ //log.debug(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
	388	+ float rankBoost = calculateArticleRank(article.getRank());
	389	+ title.setBoost(rankBoost);
363	390	doc.add(title);
364	391
	392	+ // add titles of redirects, generated from analyzer
	393	+ Field redirect = new Field("redirect", "",
	394	+ Field.Store.NO, Field.Index.TOKENIZED);
	395	+ redirect.setBoost(rankBoost);
	396	+ doc.add(redirect);
	397	+
	398	+ // most significat words in the text, gets extra score, from analyzer
	399	+ Field keyword = new Field("keyword", "",
	400	+ Field.Store.NO, Field.Index.TOKENIZED);
	401	+ doc.add(keyword);
	402	+
365	403	// the next fields are generated using wikitokenizer
366	404	doc.add(new Field("contents", "",
367	405	Field.Store.NO, Field.Index.TOKENIZED));
—	—	@@ -372,9 +410,13 @@
373	411	String text = article.getContents();
374	412	if(article.isRedirect())
375	413	text=""; // for redirects index only the title
	414	+ Object[] ret = Analyzers.getIndexerAnalyzer(text,filters,article.getRedirects());
	415	+ perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0];
376	416
377		~~- perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,filters);~~
378		-
	417	+ // set boost for keyword field
	418	+ tokenizer = (WikiTokenizer) ret[1];
	419	+ keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size()));
	420	+
379	421	return new Object[] { doc, perFieldAnalyzer };
380	422	}
381	423
—	—	@@ -392,5 +434,19 @@
393	435	else
394	436	return (float) (1 + rank/15.0);
395	437	}
	438	+
	439	+ /**
	440	+ * We don't want whole stub articles fetched as keywords, so we penalize if
	441	+ * the article is too short for keyword extraction.
	442	+ *
	443	+ * @param numTokens
	444	+ * @return
	445	+ */
	446	+ public static float calculateKeywordsBoost(int numTokens){
	447	+ if(numTokens > 2 * FastWikiTokenizerEngine.KEYWORD_TOKEN_LIMIT)
	448	+ return 1;
	449	+ else
	450	+ return ((float)numTokens)/FastWikiTokenizerEngine.KEYWORD_TOKEN_LIMIT/2;
	451	+ }
396	452
397	453	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
—	—	@@ -20,6 +20,9 @@
21	21	*
22	22	* For titles:
23	23	* * 1/sqrt(term^3)
	24	+ *
	25	+ * For redirect:
	26	+ * * no length norm
24	27	*
25	28	*/
26	29	@Override
—	—	@@ -36,6 +39,8 @@
37	40	float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
38	41	//log.debug("Length-norm: "+f+", numtokens: "+numTokens);
39	42	return f;
	43	+ } else if(fieldName.equals("redirect") \|\| fieldName.equals("keyword")){
	44	+ return 1;
40	45	} else
41	46	return super.lengthNorm(fieldName,numTokens);
42	47
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
—	—	@@ -0,0 +1,113 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.Reader;
	6	+import java.util.ArrayList;
	7	+import java.util.HashSet;
	8	+
	9	+import org.apache.log4j.Logger;
	10	+import org.apache.lucene.analysis.Analyzer;
	11	+import org.apache.lucene.analysis.Token;
	12	+import org.apache.lucene.analysis.TokenStream;
	13	+
	14	+/**
	15	+ * Analyzer that builds a field with an array of keywords,
	16	+ * each keyword is separated by a large token gap, so it's
	17	+ * convenient to run SpanNearQueries on the field. Keywords
	18	+ * themselves are tokenized. E.g.
	19	+ *
	20	+ * ("something different", "other") ->
	21	+ * "something" +1 "different" +201 "other"
	22	+ *
	23	+ * Currently used for fields "redirect" and "keyword"
	24	+ *
	25	+ * @author rainman
	26	+ *
	27	+ */
	28	+public class KeywordsAnalyzer extends Analyzer{
	29	+ static Logger log = Logger.getLogger(KeywordsAnalyzer.class);
	30	+ protected ArrayList<String> keywords;
	31	+ protected FilterFactory filters;
	32	+ protected KeywordsTokenStream tokens;
	33	+
	34	+ public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters){
	35	+ ArrayList<String> k = new ArrayList<String>();
	36	+ k.addAll(keywords);
	37	+ tokens = new KeywordsTokenStream(k,filters);
	38	+ }
	39	+
	40	+ public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters){
	41	+ tokens = new KeywordsTokenStream(keywords,filters);
	42	+ }
	43	+ /** positional increment between different redirects */
	44	+ public static final int tokenGap = 201;
	45	+
	46	+ @Override
	47	+ public TokenStream tokenStream(String fieldName, Reader reader) {
	48	+ return tokens;
	49	+ }
	50	+ @Override
	51	+ public TokenStream tokenStream(String fieldName, String text) {
	52	+ return tokens;
	53	+ }
	54	+
	55	+ class KeywordsTokenStream extends TokenStream {
	56	+ protected Analyzer analyzer;
	57	+ protected ArrayList<String> keywords;
	58	+ protected int index;
	59	+ protected String keyword;
	60	+ protected TokenStream tokens;
	61	+
	62	+ public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters){
	63	+ this.analyzer = new QueryLanguageAnalyzer(filters);
	64	+ this.keywords = keywords;
	65	+ this.index = 0;
	66	+ this.keyword = null;
	67	+ this.tokens = null;
	68	+ }
	69	+ @Override
	70	+ public Token next() throws IOException {
	71	+ if(keywords == null)
	72	+ return null; // nothing to do
	73	+ Token t;
	74	+ if(keyword == null){
	75	+ t = openNext();
	76	+ return t;
	77	+ }
	78	+ if(keyword != null && tokens!=null){
	79	+ t = tokens.next();
	80	+ if(t == null){
	81	+ t = openNext();
	82	+ if(t != null)
	83	+ t.setPositionIncrement(tokenGap);
	84	+ }
	85	+ return t;
	86	+ } else{
	87	+ log.warn("Inconsistent state: key="+keyword+", tokens="+tokens);
	88	+ }
	89	+ return null;
	90	+ }
	91	+
	92	+ protected Token openNext() throws IOException {
	93	+ Token t;
	94	+ if(index >= keywords.size())
	95	+ return null; // processed all keywords
	96	+ // try subsequent keyword titles until find one with
	97	+ // title that can be tokenized
	98	+ do{
	99	+ // next keyword title
	100	+ keyword = keywords.get(index++);
	101	+ tokens = analyzer.tokenStream("",keyword);
	102	+ // try to tokenize
	103	+ t = tokens.next();
	104	+ if(t == null && index == keywords.size())
	105	+ return null; // last token
	106	+ else if(t!=null)
	107	+ return t;
	108	+ } while(keyword == null);
	109	+ return null;
	110	+ }
	111	+
	112	+ }
	113	+
	114	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
—	—	@@ -4,6 +4,7 @@
5	5	import java.io.Reader;
6	6	import java.util.ArrayList;
7	7	import java.util.HashMap;
	8	+import java.util.HashSet;
8	9	import java.util.Iterator;
9	10
10	11	import org.apache.log4j.Logger;
—	—	@@ -17,6 +18,7 @@
18	19	protected Iterator<Token> tokenIt = null;
19	20	protected ArrayList<String> categories = null;
20	21	protected HashMap<String,String> interwikis = null;
	22	+ protected HashSet<String> keywords = null;
21	23
22	24	/** Use <code>WikiTokenizer(String)</code> constructor */
23	25	@Deprecated
—	—	@@ -52,6 +54,7 @@
53	55	tokenIt = tokens.iterator();
54	56	categories = parser.getCategories();
55	57	interwikis = parser.getInterwikis();
	58	+ keywords = parser.getKeywords();
56	59	}
57	60	}
58	61
—	—	@@ -84,7 +87,12 @@
85	88	public ArrayList<Token> getTokens() {
86	89	return tokens;
87	90	}
	91	+
	92	+ public HashSet<String> getKeywords() {
	93	+ return keywords;
	94	+ }
88	95
89	96
	97	+
90	98
91	99	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -7,6 +7,7 @@
8	8	import java.util.HashSet;
9	9	import java.util.Hashtable;
10	10
	11	+import org.apache.commons.lang.WordUtils;
11	12	import org.apache.lucene.analysis.Token;
12	13	import org.wikimedia.lsearch.util.Localization;
13	14	import org.wikimedia.lsearch.util.UnicodeDecomposer;
—	—	@@ -35,6 +36,7 @@
36	37	private ArrayList<Token> tokens;
37	38	protected ArrayList<String> categories;
38	39	protected HashMap<String,String> interwikis;
	40	+ protected HashSet<String> keywords;
39	41	private int length = 0; // length of token
40	42	private int start = 0; // start position of token
41	43	private int cur = 0; // current position in the input string
—	—	@@ -45,11 +47,17 @@
46	48	private int decompi;
47	49	private char cl; // lowercased character
48	50	private boolean numberToken; // if the buffer holds a number token
	51	+ private int headings = 0; // how many headings did we see
49	52
50	53	private int prefixLen = 0;
51	54	private final char[] prefixBuf = new char[MAX_WORD_LEN];
52	55	private int semicolonInx = -1;
	56	+ private final char[] keywordBuf = new char[MAX_WORD_LEN];
	57	+ private int keywordLen = 0;
53	58
	59	+ /** This many tokens from begining of text are eligable for keywords */
	60	+ public static final int KEYWORD_TOKEN_LIMIT = 250;
	61	+
54	62	/** language code */
55	63	private String language;
56	64	/** language code -> set (image namespace names) */
—	—	@@ -60,12 +68,12 @@
61	69
62	70	private UnicodeDecomposer decomposer;
63	71
64		~~- enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END,~~
	72	+ enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD,
65	73	LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS,
66	74	TEMPLATE_BEGIN, TEMPLATE_WORDS, TEMPLATE_END,
67	75	TABLE_BEGIN};
68	76
69		~~- enum FetchState { WORD, CATEGORY, INTERWIKI};~~
	77	+ enum FetchState { WORD, CATEGORY, INTERWIKI, KEYWORD };
70	78
71	79
72	80	private void init(){
—	—	@@ -73,6 +81,7 @@
74	82	categories = new ArrayList<String>();
75	83	interwikis = new HashMap<String,String>();
76	84	decomposer = UnicodeDecomposer.getInstance();
	85	+ keywords = new HashSet<String>();
77	86	numberToken = false;
78	87	}
79	88
—	—	@@ -258,6 +267,50 @@
259	268	return Localization.getRedirectTarget(textString,language)!=null;
260	269	}
261	270
	271	+ /**
	272	+ * Decide if link that is currently being processed is to be appended to list of keywords
	273	+ *
	274	+ * Criterion: link is within first 300 words, and before the
	275	+ * first heading
	276	+ *
	277	+ */
	278	+ protected boolean isGoodKeywordLink(){
	279	+ return headings == 0 && tokens.size() <= KEYWORD_TOKEN_LIMIT;
	280	+ }
	281	+
	282	+ /** When encountering '=' check if this line is actually a heading */
	283	+ private void checkHeadings() {
	284	+ // make sure = is at a begining of a line
	285	+ if(cur == 0 \|\| text[cur-1]=='\n' \|\| text[cur-1]=='\r'){
	286	+ int endOfLine;
	287	+ // find end of line/text
	288	+ for(endOfLine = cur ; endOfLine < textLength ; endOfLine++ ){
	289	+ lc = text[endOfLine];
	290	+ if(lc == '\n' \|\| lc =='\r')
	291	+ break;
	292	+ }
	293	+ int start=0, end=0; // number of ='s at begining and end of line
	294	+ // find first sequence of =
	295	+ for(lookup = cur ; lookup < textLength && lookup < endOfLine ; lookup++ ){
	296	+ if(text[lookup] == '=')
	297	+ start++;
	298	+ else
	299	+ break;
	300	+ }
	301	+ // find the last squence of =
	302	+ for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){
	303	+ if(text[lookup] == '=')
	304	+ end++;
	305	+ else
	306	+ break;
	307	+ }
	308	+ // check
	309	+ if(start == end && start != 0 && start+end<endOfLine-cur && start>=2 && start<=4){
	310	+ headings++;
	311	+ }
	312	+ }
	313	+ }
	314	+
262	315	/**
263	316	* Parse Wiki text, and produce an arraylist of tokens.
264	317	* Also fills the lists categories and interwikis.
—	—	@@ -281,6 +334,9 @@
282	335	switch(state){
283	336	case WORD:
284	337	switch(c){
	338	+ case '=':
	339	+ checkHeadings();
	340	+ break;
285	341	case '<':
286	342	addToken();
287	343	state = ParserState.IGNORE;
—	—	@@ -369,12 +425,18 @@
370	426	fetch = FetchState.INTERWIKI;
371	427	state = ParserState.LINK_FETCH;
372	428	continue;
373		~~- } else{~~
374		~~- // unrecognized, ignore~~
375		~~- cur--;~~
376		~~- continue;~~
377	429	}
378	430	}
	431	+ // add this link to keywords?
	432	+ if(isGoodKeywordLink()){
	433	+ fetch = FetchState.KEYWORD;
	434	+ state = ParserState.LINK_KEYWORD;
	435	+ if(pipeInx != -1)
	436	+ cur = pipeInx; // ignore up to pipe
	437	+ else
	438	+ cur--; // return the first character of link
	439	+ continue;
	440	+ }
379	441
380	442	// no semicolon, search for pipe:
381	443	if(pipeInx != -1){
—	—	@@ -384,6 +446,11 @@
385	447	addLetter();
386	448	continue;
387	449	}
	450	+ case LINK_KEYWORD:
	451	+ if(keywordLen < keywordBuf.length && c!=']'){
	452	+ keywordBuf[keywordLen++] = c;
	453	+ }
	454	+ // fall-thru
388	455	case LINK_WORDS:
389	456	if(c == ']'){
390	457	state = ParserState.LINK_END;
—	—	@@ -419,7 +486,7 @@
420	487
421	488	if(length<buffer.length)
422	489	buffer[length++] = c;
423		~~- continue;~~
	490	+ continue;
424	491	case LINK_END:
425	492	if(c == ']'){ // good link ending
426	493	state = ParserState.WORD;
—	—	@@ -439,6 +506,11 @@
440	507	length = 0;
441	508	fetch = FetchState.WORD;
442	509	continue;
	510	+ case KEYWORD:
	511	+ keywords.add(new String(keywordBuf,0,keywordLen));
	512	+ keywordLen = 0;
	513	+ fetch = FetchState.WORD;
	514	+ continue;
443	515	}
444	516	} else{
445	517	// bad syntax, ignore any categories, etc..
—	—	@@ -478,7 +550,7 @@
479	551	addToken();
480	552	return tokens;
481	553	}
482		-
	554	+
483	555	/** Check if this is an "image" keyword using localization */
484	556	private final boolean isImage(String prefix){
485	557	prefix = prefix.toLowerCase();
—	—	@@ -530,4 +602,10 @@
531	603	public ArrayList<Token> getTokens() {
532	604	return tokens;
533	605	}
	606	+
	607	+ public HashSet<String> getKeywords() {
	608	+ return keywords;
	609	+ }
	610	+
	611	+
534	612	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -18,6 +18,9 @@
19	19	import org.apache.lucene.search.Query;
20	20	import org.apache.lucene.search.TermQuery;
21	21	import org.apache.lucene.search.WildcardQuery;
	22	+import org.apache.lucene.search.spans.SpanNearQuery;
	23	+import org.apache.lucene.search.spans.SpanQuery;
	24	+import org.apache.lucene.search.spans.SpanTermQuery;
22	25	import org.wikimedia.lsearch.config.GlobalConfiguration;
23	26	import org.wikimedia.lsearch.search.NamespaceFilter;
24	27	import org.wikimedia.lsearch.util.UnicodeDecomposer;
—	—	@@ -69,7 +72,9 @@
70	73	/** boost for alias words from analyzer */
71	74	public final float ALIAS_BOOST = 0.5f;
72	75	/** boost for title field */
73		~~- public static float TITLE_BOOST = 4;~~
	76	+ public static float TITLE_BOOST = 4;
	77	+ public static float REDIRECT_BOOST = 0.5f;
	78	+ public static float KEYWORD_BOOST = 1;
74	79
75	80	/** Policies in treating field names:
76	81	*
—	—	@@ -90,6 +95,7 @@
91	96	private Query namespaceRewriteQuery;
92	97	private NamespacePolicy namespacePolicy;
93	98	protected NamespaceFilter defaultNamespaceFilter;
	99	+ protected static GlobalConfiguration global=null;
94	100
95	101	/** default value for boolean queries */
96	102	public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST;
—	—	@@ -102,7 +108,8 @@
103	109	protected void initNamespaces(){
104	110	if(namespaceQueries != null)
105	111	return;
106		~~- GlobalConfiguration global = GlobalConfiguration.getInstance();~~
	112	+ if(global == null)
	113	+ global = GlobalConfiguration.getInstance();
107	114	namespaceAllKeyword = global.getNamespacePrefixAll();
108	115	namespaceQueries = new Hashtable<String,Query>();
109	116	namespacePrefixes = new Hashtable<NamespaceFilter,String>();
—	—	@@ -891,16 +898,146 @@
892	899	return query;
893	900	}
894	901
	902	+ protected boolean isNamespaceQuery(Query q){
	903	+ if(q instanceof TermQuery)
	904	+ return ((TermQuery)q).getTerm().field().equals("namespace");
	905	+ else if(q instanceof BooleanQuery){
	906	+ for(BooleanClause cl : ((BooleanQuery)q).getClauses()){
	907	+ if(cl.getQuery() instanceof TermQuery &&
	908	+ ((TermQuery)cl.getQuery()).getTerm().field().equals("namespace"));
	909	+ else
	910	+ return false;
	911	+ }
	912	+ return true;
	913	+ }
	914	+ return false;
	915	+ }
	916	+
895	917	/**
896		~~- * Parse the query according to policy. Instead of rewrite phrase, simply pass~~
897		~~- * twice the query with different default fields.~~
	918	+ * Doing some very simple analysis extract span queries to use for
	919	+ * redirect field. Currently only extracts if all boolean clauses are
	920	+ * required or if it's a phrase query. This is since making span
	921	+ * queries in non-trivial in other cases. :(
898	922	*
	923	+ * The function heavily depends on the format of output of parser,
	924	+ * especially for rewrite.
	925	+ *
	926	+ * @param query
	927	+ * @param level - recursion level
	928	+ * @return
	929	+ */
	930	+ protected Query extractSpans(Query query, int level, String fieldName, float boost) {
	931	+ // phrase, or termquery just rewrite field name
	932	+ if(query instanceof TermQuery){
	933	+ TermQuery tq = (TermQuery)query;
	934	+ TermQuery ret = new TermQuery(new Term(fieldName,tq.getTerm().text()));
	935	+ ret.setBoost(boost);
	936	+ return ret;
	937	+ } else if(query instanceof PhraseQuery){
	938	+ PhraseQuery phrase = new PhraseQuery();
	939	+ for(Term term : ((PhraseQuery)query).getTerms()){
	940	+ phrase.add(new Term(fieldName,term.text()));
	941	+ }
	942	+ phrase.setBoost(boost);
	943	+ return phrase;
	944	+ } else if(query instanceof BooleanQuery){
	945	+ BooleanQuery bq = (BooleanQuery)query;
	946	+ // check for rewritten queries, TODO: parse complex multi-part rewrites
	947	+ if(level==0 && namespacePolicy != null && namespacePolicy == NamespacePolicy.REWRITE){
	948	+ if(bq.getClauses().length == 2 && isNamespaceQuery(bq.getClauses()[0].getQuery())){
	949	+ BooleanQuery ret = new BooleanQuery();
	950	+ ret.add(bq.getClauses()[0]);
	951	+ // the second clause is always the query
	952	+ ret.add(extractSpans(bq.getClauses()[1].getQuery(),level+1,fieldName,boost),BooleanClause.Occur.MUST);
	953	+ return ret;
	954	+ } else
	955	+ return null;
	956	+ }
	957	+ // we can parse if all clauses are required
	958	+ boolean canTransform = true;
	959	+ for(BooleanClause cl : bq.getClauses()){
	960	+ if(cl.getOccur() != BooleanClause.Occur.MUST){
	961	+ canTransform = false;
	962	+ break;
	963	+ }
	964	+ }
	965	+ if(!canTransform)
	966	+ return null;
	967	+ // rewrite into span queries + categories
	968	+ ArrayList<SpanQuery> spans = new ArrayList<SpanQuery>();
	969	+ ArrayList<Query> categories = new ArrayList<Query>();
	970	+ for(BooleanClause cl : bq.getClauses()){
	971	+ Query q = cl.getQuery();
	972	+ if(q instanceof TermQuery){ // -> SpanTermQuery
	973	+ TermQuery tq = (TermQuery)q;
	974	+ Term t = tq.getTerm();
	975	+ if(t.field().equals("category")){
	976	+ categories.add(q);
	977	+ } else {
	978	+ SpanTermQuery stq = new SpanTermQuery(new Term(fieldName,t.text()));
	979	+ stq.setBoost(boost);
	980	+ spans.add(stq);
	981	+ }
	982	+ } else if(q instanceof PhraseQuery){ // -> SpanNearQuery(slop=0,inOrder=true)
	983	+ PhraseQuery pq = (PhraseQuery)q;
	984	+ Term[] terms = pq.getTerms();
	985	+ if(terms[0].field().equals("category")){
	986	+ categories.add(q);
	987	+ } else{
	988	+ SpanTermQuery[] spanTerms = new SpanTermQuery[terms.length];
	989	+ for(int i=0; i<terms.length; i++ ){
	990	+ spanTerms[i] = new SpanTermQuery(new Term(fieldName,terms[i].text()));
	991	+ }
	992	+ SpanNearQuery snq = new SpanNearQuery(spanTerms,0,true);
	993	+ snq.setBoost(boost);
	994	+ spans.add(snq);
	995	+ }
	996	+ }
	997	+ }
	998	+ // create the queries
	999	+ Query cat = null;
	1000	+ SpanQuery span = null;
	1001	+ if(categories.size() != 0){
	1002	+ if(categories.size() == 1)
	1003	+ cat = categories.get(0);
	1004	+ else{
	1005	+ BooleanQuery b = new BooleanQuery();
	1006	+ for(Query q : categories)
	1007	+ b.add(q,BooleanClause.Occur.MUST);
	1008	+ cat = b; // intersection of categories, bool query
	1009	+ }
	1010	+ }
	1011	+ if(spans.size() != 0){
	1012	+ if(spans.size() == 1)
	1013	+ span = spans.get(0);
	1014	+ else{
	1015	+ // make a span-near query that has a slop 1/2 of tokenGap
	1016	+ span = new SpanNearQuery(spans.toArray(new SpanQuery[] {}),(KeywordsAnalyzer.tokenGap-1)/2,false);
	1017	+ }
	1018	+ }
	1019	+ if(cat != null && span != null){
	1020	+ BooleanQuery ret = new BooleanQuery();
	1021	+ ret.add(span,BooleanClause.Occur.MUST);
	1022	+ ret.add(cat,BooleanClause.Occur.MUST);
	1023	+ return ret;
	1024	+ } else if(span != null)
	1025	+ return span;
	1026	+ else // we don't want categories only
	1027	+ return null;
	1028	+
	1029	+ }
	1030	+ return null;
	1031	+ }
	1032	+
	1033	+ /**
	1034	+ * Main function for multi-pass parsing.
	1035	+ *
899	1036	* @param queryText
900	1037	* @param policy
	1038	+ * @param makeRedirect
901	1039	* @return
902		~~- * @throws ParseException~~
903	1040	*/
904		~~- public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{~~
	1041	+ protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){
905	1042	if(policy != null)
906	1043	this.namespacePolicy = policy;
907	1044	float olfDefaultBoost = defaultBoost;
—	—	@@ -914,15 +1051,66 @@
915	1052	defaultField = contentField;
916	1053	defaultBoost = olfDefaultBoost;
917	1054	if(qc == null \|\| qt == null)
918		~~- return new BooleanQuery();~~
919		-
	1055	+ return new BooleanQuery();
920	1056	if(qc.equals(qt))
921	1057	return qc; // don't duplicate (probably a query for categories only)
922	1058	BooleanQuery bq = new BooleanQuery();
923	1059	bq.add(qc,BooleanClause.Occur.SHOULD);
924	1060	bq.add(qt,BooleanClause.Occur.SHOULD);
	1061	+
	1062	+ // redirect pass
	1063	+ if(makeRedirect){
	1064	+ Query qr = extractSpans(qt,0,"redirect",REDIRECT_BOOST);
	1065	+ if(qr != null)
	1066	+ bq.add(qr,BooleanClause.Occur.SHOULD);
	1067	+ }
	1068	+ // keyword pass
	1069	+ if(makeKeywords){
	1070	+ Query qk = extractSpans(qt,0,"keyword",KEYWORD_BOOST);
	1071	+ if(qk != null)
	1072	+ bq.add(qk,BooleanClause.Occur.SHOULD);
	1073	+ }
	1074	+
925	1075	return bq;
	1076	+
926	1077	}
	1078	+
	1079	+ /**
	1080	+ * Three parse pases: contents, title, redirect
	1081	+ *
	1082	+ * @param queryText
	1083	+ * @param policy
	1084	+ * @return
	1085	+ * @throws ParseException
	1086	+ */
	1087	+ public Query parseThreePass(String queryText, NamespacePolicy policy) throws ParseException{
	1088	+ return parseMultiPass(queryText,policy,true,false);
	1089	+ }
	1090	+
	1091	+ /**
	1092	+ * Depending on settings for db, do all 4 passes of parsing:
	1093	+ * 1) contents
	1094	+ * 2) titles
	1095	+ * 3) redirects
	1096	+ * 4) keywords
	1097	+ */
	1098	+ public Query parseFourPass(String queryText, NamespacePolicy policy, String dbname) throws ParseException{
	1099	+ boolean makeKeywords = global.useKeywordScoring(dbname);
	1100	+ return parseMultiPass(queryText,policy,true,makeKeywords);
	1101	+ }
	1102	+
	1103	+ /**
	1104	+ * Parse the query according to policy. Instead of rewrite phrase, simply pass
	1105	+ * twice the query with different default fields.
	1106	+ *
	1107	+ * @param queryText
	1108	+ * @param policy
	1109	+ * @return
	1110	+ * @throws ParseException
	1111	+ */
	1112	+ public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
	1113	+ return parseMultiPass(queryText,policy,false,false);
	1114	+ }
927	1115
928	1116
929	1117	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -47,9 +47,9 @@
48	48	*
49	49	* @param text text to be tokenized
50	50	* @param languageAnalyzer language filter class (e.g. PorterStemFilter)
51		~~- * @return~~
	51	+ * @return {PerFieldAnalyzerWrapper,WikiTokenizer}
52	52	*/
53		~~- public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, FilterFactory filters) {~~
	53	+ public static Object[] getIndexerAnalyzer(String text, FilterFactory filters, ArrayList<String> redirects) {
54	54	PerFieldAnalyzerWrapper perFieldAnalyzer = null;
55	55	// parse wiki-text to get categories
56	56	WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage());
—	—	@@ -63,8 +63,11 @@
64	64	new CategoryAnalyzer(categories));
65	65	perFieldAnalyzer.addAnalyzer("title",
66	66	getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
67		-
68		~~- return perFieldAnalyzer;~~
	67	+ perFieldAnalyzer.addAnalyzer("redirect",
	68	+ new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory()));
	69	+ perFieldAnalyzer.addAnalyzer("keyword",
	70	+ new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory()));
	71	+ return new Object[] {perFieldAnalyzer,tokenizer};
69	72	}
70	73
71	74	public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
—	—	@@ -25,6 +25,8 @@
26	26	package org.wikimedia.lsearch.beans;
27	27
28	28	import java.io.Serializable;
	29	+import java.util.ArrayList;
	30	+import java.util.Collection;
29	31
30	32	/**
31	33	* Wiki article.
—	—	@@ -37,6 +39,8 @@
38	40	private boolean redirect;
39	41	private long pageId;
40	42	private int rank;
	43	+ /** all redirects in format: ns:title */
	44	+ private ArrayList<String> redirects; // pages that redirect to this page
41	45
42	46	public Article(){
43	47	namespace="";
—	—	@@ -44,7 +48,8 @@
45	49	contents="";
46	50	pageId = 0;
47	51	redirect=false;
48		~~- rank=0;~~
	52	+ rank = 0;
	53	+ redirects=new ArrayList<String>();
49	54	}
50	55
51	56	public Article(long pageId, Title title, String text, boolean redirect, int rank) {
—	—	@@ -54,6 +59,7 @@
55	60	this.pageId = pageId;
56	61	this.redirect = redirect;
57	62	this.rank = rank;
	63	+ this.redirects = new ArrayList<String>();
58	64	}
59	65
60	66	public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int rank) {
—	—	@@ -63,8 +69,19 @@
64	70	this.redirect = redirect;
65	71	this.pageId = pageId;
66	72	this.rank = rank;
	73	+ this.redirects = new ArrayList<String>();
67	74	}
68	75
	76	+ public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int rank, ArrayList<String> redirects) {
	77	+ this.namespace = Integer.toString(namespace);
	78	+ this.title = titleText;
	79	+ contents = text;
	80	+ this.redirect = redirect;
	81	+ this.pageId = pageId;
	82	+ this.rank = rank;
	83	+ this.redirects = redirects;
	84	+ }
	85	+
69	86	public boolean isRedirect() {
70	87	return redirect;
71	88	}
—	—	@@ -111,8 +128,29 @@
112	129	return "(" + namespace + ",\"" + title + "\")";
113	130	}
114	131
	132	+ /** Get how many articles link to this article */
115	133	public int getRank() {
116	134	return rank;
117	135	}
118	136
	137	+ /** Register a redirect to this article */
	138	+ public void addRedirect(String linkingArticle){
	139	+ redirects.add(linkingArticle);
	140	+ }
	141	+
	142	+ /** Register a list of redirects to this article */
	143	+ public void addRedirects(Collection<String> linkingArticles){
	144	+ redirects.addAll(linkingArticles);
	145	+ }
	146	+
	147	+ /** Get list of articles that redirect to this article */
	148	+ public ArrayList<String> getRedirects() {
	149	+ return redirects;
	150	+ }
	151	+
	152	+ public void setRedirects(ArrayList<String> redirects) {
	153	+ this.redirects = redirects;
	154	+ }
	155	+
	156	+
119	157	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Rank.java
—	—	@@ -1,19 +1,53 @@
2	2	package org.wikimedia.lsearch.beans;
3	3
	4	+import java.util.ArrayList;
	5	+
4	6	public class Rank {
5	7	/** Number of linking articles */
6	8	public int links;
7	9	/** if this is redirect, point to the target title */
8		~~- public String redirect;~~
	10	+ public Rank redirectsTo;
	11	+ /** all the pages that get redirected here */
	12	+ public ArrayList<String> redirected;
9	13
10	14	public Rank(int links) {
11	15	this.links = links;
12		~~- redirect = null;~~
	16	+ redirectsTo = null;
13	17	}
14	18
15		~~- public Rank(int links, String redirect) {~~
	19	+ public Rank(int links, Rank redirect) {
16	20	this.links = links;
17		~~- this.redirect = redirect;~~
	21	+ this.redirectsTo = redirect;
18	22	}
	23	+
	24	+ @Override
	25	+ public int hashCode() {
	26	+ final int PRIME = 31;
	27	+ int result = 1;
	28	+ result = PRIME * result + links;
	29	+ result = PRIME * result + 0;
	30	+ return result;
	31	+ }
	32	+
	33	+ @Override
	34	+ public boolean equals(Object obj) {
	35	+ if (this == obj)
	36	+ return true;
	37	+ if (obj == null)
	38	+ return false;
	39	+ if (getClass() != obj.getClass())
	40	+ return false;
	41	+ final Rank other = (Rank) obj;
	42	+ if (links != other.links)
	43	+ return false;
	44	+ if (redirectsTo == null) {
	45	+ if (other.redirectsTo != null)
	46	+ return false;
	47	+ } else if (redirectsTo != other.redirectsTo)
	48	+ return false;
	49	+ return true;
	50	+ }
19	51
	52	+
	53	+
20	54	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
—	—	@@ -18,12 +18,17 @@
19	19	protected Revision revision;
20	20	protected ArrayList<IndexUpdateRecord> records = new ArrayList<IndexUpdateRecord>();
21	21	protected IndexId iid;
22		~~- protected int references;~~
	22	+ protected int references = 0;
	23	+ protected ArrayList<String> redirects = new ArrayList<String>();
23	24
24	25	public IndexUpdatesCollector(IndexId iid){
25	26	this.iid = iid;
26	27	}
27	28
	29	+ public void addRedirect(String redirectTitle, int references) {
	30	+ redirects.add(redirectTitle);
	31	+ addReferences(references);
	32	+ }
28	33	public void addDeletion(long pageId){
29	34	// pageId is enough for page deletion
30	35	Article article = new Article(pageId,-1,"","",false,1);
—	—	@@ -42,10 +47,12 @@
43	48	this.page = page;
44	49	}
45	50	public void writeEndPage() throws IOException {
46		~~- Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references);~~
47		~~- log.info("Collected "+article+" with rank "+references);~~
	51	+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects);
	52	+ //log.info("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
48	53	records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
49	54	log.debug(iid+": Update for "+article);
	55	+ references = 0;
	56	+ redirects.clear();
50	57	}
51	58
52	59	public void close() throws IOException {
—	—	@@ -64,10 +71,12 @@
65	72	return references;
66	73	}
67	74
68		~~- public void setReferences(int references) {~~
69		~~- this.references = references;~~
	75	+ public void addReferences(int references) {
	76	+ this.references += references;
70	77	}
71	78
	79	+
	80	+
72	81
73	82
74	83	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIParser.java
—	—	@@ -8,7 +8,6 @@
9	9	import javax.xml.parsers.SAXParser;
10	10	import javax.xml.parsers.SAXParserFactory;
11	11
12		~~-import org.mediawiki.importer.DumpWriter;~~
13	12	import org.mediawiki.importer.XmlDumpReader;
14	13	import org.xml.sax.Attributes;
15	14	import org.xml.sax.SAXException;
—	—	@@ -24,6 +23,8 @@
25	24	* appears as a continious stream. For this stream
26	25	* calls to sax parser methods are delegated to XmlDumpReader.
27	26	*
	27	+ * Note: implementation is very lazy and messy :(
	28	+ *
28	29	* @author rainman
29	30	*
30	31	*/
—	—	@@ -37,8 +38,8 @@
38	39	protected String oaiId,pageId,resumptionToken,responseDate;
39	40	protected boolean beginMW; // beginning of mediawiki stream
40	41	protected String mwUri, mwLocalName, mwQName;
41		~~- protected boolean isDeleted, inReferences;~~
42		~~- protected String references;~~
	42	+ protected boolean isDeleted, inReferences, inRedirect, inRedirectTitle, inRedirectRef;
	43	+ protected String references, redirectTitle, redirectRef;
43	44
44	45
45	46	public OAIParser(InputStream in, IndexUpdatesCollector collector){
—	—	@@ -50,6 +51,8 @@
51	52	inResponseDate = false; inReferences = false;
52	53	oaiId = ""; resumptionToken = ""; responseDate = "";
53	54	beginMW = true; references = "";
	55	+ inRedirect = false; inRedirectTitle= false; inRedirectRef = false;
	56	+ redirectTitle = ""; redirectRef = "";
54	57	}
55	58
56	59	public void parse() throws IOException{
—	—	@@ -74,8 +77,17 @@
75	78	inDump = false; // lsearch syntax
76	79	inReferences = true;
77	80	references = "";
	81	+ } else if(inDump && qName.equals("redirect")){
	82	+ inDump = false;
	83	+ inRedirect = true;
	84	+ redirectTitle = "";
	85	+ redirectRef = "";
78	86	} else if(inDump)
79	87	dumpReader.startElement(uri, localName, qName, attributes);
	88	+ else if(inRedirect && qName.equals("title"))
	89	+ inRedirectTitle = true;
	90	+ else if(inRedirect && qName.equals("references"))
	91	+ inRedirectRef = true;
80	92	else if(qName.equals("record"))
81	93	inRecord = true;
82	94	else if(qName.equals("header") && inRecord){
—	—	@@ -85,8 +97,7 @@
86	98	isDeleted = true;
87	99	else
88	100	isDeleted = false;
89		~~- }~~
90		~~- else if(qName.equals("identifier") && inHeader){~~
	101	+ } else if(qName.equals("identifier") && inHeader){
91	102	oaiId = "";
92	103	inIdentifier = true;
93	104	} else if(qName.equals("metadata"))
—	—	@@ -115,10 +126,23 @@
116	127	dumpReader.endElement(uri, localName, qName);
117	128	else if(qName.equals("upload"))
118	129	inDump = true; // we ignored upload tag / parsed references, we can now resume
119		~~- else if(qName.equals("references")){~~
	130	+ else if(!inRedirect && qName.equals("references")){
120	131	inDump = true;
121		~~- collector.setReferences(Integer.parseInt(references));~~
122		~~- } else if(qName.equals("record"))~~
	132	+ inReferences = false;
	133	+ if(!references.equals(""))
	134	+ collector.addReferences(Integer.parseInt(references));
	135	+ } if(qName.equals("redirect")){
	136	+ inDump = true;
	137	+ int ref = 0;
	138	+ if(!redirectRef.equals(""))
	139	+ ref = Integer.parseInt(redirectRef);
	140	+ collector.addRedirect(redirectTitle,ref);
	141	+ inRedirect = false;
	142	+ } else if(inRedirect && qName.equals("title"))
	143	+ inRedirectTitle = false;
	144	+ else if(inRedirect && qName.equals("references"))
	145	+ inRedirectRef = false;
	146	+ else if(qName.equals("record"))
123	147	inRecord = false;
124	148	else if(qName.equals("header"))
125	149	inHeader = false;
—	—	@@ -153,6 +177,10 @@
154	178	responseDate += new String(ch,start,length);
155	179	} else if(inReferences){
156	180	references += new String(ch,start,length);
	181	+ } else if(inRedirectTitle){
	182	+ redirectTitle += new String(ch,start,length);
	183	+ } else if(inRedirectRef){
	184	+ redirectRef += new String(ch,start,length);
157	185	}
158	186	}
159	187