r32149 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r32148‎ | r32149 | r32150 >
Date:00:49, 19 March 2008
Author:rainman
Status:old
Tags:
Comment:
Query parser::
* wildcards/fuzzy can now be within phrases
Daemon:
* compatiblity modes for previous versions (version param)
* warmup for new index types
* related search can now be distributed
Suggest:
* fixed split/joins to work with other namespaces
* phrase frequencies on other namespaces
Incremental updates:
* link information fetching delayed to index update
* page_id added into links so we can to deletions
* page_id keys for precursor indexes
* updates on various index types (untested)
Modified paths:
  • /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/StopWords.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WordNet.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/SearchResults.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/StartupManager.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Fuzzy.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/HighlightTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/webinterface/lsweb.py (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java
@@ -37,6 +37,8 @@
3838 protected float completeBoost = 1;
3939 /** use complete number of tokens (with completeBoost) only for scoring */
4040 protected boolean useCompleteOnly = false;
 41+ /** act exactly as a phrase query without any positional or such optimizations */
 42+ protected boolean phraseQueryFallback = false;
4143
4244
4345 /** Options specific for phrases in contents */
@@ -148,6 +150,12 @@
149151 //wholeBoost = 8;
150152 }
151153 }
 154+ /** Fallback to phasequery-type behaviour, no positional info */
 155+ public static class PhraseQueryFallback extends PositionalOptions {
 156+ public PhraseQueryFallback(){
 157+ phraseQueryFallback = true;
 158+ }
 159+ }
152160
153161 public abstract static class NamespaceBoost implements Serializable {
154162 public abstract float getBoost(int namespace);
@@ -162,6 +170,7 @@
163171 }
164172 }
165173 }
 174+
166175
167176
168177 @Override
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java
@@ -195,6 +195,8 @@
196196 * @throws IOException
197197 */
198198 public float freqScore(int start, int distance) throws IOException{
 199+ if(options.phraseQueryFallback)
 200+ return getSimilarity().sloppyFreq(distance);
199201 //System.out.println("freqScore at start="+start+", dist="+distance);
200202 int offset = start + distance;
201203 float begin = 1;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
@@ -156,7 +156,7 @@
157157 *
158158 * @return Returns unique id.
159159 */
160 - public String getKey() {
 160+ public String getIndexKey() {
161161 return Long.toString(pageId);
162162 }
163163
@@ -270,9 +270,15 @@
271271
272272 public void setDate(Date date) {
273273 this.date = date;
 274+ }
 275+
 276+ public void setRedirectTo(String redirectTo) {
 277+ this.redirectTo = redirectTo;
274278 }
275279
276280
277281
278282
 283+
 284+
279285 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/SearchResults.java
@@ -32,6 +32,8 @@
3333 protected boolean foundAllInTitle = false;
3434 /** threshold for filtering suggestions */
3535 protected int firstHitRank = 0;
 36+ /** Words found in titles */
 37+ protected HashSet<String> foundInTitles = new HashSet<String>();
3638
3739 public SearchResults(){
3840 success = false;
@@ -130,6 +132,12 @@
131133 public void addToFirstHitRank(int rank){
132134 firstHitRank += rank;
133135 }
 136+ public HashSet<String> getFoundInTitles() {
 137+ return foundInTitles;
 138+ }
 139+ public void setFoundInTitles(HashSet<String> foundInTitles) {
 140+ this.foundInTitles = foundInTitles;
 141+ }
134142
135143 @Override
136144 public String toString() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
@@ -52,11 +52,12 @@
5353 return new PrefixIndexBuilder(iid,Links.openStandalone(iid),null);
5454 }
5555 /** Builder for incremental updates to precursor index */
56 - static public PrefixIndexBuilder forPrecursorModification(IndexId iid, Links links) throws IOException{
 56+ static public PrefixIndexBuilder forPrecursorModification(IndexId iid) throws IOException{
 57+ iid = iid.getPrefix();
5758 IndexWriter writer = WikiIndexModifier.openForWrite(iid.getPrecursor().getIndexPath(),false,new PrefixAnalyzer());
5859 writer.setMergeFactor(20);
5960 writer.setMaxBufferedDocs(500);
60 - return new PrefixIndexBuilder(iid,links,writer);
 61+ return new PrefixIndexBuilder(iid,null,writer);
6162 }
6263
6364 private PrefixIndexBuilder(IndexId iid, Links links, IndexWriter writer) throws IOException {
@@ -177,15 +178,17 @@
178179 else return -1;
179180 }
180181 });
181 - HashSet<String> selectedRedirects = new HashSet<String>();
 182+ // hash set of selected articles and places they redirect to
 183+ HashSet<String> selectedWithRedirects = new HashSet<String>();
182184 ArrayList<String> selected = new ArrayList<String>();
183185 for(int i=0;i<perPrefix && i<sorted.size();i++){
184186 String key = sorted.get(i).getKey();
185187 String redirect = redirects.get(key);
186 - if(redirect == null || !selectedRedirects.contains(redirect)){
 188+ if((redirect == null || !selectedWithRedirects.contains(redirect))
 189+ && !selectedWithRedirects.contains(key)){
187190 selected.add(key);
188 - selectedRedirects.add(redirect);
189 - selectedRedirects.add(key);
 191+ selectedWithRedirects.add(key);
 192+ selectedWithRedirects.add(redirect);
190193 }
191194 }
192195 Document d = new Document();
@@ -213,7 +216,7 @@
214217 writer.optimize();
215218 writer.close();
216219
217 - IndexThread.makeIndexSnapshot(prefixIid,path);
 220+ IndexThread.makeIndexSnapshot(prefixIid,prefixIid.getImportPath());
218221 }
219222
220223 public static String strip(String s){
@@ -230,15 +233,23 @@
231234 private static double lengthCoeff(String key, String prefix) {
232235 return 1;
233236 }
234 - /** Modify a precursor index entry */
235 - protected void modifyPrecursor(String key) throws IOException{
236 - writer.deleteDocuments(new Term("key",key));
237 - addToPrecursor(key);
238 - }
 237+
 238+
239239 /** Add a new precursor index entry */
240240 protected void addToPrecursor(String key) throws IOException{
241241 int ref = links.getNumInLinks(key);
242242 String redirect = links.getRedirectTarget(key);
 243+ String pageid = links.getPageId(key);
 244+ addToPrecursor(key,ref,redirect,pageid);
 245+ }
 246+
 247+ /** Modify a precursor index entry */
 248+ public void deleteFromPrecursor(String pageId) throws IOException{
 249+ writer.deleteDocuments(new Term("pageid",pageId));
 250+ }
 251+
 252+ /** Add a new precursor index entry */
 253+ public void addToPrecursor(String key, int ref, String redirect, String pageId) throws IOException{
243254 String strippedKey = strip(key);
244255 String strippedTarget = redirect==null? null : strip(redirect);
245256 if(redirect == null);
@@ -248,6 +259,7 @@
249260 return; // ignore redirects like byzantine -> byzantine empire
250261 // add to index
251262 Document d = new Document();
 263+ d.add(new Field("pageid",pageId,Field.Store.NO,Field.Index.UN_TOKENIZED));
252264 d.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
253265 ArrayList<Token> canonized = canonize(key,iid,filters);
254266 for(Token t : canonized){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -62,7 +62,7 @@
6363 if(original != null)
6464 this.suffix = original.getTitlesSuffix();
6565 GlobalConfiguration global = GlobalConfiguration.getInstance();
66 - langCode = global.getLanguage(iid.getDBname());
 66+ langCode = iid.getLangCode();
6767 FieldBuilder.Case dCase = (global.exactCaseIndex(iid.getDBname()))? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
6868 builder = new FieldBuilder(iid,dCase);
6969 indexes = new HashMap<String,IndexWriter>();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
@@ -143,7 +143,7 @@
144144 if(makeIndex){
145145 if(!useOldRelated){
146146 try {
147 - RelatedBuilder.rebuildFromLinksNew(iid);
 147+ RelatedBuilder.rebuildFromLinks(iid);
148148 } catch (IOException e) {
149149 log.fatal("Cannot make related mapping: "+e.getMessage());
150150 return;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
@@ -731,6 +731,10 @@
732732 return GlobalConfiguration.getIndexId(dbname+".spell") != null;
733733 }
734734
 735+ public boolean hasPrefix(){
 736+ return GlobalConfiguration.getIndexId(dbname+".prefix") != null;
 737+ }
 738+
735739 /** Get the coresponding spell words iid */
736740 public IndexId getSpell() {
737741 return get(dbname+".spell");
@@ -787,7 +791,7 @@
788792 }
789793
790794 /** Get if this is index that doesn't capitalize first letters of articles */
791 - public boolean getExactCase(){
 795+ public boolean isExactCase(){
792796 if(exactCase == null)
793797 exactCase = GlobalConfiguration.getInstance().exactCaseIndex(dbname);
794798 return exactCase;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -1158,10 +1158,6 @@
11591159 public boolean isMyHost(String host) {
11601160 return host.equalsIgnoreCase(hostAddr) || host.equalsIgnoreCase(hostName);
11611161 }
1162 -
1163 - public String getLanguage(IndexId iid){
1164 - return getLanguage(iid.getDBname());
1165 - }
11661162
11671163 /** Get language for a dbname */
11681164 public String getLanguage(String dbname) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/StartupManager.java
@@ -41,9 +41,9 @@
4242 // preload localizations
4343 HashSet<String> langCodes = new HashSet<String>();
4444 for(IndexId iid : global.getMyIndex())
45 - langCodes.add(global.getLanguage(iid.getDBname()));
 45+ langCodes.add(iid.getLangCode());
4646 for(IndexId iid : global.getMySearch())
47 - langCodes.add(global.getLanguage(iid.getDBname()));
 47+ langCodes.add(iid.getLangCode());
4848 Localization.readLocalizations(langCodes);
4949 Localization.loadInterwiki();
5050 // preload the unicode decomposer
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -250,7 +250,12 @@
251251 protected void updateCache(SearcherCache.SearcherPool pool, LocalIndex li){
252252 // do some typical queries to preload some lucene caches, pages into memory, etc..
253253 for(IndexSearcherMul is : pool.searchers){
254 - Warmup.warmupIndexSearcher(is,li.iid,true);
 254+ try{
 255+ Warmup.warmupIndexSearcher(is,li.iid,true);
 256+ } catch(IOException e){
 257+ e.printStackTrace();
 258+ log.warn("Error warmup up "+li+" : "+e.getMessage());
 259+ }
255260 }
256261 // add to cache
257262 cache.invalidateLocalSearcher(li.iid,pool);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
@@ -3,9 +3,12 @@
44 import java.io.IOException;
55 import java.util.ArrayList;
66 import java.util.Collection;
 7+import java.util.HashSet;
78 import java.util.Hashtable;
89
910 import org.apache.log4j.Logger;
 11+import org.apache.lucene.analysis.SimpleAnalyzer;
 12+import org.apache.lucene.index.IndexReader;
1013 import org.apache.lucene.index.Term;
1114 import org.apache.lucene.search.Hits;
1215 import org.apache.lucene.search.Query;
@@ -13,6 +16,7 @@
1417 import org.wikimedia.lsearch.analyzers.Analyzers;
1518 import org.wikimedia.lsearch.analyzers.FieldBuilder;
1619 import org.wikimedia.lsearch.analyzers.FieldNameFactory;
 20+import org.wikimedia.lsearch.analyzers.StopWords;
1721 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
1822 import org.wikimedia.lsearch.benchmark.SampleTerms;
1923 import org.wikimedia.lsearch.benchmark.Terms;
@@ -20,6 +24,7 @@
2125 import org.wikimedia.lsearch.config.Configuration;
2226 import org.wikimedia.lsearch.config.GlobalConfiguration;
2327 import org.wikimedia.lsearch.config.IndexId;
 28+import org.wikimedia.lsearch.spell.Suggest;
2429
2530 /**
2631 * Methods to warm up index and preload caches.
@@ -33,41 +38,57 @@
3439 protected static Hashtable<String,Terms> langTerms = new Hashtable<String,Terms>();
3540
3641 /** Runs some typical queries on a local index searcher to preload caches, pages into memory, etc .. */
37 - public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay){
 42+ public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay) throws IOException {
3843 if(iid.isLinks() || iid.isPrecursor())
3944 return; // no warmaup for these
4045 log.info("Warming up index "+iid+" ...");
4146 long start = System.currentTimeMillis();
 47+ IndexReader reader = is.getIndexReader();
4248
4349 if(global == null)
4450 global = GlobalConfiguration.getInstance();
4551
4652 Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup");
47 - if(iid.isSpell() || iid.isPrefix()); // no warmup for spell-chekers and prefixes (for now)
48 - else if(warmup == null){
49 - makeNamespaceFilters(is,iid);
50 - simpleWarmup(is,iid);
51 - log.info("Warmed up "+iid);
52 - }
53 - else{
54 - int count;
55 - try{
56 - count = Integer.parseInt(warmup.get("count"));
57 - } catch(Exception e){
58 - log.warn("Wrong parameters for warmup of database "+iid+" in global settings");
59 - simpleWarmup(is,iid);
60 - return;
 53+ int count = warmup!=null? Integer.parseInt(warmup.get("count")) : 0;
 54+ if(iid.isSpell() && count > 0){
 55+ Terms terms = getTermsForLang(iid.getLangCode());
 56+ Suggest sug = new Suggest(iid,is,false);
 57+ WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid));
 58+ for(int i=0;i<count;i++){
 59+ String searchterm = terms.next();
 60+ sug.suggest(searchterm,parser.tokenizeBareText(searchterm),new Suggest.ExtraInfo(),new NamespaceFilter());
6161 }
62 - makeNamespaceFilters(is,iid);
63 - warmupSearchTerms(is,iid,count,useDelay);
64 - long delta = System.currentTimeMillis() - start;
65 - log.info("Warmed up "+iid+" in "+delta+" ms");
66 - }
 62+ } else if((iid.isPrefix() || iid.isHighlight() || iid.isRelated()) && count > 0 && !iid.isTitlesBySuffix()){
 63+ // NOTE: this might not warmup all caches, but should read stuff into memory buffers
 64+ for(int i=0;i<count;i++){
 65+ int docid = (int)(Math.random()*is.maxDoc());
 66+ reader.document(docid).get("key");
 67+ }
 68+ } else{
 69+ // normal indexes
 70+ if(count == 0){
 71+ makeNamespaceFilters(is,iid);
 72+ simpleWarmup(is,iid);
 73+ } else{
 74+ makeNamespaceFilters(is,iid);
 75+ warmupWithSearchTerms(is,iid,count,useDelay);
 76+ }
 77+ // wait for aggregate fields to be cached
 78+ while(AggregateMetaField.isBeingCached(reader)){
 79+ try {
 80+ Thread.sleep(100);
 81+ } catch (InterruptedException e) {
 82+ e.printStackTrace();
 83+ }
 84+ }
 85+ }
 86+ long delta = System.currentTimeMillis() - start;
 87+ log.info("Warmed up "+iid+" in "+delta+" ms");
6788 }
6889
6990 /** Warmup index using some number of simple searches */
70 - protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
71 - String lang = global.getLanguage(iid.getDBname());
 91+ protected static void warmupWithSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
 92+ String lang = iid.getLangCode();
7293 FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
7394 WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
7495 Terms terms = getTermsForLang(lang);
@@ -79,7 +100,7 @@
80101 for(int j =0; j<20 && j<hits.length(); j++)
81102 hits.doc(j); // retrieve some documents
82103 if(useDelay){
83 - if(i<1000)
 104+ if(i<1000)
84105 Thread.sleep(100);
85106 else
86107 Thread.sleep(50);
@@ -126,7 +147,6 @@
127148 /** Just run one complex query and rebuild the main namespace filter */
128149 public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
129150 try{
130 - String lang = global.getLanguage(iid.getDBname());
131151 FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
132152 WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
133153 Query q = parser.parse("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java
@@ -73,6 +73,21 @@
7474 return makeQueryFromTerms(terms,field);
7575 }
7676
 77+ /** Make terms array for phrases */
 78+ public Term[] makeTerms(String wildcard, String field){
 79+ HashSet<String> terms = getCached(wildcard);
 80+ if(terms.size() == 0)
 81+ return null; // no match or error
 82+
 83+ trimTerms(terms);
 84+ Term[] ret = new Term[terms.size()];
 85+ int i = 0;
 86+ for(String w : terms)
 87+ ret[i++] = new Term(field,w);
 88+ return ret;
 89+
 90+ }
 91+
7792 protected HashSet<String> getCached(String wildcard){
7893 if(client == null)
7994 client = new RMIMessengerClient();
@@ -99,6 +114,16 @@
100115
101116 /** Construct DijunctionMaxQuery from terms */
102117 protected Query makeQueryFromTerms(HashSet<String> terms, String field){
 118+ trimTerms(terms);
 119+
 120+ DisjunctionMaxQuery q = new DisjunctionMaxQuery(0);
 121+ for(String t : terms){
 122+ q.add(new TermQuery(new Term(field,t)));
 123+ }
 124+ return q;
 125+ }
 126+
 127+ private void trimTerms(HashSet<String> terms) {
103128 if(terms.size() > MAX_TERMS){
104129 HashSet<String> temp = new HashSet<String>();
105130 int count = 0;
@@ -110,13 +135,8 @@
111136 }
112137 terms = temp;
113138 }
114 - DisjunctionMaxQuery q = new DisjunctionMaxQuery(0);
115 - for(String t : terms){
116 - q.add(new TermQuery(new Term(field,t)));
117 - }
118 - return q;
119139 }
120 -
 140+
121141 public boolean hasWildcards(){
122142 return wildcardCache.size() > 0;
123143 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Fuzzy.java
@@ -54,6 +54,14 @@
5555
5656 }
5757
 58+ public ArrayList<Float> getBoosts(String word, NamespaceFilter nsf, Term[] tt){
 59+ ArrayList<Float> boost = new ArrayList<Float>();
 60+ HashMap<String,Float> terms = getCached(word,nsf);
 61+ for(Term t : tt)
 62+ boost.add(terms.get(t.text()));
 63+ return boost;
 64+ }
 65+
5866 public ArrayList<Float> getBoosts(String word, NamespaceFilter nsf, ArrayList<String> words){
5967 ArrayList<Float> boost = new ArrayList<Float>();
6068 HashMap<String,Float> terms = getCached(word,nsf);
@@ -76,6 +84,20 @@
7785 // actually make query
7886 return makeQueryFromTerms(terms, field);
7987 }
 88+ /** Make a term array without boost */
 89+ public Term[] makeTerms(String word, String field, NamespaceFilter nsf){
 90+ if(client == null)
 91+ client = new RMIMessengerClient();
 92+ HashMap<String,Float> terms = getCached(word,nsf);
 93+ if(terms.size() == 0)
 94+ return null;
 95+
 96+ Term[] ret = new Term[terms.size()];
 97+ int i=0;
 98+ for(String w : terms.keySet())
 99+ ret[i++] = new Term(field,w);
 100+ return ret;
 101+ }
80102
81103 protected HashMap<String,Float> getCached(String word, NamespaceFilter nsf){
82104 String key = cacheKey(word,nsf);
@@ -99,7 +121,10 @@
100122 /** Calculate boost factor for suggest result - larger edit distance = smaller boost */
101123 protected float getBoost(SuggestResult r){
102124 int dist = r.getDist()+r.getDistMetaphone();
103 - return (float)(1.0/Math.pow(2,dist));
 125+ double d = r.getDist();
 126+ double l = r.getWord().length();
 127+ // 2^(-dist) * len_prop * 2^E(dist)
 128+ return (float)((1.0/Math.pow(2,dist))*((l-d)/l)*4);
104129 }
105130
106131 private Query makeQueryFromTerms(HashMap<String,Float> terms, String field) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -41,10 +41,12 @@
4242 import org.wikimedia.lsearch.frontend.SearchServer;
4343 import org.wikimedia.lsearch.highlight.Highlight;
4444 import org.wikimedia.lsearch.highlight.HighlightResult;
 45+import org.wikimedia.lsearch.index.MessengerThread;
4546 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
4647 import org.wikimedia.lsearch.ranks.StringList;
4748 import org.wikimedia.lsearch.related.Related;
4849 import org.wikimedia.lsearch.related.RelatedTitle;
 50+import org.wikimedia.lsearch.spell.Suggest;
4951 import org.wikimedia.lsearch.spell.SuggestQuery;
5052 import org.wikimedia.lsearch.util.Localization;
5153
@@ -79,24 +81,28 @@
8082 }
8183
8284 /** Main search method, call this from the search frontend */
83 - public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) {
 85+ public SearchResults search(IndexId iid, String what, String searchterm, HashMap query, double version) {
8486
8587 if (what.equals("search") || what.equals("explain")) {
8688 int offset = 0, limit = 100; boolean exactCase = false;
87 - int iwlimit = 10;
 89+ int iwlimit = 10; int iwoffset = 0;
8890 boolean searchOnly = false;
8991 if (query.containsKey("offset"))
9092 offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
9193 if (query.containsKey("limit"))
9294 limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES);
 95+ if (query.containsKey("iwoffset"))
 96+ iwoffset = Math.max(Integer.parseInt((String)query.get("iwoffset")), 0);
9397 if (query.containsKey("iwlimit"))
9498 iwlimit = Math.min(Integer.parseInt((String)query.get("iwlimit")), MAXLINES);
9599 if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
96100 exactCase = true;
97101 if(query.containsKey("searchonly"))
98102 searchOnly = Boolean.parseBoolean((String)query.get("searchonly"));
 103+ if(version <= 2)
 104+ searchOnly = true;
99105 NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
100 - SearchResults res = search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
 106+ SearchResults res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
101107 if(res!=null && res.isRetry()){
102108 int retries = 0;
103109 if(iid.isSplit() || iid.isNssplit()){
@@ -105,7 +111,7 @@
106112 retries = 1;
107113
108114 while(retries > 0 && res.isRetry()){
109 - res = search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
 115+ res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
110116 retries--;
111117 }
112118 if(res.isRetry())
@@ -114,24 +120,26 @@
115121 return res;
116122 } else if (what.equals("raw") || what.equals("rawexplain")) {
117123 int offset = 0, limit = 100; boolean exactCase = false;
118 - int iwlimit = 10;
 124+ int iwlimit = 10; int iwoffset = 0;
119125 if (query.containsKey("offset"))
120126 offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
121127 if (query.containsKey("limit"))
122128 limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES);
 129+ if (query.containsKey("iwoffset"))
 130+ iwoffset = Math.max(Integer.parseInt((String)query.get("iwoffset")), 0);
123131 if (query.containsKey("iwlimit"))
124132 iwlimit = Math.min(Integer.parseInt((String)query.get("iwlimit")), MAXLINES);
125133 if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
126134 exactCase = true;
127135 NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
128 - return search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("rawexplain"), exactCase, true, true);
 136+ return search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("rawexplain"), exactCase, true, true);
129137 } else if (what.equals("titlematch")) {
130138 // TODO: return searchTitles(searchterm);
131139 } else if (what.equals("prefix")){
132140 int limit = MAXPREFIX;
133141 if (query.containsKey("limit"))
134142 limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXPREFIX);
135 - SearchResults res = prefixSearch(iid, searchterm, limit);
 143+ SearchResults res = searchPrefix(iid, searchterm, limit);
136144 if(query.containsKey("format")){
137145 String format = (String)query.get("format");
138146 if(format.equalsIgnoreCase("json"))
@@ -146,7 +154,7 @@
147155 offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
148156 if (query.containsKey("limit"))
149157 limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES);
150 - return relatedSearch(iid, searchterm, offset, limit);
 158+ return searchRelated(iid, searchterm, offset, limit);
151159 } else {
152160 SearchResults res = new SearchResults();
153161 res.setErrorMsg("Unrecognized search type. Try one of: " +
@@ -184,43 +192,47 @@
185193 return "";
186194 }
187195
188 - protected SearchResults relatedSearch(IndexId iid, String searchterm, int offset, int limit) {
 196+ protected SearchResults searchRelated(IndexId iid, String searchterm, int offset, int limit) {
 197+ RMIMessengerClient messenger = new RMIMessengerClient();
 198+ String host = cache.getRandomHost(iid.getRelated());
 199+ return messenger.searchRelated(host,iid.toString(),searchterm,offset,limit);
 200+
 201+ }
 202+
 203+ /** Search on a local related index (called via RMI) */
 204+ public SearchResults searchRelatedLocal(IndexId iid, String searchterm, int offset, int limit) throws IOException {
189205 readLocalization(iid);
190206 IndexId rel = iid.getRelated();
191207 SearcherCache cache = SearcherCache.getInstance();
192208 SearchResults res = new SearchResults();
193 - try {
194 - IndexSearcherMul searcher = cache.getLocalSearcher(rel);
195 - IndexReader reader = searcher.getIndexReader();
196 - String key = getKey(searchterm,iid);
197 - TermDocs td = reader.termDocs(new Term("key",key));
198 - if(td.next()){
199 - ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
200 - res.setNumHits(col.size());
201 - res.setSuccess(true);
202 - for(int i=offset;i<offset+limit && i<col.size();i++){
203 - RelatedTitle rt = col.get(i);
204 - Title t = rt.getRelated();
205 - ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
206 - res.addResult(rs);
207 - }
208 - // highlight stuff
209 - Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
210 - NamespaceFilter nsDefault = new NamespaceFilter(key.substring(0,key.indexOf(':')));
211 - FieldBuilder.BuilderSet bs = new FieldBuilder(iid).getBuilder();
212 - HashSet<String> stopWords = StopWords.getPredefinedSet(iid);
213 - WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,NamespacePolicy.IGNORE,stopWords);
214 - Query q = parser.parse(key.substring(key.indexOf(':')+1),new WikiQueryParser.ParsingOptions(true));
215 - highlight(iid,q,parser.getWordsClean(),searcher,res,parser.hasPhrases());
216 - } else{
217 - res.setSuccess(true);
218 - res.setNumHits(0);
 209+
 210+ IndexSearcherMul searcher = cache.getLocalSearcher(rel);
 211+ IndexReader reader = searcher.getIndexReader();
 212+ String key = getKey(searchterm,iid);
 213+ TermDocs td = reader.termDocs(new Term("key",key));
 214+ if(td.next()){
 215+ ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
 216+ res.setNumHits(col.size());
 217+ res.setSuccess(true);
 218+ for(int i=offset;i<offset+limit && i<col.size();i++){
 219+ RelatedTitle rt = col.get(i);
 220+ Title t = rt.getRelated();
 221+ ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
 222+ res.addResult(rs);
219223 }
220 - } catch (IOException e) {
221 - e.printStackTrace();
222 - log.error("I/O error in relatedSearch on "+rel+" : "+e.getMessage());
223 - res.setErrorMsg("I/O Error processing index for "+rel);
 224+ // highlight stuff
 225+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
 226+ NamespaceFilter nsDefault = new NamespaceFilter(key.substring(0,key.indexOf(':')));
 227+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid).getBuilder();
 228+ HashSet<String> stopWords = StopWords.getPredefinedSet(iid);
 229+ WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,NamespacePolicy.IGNORE,stopWords);
 230+ Query q = parser.parse(key.substring(key.indexOf(':')+1),new WikiQueryParser.ParsingOptions(true));
 231+ highlight(iid,q,parser.getWordsClean(),searcher,res,true,true);
 232+ } else{
 233+ res.setSuccess(true);
 234+ res.setNumHits(0);
224235 }
 236+
225237 return res;
226238 }
227239
@@ -236,7 +248,7 @@
237249 }
238250 }
239251
240 - protected SearchResults prefixSearch(IndexId iid, String searchterm, int limit) {
 252+ protected SearchResults searchPrefix(IndexId iid, String searchterm, int limit) {
241253 readLocalization(iid);
242254 IndexId pre = iid.getPrefix();
243255 SearcherCache cache = SearcherCache.getInstance();
@@ -313,7 +325,7 @@
314326 // search
315327 SearchResults res = makeTitlesSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
316328 // highlight
317 - highlightTitles(iid,q,words,searcher,res,sortByPhrases);
 329+ highlightTitles(iid,q,words,searcher,res,sortByPhrases,false);
318330 return res;
319331 } catch (IOException e) {
320332 e.printStackTrace();
@@ -362,7 +374,8 @@
363375 * Search on iid, with query searchterm. View results from offset to offset+limit, using
364376 * the default namespaces filter
365377 */
366 - public SearchResults search(IndexId iid, String searchterm, int offset, int limit, int iwlimit, NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw, boolean searchOnly){
 378+ public SearchResults search(IndexId iid, String searchterm, int offset, int limit, int iwoffset, int iwlimit,
 379+ NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw, boolean searchOnly){
367380 Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
368381 if(nsDefault == null || nsDefault.cardinality() == 0)
369382 nsDefault = new NamespaceFilter("0"); // default to main namespace
@@ -429,8 +442,8 @@
430443 HighlightPack pack = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host);
431444 res = pack.res;
432445 if(!searchOnly){
433 - highlight(iid,q,parser.getWordsClean(),pack.terms,pack.dfs,pack.maxDoc,res,exactCase,null,parser.hasPhrases());
434 - fetchTitles(res,searchterm,nsfw,iid,parser,offset,0,iwlimit,explain);
 446+ highlight(iid,q,parser.getWordsClean(),pack.terms,pack.dfs,pack.maxDoc,res,exactCase,null,parser.hasPhrases(),false);
 447+ fetchTitles(res,searchterm,nsfw,iid,parser,offset,iwoffset,iwlimit,explain);
435448 suggest(iid,searchterm,parser,res,offset,nsfw);
436449 }
437450 return res;
@@ -459,8 +472,8 @@
460473 hits = searcher.search(q,nsfw,offset+limit);
461474 res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
462475 if(!searchOnly){
463 - highlight(iid,q,parser.getWordsClean(),searcher,parser.getHighlightTerms(),res,exactCase,parser.hasPhrases());
464 - fetchTitles(res,searchterm,nsfw,iid,parser,offset,0,iwlimit,explain);
 476+ highlight(iid,q,parser.getWordsClean(),searcher,parser.getHighlightTerms(),res,exactCase,parser.hasPhrases(),false);
 477+ fetchTitles(res,searchterm,nsfw,iid,parser,offset,iwoffset,iwlimit,explain);
465478 suggest(iid,searchterm,parser,res,offset,nsfw);
466479 }
467480 return res;
@@ -506,7 +519,8 @@
507520 RMIMessengerClient messenger = new RMIMessengerClient();
508521 // find host
509522 String host = cache.getRandomHost(iid.getSpell());
510 - SuggestQuery sq = messenger.suggest(host,iid.toString(),searchterm,tokens,res.getPhrases(),res.getFoundInContext(),res.getFirstHitRank(),nsfw.getFilter());
 523+ Suggest.ExtraInfo info = new Suggest.ExtraInfo(res.getPhrases(),res.getFoundInContext(),res.getFoundInTitles(),res.getFirstHitRank());
 524+ SuggestQuery sq = messenger.suggest(host,iid.toString(),searchterm,tokens,info,nsfw.getFilter());
511525 res.setSuggest(sq);
512526 }
513527 }
@@ -595,7 +609,7 @@
596610
597611 TopDocs hits = searcher.search(q,wrap,iwoffset+iwlimit);
598612 SearchResults r = makeTitlesSearchResults(searcher,hits,iwoffset,iwlimit,main,searchterm,q,searchStart,explain);
599 - highlightTitles(main,q,words,searcher,r,parser.hasWildcards());
 613+ highlightTitles(main,q,words,searcher,r,parser.hasWildcards(),false);
600614
601615 if(r.isSuccess()){
602616 res.setTitles(r.getResults());
@@ -697,38 +711,38 @@
698712 }
699713
700714 /** Highlight search results, and set the property in ResultSet */
701 - protected void highlight(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, Term[] terms, SearchResults res, boolean exactCase, boolean sortByPhrases) throws IOException{
 715+ protected void highlight(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, Term[] terms, SearchResults res, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
702716 int[] df = searcher.docFreqs(terms);
703717 int maxDoc = searcher.maxDoc();
704 - highlight(iid,q,words,terms,df,maxDoc,res,exactCase,null,sortByPhrases);
 718+ highlight(iid,q,words,terms,df,maxDoc,res,exactCase,null,sortByPhrases,alwaysIncludeFirst);
705719 }
706720
707721 /** Highlight search results, and set the property in ResultSet */
708 - protected void highlight(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases) throws IOException{
 722+ protected void highlight(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
709723 Term[] terms = getTerms(q,"contents");
710724 int[] df = searcher.docFreqs(terms);
711725 int maxDoc = searcher.maxDoc();
712 - highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases);
 726+ highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases,alwaysIncludeFirst);
713727 }
714728
715729 /** Highlight search results from titles index */
716 - protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases) throws IOException{
 730+ protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
717731 Term[] terms = getTerms(q,"alttitle");
718732 int[] df = searcher.docFreqs(terms);
719733 int maxDoc = searcher.maxDoc();
720 - highlight(iid,q,words,terms,df,maxDoc,res,false,searcher.getIndexReader(),sortByPhrases);
 734+ highlight(iid,q,words,terms,df,maxDoc,res,false,searcher.getIndexReader(),sortByPhrases,alwaysIncludeFirst);
721735 }
722736
723737 /** Highlight search results from titles index using a wikisearcher */
724 - protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, SearchResults res, boolean sortByPhrases) throws IOException{
 738+ protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
725739 Term[] terms = getTerms(q,"alttitle");
726740 int[] df = searcher.docFreqs(terms);
727741 int maxDoc = searcher.maxDoc();
728 - highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases);
 742+ highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases,alwaysIncludeFirst);
729743 }
730744
731745 /** Highlight article (don't call directly, use one of the interfaces above instead) */
732 - protected void highlight(IndexId iid, Query q, ArrayList<String> words, Term[] terms, int[] df, int maxDoc, SearchResults res, boolean exactCase, IndexReader reader, boolean sortByPhrases) throws IOException{
 746+ protected void highlight(IndexId iid, Query q, ArrayList<String> words, Term[] terms, int[] df, int maxDoc, SearchResults res, boolean exactCase, IndexReader reader, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
733747 // iid -> array of keys
734748 HashMap<IndexId,ArrayList<String>> map = new HashMap<IndexId,ArrayList<String>>();
735749 iid = iid.getHighlight();
@@ -755,17 +769,18 @@
756770 Highlight.ResultSet rs = null;
757771 if(reader != null){
758772 // we got a local reader, use it
759 - rs = Highlight.highlight(e.getValue(),hiid,terms,df,maxDoc,words,stopWords,exactCase,reader,sortByPhrases);
 773+ rs = Highlight.highlight(e.getValue(),hiid,terms,df,maxDoc,words,stopWords,exactCase,reader,sortByPhrases,alwaysIncludeFirst);
760774 } else{
761775 // remote call
762776 String host = cache.getRandomHost(hiid);
763 - rs = messenger.highlight(host,e.getValue(),hiid.toString(),terms,df,maxDoc,words,exactCase,sortByPhrases);
 777+ rs = messenger.highlight(host,e.getValue(),hiid.toString(),terms,df,maxDoc,words,exactCase,sortByPhrases,alwaysIncludeFirst);
764778 }
765779 results.putAll(rs.highlighted);
766780 res.getPhrases().addAll(rs.phrases);
767781 res.getFoundInContext().addAll(rs.foundInContext);
768782 if(rs.foundAllInTitle && words.size()>1)
769 - res.setFoundAllInTitle(true);
 783+ res.setFoundAllInTitle(true);
 784+ res.getFoundInTitles().addAll(rs.foundInTitles);
770785 }
771786 }
772787 res.addToFirstHitRank(res.getNumHits());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java
@@ -3,6 +3,9 @@
44 import java.io.IOException;
55 import java.util.Collection;
66 import java.util.HashMap;
 7+import java.util.HashSet;
 8+import java.util.Hashtable;
 9+import java.util.Set;
710 import java.util.StringTokenizer;
811 import java.util.WeakHashMap;
912
@@ -21,10 +24,16 @@
2225 *
2326 */
2427 public class AggregateMetaField {
25 - static Logger log = Logger.getLogger(RankField.class);
 28+ static Logger log = Logger.getLogger(AggregateMetaField.class);
2629 protected static WeakHashMap<IndexReader,HashMap<String,AggregateMetaFieldSource>> cache = new WeakHashMap<IndexReader,HashMap<String,AggregateMetaFieldSource>>();
2730 protected static Object lock = new Object();
 31+ protected static Hashtable<IndexReader,AggregateMetaFieldSource> cachingInProgress = new Hashtable<IndexReader,AggregateMetaFieldSource>();
2832
 33+ /** Check if there is a current background caching on a reader */
 34+ public static boolean isBeingCached(IndexReader reader){
 35+ return cachingInProgress.containsKey(reader);
 36+ }
 37+
2938 /** Get a cached field source
3039 * @throws IOException */
3140 public static AggregateMetaFieldSource getCachedSource(IndexReader reader, String field) throws IOException{
@@ -64,56 +73,63 @@
6574
6675 protected class CachingThread extends Thread {
6776 public void run(){
68 - log.info("Caching aggregate field "+field+" for "+reader.directory());
69 - int maxdoc = reader.maxDoc();
70 - index = new int[maxdoc];
71 - int count = 0;
72 - length = new byte[maxdoc]; // estimate maxdoc values
73 - lengthNoStopWords = new byte[maxdoc];
74 - lengthComplete = new byte[maxdoc];
75 - boost = new float[maxdoc];
76 - namespaces = new byte[maxdoc];
77 - for(int i=0;i<maxdoc;i++){
78 - byte[] stored = null;
79 - try{
80 - Document doc = reader.document(i);
81 - stored = doc.getBinaryValue(field);
82 - namespaces[i] = (byte)Integer.parseInt(doc.get("namespace"));
83 - index[i] = count;
84 - if(stored == null)
85 - continue;
86 - for(int j=0;j<stored.length/7;j++){
87 - if(count >= length.length){
88 - length = extendBytes(length);
89 - lengthNoStopWords = extendBytes(lengthNoStopWords);
90 - lengthComplete = extendBytes(lengthComplete);
91 - boost = extendFloats(boost);
92 - }
93 - length[count] = stored[j*7];
94 - if(length[count] == 0){
95 - log.debug("Broken length=0 for docid="+i+", at position "+j);
96 - }
97 - lengthNoStopWords[count] = stored[j*7+1];
98 - int boostInt = (((stored[j*7+2]&0xff) << 24) + ((stored[j*7+3]&0xff) << 16) + ((stored[j*7+4]&0xff) << 8) + ((stored[j*7+5]&0xff) << 0));
99 - boost[count] = Float.intBitsToFloat(boostInt);
100 - lengthComplete[count] = stored[j*7+6];
101 -
102 - count++;
103 - }
104 - } catch(Exception e){
105 - log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage());
106 - e.printStackTrace();
 77+ cachingInProgress.put(reader,AggregateMetaFieldSource.this);
 78+ try{
 79+ log.info("Caching aggregate field "+field+" for "+reader.directory());
 80+ int maxdoc = reader.maxDoc();
 81+ index = new int[maxdoc];
 82+ int count = 0;
 83+ length = new byte[maxdoc]; // estimate maxdoc values
 84+ lengthNoStopWords = new byte[maxdoc];
 85+ lengthComplete = new byte[maxdoc];
 86+ boost = new float[maxdoc];
 87+ namespaces = new byte[maxdoc];
 88+ for(int i=0;i<maxdoc;i++){
 89+ byte[] stored = null;
 90+ try{
 91+ Document doc = reader.document(i);
 92+ stored = doc.getBinaryValue(field);
 93+ namespaces[i] = (byte)Integer.parseInt(doc.get("namespace"));
 94+ index[i] = count;
 95+ if(stored == null)
 96+ continue;
 97+ for(int j=0;j<stored.length/7;j++){
 98+ if(count >= length.length){
 99+ length = extendBytes(length);
 100+ lengthNoStopWords = extendBytes(lengthNoStopWords);
 101+ lengthComplete = extendBytes(lengthComplete);
 102+ boost = extendFloats(boost);
 103+ }
 104+ length[count] = stored[j*7];
 105+ if(length[count] == 0){
 106+ log.debug("Broken length=0 for docid="+i+", at position "+j);
 107+ }
 108+ lengthNoStopWords[count] = stored[j*7+1];
 109+ int boostInt = (((stored[j*7+2]&0xff) << 24) + ((stored[j*7+3]&0xff) << 16) + ((stored[j*7+4]&0xff) << 8) + ((stored[j*7+5]&0xff) << 0));
 110+ boost[count] = Float.intBitsToFloat(boostInt);
 111+ lengthComplete[count] = stored[j*7+6];
 112+
 113+ count++;
 114+ }
 115+ } catch(Exception e){
 116+ log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage());
 117+ e.printStackTrace();
 118+ }
107119 }
 120+ // compact arrays
 121+ if(count < length.length - 1){
 122+ length = resizeBytes(length,count);
 123+ lengthNoStopWords = resizeBytes(lengthNoStopWords,count);
 124+ boost = resizeFloats(boost,count);
 125+ lengthComplete = resizeBytes(lengthComplete,count);
 126+ }
 127+ log.info("Finished caching aggregate "+field+" for "+reader.directory());
 128+ cachingFinished = true;
 129+ } catch(Exception e){
 130+ e.printStackTrace();
 131+ log.error("Whole caching failed on field="+field+", reader="+reader);
108132 }
109 - // compact arrays
110 - if(count < length.length - 1){
111 - length = resizeBytes(length,count);
112 - lengthNoStopWords = resizeBytes(lengthNoStopWords,count);
113 - boost = resizeFloats(boost,count);
114 - lengthComplete = resizeBytes(lengthComplete,count);
115 - }
116 - log.info("Finished caching aggregate "+field+" for "+reader.directory());
117 - cachingFinished = true;
 133+ cachingInProgress.remove(reader);
118134 }
119135 protected byte[] extendBytes(byte[] array){
120136 return resizeBytes(array,array.length*2);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java
@@ -39,22 +39,24 @@
4040 included = new BitSet(64);
4141 }
4242
 43+ /** "all" filter */
4344 public NamespaceFilter() {
4445 init();
4546 }
4647
 48+ /** filter namespaces */
4749 public NamespaceFilter(Collection<Integer> namespaces){
4850 init();
4951 for(Integer namespace : namespaces){
5052 included.set(namespace.intValue());
5153 }
5254 }
53 -
 55+ /** filter on one namespace */
5456 public NamespaceFilter(int namespace){
5557 init();
5658 included.set(namespace);
5759 }
58 -
 60+ /** filter number of namespaces separated by comma, e.g. 0,2,10 */
5961 public NamespaceFilter(String namespaces) {
6062 init();
6163 if (namespaces != null && !namespaces.equals("")) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java
@@ -70,12 +70,15 @@
7171 public boolean foundAllInTitle;
7272 /** Rank of the first hit, used as title-suggestion threshold */
7373 public int firstHitRank = 0;
74 - public ResultSet(HashMap<String, HighlightResult> highlighted, HashSet<String> phrases, HashSet<String> foundInContext, boolean foundAllInTitle, int firstHitRank) {
 74+ /** Words found in titles */
 75+ public HashSet<String> foundInTitles;
 76+ public ResultSet(HashMap<String, HighlightResult> highlighted, HashSet<String> phrases, HashSet<String> foundInContext, boolean foundAllInTitle, int firstHitRank, HashSet<String> foundInTitles) {
7577 this.highlighted = highlighted;
7678 this.phrases = phrases;
7779 this.foundInContext = foundInContext;
7880 this.foundAllInTitle = foundAllInTitle;
7981 this.firstHitRank = firstHitRank;
 82+ this.foundInTitles = foundInTitles;
8083 }
8184 }
8285 /**
@@ -87,10 +90,12 @@
8891 * @param words - in order words (from main phrase)
8992 * @param exactCase - if these are results from exactCase search
9093 * @throws IOException
91 - * @returns map: key -> what to highlight
 94+ * @returns resultset
9295 */
9396 @SuppressWarnings("unchecked")
94 - public static ResultSet highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, ArrayList<String> words, HashSet<String> stopWords, boolean exactCase, IndexReader reader, boolean sortByPhrases) throws IOException{
 97+ public static ResultSet highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc,
 98+ ArrayList<String> words, HashSet<String> stopWords, boolean exactCase, IndexReader reader,
 99+ boolean sortByPhrases, boolean alwaysIncludeFirstLine) throws IOException{
95100 if(cache == null)
96101 cache = SearcherCache.getInstance();
97102
@@ -101,6 +106,7 @@
102107 HashSet<String> inContext = new HashSet<String>();
103108 boolean foundAllInTitle = false;
104109 int firstHitRank = 0;
 110+ HashSet<String> inTitle = new HashSet<String>();
105111
106112 // terms weighted with idf
107113 HashMap<String,Double> weightTerm = new HashMap<String,Double>();
@@ -140,8 +146,8 @@
141147 firstHitRank = alttitles.getTitle().getRank();
142148
143149 HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles,wordIndex);
144 - ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases );
145 - ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,wordIndex,1,true,stopWords,false,phrases,inContext,false);
 150+ ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine );
 151+ ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,wordIndex,1,true,stopWords,false,phrases,inContext,false,false);
146152 int redirectAdditional = 0;
147153 if(titleSnippets.size()>0 &&
148154 ((titleSnippets.get(0).found.containsAll(words) && textTokenLength(titleSnippets.get(0).tokens) == words.size())
@@ -219,12 +225,14 @@
220226 hr.setTitle(titleSnippets.get(0).makeSnippet(256,true));
221227 if(titleSnippets.get(0).found.containsAll(words))
222228 foundAllInTitle = true;
 229+ inTitle.addAll(titleSnippets.get(0).found);
223230 }
224231
225232 if(redirectSnippets != null){
226233 hr.setRedirect(redirectSnippets.makeSnippet(MAX_CONTEXT,true));
227234 if(!foundAllInTitle && redirectSnippets.found.containsAll(words))
228235 foundAllInTitle = true;
 236+ inTitle.addAll(redirectSnippets.found);
229237 }
230238
231239 if(sectionSnippets != null){
@@ -240,7 +248,7 @@
241249 res.put(key,hr);
242250
243251 }
244 - return new ResultSet(res,phrases,inContext,foundAllInTitle,firstHitRank);
 252+ return new ResultSet(res,phrases,inContext,foundAllInTitle,firstHitRank,inTitle);
245253 }
246254
247255 /** Number of tokens excluding aliases and glue stuff */
@@ -423,7 +431,7 @@
424432 }
425433 }
426434 if((completeMatch && additional >= minAdditional) || additional > minAdditional || (additional != 0 && additional == notInTitle.size())){
427 - ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 1, false, stopWords, false, phrases, inContext, false);
 435+ ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false);
428436 if(snippets.size() > 0){
429437 RawSnippet snippet = snippets.get(0);
430438 snippet.setAlttitle(ainf);
@@ -498,7 +506,7 @@
499507 /** Highlight text */
500508 protected static ArrayList<RawSnippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms,
501509 HashMap<String,Integer> wordIndex, int maxSnippets, boolean ignoreBreaks, HashSet<String> stopWords, boolean showFirstIfNone,
502 - HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases) {
 510+ HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases, final boolean alwaysIncludeFirstLine) {
503511
504512 // pieces of text to ge highlighted
505513 ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>();
@@ -680,6 +688,12 @@
681689 // find fragments with best score
682690 Collections.sort(fragments, new Comparator<FragmentScore>() {
683691 public int compare(FragmentScore o1, FragmentScore o2) {
 692+ if(alwaysIncludeFirstLine){
 693+ if(o1.isFirstSentence)
 694+ return -1;
 695+ if(o2.isFirstSentence)
 696+ return 1;
 697+ }
684698 // sort via longest phrase found
685699 int c = o2.bestCount - o1.bestCount;
686700 if(sortByPhrases && c != 0)
@@ -698,7 +712,7 @@
699713 HashSet<String> termsFound = new HashSet<String>();
700714 ArrayList<FragmentScore> resNoNew = new ArrayList<FragmentScore>();
701715 for(FragmentScore f : fragments){
702 - if(f.score == 0)
 716+ if(f.score == 0 && !(alwaysIncludeFirstLine && f.isFirstSentence))
703717 break;
704718 // check if the fragment has new terms
705719 boolean hasNew = false;
@@ -711,7 +725,7 @@
712726 }
713727 }
714728 }
715 - if(hasNew){
 729+ if(hasNew || (alwaysIncludeFirstLine && f.isFirstSentence)){
716730 if(f.found != null)
717731 termsFound.addAll(f.found);
718732 adjustBest(f,tokens,weightTerms,wordIndex,newTerms);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java
@@ -437,11 +437,11 @@
438438 dbUpdates = new Hashtable<String,IndexUpdateRecord>();
439439 queuedUpdates.put(iid.toString(), dbUpdates);
440440 }
441 - IndexUpdateRecord oldr = dbUpdates.get(record.getKey());
 441+ IndexUpdateRecord oldr = dbUpdates.get(record.getIndexKey());
442442 // combine a previous delete with current add to form update
443443 if(oldr != null && oldr.doDelete() && record.doAdd())
444444 record.setAction(IndexUpdateRecord.Action.UPDATE);
445 - dbUpdates.put(record.getKey(),record);
 445+ dbUpdates.put(record.getIndexKey(),record);
446446 }
447447
448448 log.debug("Locally queued item: "+record);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java
@@ -149,16 +149,16 @@
150150 }
151151
152152 /**
153 - * @return Returns the page key -- page_id (via article)
 153+ * @return page index key -- page_id (via article)
154154 */
155 - public String getKey(){
156 - return article.getKey();
 155+ public String getIndexKey(){
 156+ return article.getIndexKey();
157157 }
158158
159159 /**
160 - * @return Highlight key -- ns:title
 160+ * @return ns:title key, used in links, highlight, prefix, etc.. indexes
161161 */
162 - public String getHighlightKey(){
 162+ public String getNsTitleKey(){
163163 return article.getTitleObject().getKey();
164164 }
165165
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -59,11 +59,16 @@
6060 import org.wikimedia.lsearch.config.GlobalConfiguration;
6161 import org.wikimedia.lsearch.config.IndexId;
6262 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 63+import org.wikimedia.lsearch.prefix.PrefixIndexBuilder;
6364 import org.wikimedia.lsearch.ranks.Links;
6465 import org.wikimedia.lsearch.ranks.StringList;
 66+import org.wikimedia.lsearch.related.Related;
6567 import org.wikimedia.lsearch.related.RelatedTitle;
6668 import org.wikimedia.lsearch.search.NamespaceFilter;
 69+import org.wikimedia.lsearch.spell.CleanIndexImporter;
 70+import org.wikimedia.lsearch.spell.CleanIndexWriter;
6771 import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
 72+import org.wikimedia.lsearch.storage.RelatedStorage;
6873 import org.wikimedia.lsearch.util.Buffer;
6974 import org.wikimedia.lsearch.util.Localization;
7075 import org.wikimedia.lsearch.util.MathFunc;
@@ -155,9 +160,9 @@
156161 if(rec.doDelete()){
157162 int count = 0;
158163 if(iid.isHighlight())
159 - count = reader.deleteDocuments(new Term("key", rec.getHighlightKey()));
 164+ count = reader.deleteDocuments(new Term("key", rec.getNsTitleKey()));
160165 else // normal or titles index
161 - count = reader.deleteDocuments(new Term("key", rec.getKey()));
 166+ count = reader.deleteDocuments(new Term("key", rec.getIndexKey()));
162167 if(count == 0)
163168 nonDeleteDocuments.add(rec);
164169 IndexReportCard card = getReportCard(rec);
@@ -167,7 +172,7 @@
168173 else
169174 card.setSuccessfulDelete();
170175 }
171 - log.debug(iid+": Deleting document "+rec.getKey()+" "+rec.getArticle());
 176+ log.debug(iid+": Deleting document "+rec.getIndexKey()+" "+rec.getArticle());
172177 }
173178 }
174179 reader.close();
@@ -231,7 +236,7 @@
232237 writer.addDocument(doc,indexAnalyzer);
233238 }
234239
235 - log.debug(iid+": Adding document "+rec.getKey()+" "+rec.getArticle());
 240+ log.debug(iid+": Adding document "+rec.getIndexKey()+" "+rec.getArticle());
236241 if(card != null)
237242 card.setSuccessfulAdd();
238243 } catch (IOException e) {
@@ -241,7 +246,7 @@
242247 succ = false; // report unsucc, but still continue, to process all cards
243248 } catch(Exception e){
244249 e.printStackTrace();
245 - log.error("Error adding document "+rec.getKey()+" with message: "+e.getMessage());
 250+ log.error("Error adding document "+rec.getIndexKey()+" with message: "+e.getMessage());
246251 if(card != null)
247252 card.setFailedAdd();
248253 succ = false; // report unsucc, but still continue, to process all cards
@@ -410,16 +415,124 @@
411416 *
412417 * @param iid
413418 * @param updateRecords
 419+ * @return success
414420 */
415421 public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
416 - boolean index = updateDocumentsOn(iid,updateRecords,iid);
417 - boolean highlight = updateDocumentsOn(iid.getHighlight(),updateRecords,iid);
418 - boolean titles = true;
 422+ return updateLinks(iid,updateRecords)
 423+ && fetchLinksInfo(iid,updateRecords)
 424+ && updatePrefix(iid,updateRecords)
 425+ && updateSpell(iid,updateRecords)
 426+ && updateDocumentsOn(iid,updateRecords,iid)
 427+ && updateDocumentsOn(iid.getHighlight(),updateRecords,iid)
 428+ && updateTitles(iid,updateRecords);
 429+ }
 430+
 431+ public boolean updateTitles(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
419432 if(iid.hasTitlesIndex())
420 - titles = updateDocumentsOn(iid.getTitlesIndex(),updateRecords,iid);
421 - return index && highlight && titles;
 433+ return updateDocumentsOn(iid.getTitlesIndex(),updateRecords,iid);
 434+ return true;
422435 }
423436
 437+ /** Update articles with latest linking & related information */
 438+ public boolean fetchLinksInfo(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
 439+ try{
 440+ Links links = Links.openForRead(iid,iid.getIndexPath());
 441+ RelatedStorage related = new RelatedStorage(iid);
 442+ for(IndexUpdateRecord rec : updateRecords){
 443+ if(rec.doAdd()){
 444+ String key = rec.getNsTitleKey();
 445+ Article article = rec.getArticle();
 446+ // references, redirect status
 447+ article.setReferences(links.getNumInLinks(key));
 448+ article.setRedirectTo(links.getRedirectTarget(key));
 449+ if(article.isRedirect())
 450+ article.setRedirectTargetNamespace(links.getRedirectTargetNamespace(key));
 451+ else
 452+ article.setRedirectTargetNamespace(-1);
 453+
 454+ // redirects
 455+ ArrayList<Redirect> redirects = new ArrayList<Redirect>();
 456+ for(String rk : links.getRedirectsTo(key)){
 457+ String[] parts = rk.toString().split(":",2);
 458+ int redirectRef = links.getNumInLinks(rk);
 459+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef));
 460+ }
 461+ article.setRedirects(redirects);
 462+ // related
 463+ if(related != null)
 464+ article.setRelated(related.getRelated(key));
 465+ }
 466+ }
 467+ return true;
 468+ } catch(IOException e){
 469+ e.printStackTrace();
 470+ log.error("Cannot fetch links info: "+e.getMessage());
 471+ return false;
 472+ }
 473+ }
 474+
 475+ public boolean updateLinks(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
 476+ try{
 477+ Links links = Links.openForModification(iid);
 478+ for(IndexUpdateRecord rec : updateRecords){
 479+ // TODO: this might do some unnecessary additions/deletions on split index architecture
 480+ if(rec.doDelete()){
 481+ links.deleteArticleInfoByIndexKey(rec.getIndexKey());
 482+ } else if(rec.doAdd()){
 483+ Article a = rec.getArticle();
 484+ links.addArticleInfo(a.getContents(),a.getTitleObject(),iid.isExactCase(),a.getIndexKey());
 485+ }
 486+ }
 487+ links.close();
 488+ return true;
 489+ } catch(IOException e){
 490+ e.printStackTrace();
 491+ log.error("Cannot update links index: "+e.getMessage());
 492+ return false;
 493+ }
 494+ }
 495+
 496+ public boolean updatePrefix(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
 497+ if(!iid.hasPrefix())
 498+ return true;
 499+ try{
 500+ PrefixIndexBuilder prefix = PrefixIndexBuilder.forPrecursorModification(iid);
 501+ for(IndexUpdateRecord rec : updateRecords){
 502+ if(rec.doDelete()){
 503+ prefix.deleteFromPrecursor(rec.getIndexKey());
 504+ } else if(rec.doAdd()){
 505+ Article a = rec.getArticle();
 506+ prefix.addToPrecursor(rec.getNsTitleKey(),a.getReferences(),a.getRedirectTarget(),rec.getIndexKey());
 507+ }
 508+ }
 509+ return true;
 510+ } catch(IOException e){
 511+ e.printStackTrace();
 512+ log.error("Cannot update prefix index: "+e.getMessage());
 513+ return false;
 514+ }
 515+ }
 516+
 517+ public boolean updateSpell(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
 518+ if(!iid.hasSpell())
 519+ return true;
 520+ try{
 521+ CleanIndexWriter writer = CleanIndexWriter.newForModification(iid);
 522+ for(IndexUpdateRecord rec : updateRecords){
 523+ if(rec.doDelete()){
 524+ writer.deleteArticleInfo(rec.getIndexKey());
 525+ } else if(rec.doAdd()){
 526+ writer.addArticleInfo(rec.getArticle());
 527+ }
 528+ }
 529+ return true;
 530+ } catch(IOException e){
 531+ e.printStackTrace();
 532+ log.error("Cannot update spellcheck index: "+e.getMessage());
 533+ return false;
 534+ }
 535+ }
 536+
424537 /**
425538 * Update all documents in the collection. If needed the request
426539 * is forwarded to a remote object (i.e. if the part of the split
@@ -518,7 +631,7 @@
519632 transformArticleForIndexing(article);
520633
521634 // page_id from database, used to look up and replace entries on index updates
522 - doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
 635+ doc.add(new Field("key", article.getIndexKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
523636
524637 // namespace, returned with results
525638 doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WordNet.java
@@ -6,6 +6,7 @@
77 import java.util.ArrayList;
88 import java.util.Arrays;
99 import java.util.HashMap;
 10+import java.util.HashSet;
1011 import java.util.List;
1112 import java.util.zip.GZIPInputStream;
1213
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -564,7 +564,8 @@
565565 c = text[cur];
566566 else break;
567567 }
568 - cur--; // we moved to next legal char
 568+ if(!noTrailing)
 569+ cur--; // we moved to next legal char
569570 }
570571
571572 addToken(noTrailing);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/StopWords.java
@@ -88,11 +88,11 @@
8989 return ret;
9090 }
9191
92 - /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */
93 - public static HashSet<String> getPredefinedSet(IndexId iid){
 92+ /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */
 93+ public static HashSet<String> getPredefinedSet(String langCode){
9494 loadPredefined();
9595 HashSet<String> ret = new HashSet<String>();
96 - HashSet<String> cached = cachePredefined.get(iid.getLangCode());
 96+ HashSet<String> cached = cachePredefined.get(langCode);
9797 if(cached != null){
9898 synchronized(cached){
9999 ret.addAll(cached);
@@ -100,6 +100,9 @@
101101 }
102102 return ret;
103103 }
 104+ public static HashSet<String> getPredefinedSet(IndexId iid){
 105+ return getPredefinedSet(iid.getLangCode());
 106+ }
104107
105108 protected static void loadPredefined(){
106109 if(loadedPredefined)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -20,6 +20,7 @@
2121 import org.apache.lucene.search.BooleanClause;
2222 import org.apache.lucene.search.BooleanQuery;
2323 import org.apache.lucene.search.LogTransformScore;
 24+import org.apache.lucene.search.MultiPhraseQuery;
2425 import org.apache.lucene.search.PhraseQuery;
2526 import org.apache.lucene.search.PositionalMultiQuery;
2627 import org.apache.lucene.search.PositionalOptions;
@@ -340,24 +341,10 @@
341342 c = text[cur];
342343 if(c == '"'){
343344 inPhrase = !inPhrase;
344 - if(inPhrase)
345 - length = 0;
346 - else{ // end of phrase
347 - int start = cur - length;
348 - analyzeBuffer();
349 - for(Token t : tokens){
350 - if(t.getPositionIncrement() > 0)
351 - ret.add(new Token(t.termText(),start+t.startOffset(),start+t.endOffset(),"phrase"));
352 - }
353 - }
354345 }
355346
356 - if(inPhrase){
357 - buffer[length++] = c;
358 - continue;
359 - }
360 -
361 - if(c == ')'){
 347+ if(inPhrase);
 348+ else if(c == ')'){
362349 level--;
363350 if(level < fieldLevel)
364351 fieldLevel = -1;
@@ -368,9 +355,9 @@
369356 } else if(fieldLevel != -1 && level>fieldLevel)
370357 continue;
371358
372 - if(Character.isLetterOrDigit(c) || c=='?' || c=='*' || c=='~'){
 359+ if(isTermChar(c)){
373360 int start = cur;
374 - tokenType = fetchToken();
 361+ tokenType = fetchToken(inPhrase);
375362 if(tokenType == TokenType.WORD && (start==0 || text[start-1]!='-')){
376363 String type = "word";
377364 if(bufferIsWildCard())
@@ -384,7 +371,7 @@
385372 }
386373 }
387374 }
388 - } else if(c == '['){
 375+ } else if(c == '[' && !inPhrase){
389376 fetchGenericPrefix();
390377 }
391378 }
@@ -420,12 +407,19 @@
421408 return defaultNamespaceFilter;
422409 }
423410
 411+ private final boolean isTermChar(char ch){
 412+ return !Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"';
 413+ }
 414+
424415 /**
425416 * Fetch token into <code>buffer</code> starting from current position (<code>cur</code>)
426417 *
427418 * @return type of the token in buffer
428419 */
429420 private TokenType fetchToken(){
 421+ return fetchToken(false);
 422+ }
 423+ private TokenType fetchToken(boolean termOnly){
430424 char ch;
431425 prev_cur = cur;
432426 for(length = 0; cur < queryLength; cur++){
@@ -434,7 +428,7 @@
435429 continue; // ignore whitespaces
436430
437431 // pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries
438 - if(!Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"'){
 432+ if(isTermChar(ch)){
439433 if(length<buffer.length)
440434 buffer[length++] = ch;
441435 } else{
@@ -445,6 +439,9 @@
446440 if(length == 0)
447441 return TokenType.EOF;
448442
 443+ if(termOnly)
 444+ return TokenType.WORD;
 445+
449446 // check for keywords
450447 if(length == 3 && buffer[0]=='A' && buffer[1]=='N' && buffer[2]=='D')
451448 return TokenType.AND;
@@ -553,34 +550,80 @@
554551 *
555552 * @return a query, or null if the query is empty
556553 */
557 - private PhraseQuery parsePhrase(){
558 - PhraseQuery query = null;
559 -
560 - length = 0;
 554+ private Query parsePhrase(){
 555+ // special case for incategory
 556+ if(currentField!=null && currentField.equals("incategory")){
 557+ for(; cur < queryLength ; cur++ ){
 558+ if(text[cur] == '"')
 559+ break;
 560+ else if(length < buffer.length)
 561+ buffer[length++] = text[cur];
 562+ }
 563+ if(length > 0){
 564+ // no tokenization, we want whole category name
 565+ return new TermQuery(makeTerm());
 566+ }
 567+ return null;
 568+ }
 569+ //PositionalMultiQuery query = new PositionalMultiQuery(new PositionalOptions.PhraseQueryFallback());
 570+ MultiPhraseQuery query = new MultiPhraseQuery();
561571 for(; cur < queryLength ; cur++ ){
 572+ length = 0;
 573+ // fetch next word
 574+ while(cur<queryLength && isTermChar(text[cur]) && length<buffer.length){
 575+ buffer[length++] = text[cur++];
 576+ }
 577+
 578+ // add to phrase
 579+ if(length > 0){
 580+ boolean added = false;
 581+ if(bufferIsWildCard()){
 582+ Term term = makeTerm();
 583+ Term[] terms = wildcards.makeTerms(term.text(),term.field());
 584+ if(terms != null){
 585+ query.add(terms);
 586+ ArrayList<String> words = wildcards.getWords(term.text());
 587+ expandedWordsFromParser.add(words);
 588+ expandedTypesFromParser.add(ExpandedType.WILDCARD);
 589+ ArrayList<Float> boosts = new ArrayList<Float>();
 590+ for(int i=0;i<words.size();i++) boosts.add(1f);
 591+ expandedBoostFromParser.add(boosts);
 592+ added = true;
 593+ }
 594+ }
 595+ if(bufferIsFuzzy()){
 596+ Term term = makeTerm();
 597+ NamespaceFilter nsf = getNamespaceFilter(currentField);
 598+ Term[] terms = fuzzy.makeTerms(term.text(),term.field(),nsf);
 599+ if(terms != null){
 600+ //query.add(terms,fuzzy.getBoosts(term.text(),nsf,terms));
 601+ query.add(terms);
 602+ ArrayList<String> words = fuzzy.getWords(term.text(),nsf);
 603+ expandedWordsFromParser.add(words);
 604+ expandedTypesFromParser.add(ExpandedType.FUZZY);
 605+ expandedBoostFromParser.add(fuzzy.getBoosts(term.text(),nsf,words));
 606+ added = true;
 607+ }
 608+ }
 609+ if(!added){
 610+ // fallback to ordinary words
 611+ analyzeBuffer();
 612+ for(Token token : tokens){
 613+ if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words
 614+ Term t = makeTerm(token);
 615+ addToWords(t.text(),1,ExpandedType.PHRASE);
 616+ query.add(t);
 617+ }
 618+ }
 619+ }
 620+ }
562621 // end of phrase query
563622 if(text[cur] == '"')
564623 break;
565 - else if(length < buffer.length)
566 - buffer[length++] = text[cur];
567624 }
568 - if(length != 0){
569 - query = new PhraseQuery();
570 - // if it's a category don't tokenize it, we want whole category name
571 - if(currentField!=null && currentField.equals("incategory"))
572 - query.add(makeTerm());
573 - else{
574 - analyzeBuffer();
575 - for(Token token : tokens){
576 - if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words
577 - Term t = makeTerm(token);
578 - addToWords(t.text(),1,ExpandedType.PHRASE);
579 - query.add(t);
580 - }
581 - }
582 - query.setBoost(defaultBoost);
583 - }
584 - return query;
 625+ if(query.getPositions().length > 0){
 626+ query.setBoost(defaultBoost);
 627+ return query;
585628 } else
586629 return null;
587630 }
@@ -1115,20 +1158,22 @@
11161159 full.add(additional,Occur.SHOULD);
11171160
11181161 // redirect match (when redirect is not contained in contents or title)
1119 - Query redirects = makeAlttitleForRedirects(words,20,1);
1120 - if(redirects != null)
1121 - full.add(redirects,Occur.SHOULD);
1122 - if(singularWords != null){
1123 - Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);
1124 - if(redirectsSing != null)
1125 - full.add(redirectsSing,Occur.SHOULD);
1126 - }
11271162 if(hasWildcards() || hasFuzzy()){
11281163 Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f);
11291164 if(redirectsMulti != null)
11301165 full.add(redirectsMulti,Occur.SHOULD);
 1166+ } else{
 1167+ Query redirects = makeAlttitleForRedirects(words,20,1);
 1168+ if(redirects != null)
 1169+ full.add(redirects,Occur.SHOULD);
 1170+ if(singularWords != null){
 1171+ Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);
 1172+ if(redirectsSing != null)
 1173+ full.add(redirectsSing,Occur.SHOULD);
 1174+ }
11311175 }
11321176
 1177+
11331178 BooleanQuery wrap = new BooleanQuery(true);
11341179 wrap.add(full,Occur.SHOULD);
11351180 wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD);
@@ -1324,7 +1369,7 @@
13251370 Query main = null;
13261371
13271372 // all words as entered into the query
1328 - PositionalQuery exact = makePositional(words,fields.contents(),new PositionalOptions.Exact(),0,1);
 1373+ Query exact = makePositionalMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,fields.contents(),new PositionalOptions.Exact(),0,1);
13291374 // words + stemmed + singulars + transliterations + wildcards + fuzzy - with slop factor
13301375 Query sloppy = makePositionalMulti(expandedWordsContents,expandedBoostContents,expandedTypes,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1,false);
13311376
@@ -1335,7 +1380,7 @@
13361381 ArrayList<ArrayList<String>> wordnet = WordNet.replaceOne(words,iid.getLangCode());
13371382
13381383 BooleanQuery combined = new BooleanQuery(true);
1339 - if(exact!=null && exact.getTerms().length > 0)
 1384+ if(exact!=null)
13401385 combined.add(exact,Occur.SHOULD);
13411386 // combined various queries into mainphrase
13421387 if(sloppy != null){
@@ -1343,7 +1388,8 @@
13441389 // wordnet
13451390 if(wordnet != null){
13461391 for(ArrayList<String> wnwords : wordnet){
1347 - combined.add(makePositional(wnwords,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD);
 1392+ if(!allStopWords(wnwords))
 1393+ combined.add(makePositional(wnwords,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD);
13481394 }
13491395 }
13501396 }
@@ -1367,7 +1413,8 @@
13681414 ArrayList<Query> altAdd = new ArrayList<Query>();
13691415 if(wordnet!=null)
13701416 for(ArrayList<String> wnwords : wordnet)
1371 - altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST));
 1417+ if(!allStopWords(wnwords))
 1418+ altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST));
13721419 alttitle = simplify(combine(alttitle,altAdd));
13731420
13741421 // relevance: related
@@ -1375,7 +1422,8 @@
13761423 ArrayList<Query> relAdd = new ArrayList<Query>();
13771424 if(wordnet!=null)
13781425 for(ArrayList<String> wnwords : wordnet)
1379 - relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST));
 1426+ if(!allStopWords(wnwords))
 1427+ relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST));
13801428 related = simplify(combine(related,relAdd));
13811429
13821430 BooleanQuery relevances = new BooleanQuery(true);
@@ -1546,8 +1594,11 @@
15471595 // add the whole-only query
15481596 if(whole != null)
15491597 bq.add(makePositional(words,field,whole,slop,1),Occur.SHOULD);
1550 - if(wholeSloppy != null)
1551 - bq.add(makePositional(words,field,wholeSloppy,slop,1,false),Occur.SHOULD);
 1598+ if(wholeSloppy != null){
 1599+ Query ws = makePositional(words,field,wholeSloppy,slop,1,false);
 1600+ if(ws != null)
 1601+ bq.add(ws,Occur.SHOULD);
 1602+ }
15521603 bq.setBoost(boost);
15531604
15541605 return bq;
@@ -1600,8 +1651,11 @@
16011652 // add the whole-only query
16021653 if(whole != null)
16031654 bq.add(makePositionalMulti(words,boosts,types,field,whole,slop,1),Occur.SHOULD);
1604 - if(wholeSloppy != null)
1605 - bq.add(makePositionalMulti(words,boosts,types,field,wholeSloppy,slop,0.5f,false),Occur.SHOULD);
 1655+ if(wholeSloppy != null){
 1656+ Query ws = makePositionalMulti(words,boosts,types,field,wholeSloppy,slop,0.5f,false);
 1657+ if(ws != null)
 1658+ bq.add(ws,Occur.SHOULD);
 1659+ }
16061660 bq.setBoost(boost);
16071661
16081662 return bq;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java
@@ -56,12 +56,11 @@
5757 }
5858 }
5959
60 - public static class HighlightOriginal extends TokenizerOptions {
 60+ /** Used for titles, doesn't simply glue and has no case detection */
 61+ public static class HighlightOriginal extends Highlight {
6162 public HighlightOriginal(){
62 - super(false);
63 - this.highlightParsing = true;
64 - this.relocationParsing = false;
6563 this.simplifyGlue = false;
 64+ this.noCaseDetection = true;
6665 }
6766 }
6867 /** Used to filter prefixes (up to FastWikiTokenizer.MAX_WORD_LEN chars) */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java
@@ -37,7 +37,7 @@
3838 /** Invoke ListRecords from a certain timestamp */
3939 public ArrayList<IndexUpdateRecord> getRecords(String from){
4040 try{
41 - read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&from="+from));
 41+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=mediawiki&from="+from));
4242 return collector.getRecords();
4343 } catch(IOException e){
4444 log.warn("I/O exception listing records: "+e.getMessage());
@@ -58,7 +58,7 @@
5959 /** Invoke ListRecords using the last resumption token */
6060 public ArrayList<IndexUpdateRecord> getMoreRecords(){
6161 try{
62 - read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken));
 62+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=mediawiki&resumptionToken="+resumptionToken));
6363 return collector.getRecords();
6464 } catch(IOException e){
6565 log.warn("I/O exception listing records: "+e.getMessage());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -35,7 +35,7 @@
3636
3737 public IndexUpdatesCollector(IndexId iid){
3838 this.iid = iid;
39 - this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
 39+ this.langCode = iid.getLangCode();
4040 }
4141
4242 public void addRedirect(String redirectTitle, int references) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -93,7 +93,6 @@
9494 HashSet<String> excludeList = new HashSet<String>();
9595 HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass
9696 String defaultTimestamp = "2001-01-01";
97 - boolean fetchReferences = true;
9897 // args
9998 for(int i=0; i<args.length; i++){
10099 if(args[i].equals("-d"))
@@ -110,8 +109,6 @@
111110 excludeList.add(args[++i]);
112111 else if(args[i].equals("-n"))
113112 notification = true;
114 - else if(args[i].equals("--no-ranks"))
115 - fetchReferences = false;
116113 else if(args[i].equals("--help"))
117114 break;
118115 else if(args[i].startsWith("-")){
@@ -145,7 +142,6 @@
146143 System.out.println(" -f - dblist file, one dbname per line");
147144 System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")");
148145 System.out.println(" -e - exclude dbname from incremental updates (overrides -f)");
149 - System.out.println(" --no-ranks - don't try to fetch any article rank data");
150146 return;
151147 }
152148 // config
@@ -190,22 +186,8 @@
191187 ArrayList<IndexUpdateRecord> records = harvester.getRecords(from);
192188 if(records.size() == 0)
193189 continue;
194 - LinkAnalysisStorage las = new LinkAnalysisStorage(iid);
195 - RelatedStorage related = new RelatedStorage(iid);
196190 boolean hasMore = false;
197191 do{
198 - if(fetchReferences){
199 - try{
200 - // fetch references for records
201 - fetchReferencesAndRelated(records,las,related);
202 - } catch(IOException e){
203 - // FIXME: quick hack, if the table cannot be found (e.g. for new wikis) don't abort
204 - if(e.getMessage().contains("Base table or view not found")){
205 - log.warn("Continuing, but could not fetch references for "+iid+": "+e.getMessage());
206 - } else
207 - throw e;
208 - }
209 - }
210192 for(IndexUpdateRecord rec : records){
211193 Article ar = rec.getArticle();
212194 log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
@@ -287,62 +269,5 @@
288270 }
289271 } while(daemon);
290272 }
291 -
292 - protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, LinkAnalysisStorage las, RelatedStorage related) throws IOException {
293 - ArrayList<Title> titles = new ArrayList<Title>();
294 - for(IndexUpdateRecord rec : records){
295 - if(rec.isDelete())
296 - continue;
297 - Article ar = rec.getArticle();
298 - titles.add(ar.makeTitle());
299 - if(ar.getRedirects() != null){
300 - for(Redirect r : ar.getRedirects()){
301 - titles.add(r.makeTitle());
302 - }
303 - }
304 - }
305 - // fetch
306 - //OldLinks links = new OldLinks(store.getPageReferences(titles,dbname));
307 - //HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname);
308 - // update
309 - // FIXME: wow, this is BCE ...
310 - for(IndexUpdateRecord rec : records){
311 - if(rec.isDelete())
312 - continue;
313 - Article ar = rec.getArticle();
314 - Title t = ar.makeTitle();
315 - ArticleAnalytics aa = las.getAnaliticsForArticle(t.getKey());
316 - ArrayList<String> anchors = new ArrayList<String>();
317 - anchors.addAll(aa.getAnchorText());
318 - // set references
319 - ar.setReferences(aa.getReferences());
320 - //ar.setRedirect(aa.isRedirect());
321 - if(aa.isRedirect())
322 - ar.setRedirectTargetNamespace(aa.getRedirectTargetNamespace());
323 - if(ar.getRedirects() != null){
324 - for(Redirect r : ar.getRedirects()){
325 - ArticleAnalytics raa = las.getAnaliticsForReferences(r.makeTitle().getKey());
326 - r.setReferences(raa.getReferences());
327 - anchors.addAll(raa.getAnchorText());
328 - }
329 - }
330 - // set anchors
331 - ar.setAnchorText(anchors);
332 - // set related
333 - if(related.canRead())
334 - ar.setRelated(related.getRelated(t.getKey()));
335 - /*ArrayList<RelatedTitle> rt = rel.get(t.getKey());
336 - if(rt != null){
337 - Collections.sort(rt,new Comparator<RelatedTitle>() {
338 - public int compare(RelatedTitle o1, RelatedTitle o2){
339 - double d = o2.getScore()-o1.getScore();
340 - if(d == 0) return 0;
341 - else if(d > 0) return 1;
342 - else return -1;
343 - }
344 - });
345 - ar.setRelated(rt);
346 - }*/
347 - }
348 - }
349 -}
 273+
 274+}
\ No newline at end of file
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java
@@ -46,7 +46,7 @@
4747 langCode = "en";
4848 this.langCode = langCode;
4949 this.iid = iid;
50 - this.exactCase = iid.getExactCase();
 50+ this.exactCase = iid.isExactCase();
5151 interwiki = Localization.getInterwiki();
5252 }
5353 public void writeRevision(Revision revision) throws IOException {
@@ -58,7 +58,7 @@
5959 public void writeEndPage() throws IOException {
6060 Title t = new Title(page.Title.Namespace,page.Title.Text);
6161 try{
62 - links.addArticleInfo(revision.Text,t,exactCase);
 62+ links.addArticleInfo(revision.Text,t,exactCase,Integer.toString(page.Id));
6363 } catch(Exception e){
6464 log.error("Error adding article "+t+" : "+e.getMessage());
6565 e.printStackTrace();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
@@ -70,16 +70,16 @@
7171 protected Directory directory = null;
7272 protected NamespaceFilter nsf; // default search
7373 protected ObjectCache cache;
74 - //protected ObjectCache refCache;
7574 protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly;
7675 protected boolean optimized = false;
 76+ protected boolean autoOptimize = false;
7777
78 - private Links(IndexId iid, String path, IndexWriter writer) throws CorruptIndexException, IOException{
 78+ private Links(IndexId iid, String path, IndexWriter writer, boolean autoOptimize) throws CorruptIndexException, IOException{
7979 this.writer = writer;
8080 this.path = path;
8181 this.iid = iid;
82 - GlobalConfiguration global = GlobalConfiguration.getInstance();
83 - this.langCode = global.getLanguage(iid);
 82+ this.autoOptimize = autoOptimize;
 83+ this.langCode = iid.getLangCode();
8484 String dbname = iid.getDBname();
8585 nsmap = Localization.getLocalizedNamespaces(langCode,dbname);
8686 interwiki = Localization.getInterwiki();
@@ -87,13 +87,7 @@
8888 imageLocalized = Localization.getLocalizedImage(langCode,dbname);
8989 state = State.FLUSHED;
9090 initWriter(writer);
91 - //reader = IndexReader.open(path);
92 - nsf = global.getDefaultNamespace(iid);
93 - cache = new ObjectCache(10000);
94 - // init cache manager
95 - /*CacheManager manager = CacheManager.create();
96 - cache = new Cache("links", 5000, false, false, 5, 2);
97 - manager.addCache(cache); */
 91+ nsf = iid.getDefaultNamespace();
9892 keyOnly = makeSelector("article_key");
9993 redirectOnly = makeSelector("redirect");
10094 contextOnly = makeSelector("context");
@@ -122,7 +116,7 @@
123117 String path = iid.getIndexPath();
124118 log.info("Using index at "+path);
125119 IndexWriter writer = WikiIndexModifier.openForWrite(path,false);
126 - return new Links(iid,path,writer);
 120+ return new Links(iid,path,writer,false);
127121 }
128122
129123 public static Links openStandalone(IndexId iid) throws IOException {
@@ -138,7 +132,7 @@
139133 public static Links openForRead(IndexId iid, String path) throws IOException {
140134 iid = iid.getLinks();
141135 log.info("Opening for read "+path);
142 - return new Links(iid,path,null);
 136+ return new Links(iid,path,null,true);
143137 }
144138
145139 /** Create new in the import path */
@@ -147,7 +141,7 @@
148142 String path = iid.getImportPath();
149143 log.info("Making index at "+path);
150144 IndexWriter writer = WikiIndexModifier.openForWrite(path,true);
151 - Links links = new Links(iid,path,writer);
 145+ Links links = new Links(iid,path,writer,true);
152146 return links;
153147 }
154148
@@ -156,7 +150,7 @@
157151 iid = iid.getLinks();
158152 log.info("Making index in memory");
159153 IndexWriter writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);
160 - Links links = new Links(iid,null,writer);
 154+ Links links = new Links(iid,null,writer,true);
161155 return links;
162156 }
163157
@@ -172,7 +166,7 @@
173167 nsmap.put(namespace.toLowerCase(),index);
174168 }
175169
176 - /** Write all changes, optimize/close everything
 170+ /** Write all changes, optimize if in autoOptimize mode
177171 * @throws IOException */
178172 public void flush() throws IOException{
179173 // close & optimize
@@ -181,7 +175,8 @@
182176 if(reader != null)
183177 reader.close();
184178 if(writer != null){
185 - writer.optimize();
 179+ if(autoOptimize)
 180+ writer.optimize();
186181 writer.close();
187182 }
188183 state = State.FLUSHED;
@@ -193,15 +188,7 @@
194189 * @throws IOException
195190 */
196191 protected void flushForRead() throws IOException{
197 - // close & optimize
198 - if(searcher != null)
199 - searcher.close();
200 - if(reader != null)
201 - reader.close();
202 - if(writer != null){
203 - writer.optimize();
204 - writer.close();
205 - }
 192+ flush();
206193 log.debug("Opening index reader");
207194 // reopen
208195 reader = IndexReader.open(path);
@@ -238,27 +225,28 @@
239226 openForWrite();
240227 }
241228
242 - /** Modify existing article links info */
243 - public void modifyArticleInfo(String text, Title t, boolean exactCase) throws IOException{
 229+ /** Delete article info connected to title t */
 230+ public void deleteArticleInfo(Title t) throws IOException {
244231 ensureWrite();
245232 writer.deleteDocuments(new Term("article_key",t.getKey()));
246 - addArticleInfo(text,t,exactCase);
247233 }
 234+ /** Delete by page_id, not ns:title key */
 235+ public void deleteArticleInfoByIndexKey(String key) throws IOException {
 236+ ensureWrite();
 237+ writer.deleteDocuments(new Term("article_pageid",key));
 238+ }
248239
249240 /** Add links and other info from article
250241 * @throws IOException */
251 - public void addArticleInfo(String text, Title t, boolean exactCase) throws IOException{
 242+ public void addArticleInfo(String text, Title t, boolean exactCase, String pageId) throws IOException{
252243 ensureWrite();
253244 Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
254245 int namespace = t.getNamespace();
255246 Matcher matcher = linkPat.matcher(text);
256247 int ns; String title;
257248 boolean escaped;
258 - //PrefixAnalyzer prefixAnalyzer = new PrefixAnalyzer();
259249
260250 ArrayList<String> pagelinks = new ArrayList<String>();
261 - // article link -> contexts
262 - //HashMap<String,ArrayList<String>> contextMap = new HashMap<String,ArrayList<String>>();
263251
264252 // use context only for namespace in default search
265253 boolean useContext = nsf.contains(t.getNamespace());
@@ -339,6 +327,7 @@
340328 StringList lk = new StringList(pagelinks);
341329 Analyzer an = new SplitAnalyzer(1,true);
342330 Document doc = new Document();
 331+ doc.add(new Field("article_pageid",pageId,Field.Store.YES,Field.Index.UN_TOKENIZED));
343332 // ns:title
344333 doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
345334 if(redirectsTo != null)
@@ -348,8 +337,6 @@
349338 // a list of all links
350339 doc.add(new Field("links",lk.toString(),Field.Store.NO,Field.Index.TOKENIZED));
351340 }
352 - // key split up into prefixes (for prefix index)
353 - // doc.add(new Field("prefix",prefixAnalyzer.tokenStream("prefix",t.getKey())));
354341
355342 writer.addDocument(doc,an);
356343 state = State.MODIFIED;
@@ -430,18 +417,17 @@
431418 }
432419 return false;
433420 }
434 -
435 - @Deprecated
436 - /** If article is redirect, get target, else null */
437 - public String getRedirectTargetOld(String key) throws IOException{
 421+
 422+ /** Get page_id for ns:title */
 423+ public String getPageId(String key) throws IOException {
438424 ensureRead();
439425 TermDocs td = reader.termDocs(new Term("article_key",key));
440426 if(td.next()){
441 - return reader.document(td.doc(),redirectOnly).get("redirect");
 427+ return reader.document(td.doc()).get("article_pageid");
442428 }
443429 return null;
444430 }
445 -
 431+
446432 /** If article is redirect, get target key, else null */
447433 public String getRedirectTarget(String key) throws IOException{
448434 ensureRead();
@@ -637,19 +623,16 @@
638624 writer.close();
639625 if(reader != null)
640626 reader.close();
641 - if(directory != null)
642 - directory.close();
 627+ //if(directory != null)
 628+ // directory.close();
643629 }
644630
645631 public ObjectCache getCache() {
646632 return cache;
647633 }
648634
649 - /*public ObjectCache getRefCache() {
650 - return refCache;
651 - } */
652 -
653 -
654 -
655 -
 635+ public boolean isAutoOptimize() {
 636+ return autoOptimize;
 637+ }
 638+
656639 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -72,8 +72,10 @@
7373 Configuration.open();
7474 String text = "bre! (ant) and some. it's stupid it's something and 5\"6' or more, links abacus";
7575 showTokens(text);
76 - text = "bre! u.s. {{template|text}} {{template|text2|text3}} [http://ls2.wiki link]";
 76+ text = "This, is a '''list of [[African]] countries and dependencies by [[population]]'''.\n\n{| border=\"1\" cellpadding=\"2\" cellspacing=\"0\" style=\"border-collapse:collapse; text-align:right;\"\n|- style=\"text-align:center; background:#efefef\"\n!Pos !! Country !! Population\n|-\n| align=\"left\" |-\n| align=\"left\" |'''Africa''' || 934,283,426\n|-\n";
7777 showTokens(text);
 78+ text = "u.s. {{template|text}} {{template|text2|text3}} [http://ls2.wiki link]";
 79+ showTokens(text);
7880 text = "Good-Thomas C# C++ and so on.. ";
7981 showTokens(text);
8082 text = "[[Image:Argishti monument.JPG|thumb|King Argishti of Urartu riding a chariot with two horses in Yerevan, Armenia in front of the Erebuni Museum.]]'''Urartu''' (Assyrian ''Urarṭu'', [[Urartian language|Urartian]] ''Biainili'') was an ancient [[kingdom (politics)|kingdom]] of [[Armenia]]&lt;ref&gt;&quot;Urartu.&quot; Columbia Electronic Encyclopedia. Columbia University Press.&lt;/ref&gt; located in the mountainous plateau between [[Asia Minor]], [[Mesopotamia]], and [[Caucasus mountains]], later known as the [[Armenian Highland]], and it centered around [[Lake Van]] (present-day eastern [[Turkey]]). The kingdom existed from ca. [[860s BC|860 BC]], emerging from Late Bronze Age [[Nairi]] polities, until [[585 BC]]. The name corresponds to the [[Bible|Biblical]] '''[[Mount Ararat|Ararat]]'''.";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
@@ -113,6 +113,7 @@
114114 {"ommmmmmmmiteed", "omitted"},
115115 {"ommmmmmmmitted", "omitted"},
116116 {"a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",""},
 117+ {"Douglas Adams's Guide to The Hitch-Hiker's Guide to the Galaxy",""},
117118
118119 };
119120
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
@@ -48,7 +48,7 @@
4949 && res.get(1).getWord().equals(m[1]))
5050 good++;
5151 else if(r.getDist() > 1){
52 - SuggestResult split = sc.suggestSplit(m[0],0);
 52+ SuggestResult split = sc.suggestSplit(m[0],null);
5353 if(split!=null && m[1].equals(split.getWord()))
5454 good++;
5555 else{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java
@@ -104,10 +104,10 @@
105105 Analyzer analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("enwiki"));
106106 //Analyzer analyzer = Analyzers.getHighlightAnalyzer(IndexId.get("enwiki"));
107107 Analyzer old = new EnglishAnalyzer();
108 - String text = "a-b compatibly compatible Gödel; The who is a band. The who is Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method.";
 108+ String text = "Pokémons a-b compatibly compatible Gödel; The who is a band. The who is Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method.";
109109 displayTokens(analyzer,text);
110110 displayTokens(old,text);
111 - text = "links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)";
 111+ text = "Pokémons links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)";
112112 displayTokens(analyzer,text);
113113 text = "Thomas c# c++ good-thomas Good-Thomas rats RATS Frame semantics (linguistics) 16th century sixteenth .fr web.fr other";
114114 displayTokens(analyzer,text);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -318,7 +318,7 @@
319319 assertTrue(ents1.isTitlesBySuffix());
320320 assertEquals("w",ents1.getInterwikiBySuffix("wiki"));
321321 assertEquals(ents1,IndexId.get("enwiki").getTitlesIndex());
322 - assertEquals("en",testgc.getLanguage(ents1));
 322+ assertEquals("en",ents1.getLangCode());
323323 assertEquals("{wiki=enwiki}",ents1.getSuffixToDbname().toString());
324324 IndexId ents2 = IndexId.get("en-titles.tspart2");
325325 assertEquals("{wikisource=enwikisource, wiktionary=enwiktionary, test=entest}",ents2.getSuffixToDbname().toString());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/HighlightTest.java
@@ -49,7 +49,7 @@
5050 Term[] terms = termSet.toArray(new Term[] {});
5151 IndexSearcher searcher = SearcherCache.getInstance().getLocalSearcher(iid);
5252 int[] df = searcher.docFreqs(terms);
53 - Highlight.highlight(hits,iid,terms,df,searcher.maxDoc(),parser.getWordsClean(),StopWords.getPredefinedSet(iid),false,null,false);
 53+ Highlight.highlight(hits,iid,terms,df,searcher.maxDoc(),parser.getWordsClean(),StopWords.getPredefinedSet(iid),false,null,false,false);
5454 }
5555
5656 public static void timeTest(String dbname, String dbnameSrc) throws Exception {
@@ -86,7 +86,7 @@
8787 Document doc = reader.document(docid);
8888 hits.add(doc.get("namespace")+":"+doc.get("title"));
8989 }
90 - Highlight.ResultSet rs = Highlight.highlight(hits,iid,terms,df,maxDoc,words,stopWords,false,null,false);
 90+ Highlight.ResultSet rs = Highlight.highlight(hits,iid,terms,df,maxDoc,words,stopWords,false,null,false,false);
9191 HashMap<String,HighlightResult> res = rs.highlighted;
9292 count += res.size();
9393 if(i!=0 && i % 200 == 0){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java
@@ -221,7 +221,7 @@
222222 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
223223 "<head>\n<title>Error: " + code + " " + message + "</title>\n"+
224224 "</head>\n<body>\n<h1>" + code + " " + message + "</h1>\n"+
225 - "<p>" + detail + "</p>\n<hr />\n<p><i>MWSearch on localhost" +
 225+ "<div>" + detail + "</div>\n<hr />\n<p><i>LSearch daemon on localhost" +
226226 "</i></p>\n</body>\n</html>");
227227 }
228228
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
@@ -36,6 +36,8 @@
3737 String what;
3838 /** Client-supplied database we should operate on */
3939 String dbname;
 40+
 41+ public static final double CURRENT_VERSION = 2.1;
4042
4143 public SearchDaemon(Socket sock) {
4244 super(sock);
@@ -68,7 +70,8 @@
6971 try{
7072 SearchEngine engine = new SearchEngine();
7173 HashMap query = new QueryStringMap(uri);
72 - SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query);
 74+ double version = getVersion(query);
 75+ SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query,version);
7376 contentType = "text/plain";
7477 // format:
7578 // <num of hits>
@@ -85,45 +88,49 @@
8689 }
8790 } else{
8891 sendOutputLine(Integer.toString(res.getNumHits()));
89 - SuggestQuery sq = res.getSuggest();
90 - if(sq != null && sq.hasSuggestion()){
91 - sendOutputLine("#suggest ["+sq.getRangesSerialized()+"] "+encode(sq.getSearchterm()));
92 - } else
93 - sendOutputLine("#no suggestion");
94 - if(res.getTitles() != null){
95 - sendOutputLine("#interwiki "+res.getTitles().size());
96 - for(ResultSet rs : res.getTitles()){
97 - sendOutputLine(rs.getScore()+" "+encode(rs.getInterwiki())+" "+rs.getNamespace()+" "+encodeTitle(rs.getTitle()));
 92+ if(version>=2.1){
 93+ SuggestQuery sq = res.getSuggest();
 94+ if(sq != null && sq.hasSuggestion()){
 95+ sendOutputLine("#suggest ["+sq.getRangesSerialized()+"] "+encode(sq.getSearchterm()));
 96+ } else
 97+ sendOutputLine("#no suggestion");
 98+ if(res.getTitles() != null){
 99+ sendOutputLine("#interwiki "+res.getTitles().size());
 100+ for(ResultSet rs : res.getTitles()){
 101+ sendOutputLine(rs.getScore()+" "+encode(rs.getInterwiki())+" "+rs.getNamespace()+" "+encodeTitle(rs.getTitle()));
 102+ if(rs.getExplanation() != null)
 103+ sendOutputLine(rs.getExplanation().toString());
 104+ if(rs.getHighlight() != null){
 105+ HighlightResult hr = rs.getHighlight();
 106+ sendHighlight("title",hr.getTitle());
 107+ sendHighlightWithTitle("redirect",hr.getRedirect());
 108+ }
 109+ }
 110+ } else
 111+ sendOutputLine("#interwiki 0");
 112+ sendOutputLine("#results");
 113+ }
 114+ for(ResultSet rs : res.getResults()){
 115+ sendResultLine(rs.score, rs.namespace, rs.title);
 116+ if(version>=2.1){
 117+ if(rs.getContext() != null){
 118+ for(String c : rs.getContext())
 119+ sendOutputLine("#context "+c);
 120+ }
98121 if(rs.getExplanation() != null)
99122 sendOutputLine(rs.getExplanation().toString());
100123 if(rs.getHighlight() != null){
101124 HighlightResult hr = rs.getHighlight();
102 - sendHighlight("title",hr.getTitle());
 125+ sendHighlight("title",hr.getTitle());
 126+ for(Snippet sn : hr.getText())
 127+ sendHighlight("text",sn);
103128 sendHighlightWithTitle("redirect",hr.getRedirect());
 129+ sendHighlightWithFragment("section",hr.getSection());
 130+ if(hr.getDate() != null)
 131+ sendHighlight("date",hr.getDate());
 132+ sendHighlight("wordcount",Integer.toString(hr.getWordCount()));
104133 }
105134 }
106 - } else
107 - sendOutputLine("#interwiki 0");
108 - sendOutputLine("#results");
109 - for(ResultSet rs : res.getResults()){
110 - sendResultLine(rs.score, rs.namespace, rs.title);
111 - if(rs.getContext() != null){
112 - for(String c : rs.getContext())
113 - sendOutputLine("#context "+c);
114 - }
115 - if(rs.getExplanation() != null)
116 - sendOutputLine(rs.getExplanation().toString());
117 - if(rs.getHighlight() != null){
118 - HighlightResult hr = rs.getHighlight();
119 - sendHighlight("title",hr.getTitle());
120 - for(Snippet sn : hr.getText())
121 - sendHighlight("text",sn);
122 - sendHighlightWithTitle("redirect",hr.getRedirect());
123 - sendHighlightWithFragment("section",hr.getSection());
124 - if(hr.getDate() != null)
125 - sendHighlight("date",hr.getDate());
126 - sendHighlight("wordcount",Integer.toString(hr.getWordCount()));
127 - }
128135 }
129136 }
130137 } else if(res.getFormat() == Format.JSON){
@@ -170,7 +177,17 @@
171178 }
172179 }
173180
174 -
 181+
 182+ private double getVersion(HashMap query) {
 183+ String v = (String)query.get("version");
 184+ if(v == null)
 185+ v = (String)query.get("ver");
 186+ if(v != null)
 187+ return Double.parseDouble(v);
 188+ return CURRENT_VERSION;
 189+ }
 190+
 191+
175192 private String makeHighlight(String type, Snippet snippet){
176193 if(snippet == null)
177194 return null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -1,6 +1,7 @@
22 package org.wikimedia.lsearch.spell;
33
44 import java.io.IOException;
 5+import java.io.Serializable;
56 import java.util.ArrayList;
67 import java.util.Collection;
78 import java.util.Collections;
@@ -56,6 +57,7 @@
5758 protected NamespaceFilter defaultNs;
5859 protected HashMap<String,Boolean> wordExistCache = new HashMap<String,Boolean>();
5960 protected enum Filtering { STRONG, WEAK };
 61+ protected boolean useLogging = true;
6062
6163 /** Distance an metaphone metrics */
6264 static public class Metric {
@@ -123,6 +125,8 @@
124126
125127 protected static class Namespaces {
126128 HashSet<Integer> namespaces = new HashSet<Integer>();
 129+ /** If true, these namespaces are additional to the default namespaces,
 130+ * if false, there is no intersection between these namespaces and default namespaces */
127131 boolean additional = false;
128132 String prefix = "ns_";
129133 public Namespaces(HashSet<Integer> namespaces, boolean additional) {
@@ -155,19 +159,26 @@
156160 /** Number of results to fetch for titles */
157161 public static final int POOL_TITLE = 100;
158162 /** Number of results to fetch for fuzzy word matches */
159 - public static final int POOL_FUZZY = 1000;
 163+ public static final int POOL_FUZZY = 500;
160164 /** Number of words to return for fuzzy queries */
161165 public static final int MAX_FUZZY = 50;
162166
163167 /** Lower limit to hit rate for joining */
164168 public static final int JOIN_FREQ = 1;
165169
166 - public Suggest(IndexId iid) throws IOException{
 170+ public Suggest(IndexId iid) throws IOException {
 171+ this(iid,null,true);
 172+ }
 173+
 174+ public Suggest(IndexId iid, IndexSearcher searcher, boolean useLogging) throws IOException{
167175 SearcherCache cache = SearcherCache.getInstance();
168176 this.iid = iid;
169 - this.searcher = cache.getLocalSearcher(iid.getSpell());
 177+ if(searcher == null)
 178+ searcher = cache.getLocalSearcher(iid.getSpell());
 179+ this.searcher = searcher;
170180 this.reader = searcher.getIndexReader();
171181 this.defaultNs = iid.getDefaultNamespace();
 182+ this.useLogging = useLogging;
172183
173184 synchronized(stopWordsIndexes){
174185 if(!stopWordsIndexes.containsKey(searcher)){
@@ -209,18 +220,43 @@
210221 }
211222
212223 /**
 224+ * Extra information that will help disambiguate some suggest cases,
 225+ * e.g. words from titles found in search, phrases found in text, ...
 226+ * @author rainman
 227+ *
 228+ */
 229+ public static class ExtraInfo implements Serializable {
 230+ protected HashSet<String> phrases;
 231+ protected HashSet<String> foundInContext;
 232+ protected HashSet<String> foundInTitles;
 233+ protected int firstRank;
 234+
 235+ public ExtraInfo(HashSet<String> phrases, HashSet<String> foundInContext, HashSet<String> foundInTitles, int firstRank) {
 236+ this.phrases = phrases;
 237+ this.foundInContext = foundInContext;
 238+ this.foundInTitles = foundInTitles;
 239+ this.firstRank = firstRank;
 240+ }
 241+
 242+ public ExtraInfo(){
 243+ this(new HashSet<String>(),new HashSet<String>(),new HashSet<String>(),0);
 244+ }
 245+
 246+
 247+ }
 248+
 249+ /**
213250 * Make a suggestion for a query
214251 *
215252 * @throws IOException
216253 */
217254 @SuppressWarnings("unchecked")
218 - public SuggestQuery suggest(String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext,
219 - int firstRank, NamespaceFilter nsf) throws IOException{
 255+ public SuggestQuery suggest(String searchterm, ArrayList<Token> tokens, ExtraInfo info, NamespaceFilter nsf) throws IOException{
220256 FilterFactory filters = new FilterFactory(iid);
221257 wordExistCache.clear();
222258 long start = System.currentTimeMillis();
223259
224 - System.out.println("tokens: "+tokens+" inContext:"+foundInContext+" phrases:"+phrases);
 260+ // System.out.println("tokens: "+tokens+" inContext:"+info.foundInContext+" phrases:"+info.phrases+", inTitles="+info.foundInTitles);
225261
226262 if(tokens.size() > 30){
227263 logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start);
@@ -262,7 +298,6 @@
263299 }
264300
265301 // init suggestions
266 - int minFreq = 0;
267302 ArrayList<Change> suggestions = new ArrayList<Change>();
268303 ArrayList<Change> suggestionsTitle = new ArrayList<Change>();
269304 HashMap<String,HashSet<String>> contextCache = new HashMap<String,HashSet<String>>();
@@ -272,7 +307,7 @@
273308 String redirectTarget = followRedirect(joinTokens,ns);
274309 if(redirectTarget != null){
275310 EditDistance ed = new EditDistance(joinTokens);
276 - if(ed.getDistance(redirectTarget) <= 2 && betterRank(titleRank(redirectTarget,ns),firstRank)){
 311+ if(ed.getDistance(redirectTarget) <= 2 && betterRank(titleRank(redirectTarget,ns),info.firstRank)){
277312 HashMap<Integer,String> changes = extractTitleChanges(joinTokens,redirectTarget,tokens);
278313 if(changes != null){
279314 SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns);
@@ -291,7 +326,7 @@
292327 logRequest(searchterm,"CORRECT (exact title match)",start);
293328 return new SuggestQuery(searchterm,new ArrayList<Integer>());
294329 }
295 - if(betterRank(r.frequency,firstRank)){
 330+ if(betterRank(r.frequency,info.firstRank)){
296331 HashMap<Integer,String> changes = extractTitleChanges(joinTokens,r.word,tokens);
297332 if(changes != null){
298333 SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns);
@@ -312,7 +347,7 @@
313348 if(r.isExactMatch()){
314349 logRequest(searchterm,"CORRECT (by single word index)",start);
315350 return new SuggestQuery(searchterm,new ArrayList<Integer>());
316 - } else if(r.dist == 1 && betterRank(r.frequency,firstRank)){
 351+ } else if(r.dist == 1 && betterRank(r.frequency,info.firstRank)){
317352 HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>();
318353 proposedChanges.put(0,r.word);
319354 SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,new HashSet<Integer>(),ns);
@@ -323,13 +358,13 @@
324359 }
325360
326361 // check if all words are found within phrases during highlighting
327 - if(tokens.size() > 1 && tokens.size() == phrases.size() + 1){
 362+ if(tokens.size() > 1 && tokens.size() == info.phrases.size() + 1){
328363 logRequest(searchterm,"CORRECT (by highlight phrases)",start);
329364 return new SuggestQuery(searchterm,new ArrayList<Integer>());
330365 }
331366
332367 // indexes of words in found during highlighting in phrases
333 - HashSet<Integer> inPhrases = new HashSet<Integer>();
 368+ //HashSet<Integer> inPhrases = new HashSet<Integer>();
334369 // words that might spellcheck to stop words
335370 ArrayList<SuggestResult> possibleStopWords = new ArrayList<SuggestResult>();
336371 // word suggestions
@@ -377,7 +412,7 @@
378413 possibleStopWords.add(null);
379414 }
380415 // suggest split
381 - SuggestResult split = suggestSplit(w,minFreq);
 416+ SuggestResult split = suggestSplit(w,ns);
382417 if(split != null){
383418 Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);
384419 sc.substitutes.put(i,split.word.replace("_"," "));
@@ -388,7 +423,7 @@
389424 if(i-1 >= 0
390425 && (wordSug.get(i-1)==null || !wordSug.get(i-1).get(0).isExactMatch())
391426 && (wordSug.get(i)==null || !wordSug.get(i).get(0).isExactMatch())){
392 - SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,minFreq);
 427+ SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,ns);
393428 if(join != null){
394429 Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
395430 sc.substitutes.put(i-1,"");
@@ -459,27 +494,29 @@
460495 int freq = (Integer)ret[0];
461496 boolean inTitle = (Boolean)ret[1];
462497
463 - // log.info("Checking "+phrase);
 498+ //log.debug("Checking "+phrase);
464499 boolean inContext = inContext(s1.word,s2.word,contextCache,allWords,ns) || inContext(s2.word,s1.word,contextCache,allWords,ns);
465500 if(freq > 0 || inContext){
466501 // number of characters added/substracted
467502 int diff1 = Math.abs(s1.word.length()-w1.length());
468503 int diff2 = Math.abs(s2.word.length()-w2.length());
469 - log.info("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle);
 504+ log.debug("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle);
470505 int dist = s1.dist + s2.dist + distOffset;
471506 boolean accept = true;
472507 Change c = new Change(dist,freq,Change.Type.PHRASE);
473508 // register changes
474509 if(s1.word.equals(w1))
475510 c.preserves.put(i,w1);
476 - else if(!good1 || ((inTitle||inContext) && diff1 <=2 && !foundInContext.contains(w1)) )
 511+ else if((!good1 && !info.foundInTitles.contains(w1))
 512+ || ((inTitle||inContext) && diff1 <=2 && !info.foundInContext.contains(w1)) )
477513 c.substitutes.put(i,s1.word);
478514 else
479515 accept = false;
480516
481517 if(s2.word.equals(w2))
482518 c.preserves.put(i2,w2);
483 - else if(!good2 || ((inTitle||inContext) && diff2 <= 2 && !foundInContext.contains(w2)))
 519+ else if((!good2 && !info.foundInTitles.contains(w2))
 520+ || ((inTitle||inContext) && diff2 <= 2 && !info.foundInContext.contains(w2)))
484521 c.substitutes.put(i2,s2.word);
485522 else
486523 accept = false;
@@ -522,7 +559,7 @@
523560 return sq;
524561 }
525562 }
526 - log.info("Spell-checking based on phrases...");
 563+ log.debug("Spell-checking based on phrases...");
527564 // find best suggestion based on phrases
528565 HashMap<Integer,String> preserveTokens = new HashMap<Integer,String>();
529566 HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>();
@@ -544,11 +581,12 @@
545582 for(int i=0;i<tokens.size();i++){
546583 if(preserveTokens.containsKey(i) || proposedChanges.containsKey(i))
547584 continue;
 585+ String w = tokens.get(i).termText();
548586 ArrayList<SuggestResult> sug = wordSug.get(i);
549587 if(sug == null)
550588 continue;
551589 SuggestResult s = sug.get(0);
552 - if(!s.isExactMatch() && acceptWordChange(tokens.get(i).termText(),s)){
 590+ if(!s.isExactMatch() && !info.foundInTitles.contains(w) && acceptWordChange(w,s)){
553591 distance += s.dist;
554592 proposedChanges.put(i,s.word);
555593 if(using.equals("phrases"))
@@ -564,7 +602,7 @@
565603 SuggestResult tr = titleRes.get(0);
566604 HashMap<Integer,String> changes = extractTitleChanges(joinTokens,tr.word,tokens);
567605 if(changes != null){
568 - if(tr.dist <= distance && (betterRank(tr.frequency,firstRank) || proposedChanges.equals(changes))){
 606+ if(tr.dist <= distance && (betterRank(tr.frequency,info.firstRank) || proposedChanges.equals(changes))){
569607 // we found a much better suggestion !
570608 proposedChanges = changes;
571609 alwaysReplace.addAll(proposedChanges.keySet());
@@ -690,6 +728,26 @@
691729 }
692730 return b;
693731 }
 732+ /** Get frequency of a word if exists (0 if not) */
 733+ private int wordFrequency(String w, Namespaces ns) throws IOException {
 734+ if(ns == null){ // default
 735+ TermDocs td = reader.termDocs(new Term("word",w));
 736+ if(td.next())
 737+ return getFrequency(reader.document(td.doc()),null);
 738+ return 0;
 739+ } else{ // other
 740+ int freq = 0;
 741+ TermDocs td = reader.termDocs(new Term(ns.prefix+"word",w));
 742+ if(td.next())
 743+ freq = getFrequency(reader.document(td.doc()),ns);
 744+ if(ns.additional){ // also look in main
 745+ TermDocs td2 = reader.termDocs(new Term("word",w));
 746+ if(td2.next())
 747+ freq += getFrequency(reader.document(td2.doc()),null);
 748+ }
 749+ return freq;
 750+ }
 751+ }
694752
695753 /** Return true if (striped) title exists in the index */
696754 private boolean titleExists(String w, Namespaces ns) throws IOException{
@@ -762,8 +820,9 @@
763821 if(w.equals(nt))
764822 continue; // trying to subtitute same
765823 // incorrect words, or doesn't stem to same
766 - boolean sameStem = (alwaysReplace.contains(e.getKey()))? false : filters.stemsToSame(FastWikiTokenizerEngine.decompose(w),FastWikiTokenizerEngine.decompose(nt));
767 - if(!sameStem || (sameStem && !wordExists(w,ns))){
 824+ boolean sameStem = (alwaysReplace.contains(e.getKey()))? false : filters.stemsToSame(FastWikiTokenizerEngine.decompose(w),FastWikiTokenizerEngine.decompose(nt)) || filters.stemsToSame(w,nt);
 825+ //if(!sameStem || (sameStem && !wordExists(w,ns))){
 826+ if(!sameStem){
768827 int so = t.startOffset();
769828 int eo = t.endOffset();
770829 if(so != start)
@@ -940,7 +999,7 @@
9411000 }
9421001 });
9431002
944 - log.info("Sorted changes: "+changes);
 1003+ log.debug("Sorted changes: "+changes);
9451004
9461005 HashMap<Integer,String> accept = new HashMap<Integer,String>();
9471006 HashMap<Integer,String> preserve = new HashMap<Integer,String>();
@@ -971,7 +1030,7 @@
9721031 break;
9731032 }
9741033 if(changesBadWord){
975 - log.info("Considering "+c);
 1034+ log.debug("Considering "+c);
9761035 boolean acceptChange = true;
9771036 for(Entry<Integer,String> e : c.substitutes.entrySet()){
9781037 String acceptedTerm = accept.get(e.getKey());
@@ -983,7 +1042,7 @@
9841043 }
9851044 }
9861045 if(acceptChange && (dist + c.dist < maxDist)){
987 - log.info("Applying "+c);
 1046+ log.debug("Applying "+c);
9881047 processedChange.add(i);
9891048 for(Entry<Integer,String> e : c.substitutes.entrySet()){
9901049 accept.put(e.getKey(),e.getValue());
@@ -1014,7 +1073,7 @@
10151074 }
10161075 }
10171076 if(acceptChange && (dist + c.dist < maxDist)){
1018 - log.info("Applying "+c);
 1077+ log.debug("Applying "+c);
10191078 processedChange.add(i);
10201079 for(Entry<Integer,String> e : c.substitutes.entrySet()){
10211080 accept.put(e.getKey(),e.getValue());
@@ -1058,7 +1117,7 @@
10591118 }
10601119
10611120 /** Merge two result sets */
1062 - public ArrayList<SuggestResult> mergeResults(ArrayList<SuggestResult> main, ArrayList<SuggestResult> add, int num){
 1121+ public ArrayList<SuggestResult> mergeResults(ArrayList<SuggestResult> main, ArrayList<SuggestResult> add, int num, Filtering filter){
10631122 // merge
10641123 HashMap<String,SuggestResult> map = new HashMap<String,SuggestResult>();
10651124 ArrayList<SuggestResult> toAdd = new ArrayList<SuggestResult>();
@@ -1074,7 +1133,10 @@
10751134 }
10761135 main.addAll(toAdd);
10771136 // re-sort
1078 - Collections.sort(main,new SuggestResult.Comparator());
 1137+ if(filter == Filtering.WEAK)
 1138+ Collections.sort(main,new SuggestResult.ComparatorNoCommonMisspell());
 1139+ else
 1140+ Collections.sort(main,new SuggestResult.Comparator());
10791141 // trim
10801142 ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
10811143 for(int i=0;i<num && i<main.size();i++)
@@ -1097,7 +1159,7 @@
10981160 ArrayList<SuggestResult> res = suggestWordsOnNamespaces(word,word,num,num,namespaces,filter);
10991161 if(namespaces.additional){
11001162 ArrayList<SuggestResult> def = suggestWordsOnNamespaces(word,word,num,num,null,filter); // add from default
1101 - return mergeResults(def,res,num);
 1163+ return mergeResults(def,res,num,filter);
11021164 }
11031165 return res;
11041166 }
@@ -1135,7 +1197,10 @@
11361198 res.add(r);
11371199 }
11381200 // sort
1139 - Collections.sort(res,new SuggestResult.Comparator());
 1201+ if(filter == Filtering.WEAK)
 1202+ Collections.sort(res,new SuggestResult.ComparatorNoCommonMisspell());
 1203+ else
 1204+ Collections.sort(res,new SuggestResult.Comparator());
11401205 ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
11411206 for(int i=0;i<num && i<res.size();i++)
11421207 ret.add(res.get(i));
@@ -1148,9 +1213,7 @@
11491214 }
11501215
11511216 private int getFrequency(Document d, Namespaces namespaces) {
1152 - String prefix = "";
1153 - if(namespaces != null) // namespaces=null -> default namespace, empty -> all
1154 - prefix = namespaces.prefix;
 1217+ String prefix = getPrefix(namespaces);
11551218 int freq = 0;
11561219 if(namespaces == null)
11571220 freq = Integer.parseInt(d.get(prefix+"freq"));
@@ -1168,39 +1231,46 @@
11691232 return freq;
11701233 }
11711234
 1235+ /** @return {frequency (int), inTitle (boolean)} */
11721236 private Object[] getPhrase(String phrase, Namespaces namespaces) throws IOException {
1173 - String prefix = "";
1174 - if(namespaces != null) // namespaces=null -> default namespace, empty -> all
1175 - prefix = namespaces.prefix;
1176 -
 1237+ String prefix = getPrefix(namespaces);
11771238 int freq = 0;
1178 - boolean inTitle = false;
1179 - TermDocs td = reader.termDocs(new Term(prefix+"phrase",phrase));
1180 - if(td.next()){
1181 - Document d = reader.document(td.doc());
1182 - if(namespaces == null){
1183 - freq = Integer.parseInt(d.get(prefix+"freq"));
 1239+ boolean inTitle = false;
 1240+ // default namespaces
 1241+ if(namespaces == null || namespaces.additional){
 1242+ TermDocs td = reader.termDocs(new Term("phrase",phrase));
 1243+ if(td.next()){
 1244+ Document d = reader.document(td.doc());
 1245+ String f = d.get("freq");
 1246+ freq = Integer.parseInt(f);
11841247 String it = d.get("intitle");
11851248 if(it!=null && it.equals("1"))
11861249 inTitle = true;
1187 - } else{ // all namespaces
1188 - if(namespaces.namespaces.isEmpty()){
1189 - freq = Integer.parseInt(d.get(prefix+"freq"));
1190 - String it = d.get("intitle");
1191 - if(it!=null && it.equals("1"))
1192 - inTitle = true;
1193 -
1194 - } else{
 1250+ }
 1251+ }
 1252+ // other
 1253+ if(namespaces!=null){
 1254+ TermDocs td = reader.termDocs(new Term(prefix+"phrase",phrase));
 1255+ if(td.next()){
 1256+ Document d = reader.document(td.doc());
 1257+ String it = d.get(prefix+"intitle");
 1258+ if(it!=null && it.equals("1"))
 1259+ inTitle = true;
 1260+
 1261+ if(namespaces.namespaces.isEmpty()){ // all
 1262+ String f = d.get(prefix+"freq");
 1263+ if(f != null)
 1264+ freq += Integer.parseInt(f);
 1265+ } else{ // some subset
11951266 for(Integer i : namespaces.namespaces){
11961267 String f = d.get(prefix+"freq_"+i);
1197 - if(f != null){
 1268+ if(f != null)
11981269 freq += Integer.parseInt(f);
1199 - inTitle = true;
1200 - }
1201 - }
 1270+ }
12021271 }
12031272 }
12041273 }
 1274+
12051275 return new Object[] { freq, inTitle};
12061276 }
12071277
@@ -1211,7 +1281,7 @@
12121282 ArrayList<SuggestResult> res = suggestTitlesOnNamespaces(title,num,pool_size,distance,namespaces);
12131283 if(namespaces.additional){
12141284 ArrayList<SuggestResult> main = suggestTitlesOnNamespaces(title,num,pool_size,distance,null);
1215 - return mergeResults(main,res,num);
 1285+ return mergeResults(main,res,num,Filtering.STRONG);
12161286 }
12171287 return res;
12181288 }
@@ -1329,25 +1399,19 @@
13301400 }
13311401
13321402 /** Try to split word into 2 words which make up a phrase */
1333 - public SuggestResult suggestSplit(String word, int minFreq){
1334 - int freq = 0;
1335 - Hits hits;
 1403+ public SuggestResult suggestSplit(String word, Namespaces ns){
13361404 ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
13371405 try {
13381406 // find frequency
1339 - hits = searcher.search(new TermQuery(new Term("word",word)));
1340 - if(hits.length() == 1)
1341 - freq = Integer.parseInt(hits.doc(0).get("freq"));
 1407+ int wordFreq = wordFrequency(word,ns);
13421408
13431409 // try different splits
13441410 for(int i=1;i<word.length()-1;i++){
13451411 String phrase = word.substring(0,i) + "_" + word.substring(i);
1346 - hits = searcher.search(new TermQuery(new Term("phrase",phrase)));
1347 - if(hits.length() > 0){
1348 - int pfreq = Integer.parseInt(hits.doc(0).get("freq"));
1349 - if(pfreq >= freq && pfreq > minFreq)
1350 - res.add(new SuggestResult(phrase,pfreq,2));
1351 - }
 1412+ Object[] ret = getPhrase(phrase,ns);
 1413+ int freq = (Integer)ret[0];
 1414+ if(freq > wordFreq)
 1415+ res.add(new SuggestResult(phrase,freq,2));
13521416 }
13531417 if(res.size() > 0){
13541418 Collections.sort(res,new SuggestResult.Comparator());
@@ -1361,14 +1425,13 @@
13621426 }
13631427
13641428 /** Returns suggestion if joining words makes sense */
1365 - public SuggestResult suggestJoin(String word1, String word2, int minFreq){
 1429+ public SuggestResult suggestJoin(String word1, String word2, Namespaces ns){
13661430 try {
1367 - Hits hits = searcher.search(new TermQuery(new Term("word",word1+word2)));
1368 - if(hits.length() > 0){
1369 - int freq = Integer.parseInt(hits.doc(0).get("freq"));
1370 - if(freq >= minFreq)
1371 - return new SuggestResult(word1+word2,freq,1);
1372 - }
 1431+ Object[] ret = getPhrase(word1+"_"+word2,ns);
 1432+ int freqPhrase = (Integer)ret[0];
 1433+ int freqJoin = wordFrequency(word1+word2,ns);
 1434+ if(freqJoin > 0 && freqJoin > freqPhrase)
 1435+ return new SuggestResult(word1+word2,freqJoin,1);
13731436 } catch (IOException e) {
13741437 log.warn("I/O error while suggesting join on "+iid+" : "+e.getMessage());
13751438 e.printStackTrace();
@@ -1379,7 +1442,10 @@
13801443 /** Fetch a set of string for fuzzy queries */
13811444 public ArrayList<SuggestResult> getFuzzy(String word, NamespaceFilter nsf){
13821445 Namespaces ns = makeNamespaces(nsf);
1383 - ArrayList<SuggestResult> sug = suggestWords(word,POOL_FUZZY,ns,Filtering.WEAK);
 1446+ int pool = POOL_FUZZY;
 1447+ if(word.length() <= 4)
 1448+ pool *= 2;
 1449+ ArrayList<SuggestResult> sug = suggestWords(word,pool,ns,Filtering.WEAK);
13841450 ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
13851451 for(int i=0;i<MAX_FUZZY && i<sug.size();i++){
13861452 ret.add(sug.get(i));
@@ -1388,7 +1454,8 @@
13891455 }
13901456
13911457 protected void logRequest(String searchterm, String using, long start){
1392 - log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms");
 1458+ if(useLogging)
 1459+ log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms");
13931460 }
13941461
13951462 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
@@ -73,15 +73,15 @@
7474 System.out.println(r);
7575 }
7676
77 - System.out.println("SPLIT: "+sc.suggestSplit(text,0));
 77+ System.out.println("SPLIT: "+sc.suggestSplit(text,null));
7878 }
7979 if(last != null){
80 - System.out.println("JOIN: "+sc.suggestJoin(last,text,0));
 80+ System.out.println("JOIN: "+sc.suggestJoin(last,text,null));
8181 }
8282 last = text;
8383 }
8484 }
85 - System.out.println("#suggest: "+sc.suggest(inputtext,parser.tokenizeBareText(inputtext),new HashSet<String>(),new HashSet<String>(),0,new NamespaceFilter("0")));
 85+ System.out.println("#suggest: "+sc.suggest(inputtext,parser.tokenizeBareText(inputtext),new Suggest.ExtraInfo(new HashSet<String>(),new HashSet<String>(),new HashSet<String>(),0),new NamespaceFilter("0")));
8686 System.out.println("(finished in "+(System.currentTimeMillis()-start)+" ms)");
8787 }
8888
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
@@ -14,6 +14,7 @@
1515 import org.apache.lucene.document.Field.Store;
1616 import org.apache.lucene.index.CorruptIndexException;
1717 import org.apache.lucene.index.IndexWriter;
 18+import org.apache.lucene.index.Term;
1819 import org.wikimedia.lsearch.analyzers.Analyzers;
1920 import org.wikimedia.lsearch.analyzers.FieldBuilder;
2021 import org.wikimedia.lsearch.analyzers.FilterFactory;
@@ -44,6 +45,7 @@
4546 protected String langCode;
4647 protected Analyzer analyzer;
4748 protected HashSet<String> stopWords;
 49+ protected NamespaceFilter nsf;
4850
4951 /** Make a new index, and init writer on it (on importPath())*/
5052 public static CleanIndexWriter newForWrite(IndexId iid) throws IOException{
@@ -63,9 +65,10 @@
6466 GlobalConfiguration global = GlobalConfiguration.getInstance();
6567 this.iid = iid;
6668 this.builder = new FieldBuilder(iid,FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
67 - this.langCode = global.getLanguage(iid.getDBname());
 69+ this.langCode = iid.getLangCode();
6870 analyzer = Analyzers.getIndexerAnalyzer(builder);
6971 this.stopWords = StopWords.getPredefinedSet(iid);
 72+ nsf = global.getDefaultNamespace(iid);
7073
7174 HashSet<String> stopWords = new HashSet<String>();
7275 for(String w : StopWords.getStopWords(iid))
@@ -83,6 +86,19 @@
8487 writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
8588 }
8689
 90+ public void deleteArticleInfo(String pageId) throws IOException {
 91+ writer.deleteDocuments(new Term("key",pageId));
 92+ }
 93+
 94+ /** Call this to add information about the article into index */
 95+ public void addArticleInfo(Article a){
 96+ // only for articles in default namespace(s)
 97+ if(nsf.contains(Integer.parseInt(a.getNamespace())))
 98+ addArticle(a);
 99+ else
 100+ addTitleOnly(a);
 101+ }
 102+
87103 /** Add single article */
88104 protected void addArticle(Article a){
89105 //if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
@@ -102,8 +118,9 @@
103119 }
104120
105121 /** Add title/redirect with ranks information only */
106 - public void addTitleOnly(Article article) {
 122+ protected void addTitleOnly(Article article) {
107123 Document doc = new Document();
 124+ doc.add(new Field("key",article.getIndexKey(),Store.NO,Index.UN_TOKENIZED));
108125 doc.add(new Field("ns_title",article.getTitle(),Store.YES,Index.TOKENIZED));
109126 doc.add(new Field("ns_namespace",article.getNamespace(),Store.YES,Index.UN_TOKENIZED));
110127 doc.add(new Field("ns_rank",Integer.toString(article.getReferences()),Store.YES,Index.NO));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
@@ -83,7 +83,7 @@
8484 if(minPhraseFreq < 1)
8585 minPhraseFreq = 1;
8686 this.createNew = createNew;
87 - this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
 87+ this.langCode=iid.getLangCode();
8888 this.ngramWriter = new NgramIndexer();
8989 this.registry = IndexRegistry.getInstance();
9090 }
@@ -222,7 +222,7 @@
223223 while((word = dict.next()) != null){
224224 String w = word.getWord();
225225 if(w.contains("_")){ // phrase
226 - addNsPhrase(w,ir);
 226+ addNsPhrase(w,ir,true);
227227 } else{ // word
228228 addNsWord(w,ir);
229229 }
@@ -329,7 +329,7 @@
330330 }
331331
332332 /** Add phrase in namespace other than default */
333 - public void addNsPhrase(String phrase, IndexReader ir) throws IOException {
 333+ public void addNsPhrase(String phrase, IndexReader ir, boolean inTitle) throws IOException {
334334 if(phrase.length() <= 2){
335335 log.warn("Invalid phrase: "+phrase);
336336 return;
@@ -342,6 +342,9 @@
343343 for(Entry<String,SimpleInt> e : freq.entrySet()){
344344 doc.add(new Field("ns_freq_"+e.getKey(), Integer.toString(e.getValue().count), Field.Store.YES, Field.Index.NO));
345345 }
 346+ if(inTitle){
 347+ doc.add(new Field("ns_intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED));
 348+ }
346349 ngramWriter.addDocument(doc);
347350 }
348351
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java
@@ -47,15 +47,14 @@
4848 CleanIndexWriter writer;
4949 String langCode;
5050 Links links;
51 - NamespaceFilter nsf;
5251
5352 public CleanIndexImporter(IndexId iid, String langCode) throws IOException{
5453 Configuration.open(); // make sure configuration is loaded
5554 this.writer = CleanIndexWriter.newForWrite(iid);
5655 this.langCode = langCode;
57 - this.links = Links.openForRead(iid,iid.getLinks().getImportPath());
58 - nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
59 - log.info("Rebuilding for namespaces: "+nsf);
 56+ this.links = Links.openStandalone(iid);
 57+
 58+ //log.info("Rebuilding for namespaces: "+nsf);
6059 }
6160 public void writeRevision(Revision revision) throws IOException {
6261 this.revision = revision;
@@ -72,8 +71,8 @@
7372 ArrayList<String> redirectsHere = links.getRedirectsTo(key);
7473 references -= redirectsHere.size(); // we want raw rank, without redirects
7574
76 - if(redirectTargetNamespace<0 || !nsf.contains(redirectTargetNamespace))
77 - redirectTo = null; // redirect to other namespace
 75+ if(redirectTargetNamespace<0 || redirectTargetNamespace != page.Title.Namespace)
 76+ redirectTo = null; // redirect to different namespace
7877 }
7978 Date date = new Date(revision.Timestamp.getTimeInMillis());
8079
@@ -88,11 +87,7 @@
8988 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,redirectTo,
9089 references,redirectTargetNamespace,redirects,new ArrayList<RelatedTitle>(),anchors,date);
9190
92 - // only for articles in default namespace(s)
93 - if(nsf.contains(page.Title.Namespace))
94 - writer.addArticle(article);
95 - else
96 - writer.addTitleOnly(article);
 91+ writer.addArticleInfo(article);
9792 }
9893
9994 public void close() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java
@@ -60,7 +60,7 @@
6161 }
6262 long start = System.currentTimeMillis();
6363 try {
64 - rebuildFromLinksNew(iid);
 64+ rebuildFromLinks(iid);
6565 } catch (IOException e) {
6666 log.fatal("Rebuild I/O error: "+e.getMessage());
6767 e.printStackTrace();
@@ -71,84 +71,9 @@
7272
7373 System.out.println("Finished generating related in "+formatTime(end-start));
7474 }
75 -
76 - @Deprecated
77 - public static void rebuildFromDump(String inputfile, IndexId iid) throws IOException{
78 - GlobalConfiguration global = GlobalConfiguration.getInstance();
79 - String langCode = global.getLanguage(iid);
80 - log.info("First pass, getting a list of valid articles...");
81 - // first pass - titles
82 - InputStream input = null;
83 - input = Tools.openInputFile(inputfile);
84 - NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
85 - TitleReader tr = new TitleReader(iid,langCode,nsf);
86 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
87 - reader.readDump();
88 - input.close();
89 - CompactLinks links = tr.getTitles();
90 - tr = null; // GC
91 -
92 - log.info("Second pass, geting in/out links...");
93 - // second pass - in/out links
94 - input = Tools.openInputFile(inputfile);
95 - LinkReader rr = new LinkReader(links,iid);
96 - reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
97 - reader.readDump();
98 - links.compactAll();
99 - store(links,iid);
100 - }
10175
102 - /**
103 - * Rebuild related articles index for iid
104 - * @throws IOException
105 - */
106 - @Deprecated
107 - public static void rebuildFromLinks(IndexId iid) throws IOException {
108 - CompactLinks links = new CompactLinks();
109 - Links temp = Links.openForRead(iid,iid.getLinks().getImportPath());
110 -
111 - NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);
112 - log.info("Reading titles in default search");
113 - Dictionary dict = temp.getKeys();
114 - Word w;
115 - HashMap<Integer,CompactArticleLinks> keyCache = new HashMap<Integer,CompactArticleLinks>();
116 - while((w = dict.next()) != null){
117 - String key = w.getWord();
118 - int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));
119 - if(nsf.contains(ns)){
120 - links.add(key,temp.getNumInLinks(key));
121 - keyCache.put(temp.getDocId(key),links.get(key));
122 - }
123 - }
124 -
125 - log.info("Reading in/out links");
126 - dict = temp.getKeys();
127 - while((w = dict.next()) != null){
128 - String key = w.getWord();
129 - int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));
130 - if(nsf.contains(ns)){
131 - CompactArticleLinks l = links.get(key);
132 - // inlinks
133 - l.setInLinks(temp.getInLinks(l,keyCache));
134 - // outlinks
135 - ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>();
136 - for(String k : temp.getOutLinks(key).toCollection()){
137 - CompactArticleLinks cs = links.get(k);
138 - if(cs != null)
139 - out.add(cs);
140 - }
141 - l.setOutLinks(out);
142 - }
143 - }
144 - temp.close();
145 - temp = null; // GC
146 - keyCache = null; // GC
147 -
148 - store(links,iid);
149 - }
150 -
15176 /** Calculate from links index */
152 - public static void rebuildFromLinksNew(IndexId iid) throws IOException {
 77+ public static void rebuildFromLinks(IndexId iid) throws IOException {
15378 Links links = Links.openForRead(iid,iid.getLinks().getImportPath());
15479 RelatedStorage store = new RelatedStorage(iid);
15580
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
@@ -132,10 +132,10 @@
133133 }
134134
135135 // inherit javadoc
136 - public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int[] df, int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases) throws RemoteException{
 136+ public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int[] df, int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws RemoteException{
137137 IndexId iid = IndexId.get(dbrole);
138138 try{
139 - return Highlight.highlight(hits,iid,terms,df,maxDoc,words,StopWords.getPredefinedSet(iid),exactCase,null,sortByPhrases);
 139+ return Highlight.highlight(hits,iid,terms,df,maxDoc,words,StopWords.getPredefinedSet(iid),exactCase,null,sortByPhrases,alwaysIncludeFirst);
140140 } catch(IOException e){
141141 throw new RemoteException("IOException on "+dbrole,e);
142142 }
@@ -151,10 +151,10 @@
152152 }
153153 }
154154
155 - public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf) throws RemoteException {
 155+ public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf) throws RemoteException {
156156 IndexId iid = IndexId.get(dbrole);
157157 try{
158 - return new Suggest(iid).suggest(searchterm,tokens,phrases,foundInContext,firstRank,nsf);
 158+ return new Suggest(iid).suggest(searchterm,tokens,info,nsf);
159159 } catch(Exception e){
160160 e.printStackTrace();
161161 throw new RemoteException("Exception on "+dbrole,e);
@@ -171,6 +171,16 @@
172172 }
173173 }
174174
 175+ public SearchResults searchRelated(String dbrole, String searchterm, int offset, int limit) throws RemoteException {
 176+ IndexId iid = IndexId.get(dbrole);
 177+ try{
 178+ return new SearchEngine().searchRelatedLocal(iid,searchterm,offset,limit);
 179+ } catch(IOException e){
 180+ e.printStackTrace();
 181+ throw new RemoteException("Exception on "+dbrole,e);
 182+ }
 183+ }
 184+
175185 protected RMIMessengerImpl(){
176186 networkStatus = null;
177187 indexRegistry = null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
@@ -32,6 +32,7 @@
3333 import org.wikimedia.lsearch.search.SuffixFilterWrapper;
3434 import org.wikimedia.lsearch.search.SuffixNamespaceWrapper;
3535 import org.wikimedia.lsearch.search.Wildcards;
 36+import org.wikimedia.lsearch.spell.Suggest;
3637 import org.wikimedia.lsearch.spell.SuggestQuery;
3738 import org.wikimedia.lsearch.spell.SuggestResult;
3839
@@ -247,13 +248,13 @@
248249 }
249250 }
250251
251 - public Highlight.ResultSet highlight(String host, ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases){
 252+ public Highlight.ResultSet highlight(String host, ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst){
252253 try{
253254 RMIMessenger r = messengerFromCache(host);
254 - return r.highlight(hits,dbrole,terms,df,maxDoc,words,exactCase,sortByPhrases);
 255+ return r.highlight(hits,dbrole,terms,df,maxDoc,words,exactCase,sortByPhrases,alwaysIncludeFirst);
255256 } catch(Exception e){
256257 e.printStackTrace();
257 - return new Highlight.ResultSet(new HashMap<String,HighlightResult>(),new HashSet<String>(),new HashSet<String>(),false,0);
 258+ return new Highlight.ResultSet(new HashMap<String,HighlightResult>(),new HashSet<String>(),new HashSet<String>(),false,0,new HashSet<String>());
258259 }
259260 }
260261
@@ -279,10 +280,10 @@
280281 }
281282 }
282283
283 - public SuggestQuery suggest(String host, String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf){
 284+ public SuggestQuery suggest(String host, String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf){
284285 try{
285286 RMIMessenger r = messengerFromCache(host);
286 - return r.suggest(dbrole,searchterm,tokens,phrases,foundInContext,firstRank,nsf);
 287+ return r.suggest(dbrole,searchterm,tokens,info,nsf);
287288 } catch(Exception e){
288289 if(host == null){
289290 log.warn("Cannot find spell-check host for "+dbrole);
@@ -304,9 +305,29 @@
305306 return r.getFuzzy(dbrole,word,nsf);
306307 } catch(Exception e){
307308 e.printStackTrace();
308 - log.warn("Error invoking getFuzzyt() on "+host+" : "+e.getMessage());
 309+ log.warn("Error invoking getFuzzy() on "+host+" : "+e.getMessage());
309310 return new ArrayList<SuggestResult>();
310311 }
311312 }
 313+
 314+ /** dbrole pointing to original dbrole, not .related, e.g. wikilucene, not wikilucene.related */
 315+ public SearchResults searchRelated(String host, String dbrole, String searchterm, int offset, int limit){
 316+ try{
 317+ RMIMessenger r = messengerFromCache(host);
 318+ return r.searchRelated(dbrole,searchterm,offset,limit);
 319+ } catch(Exception e){
 320+ e.printStackTrace();
 321+ log.warn("Error invoking searchRelated() on "+host+" : "+e.getMessage());
 322+ if(host!=null && !isLocal(host)){
 323+ if(cache == null)
 324+ cache = SearcherCache.getInstance();
 325+ cache.invalidateSearchable(IndexId.get(dbrole),host);
 326+ }
 327+ SearchResults res = new SearchResults();
 328+ res.setErrorMsg("Error searching related index: "+e.getMessage());
 329+ return res;
 330+ }
 331+
 332+ }
312333
313334 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java
@@ -22,6 +22,7 @@
2323 import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
2424 import org.wikimedia.lsearch.search.SuffixFilterWrapper;
2525 import org.wikimedia.lsearch.search.SuffixNamespaceWrapper;
 26+import org.wikimedia.lsearch.spell.Suggest;
2627 import org.wikimedia.lsearch.spell.SuggestQuery;
2728 import org.wikimedia.lsearch.spell.SuggestResult;
2829
@@ -133,9 +134,9 @@
134135 * @param maxDoc - max number of documents in the index (needed for idf calculation)
135136 * @param words - main phrase words, gives extra score
136137 * @param exactCase - if this is an exact case query
137 - * @return map: key -> highlighting result
 138+ * @return resultset
138139 */
139 - public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases) throws RemoteException;
 140+ public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws RemoteException;
140141
141142 /**
142143 * Search grouped titles, similar logic to that of searchPart()
@@ -161,7 +162,7 @@
162163 * @return
163164 * @throws RemoteException
164165 */
165 - public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf) throws RemoteException;
 166+ public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf) throws RemoteException;
166167
167168 /**
168169 * Fetch words for fuzzy queries (e.g. query~)
@@ -172,5 +173,17 @@
173174 * @return
174175 * @throws RemoteException
175176 */
176 - public ArrayList<SuggestResult> getFuzzy(String dbrole, String word, NamespaceFilter nsf) throws RemoteException;
 177+ public ArrayList<SuggestResult> getFuzzy(String dbrole, String word, NamespaceFilter nsf) throws RemoteException;
 178+
 179+ /**
 180+ * Search a remote related index
 181+ *
 182+ * @param dbrole
 183+ * @param searchterm
 184+ * @param limit
 185+ * @param offset
 186+ * @return
 187+ * @throws RemoteException
 188+ */
 189+ public SearchResults searchRelated(String dbrole, String searchterm, int offset, int limit) throws RemoteException;
177190 }
Index: branches/lucene-search-2.1/webinterface/lsweb.py
@@ -401,10 +401,10 @@
402402 self.wfile.write('</body></html>')
403403 except HTTPError:
404404 self.send_error(400,'Bad request')
405 - self.wfile.write("Error in query")
 405+ self.wfile.write("<div>Error in query</div>")
406406 except URLError:
407407 self.send_error(500,'Internal Server Error')
408 - self.wfile.write("Cannot connect to lucene search 2 daemon")
 408+ self.wfile.write("<div>Cannot connect to lucene search 2 daemon</div>")
409409 delta_time = time.time() - start_time
410410 print '[%s] Processed query %s in %d ms' %(time.strftime("%Y-%m-%d %H:%M:%S"),self.path,int(delta_time*1000))
411411 elif s[2] == '/':

Status & tagging log