r25462 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r25461‎ | r25462 | r25463 >
Date:22:06, 3 September 2007
Author:rainman
Status:old
Tags:
Comment:
* Rewrite link analysis for lower memory profile, slows down the complete
rebuilds, but not too much for larger wikis (maybe 20%)
* Drop MySQL storage - store link analysis in specialized lucene index
* Extract anchor text (related extraction seems to be too slow)
* Cleanup of spellcheck stuff, use only one index
* Make suggest index be rebuilt for defaultnamespacestobesearchedorsomethinglikethat
Modified paths:
  • /branches/lucene-search-2.1/.project (modified) (history)
  • /branches/lucene-search-2.1/lsearch-global.conf (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Related.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringList.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NamespaceFreq.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/WordsIndexer.java (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/ArticleAnalytics.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/MySQLStorage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/Storage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/suggest (deleted) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java (modified) (history)
  • /branches/lucene-search-2.1/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/.project
@@ -1,6 +1,6 @@
22 <?xml version="1.0" encoding="UTF-8"?>
33 <projectDescription>
4 - <name>search-2.0</name>
 4+ <name>search-2</name>
55 <comment>JavaCC Nature</comment>
66 <projects>
77 </projects>
Index: branches/lucene-search-2.1/test-data/mwsearch-global.test
@@ -9,7 +9,7 @@
1010 # aspell <language>
1111 [Database]
1212 entest : (mainsplit), (mainpart,false,2,10), (restpart,true,2)
13 -entest : (ngram), (spell_words,3,20), (spell_titles,1,2,20)
 13+entest : (ngram), (spell,1,2)
1414 detest,rutest : (single,true,2,10)
1515 frtest : (split,3) (part1) (part2) (part3)
1616 srwiki : (single)
@@ -28,7 +28,7 @@
2929 192.168.0.10 :entest.mainpart
3030 192.168.0.2 : entest.restpart, rutest
3131 [Search-Group]
32 -192.168.0.1 : njawiki entest.spell_words entest.spell_titles
 32+192.168.0.1 : njawiki entest.spell
3333
3434 # Index nodes
3535 # host: db1.role, db2.role
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/ArticleAnalytics.java
@@ -0,0 +1,99 @@
 2+package org.wikimedia.lsearch.storage;
 3+
 4+import java.util.Collection;
 5+
 6+import org.wikimedia.lsearch.ranks.Related;
 7+
 8+/**
 9+ * Various link analysis info about the article
 10+ *
 11+ * @author rainman
 12+ *
 13+ */
 14+public class ArticleAnalytics {
 15+ String key;
 16+ int references;
 17+ String redirectTarget;
 18+ Collection<String> anchorText;
 19+ Collection<Related> related;
 20+ Collection<String> redirectKeys;
 21+
 22+ /**
 23+ * @param key - article key (ns:title)
 24+ * @param references - number of links to article
 25+ * @param redirectTarget - if article is redirect - target article key, otherwise null
 26+ * @param anchorText - anchor texts
 27+ * @param relatedKeys - related articles (ns:title)
 28+ * @param redirectKeys - articles that redirect here (ns:title)
 29+ *
 30+ */
 31+ public ArticleAnalytics(String key, int references, String redirectTarget, Collection<String> anchorText, Collection<Related> related, Collection<String> redirectKeys) {
 32+ this.key = key;
 33+ this.references = references;
 34+ this.redirectTarget = redirectTarget;
 35+ this.anchorText = anchorText;
 36+ this.related = related;
 37+ this.redirectKeys = redirectKeys;
 38+ }
 39+
 40+ @Override
 41+ public String toString() {
 42+ return key+" : ref="+references+", redirect_to="+redirectTarget+", anchor="+anchorText+", redirects="+redirectKeys+", related="+related;
 43+ }
 44+
 45+ public boolean isRedirect(){
 46+ return redirectTarget != null;
 47+ }
 48+
 49+
 50+ public Collection<String> getAnchorText() {
 51+ return anchorText;
 52+ }
 53+
 54+ public void setAnchorText(Collection<String> anchorText) {
 55+ this.anchorText = anchorText;
 56+ }
 57+
 58+ public String getKey() {
 59+ return key;
 60+ }
 61+
 62+ public void setKey(String key) {
 63+ this.key = key;
 64+ }
 65+
 66+ public Collection<String> getRedirectKeys() {
 67+ return redirectKeys;
 68+ }
 69+
 70+ public void setRedirectKeys(Collection<String> redirectKeys) {
 71+ this.redirectKeys = redirectKeys;
 72+ }
 73+
 74+ public String getRedirectTarget() {
 75+ return redirectTarget;
 76+ }
 77+
 78+ public void setRedirectTarget(String redirectTarget) {
 79+ this.redirectTarget = redirectTarget;
 80+ }
 81+
 82+ public int getReferences() {
 83+ return references;
 84+ }
 85+
 86+ public void setReferences(int references) {
 87+ this.references = references;
 88+ }
 89+
 90+ public Collection<Related> getRelated() {
 91+ return related;
 92+ }
 93+
 94+ public void setRelated(Collection<Related> related) {
 95+ this.related = related;
 96+ }
 97+
 98+
 99+
 100+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/Storage.java
@@ -11,7 +11,7 @@
1212 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
1313 import org.wikimedia.lsearch.ranks.Related;
1414 import org.wikimedia.lsearch.ranks.RelatedTitle;
15 -
 15+@Deprecated
1616 abstract public class Storage {
1717 static protected Storage instance = null;
1818
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java
@@ -0,0 +1,106 @@
 2+package org.wikimedia.lsearch.storage;
 3+
 4+import java.io.IOException;
 5+import java.util.Collection;
 6+
 7+import org.apache.log4j.Logger;
 8+import org.apache.lucene.document.Document;
 9+import org.apache.lucene.document.Field;
 10+import org.apache.lucene.index.IndexReader;
 11+import org.apache.lucene.index.IndexWriter;
 12+import org.apache.lucene.index.Term;
 13+import org.apache.lucene.index.TermDocs;
 14+import org.wikimedia.lsearch.analyzers.SplitAnalyzer;
 15+import org.wikimedia.lsearch.beans.LocalIndex;
 16+import org.wikimedia.lsearch.config.IndexId;
 17+import org.wikimedia.lsearch.config.IndexRegistry;
 18+import org.wikimedia.lsearch.index.IndexThread;
 19+import org.wikimedia.lsearch.ranks.Related;
 20+import org.wikimedia.lsearch.ranks.StringList;
 21+
 22+/**
 23+ * Store/retrieve link analysis results
 24+ *
 25+ * @author rainman
 26+ *
 27+ */
 28+public class LinkAnalysisStorage {
 29+ static Logger log = Logger.getLogger(LinkAnalysisStorage.class);
 30+ protected IndexId iid;
 31+ protected IndexWriter writer = null;
 32+ protected IndexReader reader = null;
 33+ protected IndexRegistry registry = IndexRegistry.getInstance();
 34+
 35+ public LinkAnalysisStorage(IndexId iid){
 36+ this.iid = iid.getLinkAnalysis();
 37+ }
 38+
 39+ protected void ensureWrite() throws IOException{
 40+ if(writer == null){
 41+ writer = new IndexWriter(iid.getImportPath(), new SplitAnalyzer(), true);
 42+ }
 43+ }
 44+
 45+ protected void ensureRead() throws IOException{
 46+ if(reader == null){
 47+ LocalIndex li = registry.getLatestSnapshot(iid);
 48+ if(li == null)
 49+ throw new IOException("There are no snapshots for "+iid);
 50+
 51+ reader = IndexReader.open(li.getPath());
 52+ }
 53+ }
 54+ /**
 55+ * Add rank analysis stuff for a single article
 56+ * @throws IOException
 57+ */
 58+ public void addAnalitics(ArticleAnalytics aa) throws IOException{
 59+ ensureWrite();
 60+ //log.info("Writing analitics "+aa);
 61+ Document doc = new Document();
 62+ doc.add(new Field("key",aa.key,Field.Store.YES,Field.Index.UN_TOKENIZED));
 63+ doc.add(new Field("references",Integer.toString(aa.references),Field.Store.YES,Field.Index.NO));
 64+ doc.add(new Field("anchor",new StringList(aa.anchorText).toString(),Field.Store.YES,Field.Index.NO));
 65+ doc.add(new Field("related",new StringList(Related.convertToStringList(aa.related)).toString(),Field.Store.YES,Field.Index.NO));
 66+ doc.add(new Field("redirect",new StringList(aa.redirectKeys).toString(),Field.Store.YES,Field.Index.NO));
 67+ if(aa.redirectTarget != null)
 68+ doc.add(new Field("redirect_to",aa.redirectTarget,Field.Store.YES,Field.Index.NO));
 69+ writer.addDocument(doc);
 70+ }
 71+
 72+ public void snapshot() throws IOException{
 73+ if(writer != null){
 74+ writer.optimize();
 75+ writer.close();
 76+ writer = null;
 77+ IndexThread.makeIndexSnapshot(iid,iid.getImportPath());
 78+ registry.refreshSnapshots(iid);
 79+ }
 80+ }
 81+
 82+ /**
 83+ * Read analitics from latest link analysis index snapshot
 84+ * @param key ns:title
 85+ * @return
 86+ * @throws IOException
 87+ */
 88+ public ArticleAnalytics getAnalitics(String key) throws IOException{
 89+ ensureRead();
 90+
 91+ TermDocs td = reader.termDocs(new Term("key",key));
 92+ if(td.next()){
 93+ Document d = reader.document(td.doc());
 94+ int ref = Integer.parseInt(d.get("references"));
 95+ StringList anchor = new StringList(d.get("anchor"));
 96+ StringList related = new StringList(d.get("related"));
 97+ StringList redirect = new StringList(d.get("redirect"));
 98+ String redirectTarget = d.get("redirect_to");
 99+ return new ArticleAnalytics(key,ref,redirectTarget,
 100+ anchor.toCollection(),
 101+ Related.convertToRelatedList(related.toCollection()),
 102+ redirect.toCollection());
 103+ }
 104+
 105+ return null;
 106+ }
 107+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/MySQLStorage.java
@@ -31,6 +31,7 @@
3232 * @author rainman
3333 *
3434 */
 35+@Deprecated
3536 public class MySQLStorage extends Storage {
3637 static Logger log = Logger.getLogger(MySQLStorage.class);
3738 protected Configuration config;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -66,6 +66,8 @@
6767 protected Hashtable<String,String> wgLanguageCode = null;
6868 /** wgServer, suffix -> server (default server is "default")*/
6969 protected Hashtable<String,String> wgServer = null;
 70+ /** wgNamespacesToBeSearchedDefault from InitialiseSettings, suffix -> lang code */
 71+ protected Hashtable<String,NamespaceFilter> wgDefaultSearch = null;
7072
7173 /** info about this host */
7274 protected static InetAddress myHost;
@@ -183,6 +185,9 @@
184186 database.get(dbname).put(dbpart,new Hashtable<String,String>());
185187 }
186188 }
 189+ // add the link analysis to indexers
 190+ if(!types.contains("link_analysis"))
 191+ database.get(dbname).put("link_analysis",new Hashtable<String,String>());
187192 // add spellcheck indexes
188193 /* if(!types.contains("spell_words"))
189194 database.get(dbname).put("spell_words",new Hashtable<String,String>());
@@ -209,8 +214,7 @@
210215 }
211216 }
212217 // spell check indexes are searched by default if they exist
213 - addToList(hostsearch,dbname+".spell_words");
214 - addToList(hostsearch,dbname+".spell_titles");
 218+ addToList(hostsearch,dbname+".spell");
215219 }
216220 }
217221
@@ -231,7 +235,7 @@
232236 } else if(typeid.matches("nspart[1-9][0-9]*")){
233237 type = "nssplit";
234238 dbrole = dbname + "." + typeid;
235 - } else if(typeid.equals("spell_words") || typeid.equals("spell_titles")){
 239+ } else if(typeid.equals("spell") || typeid.equals("link_analysis")){
236240 type = typeid;
237241 dbrole = dbname + "." + typeid;
238242 } else
@@ -250,7 +254,7 @@
251255 }
252256 }
253257 boolean searched = (getSearchHosts(dbrole).size() != 0);
254 - if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit"))){
 258+ if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit") || typeid.equals("link_analysis"))){
255259 if(verbose)
256260 System.out.println("WARNING: in Global Configuration: index "+dbrole+" is not searched by any host.");
257261 }
@@ -455,6 +459,7 @@
456460 String text = parser.readURL(new URL(initset));
457461 wgLanguageCode = parser.getLanguages(text);
458462 wgServer = parser.getServer(text);
 463+ wgDefaultSearch = parser.getDefaultSearch(text);
459464 } catch (IOException e) {
460465 System.out.println("Error: Cannot read InitialiseSettings.php from url "+initset+" : "+e.getMessage());
461466 }
@@ -516,7 +521,7 @@
517522 } else if(typeid.matches("nspart[1-9][0-9]*")){
518523 type = "nssplit";
519524 dbrole = dbname + "." + typeid;
520 - } else if(typeid.equals("spell_words") || typeid.equals("spell_titles")){
 525+ } else if(typeid.equals("spell") || typeid.equals("link_analysis")){
521526 type = typeid;
522527 dbrole = dbname + "." + typeid;
523528 } else
@@ -802,27 +807,14 @@
803808
804809 dbroles.put(type,params);
805810
806 - } else if(type.equals("spell_words")){
 811+ } else if(type.equals("spell")){
807812 // all params are optional, if absent default will be used
808813 if(tokens.length>1)
809 - params.put("minFreq",tokens[1]);
810 - if(tokens.length>2)
811 - params.put("minHits",tokens[2]);
812 -
813 - if(tokens.length>3 && verbose)
814 - System.out.println("Unrecognized suggest parameters in ("+role+")");
815 -
816 - dbroles.put(type,params);
817 - } else if(type.equals("spell_titles")){
818 - // all params are optional, if absent default will be used
819 - if(tokens.length>1)
820814 params.put("wordsMinFreq",tokens[1]);
821815 if(tokens.length>2)
822816 params.put("phrasesMinFreq",tokens[2]);
823 - if(tokens.length>3)
824 - params.put("minHits",tokens[3]);
825817
826 - if(tokens.length>4 && verbose)
 818+ if(tokens.length>3 && verbose)
827819 System.out.println("Unrecognized suggest parameters in ("+role+")");
828820
829821 dbroles.put(type,params);
@@ -1102,6 +1094,19 @@
11031095 GlobalConfiguration.verbose = verbose;
11041096 }
11051097
 1098+ public NamespaceFilter getDefaultNamespace(IndexId iid){
 1099+ return getDefaultNamespace(iid.getDBname());
 1100+ }
 1101+ public NamespaceFilter getDefaultNamespace(String dbname){
 1102+ if(wgDefaultSearch != null){
 1103+ if(wgDefaultSearch.containsKey(dbname))
 1104+ return wgDefaultSearch.get(dbname);
 1105+ else if(wgDefaultSearch.containsKey("default"))
 1106+ return wgDefaultSearch.get("default");
 1107+ }
 1108+ return new NamespaceFilter(0);
 1109+ }
11061110
 1111+
11071112
11081113 }
\ No newline at end of file
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
@@ -58,7 +58,7 @@
5959 /** If true, this machine is an indexer for this index */
6060 protected boolean myIndex;
6161
62 - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL_WORDS, SPELL_TITLES };
 62+ protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS };
6363
6464 /** Type of index, enumeration */
6565 protected IndexType type;
@@ -156,10 +156,10 @@
157157 this.type = IndexType.SPLIT;
158158 else if(type.equals("nssplit"))
159159 this.type = IndexType.NSSPLIT;
160 - else if(type.equals("spell_words"))
161 - this.type = IndexType.SPELL_WORDS;
162 - else if(type.equals("spell_titles"))
163 - this.type = IndexType.SPELL_TITLES;
 160+ else if(type.equals("spell"))
 161+ this.type = IndexType.SPELL;
 162+ else if(type.equals("link_analysis"))
 163+ this.type = IndexType.LINK_ANALYSIS;
164164
165165 // parts
166166 String[] parts = dbrole.split("\\.");
@@ -251,18 +251,14 @@
252252 public boolean isNssplit(){
253253 return type == IndexType.NSSPLIT;
254254 }
255 - /** If this is the spell-check index for words */
256 - public boolean isSpellWords(){
257 - return type == IndexType.SPELL_WORDS;
 255+ /** If this is the spell-check index */
 256+ public boolean isSpell(){
 257+ return type == IndexType.SPELL;
258258 }
259 - /** It this is the spell-check index for phrases and words from titles */
260 - public boolean isSpellTitles(){
261 - return type == IndexType.SPELL_TITLES;
 259+ /** If this is the link-analysis index */
 260+ public boolean isLinkAnalysis(){
 261+ return type == IndexType.LINK_ANALYSIS;
262262 }
263 - /** If this is one of the spell-check indexes */
264 - public boolean isSpellCheck(){
265 - return isSpellWords() || isSpellTitles();
266 - }
267263
268264 /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */
269265 public int getPartNum() {
@@ -374,7 +370,6 @@
375371 return tempPath;
376372 }
377373
378 -
379374 /** Get search path with resolved symlinks */
380375 public String getCanonicalSearchPath(){
381376 try {
@@ -411,7 +406,7 @@
412407
413408 /** get all hosts that search db this iid belongs to */
414409 public HashSet<String> getDBSearchHosts(){
415 - if(isSingle() || isSpellWords() || isSpellTitles())
 410+ if(isSingle() || isSpell() || isLinkAnalysis())
416411 return searchHosts;
417412 else{
418413 // add all hosts that search: dbname and all parts
@@ -462,7 +457,7 @@
463458 */
464459 public HashSet<String> getPhysicalIndexes() {
465460 HashSet<String> ret = new HashSet<String>();
466 - if(isSingle() || isSpellWords() || isSpellTitles())
 461+ if(isSingle() || isSpell() || isLinkAnalysis())
467462 ret.add(dbrole);
468463 else if(isMainsplit() || isSplit() || isNssplit()){
469464 for(String p : splitParts)
@@ -534,13 +529,13 @@
535530 }
536531
537532 /** Get the coresponding spell words iid */
538 - public IndexId getSpellWords() {
539 - return get(dbname+".spell_words");
 533+ public IndexId getSpell() {
 534+ return get(dbname+".spell");
540535 }
541536
542 - /** Get the coresponding spell titles iid */
543 - public IndexId getSpellTitles() {
544 - return get(dbname+".spell_titles");
 537+ /** Get the link analysis iid */
 538+ public IndexId getLinkAnalysis() {
 539+ return get(dbname+".link_analysis");
545540 }
546541
547542
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java
@@ -91,6 +91,16 @@
9292 return included.get(namespace);
9393 }
9494
 95+ /** Set bit for namespace to true */
 96+ public void set(int namespace){
 97+ included.set(namespace);
 98+ }
 99+
 100+ /** Set bit for namespace to false */
 101+ public void unset(int namespace){
 102+ included.set(namespace,false);
 103+ }
 104+
95105 public boolean contains(int namespace){
96106 return included.get(namespace);
97107 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
@@ -40,11 +40,9 @@
4141 global = GlobalConfiguration.getInstance();
4242
4343 Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup");
44 - if(iid.isSpellCheck()){
 44+ if(iid.isSpell()); // no warmup for spell-chekers
 45+ else if(warmup == null){
4546 makeNamespaceFilters(is,iid);
46 - log.info("Warmed up spell-check index "+iid);
47 - } else if(warmup == null){
48 - makeNamespaceFilters(is,iid);
4947 simpleWarmup(is,iid);
5048 log.info("Warmed up "+iid);
5149 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -40,7 +40,7 @@
4141 import org.wikimedia.lsearch.config.GlobalConfiguration;
4242 import org.wikimedia.lsearch.config.IndexId;
4343 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
44 -import org.wikimedia.lsearch.spell.api.TitleIndexer;
 44+import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
4545 import org.wikimedia.lsearch.util.Localization;
4646
4747 /**
@@ -370,15 +370,6 @@
371371 boolean succ = succAdd; // it's OK if articles cannot be deleted
372372 trans.commit();
373373
374 - // if there is a titles spell-check index, update it
375 - if(iid.getSpellTitles() != null){
376 - TitleIndexer spell = new TitleIndexer(iid);
377 - trans = new Transaction(iid.getSpellTitles());
378 - trans.begin();
379 - spell.update(updateRecords);
380 - trans.commit();
381 - }
382 -
383374 // send reports back to the main indexer host
384375 RMIMessengerClient messenger = new RMIMessengerClient();
385376 if(modifier.reportQueue.size() != 0)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java
@@ -2,39 +2,36 @@
33
44 import java.io.IOException;
55 import java.io.Reader;
 6+import java.util.Iterator;
67
78 import org.apache.lucene.analysis.Analyzer;
89 import org.apache.lucene.analysis.Token;
910 import org.apache.lucene.analysis.TokenStream;
1011 import org.apache.lucene.analysis.Tokenizer;
 12+import org.wikimedia.lsearch.ranks.StringList;
1113
1214 /** Split the text by some specific char */
1315 public class SplitAnalyzer extends Analyzer {
1416 class SplitTokenStream extends Tokenizer {
15 - String[] tokens;
 17+ Iterator<String> it = null;
1618 int in = 0;
1719 int start = 0;
18 - SplitTokenStream(String inputStr){
19 - tokens = inputStr.split(""+splitChar);
 20+ SplitTokenStream(String input){
 21+ it = new StringList(input).iterator();
2022 }
2123 @Override
2224 public Token next() throws IOException {
23 - if(in >= tokens.length)
 25+ if(!it.hasNext())
2426 return null;
2527 else{
 28+ String str = it.next();
2629 int s = start;
27 - int e = start + tokens[in].length();
28 - start = e + 1;
29 - return new Token(tokens[in++],s,e);
30 - }
 30+ int e = start + str.length();
 31+ return new Token(str,s,e);
 32+ }
3133 }
3234 }
33 - char splitChar;
3435
35 - public SplitAnalyzer(char splitChar){
36 - this.splitChar = splitChar;
37 - }
38 -
3936 @Override
4037 public TokenStream tokenStream(String fieldName, String text) {
4138 return new SplitTokenStream(text);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -69,7 +69,7 @@
7070 }
7171 public void writeEndPage() throws IOException {
7272 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),
73 - references,redirects,new ArrayList<RelatedTitle>()); // references and related titles are set correctly later (in incremental updater)
 73+ references,redirects,new ArrayList<RelatedTitle>(), new ArrayList<String>()); // references and related titles are set correctly later (in incremental updater)
7474 log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
7575 records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
7676 log.debug(iid+": Update for "+article);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -31,6 +31,8 @@
3232 import org.wikimedia.lsearch.ranks.OldLinks;
3333 import org.wikimedia.lsearch.ranks.Related;
3434 import org.wikimedia.lsearch.ranks.RelatedTitle;
 35+import org.wikimedia.lsearch.storage.ArticleAnalytics;
 36+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
3537 import org.wikimedia.lsearch.storage.Storage;
3638 import org.wikimedia.lsearch.util.Localization;
3739 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -284,7 +286,6 @@
285287 }
286288
287289 protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException {
288 - Storage store = Storage.getInstance();
289290 ArrayList<Title> titles = new ArrayList<Title>();
290291 for(IndexUpdateRecord rec : records){
291292 if(rec.isDelete())
@@ -298,23 +299,31 @@
299300 }
300301 }
301302 // fetch
302 - OldLinks links = new OldLinks(store.getPageReferences(titles,dbname));
303 - HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname);
 303+ LinkAnalysisStorage store = new LinkAnalysisStorage(IndexId.get(dbname));
 304+ //OldLinks links = new OldLinks(store.getPageReferences(titles,dbname));
 305+ //HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname);
304306 // update
305307 for(IndexUpdateRecord rec : records){
306308 if(rec.isDelete())
307309 continue;
308310 Article ar = rec.getArticle();
309311 Title t = ar.makeTitle();
 312+ ArticleAnalytics aa = store.getAnalitics(t.getKey());
 313+ ArrayList<String> anchors = new ArrayList<String>();
 314+ anchors.addAll(aa.getAnchorText());
310315 // set references
311 - ar.setReferences(links.getLinks(t.getKey()));
 316+ ar.setReferences(aa.getReferences());
312317 if(ar.getRedirects() != null){
313318 for(Redirect r : ar.getRedirects()){
314 - r.setReferences(links.getLinks(r.makeTitle().getKey()));
 319+ ArticleAnalytics raa = store.getAnalitics(r.makeTitle().getKey());
 320+ r.setReferences(raa.getReferences());
 321+ anchors.addAll(raa.getAnchorText());
315322 }
316323 }
 324+ // set anchors
 325+ ar.setAnchorText(anchors);
317326 // set related
318 - ArrayList<RelatedTitle> rt = rel.get(t.getKey());
 327+ /*ArrayList<RelatedTitle> rt = rel.get(t.getKey());
319328 if(rt != null){
320329 Collections.sort(rt,new Comparator<RelatedTitle>() {
321330 public int compare(RelatedTitle o1, RelatedTitle o2){
@@ -325,7 +334,7 @@
326335 }
327336 });
328337 ar.setRelated(rt);
329 - }
 338+ }*/
330339 }
331340 }
332341 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java
@@ -12,6 +12,8 @@
1313 import java.util.regex.Matcher;
1414 import java.util.regex.Pattern;
1515
 16+import org.wikimedia.lsearch.search.NamespaceFilter;
 17+
1618 /**
1719 * Extract some variable from MediaWiki MessageXX.php files. In particular,
1820 * the localized namespace names (needed for proper parsing of wiki code).
@@ -160,6 +162,58 @@
161163 return servers;
162164 }
163165
 166+ /** Get wgNamespacesToBeSearchedDefault from InitialiseSettings */
 167+ public Hashtable<String,NamespaceFilter> getDefaultSearch(String text){
 168+ text = text.replaceAll("(#.*)",""); // strip comments
 169+ Hashtable<String,NamespaceFilter> ret = new Hashtable<String,NamespaceFilter>();
 170+
 171+ int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
 172+ //Pattern wgns = Pattern.compile("[\"']wgNamespacesToBeSearchedDefault[\"']\\s*=>\\s*array\\s*\\(((.*?\\(.*?\\).*?)+)\\)",flags);
 173+ Pattern db = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags);
 174+ Pattern entry = Pattern.compile("(-?[0-9]+)\\s*=>\\s*([01])",flags);
 175+ String t = fetchArray(text,"'wgNamespacesToBeSearchedDefault'");
 176+ Matcher md = db.matcher(t);
 177+ while(md.find()){
 178+ String dbname = md.group(1);
 179+ NamespaceFilter nsf = new NamespaceFilter();
 180+ Matcher me = entry.matcher(md.group(2));
 181+ while(me.find()){
 182+ if(!me.group(2).equals("0"))
 183+ nsf.set(Integer.parseInt(me.group(1)));
 184+ }
 185+ ret.put(dbname,nsf);
 186+ }
 187+ return ret;
 188+ }
 189+
 190+ /** Fetche array by balancing out parenthesis */
 191+ public String fetchArray(String text, String var){
 192+ int start = text.indexOf(var);
 193+ if(start == -1)
 194+ return null;
 195+ char[] t = text.toCharArray();
 196+ int level = 0; boolean ret = false;
 197+ boolean comment = false;
 198+ for(int i=start+var.length();i<t.length;i++){
 199+ if(level == 0 && ret)
 200+ return new String(t,start+var.length(),i-start-var.length());
 201+ if(comment){
 202+ if(t[i] == '#')
 203+ comment = false;
 204+ else
 205+ continue;
 206+ }
 207+ if(t[i] == '('){
 208+ ret = true;
 209+ level ++;
 210+ } else if(t[i] == ')')
 211+ level--;
 212+ else if(t[i] == '#')
 213+ comment = true;
 214+ }
 215+ return null;
 216+ }
 217+
164218 public String readFile(String path){
165219 char buffer[] = new char[32768];
166220 String text = "";
@@ -221,6 +275,7 @@
222276 String initset = p.readURL(new URL("file:///home/rainman/Desktop/InitialiseSettings.php"));
223277 System.out.println(p.getLanguages(initset));
224278 System.out.println(p.getServer(initset));
 279+ System.out.println(p.getDefaultSearch(initset));
225280
226281
227282 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java
@@ -4,16 +4,20 @@
55 import java.io.InputStream;
66 import java.util.ArrayList;
77 import java.util.Arrays;
 8+import java.util.BitSet;
89 import java.util.Collection;
910 import java.util.Collections;
1011 import java.util.Comparator;
1112 import java.util.HashMap;
1213 import java.util.HashSet;
 14+import java.util.Iterator;
1315 import java.util.PriorityQueue;
1416 import java.util.Map.Entry;
1517
1618 import org.apache.log4j.Logger;
1719 import org.apache.lucene.document.Field.Store;
 20+import org.apache.lucene.index.Term;
 21+import org.apache.lucene.index.TermDocs;
1822 import org.mediawiki.dumper.ProgressFilter;
1923 import org.mediawiki.dumper.Tools;
2024 import org.mediawiki.importer.XmlDumpReader;
@@ -24,6 +28,10 @@
2529 import org.wikimedia.lsearch.config.IndexId;
2630 import org.wikimedia.lsearch.index.IndexThread;
2731 import org.wikimedia.lsearch.spell.SuggestResult;
 32+import org.wikimedia.lsearch.spell.api.Dictionary;
 33+import org.wikimedia.lsearch.spell.api.Dictionary.Word;
 34+import org.wikimedia.lsearch.storage.ArticleAnalytics;
 35+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
2836 import org.wikimedia.lsearch.storage.Storage;
2937 import org.wikimedia.lsearch.util.Localization;
3038 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -44,6 +52,7 @@
4553 public static void main(String[] args) throws IOException {
4654 String inputfile = null;
4755 String dbname = null;
 56+ boolean useExistingTemp = false;
4857
4958 System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n");
5059
@@ -51,17 +60,26 @@
5261 log = Logger.getLogger(RankBuilder.class);
5362
5463 if(args.length < 2){
55 - System.out.println("Syntax: java RankBuilder <inputfile> <dbname>");
 64+ System.out.println("Syntax: java RankBuilder [-t] <inputfile> <dbname>");
 65+ System.out.println("Options:");
 66+ System.out.println(" -t - use existing temporary ranking index");
5667 return;
5768 }
58 - inputfile = args[0];
59 - dbname = args[1];
 69+ for(int i=0;i<args.length;i++){
 70+ if(args[i].equals("-t"))
 71+ useExistingTemp = true;
 72+ else if(inputfile == null)
 73+ inputfile = args[i];
 74+ else if(dbname == null)
 75+ dbname = args[i];
 76+ }
6077 if(inputfile == null || dbname == null){
6178 System.out.println("Please specify both input xml file and database name");
6279 return;
6380 }
6481
6582 String langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
 83+ IndexId iid = IndexId.get(dbname);
6684 // preload
6785 UnicodeDecomposer.getInstance();
6886 Localization.readLocalization(langCode);
@@ -69,19 +87,51 @@
7088
7189 long start = System.currentTimeMillis();
7290
73 - // regenerate link info
74 - OldLinks links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS);
75 - links.compactAll();
76 - Storage store = Storage.getInstance();
77 - store.storePageReferences(links.getAll(),dbname);
78 - storeRelated(store,links,dbname);
 91+ // link info
 92+ Links links = null;
 93+ if(useExistingTemp)
 94+ links = Links.openExisting(iid);
 95+ else
 96+ links = processLinks(inputfile,getTitles(inputfile,langCode,iid),langCode);
 97+ //links.cacheInLinks();
 98+ /*log.info("Creating ref count cache");
 99+ HashMap<String,Integer> refCount = new HashMap<String,Integer>();
 100+ HashMap<Integer,String> keyCache = new HashMap<Integer,String>();
 101+ Word w; Dictionary d = links.getKeys();
 102+ while((w = d.next()) != null){
 103+ String key = w.getWord();
 104+ refCount.put(key,links.getNumInLinks(key));
 105+ keyCache.put(links.getDocId(key),key);
 106+ } */
 107+ storeLinkAnalysis(links,iid);
 108+ //Storage store = Storage.getInstance();
 109+ //store.storePageReferences(links.getAll(),dbname);
 110+ //storeRelated(store,links,dbname);
79111
80112 long end = System.currentTimeMillis();
81113
82114 System.out.println("Finished generating ranks in "+formatTime(end-start));
83115 }
84116
85 - public static OldLinks processLinks(String inputfile, OldLinks links, String langCode, boolean readRedirects) {
 117+ public static void storeLinkAnalysis(Links links, IndexId iid) throws IOException{
 118+ log.info("Storing link analysis data");
 119+ LinkAnalysisStorage store = new LinkAnalysisStorage(iid);
 120+ Word w;
 121+ Dictionary keys = links.getKeys();
 122+ while((w = keys.next()) != null){
 123+ String key = w.getWord();
 124+ int ref = links.getNumInLinks(key);
 125+ String redirectTarget = links.getRedirectTarget(key);
 126+ ArrayList<String> anchor = links.getAnchors(key);
 127+ ArrayList<Related> related = new ArrayList<Related>(); //FIXME: too slow getRelated(key,links,refCount,keyCache);
 128+ ArrayList<String> redirect = links.getRedirectsTo(key);
 129+ store.addAnalitics(new ArticleAnalytics(key,ref,redirectTarget,anchor,related,redirect));
 130+ }
 131+ store.snapshot();
 132+
 133+ }
 134+
 135+ public static Links processLinks(String inputfile, Links links, String langCode) {
86136 log.info("Second pass, calculating article links...");
87137 InputStream input = null;
88138 // second pass - calculate page ranks
@@ -92,10 +142,11 @@
93143 return null;
94144 }
95145 // calculate ranks
96 - LinkReader rr = new LinkReader(links,langCode,readRedirects);
 146+ LinkReader rr = new LinkReader(links,langCode);
97147 XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
98148 try {
99149 reader.readDump();
 150+ links.flush();
100151 } catch (IOException e) {
101152 log.fatal("I/O error reading dump while calculating ranks for from "+inputfile);
102153 return null;
@@ -103,7 +154,7 @@
104155 return links;
105156 }
106157
107 - public static OldLinks getTitles(String inputfile,String langCode) {
 158+ public static Links getTitles(String inputfile,String langCode,IndexId iid) {
108159 log.info("First pass, getting a list of valid articles...");
109160 InputStream input = null;
110161 try {
@@ -112,49 +163,56 @@
113164 log.fatal("I/O error opening "+inputfile);
114165 return null;
115166 }
116 - // first pass, get titles
117 - TitleReader tr = new TitleReader(langCode);
118 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
119167 try {
 168+ // first pass, get titles
 169+ TitleReader tr = new TitleReader(langCode,iid);
 170+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
120171 reader.readDump();
121172 input.close();
 173+ Links links = tr.getLinks();
 174+ links.flush();
 175+ return links;
122176 } catch (IOException e) {
123177 log.fatal("I/O error reading dump while getting titles from "+inputfile);
124178 return null;
125179 }
126 - return tr.getTitles();
 180+
127181 }
128182
129 - public static void storeRelated(Storage store, OldLinks links, String dbname) throws IOException{
130 - int num = 0;
131 - int total = links.getAll().size();
132 - ArrayList<Related> buf = new ArrayList<Related>();
133 - for(CompactArticleLinks cs : links.getAll()){
134 - num++;
135 - log.debug("["+num+"/"+total+" - "+cs.linksInIndex+"] "+cs.toString());
136 - buf.addAll(getRelated(cs,links));
137 - if(buf.size() > 10000){
138 - store.storeRelatedPages(buf,dbname);
139 - buf.clear();
140 - }
141 - }
142 - }
143 -
144183 /**
145184 * Get related articles, sorted descending by score
 185+ * @throws IOException
146186 */
147 - public static ArrayList<Related> getRelated(CompactArticleLinks cs, OldLinks links){
 187+ public static ArrayList<Related> getRelated(String key, Links links, HashMap<String,Integer> refCache, HashMap<Integer,String> keyCache) throws IOException{
148188 ArrayList<Related> ret = new ArrayList<Related>();
149189
150 - HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();
151 - if(cs.linksIn != null){
152 - for(CompactArticleLinks csl : cs.linksIn)
153 - ll.add(csl);
 190+ HashMap<String,Integer> map = new HashMap<String,Integer>();
 191+ int i = 1;
 192+ ArrayList<String> inLinks = links.getInLinks(key,keyCache);
 193+ for(String in : inLinks){
 194+ map.put(in,i++);
154195 }
155 - for(CompactArticleLinks from : ll){
156 - double score = relatedScore(cs,ll,from);
 196+ HashSet<Long> internal = new HashSet<Long>();
 197+ for(Entry<String,Integer> e : map.entrySet()){
 198+ String from = e.getKey();
 199+ long inx = e.getValue();
 200+ long offset = inx << 32;
 201+ StringList sl = links.getOutLinks(from);
 202+ Iterator<String> it = sl.iterator();
 203+ while(it.hasNext()){
 204+ Integer inx2 = map.get(it.next());
 205+ if(inx2 != null){
 206+ internal.add(offset + inx2);
 207+ }
 208+ }
 209+ }
 210+ for(Entry<String,Integer> e : map.entrySet()){
 211+ String from = e.getKey();
 212+ int inx = e.getValue();
 213+ //double score = relatedScore(links,in,from,refCount);
 214+ double score = relatedScore(inx,internal,inLinks,refCache);
157215 if(score != 0)
158 - ret.add(new Related(cs,from,score));
 216+ ret.add(new Related(key,from,score));
159217 }
160218 Collections.sort(ret,new Comparator<Related>() {
161219 public int compare(Related o1, Related o2){
@@ -171,7 +229,7 @@
172230 * Get related titles (RelatedTitle is used in Article)
173231 */
174232 public static ArrayList<RelatedTitle> getRelatedTitles(CompactArticleLinks cs, OldLinks links){
175 - ArrayList<Related> rel = getRelated(cs,links);
 233+ ArrayList<Related> rel = null; // getRelated(cs,links);
176234 ArrayList<RelatedTitle> ret = new ArrayList<RelatedTitle>();
177235 for(Related r : rel){
178236 ret.add(new RelatedTitle(new Title(r.relates.toString()),r.score));
@@ -186,23 +244,49 @@
187245 return d;
188246 }
189247
190 - public static double relatedScore(CompactArticleLinks p, HashSet<CompactArticleLinks> ll, CompactArticleLinks q){
 248+ //public static double relatedScore(Links links, HashSet<String> inLinks, String from, HashMap<String,Integer> refCount) throws IOException{
 249+ public static double relatedScore(long q, HashSet<Long> internal, ArrayList<String> inLinks, HashMap<String,Integer> refCache){
 250+ //Collection<String> qInLinks = links.getInLinksFromCache(from);
 251+ //Collection<String> qOutLinks = links.getOutLinks(from).toCollection();
191252 double score = 0;
 253+ for(Long l : internal){
 254+ long l1 = l >> 32;
 255+ long l2 = l - (l1 << 32);
 256+ if(l1 == q && l2 == q)
 257+ continue;
 258+ else if(l1 == q)
 259+ score += 1.0/norm(refCache.get(inLinks.get((int) (l2 - 1))));
 260+ else if(l2 == q)
 261+ score += 1.0/norm(refCache.get(inLinks.get((int) (l1 - 1))));
 262+ }
 263+ /*for(int i=1;i<=inLinks.size();i++){
 264+ if(i!=q && internal.contains(i*q)){
 265+ score += 1.0/norm(refCache.get(inLinks.get(i-1)));
 266+ }
 267+ } */
 268+
192269 // all r that links to q
193 - for(int i=0;i<q.linksInIndex;i++){
194 - CompactArticleLinks r = q.linksIn[i];
195 - if(r != q && r.links != 0 && ll.contains(r)){
196 - score += 1.0/norm(r.links);
 270+ /*for(String r : qInLinks){
 271+ if(!refCount.containsKey(r))
 272+ System.out.println("ERROR for key "+r);
 273+ //int ref = links.getNumInLinks(r);
 274+ int ref = refCount.get(r);
 275+ if(!r.equals(from) && ref != 0 && inLinks.contains(r)){
 276+ score += 1.0/norm(ref);
197277 }
198278
199279 }
200280 // all r that q links to
201 - for(int i=0;i<q.linksOutIndex;i++){
202 - CompactArticleLinks r = q.linksOut[i];
203 - if(r != q && r.links!=0 && ll.contains(r)){
204 - score += 1.0/norm(r.links);
 281+ for(String r : qOutLinks){
 282+ //int ref = links.getNumInLinks(r);
 283+ if(!refCount.containsKey(r))
 284+ System.out.println("ERROR for key "+r);
 285+ int ref = refCount.get(r);
 286+ if(!r.equals(from) && ref != 0 && inLinks.contains(r)){
 287+ score += 1.0/norm(ref);
205288 }
206 - }
 289+
 290+ } */
207291 return score;
208292 }
209293
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringList.java
@@ -0,0 +1,128 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.util.ArrayList;
 5+import java.util.Collection;
 6+import java.util.Iterator;
 7+
 8+/**
 9+ * Maintain a list of string, with emphasis on
 10+ * efficient serialization and deserialization.
 11+ *
 12+ * Note: string length is limited to BUFFER_SIZE chars
 13+ *
 14+ * @author rainman
 15+ *
 16+ */
 17+public class StringList {
 18+ public static final int BUFFER_SIZE = 300;
 19+ /** delimiter used during serialization/deserialization */
 20+ public static final char DELIMITER = '\0';
 21+
 22+ protected char[] serialized = null;
 23+ protected Collection<String> collection = null;
 24+ protected String serializedStr = null;
 25+
 26+
 27+ /** Build a list form serialized input string */
 28+ public StringList(String serialized){
 29+ if(serialized == null)
 30+ this.serialized = new char[0];
 31+ else
 32+ this.serialized = serialized.toCharArray();
 33+ }
 34+
 35+ /** Build from a collection of string */
 36+ public StringList(Collection<String> inputCollection){
 37+ this.collection = inputCollection;
 38+ }
 39+
 40+ public Iterator<String> iterator(){
 41+ if(collection != null)
 42+ return collection.iterator();
 43+ else if(serialized != null)
 44+ return new StringListIterator();
 45+ else
 46+ return null;
 47+ }
 48+
 49+ public Collection<String> toCollection(){
 50+ if(collection != null)
 51+ return collection;
 52+ Iterator<String> it = iterator();
 53+ collection = new ArrayList<String>();
 54+ if(it != null){
 55+ while(it.hasNext())
 56+ collection.add(it.next());
 57+ }
 58+ return collection;
 59+
 60+ }
 61+
 62+ public String serialize(){
 63+ if(serialized != null)
 64+ return new String(serialized);
 65+ else if(serializedStr != null)
 66+ return serializedStr;
 67+ else if(collection == null)
 68+ throw new RuntimeException("String list to be serialized is null");
 69+ StringBuilder sb = new StringBuilder();
 70+ boolean first = true;
 71+ for(String s : collection){
 72+ if(!first)
 73+ sb.append(DELIMITER);
 74+ sb.append(s);
 75+ first = false;
 76+ }
 77+ serializedStr = sb.toString();
 78+ return serializedStr;
 79+ }
 80+
 81+
 82+
 83+ @Override
 84+ public String toString() {
 85+ return serialize();
 86+ }
 87+
 88+
 89+
 90+ class StringListIterator implements Iterator<String> {
 91+ char[] buffer = new char[BUFFER_SIZE];
 92+ int length = 0;
 93+ int pos = 0; // position in serialized[]
 94+
 95+ public boolean hasNext() {
 96+ if(pos < serialized.length)
 97+ return true;
 98+ else
 99+ return false;
 100+ }
 101+
 102+ public String next() {
 103+ if(!hasNext())
 104+ return null;
 105+ length = 0;
 106+ for(;pos<serialized.length;pos++){
 107+ if(serialized[pos] == DELIMITER){
 108+ pos++; // position on first char of next string
 109+ break;
 110+ }
 111+ if(length >= buffer.length){ // should never happen with wiki-titles
 112+ char[] newbuf = new char[buffer.length*2];
 113+ System.arraycopy(buffer,0,newbuf,0,buffer.length);
 114+ buffer = newbuf;
 115+ }
 116+ buffer[length++] = serialized[pos];
 117+ }
 118+ return new String(buffer,0,length);
 119+ }
 120+
 121+ public void remove() {
 122+ throw new UnsupportedOperationException();
 123+ }
 124+
 125+ }
 126+
 127+
 128+
 129+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java
@@ -12,6 +12,8 @@
1313 import org.mediawiki.importer.Revision;
1414 import org.mediawiki.importer.Siteinfo;
1515 import org.wikimedia.lsearch.beans.ArticleLinks;
 16+import org.wikimedia.lsearch.beans.Title;
 17+import org.wikimedia.lsearch.config.IndexId;
1618 import org.wikimedia.lsearch.util.Localization;
1719
1820 /**
@@ -23,11 +25,12 @@
2426 public class TitleReader implements DumpWriter{
2527 Page page;
2628 Revision revision;
27 - OldLinks links = new OldLinks();
 29+ Links links;
2830 protected String langCode;
2931
30 - public TitleReader(String langCode){
 32+ public TitleReader(String langCode, IndexId iid) throws IOException{
3133 this.langCode = langCode;
 34+ this.links = Links.createNew(iid);
3235 }
3336
3437 public void writeRevision(Revision revision) throws IOException {
@@ -38,9 +41,9 @@
3942 }
4043 public void writeEndPage() throws IOException {
4144 String key = page.Title.Namespace+":"+page.Title.Text;
42 - links.add(key,0);
 45+ links.addTitle(new Title(key));
4346 }
44 - public OldLinks getTitles() {
 47+ public Links getLinks() {
4548 return links;
4649 }
4750 public void close() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java
@@ -32,21 +32,12 @@
3333 Revision revision;
3434 Siteinfo siteinfo;
3535 /** ns:title -> number of referring articles */
36 - OldLinks links;
 36+ Links links;
3737 HashSet<String> interwiki;
3838 String langCode;
39 - boolean readRedirects;
40 -
41 - public static final boolean READ_REDIRECTS = true;
42 - public static final boolean NO_REDIRECTS = false;
4339
44 - public LinkReader(OldLinks links, String langCode){
45 - this(links,langCode,false);
46 - }
47 -
48 - public LinkReader(OldLinks links, String langCode, boolean readRedirects){
 40+ public LinkReader(Links links, String langCode){
4941 this.links = links;
50 - this.readRedirects = readRedirects;
5142 if(langCode == null || langCode.equals(""))
5243 langCode = "en";
5344 this.langCode = langCode;
@@ -59,113 +50,8 @@
6051 this.page = page;
6152 }
6253 public void writeEndPage() throws IOException {
63 - CompactArticleLinks p = links.get(page.Title.Namespace+":"+page.Title.Text);
64 - // register redirect
65 - Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
66 - if(redirect != null && readRedirects){
67 - CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
68 - if(cs != null){
69 - links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs);
70 - }
71 - } else if(redirect == null){ // process only non-redirects
72 - processLinks(p,revision.Text,page.Title.Namespace);
73 - }
74 - }
75 -
76 - /** Find the links object for the ns:title key */
77 - protected CompactArticleLinks findArticleLinks(int ns, String title){
78 - String key;
79 - CompactArticleLinks rank;
80 - if(title.length() == 0)
81 - return null;
82 - // try exact match
83 - key = ns+":"+title;
84 - rank = links.get(key);
85 - if(rank != null)
86 - return rank;
87 - // try lowercase
88 - key = ns+":"+title.toLowerCase();
89 - rank = links.get(key);
90 - if(rank != null)
91 - return rank;
92 - // try lowercase with first letter upper case
93 - if(title.length()==1)
94 - key = ns+":"+title.toUpperCase();
95 - else
96 - key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();
97 - rank = links.get(key);
98 - if(rank != null)
99 - return rank;
100 - // try title case
101 - key = ns+":"+WordUtils.capitalize(title);
102 - rank = links.get(key);
103 - if(rank != null)
104 - return rank;
105 - // try upper case
106 - key = ns+":"+title.toUpperCase();
107 - rank = links.get(key);
108 - if(rank != null)
109 - return rank;
110 - // try capitalizing at word breaks
111 - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
112 - rank = links.get(key);
113 - if(rank != null)
114 - return rank;
115 -
116 - return null;
117 - }
118 -
119 - /** Extract all links from this page, and increment ref count for linked pages */
120 - protected void processLinks(CompactArticleLinks p, String text, int namespace) {
121 - Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
122 - Matcher matcher = linkPat.matcher(text);
123 - int ns; String title;
124 - boolean escaped;
125 -
126 - HashSet<CompactArticleLinks> pagelinks = new HashSet<CompactArticleLinks>();
127 - while(matcher.find()){
128 - String link = matcher.group(1);
129 - int fragment = link.lastIndexOf('#');
130 - if(fragment != -1)
131 - link = link.substring(0,fragment);
132 - //System.out.println("Got link "+link);
133 - if(link.startsWith(":")){
134 - escaped = true;
135 - link = link.substring(1);
136 - } else escaped = false;
137 - ns = 0;
138 - title = link;
139 - // check for ns:title syntax
140 - String[] parts = link.split(":",2);
141 - if(parts.length == 2 && parts[0].length() > 1){
142 - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
143 - if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14)))
144 - continue; // categories, ignore
145 - if(inx!=null && inx < 0)
146 - continue; // special pages, ignore
147 - if(inx != null){
148 - ns = inx;
149 - title = parts[1];
150 - }
151 -
152 - // ignore interwiki links
153 - if(interwiki.contains(parts[0]))
154 - continue;
155 - }
156 - if(ns == 0 && namespace!=0)
157 - continue; // skip links from other namespaces into the main namespace
158 - // register as link
159 - CompactArticleLinks target = findArticleLinks(ns,title);
160 - if(target != null)
161 - pagelinks.add(target);
162 - }
163 - // increment page ranks
164 - for(CompactArticleLinks rank : pagelinks){
165 - rank.links++;
166 - rank.addInLink(p);
167 - p.addOutLink(rank);
168 - }
169 - }
 54+ links.addArticleInfo(revision.Text,new Title(page.Title.Namespace,page.Title.Text));
 55+ }
17056 public void writeSiteinfo(Siteinfo info) throws IOException {
17157 siteinfo = info;
17258 }
@@ -178,7 +64,7 @@
17965 public void writeStartWiki() throws IOException {
18066 // nop
18167 }
182 - public OldLinks getRanks() {
 68+ public Links getLinks() {
18369 return links;
18470 }
18571
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
@@ -10,7 +10,9 @@
1111 import java.util.regex.Pattern;
1212
1313 import org.apache.commons.lang.WordUtils;
 14+import org.apache.log4j.Logger;
1415 import org.apache.lucene.analysis.Analyzer;
 16+import org.apache.lucene.analysis.SimpleAnalyzer;
1517 import org.apache.lucene.document.Document;
1618 import org.apache.lucene.document.Field;
1719 import org.apache.lucene.index.IndexReader;
@@ -18,15 +20,21 @@
1921 import org.apache.lucene.index.Term;
2022 import org.apache.lucene.index.TermDocs;
2123 import org.apache.lucene.index.TermEnum;
 24+import org.apache.lucene.store.Directory;
 25+import org.apache.lucene.store.RAMDirectory;
2226 import org.wikimedia.lsearch.analyzers.SplitAnalyzer;
2327 import org.wikimedia.lsearch.beans.Article;
2428 import org.wikimedia.lsearch.beans.Title;
2529 import org.wikimedia.lsearch.config.GlobalConfiguration;
2630 import org.wikimedia.lsearch.config.IndexId;
2731 import org.wikimedia.lsearch.index.WikiIndexModifier;
 32+import org.wikimedia.lsearch.spell.api.Dictionary;
 33+import org.wikimedia.lsearch.spell.api.LuceneDictionary;
 34+import org.wikimedia.lsearch.spell.api.Dictionary.Word;
2835 import org.wikimedia.lsearch.util.Localization;
2936
3037 public class Links {
 38+ static Logger log = Logger.getLogger(Links.class);
3139 protected IndexId iid;
3240 protected String langCode;
3341 protected IndexWriter writer = null;
@@ -34,24 +42,61 @@
3543 protected HashSet<String> interwiki = new HashSet<String>();
3644 protected IndexReader reader = null;
3745 protected String path;
 46+ protected enum State { MODIFIED_TITLES, FLUSHED, MODIFIED_ARTICLES, READ };
 47+ protected State state;
 48+ protected Directory directory;
3849
39 - public static final char DELIMITER = '\0';
40 -
4150 private Links(IndexId iid){
4251 this.iid = iid;
4352 this.langCode = GlobalConfiguration.getInstance().getLanguage(iid);
4453 }
4554
 55+ public static Links openExisting(IndexId iid) throws IOException{
 56+ Links links = new Links(iid);
 57+ links.path = iid.getTempPath();
 58+ log.info("Using index at "+links.path);
 59+ links.writer = WikiIndexModifier.openForWrite(links.path,false);
 60+ initWriter(links.writer);
 61+ links.reader = IndexReader.open(links.path);
 62+ links.nsmap = Localization.getLocalizedNamespaces(links.langCode);
 63+ links.interwiki = Localization.getInterwiki();
 64+ links.state = State.FLUSHED;
 65+ links.directory = links.writer.getDirectory();
 66+ return links;
 67+ }
 68+
 69+ private static void initWriter(IndexWriter writer) {
 70+ writer.setMergeFactor(20);
 71+ writer.setMaxBufferedDocs(500);
 72+ writer.setUseCompoundFile(true);
 73+ }
 74+
4675 public static Links createNew(IndexId iid) throws IOException{
4776 Links links = new Links(iid);
4877 links.path = iid.getTempPath();
 78+ log.info("Making index at "+links.path);
4979 links.writer = WikiIndexModifier.openForWrite(links.path,true);
5080 links.reader = IndexReader.open(links.path);
5181 links.nsmap = Localization.getLocalizedNamespaces(links.langCode);
5282 links.interwiki = Localization.getInterwiki();
 83+ links.state = State.FLUSHED;
 84+ links.directory = links.writer.getDirectory();
5385 return links;
5486 }
5587
 88+ public static Links createNewInMemory(IndexId iid) throws IOException{
 89+ Links links = new Links(iid);
 90+ links.path = iid.getTempPath();
 91+ log.info("Making index at "+links.path);
 92+ links.writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);
 93+ links.reader = IndexReader.open(links.path);
 94+ links.nsmap = Localization.getLocalizedNamespaces(links.langCode);
 95+ links.interwiki = Localization.getInterwiki();
 96+ links.state = State.FLUSHED;
 97+ links.directory = links.writer.getDirectory();
 98+ return links;
 99+ }
 100+
56101 /** Add more entries to namespace mapping (ns_name -> ns_index) */
57102 public void addToNamespaceMap(HashMap<String,Integer> map){
58103 for(Entry<String,Integer> e : map.entrySet()){
@@ -64,11 +109,31 @@
65110 public void flush() throws IOException{
66111 // close & optimize
67112 reader.close();
 113+ if(writer != null){
 114+ writer.optimize();
 115+ writer.close();
 116+ }
 117+ // reopen
 118+ writer = new IndexWriter(directory, new SimpleAnalyzer(), false);
 119+ initWriter(writer);
 120+ reader = IndexReader.open(path);
 121+ state = State.FLUSHED;
 122+ }
 123+
 124+ /**
 125+ * Flush, and stop using this instance for writing.
 126+ * Can still read.
 127+ * @throws IOException
 128+ */
 129+ public void flushForRead() throws IOException{
 130+ // close & optimize
 131+ reader.close();
68132 writer.optimize();
69133 writer.close();
70134 // reopen
71 - writer = WikiIndexModifier.openForWrite(path,false);
72 - reader = IndexReader.open(path);
 135+ reader = IndexReader.open(path);
 136+ writer = null;
 137+ state = State.READ;
73138 }
74139
75140 /** Add a title to enable proper link analysis when adding articles
@@ -77,21 +142,23 @@
78143 Document doc = new Document();
79144 doc.add(new Field("namespace",Integer.toString(t.getNamespace()),Field.Store.YES,Field.Index.UN_TOKENIZED));
80145 doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));
81 - doc.add(new Field("key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
82 - writer.addDocument(doc);
 146+ doc.add(new Field("title_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 147+ writer.addDocument(doc);
 148+ state = State.MODIFIED_TITLES;
83149 }
84150
85151 /** Add links and other info from article
86152 * @throws IOException */
87 - public void addArticleInfo(Article a) throws IOException{
 153+ public void addArticleInfo(String text, Title t) throws IOException{
 154+ if(state == State.MODIFIED_TITLES)
 155+ flush();
88156 Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
89 - String text = a.getContents();
90 - int namespace = Integer.parseInt(a.getNamespace());
 157+ int namespace = t.getNamespace();
91158 Matcher matcher = linkPat.matcher(text);
92159 int ns; String title;
93160 boolean escaped;
94161 HashSet<String> pagelinks = new HashSet<String>();
95 - HashSet<String> linkkeys = new HashSet<String>();
 162+ HashSet<String> linkkeys = new HashSet<String>();
96163
97164 Title redirect = Localization.getRedirectTitle(text,langCode);
98165 String redirectsTo = null;
@@ -101,10 +168,12 @@
102169 while(matcher.find()){
103170 String link = matcher.group(1);
104171 String anchor = matcher.group(2);
 172+ if(anchor != null && anchor.length()>1 && anchor.substring(1).equalsIgnoreCase(title(link)))
 173+ anchor = null; // anchor same as link text
105174 int fragment = link.lastIndexOf('#');
106175 if(fragment != -1)
107176 link = link.substring(0,fragment);
108 - System.out.println("Got link "+link+"|"+anchor);
 177+ //System.out.println("Got link "+link+anchor);
109178 if(link.startsWith(":")){
110179 escaped = true;
111180 link = link.substring(1);
@@ -132,40 +201,33 @@
133202 continue; // skip links from other namespaces into the main namespace
134203 String target = findTargetLink(ns,title);
135204 if(target != null){
 205+ //System.out.println("Found "+link);
136206 linkkeys.add(target); // for outlink storage
137207 pagelinks.add(target+"|"); // for backlinks
138 - if(anchor != null && !"".equals(anchor))
139 - pagelinks.add(target+"|"+anchor); // for efficient anchortext extraction
 208+ if(anchor != null && !"|".equals(anchor))
 209+ pagelinks.add(target+anchor); // for efficient anchortext extraction
140210 }
141211 }
142212 }
143 - // index article
144 - Analyzer an = new SplitAnalyzer(DELIMITER);
 213+ // index article
 214+ StringList sl = new StringList(pagelinks);
 215+ StringList lk = new StringList(linkkeys);
 216+ Analyzer an = new SplitAnalyzer();
145217 Document doc = new Document();
146 - doc.add(new Field("namespace",a.getNamespace(),Field.Store.YES,Field.Index.UN_TOKENIZED));
147 - doc.add(new Field("title",a.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));
148 - doc.add(new Field("article_key",a.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 218+ doc.add(new Field("namespace",t.getNamespaceAsString(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 219+ doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 220+ doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
149221 if(redirectsTo != null)
150222 doc.add(new Field("redirect",redirectsTo,Field.Store.YES,Field.Index.UN_TOKENIZED));
151 - else
152 - doc.add(new Field("links",join(pagelinks,DELIMITER),Field.Store.NO,Field.Index.TOKENIZED));
153 - doc.add(new Field("links_stored",join(linkkeys,DELIMITER),Field.Store.YES,Field.Index.NO));
 223+ else{
 224+ doc.add(new Field("links",sl.toString(),Field.Store.NO,Field.Index.TOKENIZED));
 225+ doc.add(new Field("links_stored",lk.toString(),Field.Store.YES,Field.Index.TOKENIZED));
 226+ }
154227
155228 writer.addDocument(doc,an);
 229+ state = State.MODIFIED_ARTICLES;
156230 }
157231
158 - protected String join(Collection<String> strs, char join){
159 - StringBuilder sb = new StringBuilder();
160 - boolean first = true;
161 - for(String s : strs){
162 - if(!first)
163 - sb.append(join);
164 - sb.append(s);
165 - first = false;
166 - }
167 - return sb.toString();
168 - }
169 -
170232 /** Find the target key to title (ns:title) to which the links is pointing to
171233 * @throws IOException */
172234 protected String findTargetLink(int ns, String title) throws IOException{
@@ -174,55 +236,102 @@
175237 return null;
176238 // try exact match
177239 key = ns+":"+title;
178 - if(reader.docFreq(new Term("key",key)) != 0)
 240+ if(reader.docFreq(new Term("title_key",key)) != 0)
179241 return key;
180242 // try lowercase
181243 key = ns+":"+title.toLowerCase();
182 - if(reader.docFreq(new Term("key",key)) != 0)
 244+ if(reader.docFreq(new Term("title_key",key)) != 0)
183245 return key;
184246 // try lowercase with first letter upper case
185247 if(title.length()==1)
186248 key = ns+":"+title.toUpperCase();
187249 else
188250 key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();
189 - if(reader.docFreq(new Term("key",key)) != 0)
 251+ if(reader.docFreq(new Term("title_key",key)) != 0)
190252 return key;
191253 // try title case
192254 key = ns+":"+WordUtils.capitalize(title);
193 - if(reader.docFreq(new Term("key",key)) != 0)
 255+ if(reader.docFreq(new Term("title_key",key)) != 0)
194256 return key;
195257 // try upper case
196258 key = ns+":"+title.toUpperCase();
197 - if(reader.docFreq(new Term("key",key)) != 0)
 259+ if(reader.docFreq(new Term("title_key",key)) != 0)
198260 return key;
199261 // try capitalizing at word breaks
200262 key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
201 - if(reader.docFreq(new Term("key",key)) != 0)
 263+ if(reader.docFreq(new Term("title_key",key)) != 0)
202264 return key;
203265
204266 return null;
205267 }
206268
207269 /** Get number of backlinks to this title */
208 - public int getNumInLinks(Title t) throws IOException{
209 - return reader.docFreq(new Term("links",t.getKey()+"|"));
 270+ public int getNumInLinks(String key) throws IOException{
 271+ return reader.docFreq(new Term("links",key+"|"));
210272 }
211273
212274 /** Get all article titles that redirect to given title */
213 - public ArrayList<Title> getRedirectsTo(Title t) throws IOException{
214 - ArrayList<Title> ret = new ArrayList<Title>();
215 - TermDocs td = reader.termDocs(new Term("redirect",t.getKey()));
 275+ public ArrayList<String> getRedirectsTo(String key) throws IOException{
 276+ ArrayList<String> ret = new ArrayList<String>();
 277+ TermDocs td = reader.termDocs(new Term("redirect",key));
216278 while(td.next()){
217 - ret.add(new Title(reader.document(td.doc()).get("article_key")));
 279+ ret.add(reader.document(td.doc()).get("article_key"));
218280 }
219281 return ret;
220282 }
221283
 284+ protected void ensureRead() throws IOException {
 285+ if(state != State.READ)
 286+ flushForRead();
 287+ }
 288+
 289+
 290+ /** If an article is a redirect
 291+ * @throws IOException */
 292+ public boolean isRedirect(String key) throws IOException{
 293+ ensureRead();
 294+ TermDocs td = reader.termDocs(new Term("article_key",key));
 295+ if(td.next()){
 296+ if(reader.document(td.doc()).get("redirect")!=null)
 297+ return true;
 298+ }
 299+ return false;
 300+ }
 301+
 302+ /** If article is redirect, get target, else null */
 303+ public String getRedirectTarget(String key) throws IOException{
 304+ ensureRead();
 305+ TermDocs td = reader.termDocs(new Term("article_key",key));
 306+ if(td.next()){
 307+ return reader.document(td.doc()).get("redirect");
 308+ }
 309+ return null;
 310+ }
 311+
 312+ /** Get only anchors without frequency */
 313+ public ArrayList<String> getAnchors(String key) throws IOException{
 314+ ensureRead();
 315+ ArrayList<String> ret = new ArrayList<String>();
 316+ TermEnum te = reader.terms(new Term("links",key+"|"));
 317+ while(te.next()){
 318+ String t = te.term().text();
 319+ if(!t.startsWith(key) || !te.term().field().equals("links"))
 320+ break;
 321+ ret.add(t.substring(key.length()+1));
 322+ }
 323+ return ret;
 324+ }
 325+
 326+ /** Get title part of the key (ns:title) */
 327+ private String title(String key) {
 328+ return key.substring(key.indexOf(':')+1);
 329+ }
 330+
222331 /** Get anchor texts for given title
223332 * @throws IOException */
224 - public ArrayList<AnchorText> getAnchorText(Title t) throws IOException{
 333+ public ArrayList<AnchorText> getAnchorText(String key) throws IOException{
 334+ ensureRead();
225335 ArrayList<AnchorText> ret = new ArrayList<AnchorText>();
226 - String key = t.getKey();
227336 TermEnum te = reader.terms(new Term("links",key+"|"));
228337 while(te.next()){
229338 if(!te.term().text().startsWith(key) || !te.term().field().equals("links"))
@@ -233,7 +342,7 @@
234343 }
235344
236345 static public class AnchorText {
237 - public String text;
 346+ public String text; /** ns:title **/
238347 public int freq;
239348 public AnchorText(String text, int freq) {
240349 this.text = text;
@@ -243,25 +352,81 @@
244353
245354 /** Get all article titles linking to given title
246355 * @throws IOException */
247 - public ArrayList<Title> getInLinks(Title t) throws IOException{
248 - ArrayList<Title> ret = new ArrayList<Title>();
249 - TermDocs td = reader.termDocs(new Term("links",t.getKey()+"|"));
 356+ public ArrayList<String> getInLinks(String key, HashMap<Integer,String> keyCache) throws IOException{
 357+ ensureRead();
 358+ ArrayList<String> ret = new ArrayList<String>();
 359+ TermDocs td = reader.termDocs(new Term("links",key+"|"));
250360 while(td.next()){
251 - ret.add(new Title(reader.document(td.doc()).get("article_key")));
 361+ //ret.add(keyCache.get(td.doc()));
 362+ ret.add(reader.document(td.doc()).get("article_key"));
252363 }
253364 return ret;
254365 }
255366
256367 /** Get links from this article to other articles */
257 - public ArrayList<Title> getOutLinks(Title t) throws IOException{
258 - ArrayList<Title> ret = new ArrayList<Title>();
259 - TermDocs td = reader.termDocs(new Term("article_key",t.getKey()));
 368+ public StringList getOutLinks(String key) throws IOException{
 369+ ensureRead();
 370+ TermDocs td = reader.termDocs(new Term("article_key",key));
260371 if(td.next()){
261 - String links = reader.document(td.doc()).get("links_stored");
262 - for(String key : links.split(""+DELIMITER)){
263 - ret.add(new Title(key));
 372+ return new StringList(reader.document(td.doc()).get("links_stored"));
 373+ }
 374+ return null;
 375+ }
 376+
 377+ public Dictionary getKeys() throws IOException{
 378+ ensureRead();
 379+ return new LuceneDictionary(reader,"article_key");
 380+ }
 381+ @Deprecated
 382+ protected void cacheInLinks() throws IOException{
 383+ if(state != State.FLUSHED)
 384+ flush();
 385+ log.info("Caching in-links");
 386+ int count = 0;
 387+ // docid -> key
 388+ HashMap<Integer,String> keyCache = new HashMap<Integer,String>();
 389+ Dictionary dict = new LuceneDictionary(reader,"article_key");
 390+ Word w;
 391+ // build key cache
 392+ while((w = dict.next()) != null){
 393+ String key = w.getWord();
 394+ TermDocs td = reader.termDocs(new Term("article_key",key));
 395+ if(td.next()){
 396+ keyCache.put(td.doc(),key);
 397+ } else
 398+ log.error("Cannot find article for key "+key);
 399+ }
 400+
 401+ // get inlinks
 402+ for(String key : keyCache.values()){
 403+ ArrayList<String> in = getInLinks(key,keyCache);
 404+ Document doc = new Document();
 405+ doc.add(new Field("inlinks_key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
 406+ doc.add(new Field("inlinks",new StringList(in).toString(),Field.Store.YES,Field.Index.UN_TOKENIZED));
 407+ writer.addDocument(doc);
 408+ count ++;
 409+ if(count % 1000 == 0){
 410+ System.out.println("Cached inlinks for "+count);
264411 }
265412 }
266 - return ret;
267413 }
 414+
 415+ /** Get all article titles linking to given title (from inlinks cache)
 416+ * @throws IOException */
 417+ public Collection<String> getInLinksFromCache(String key) throws IOException{
 418+ ensureRead();
 419+ TermDocs td = reader.termDocs(new Term("inlinks_key",key));
 420+ while(td.next()){
 421+ return new StringList(reader.document(td.doc()).get("inlinks")).toCollection();
 422+ }
 423+ return new ArrayList<String>();
 424+ }
 425+
 426+ public Integer getDocId(String key) throws IOException {
 427+ TermDocs td = reader.termDocs(new Term("article_key",key));
 428+ if(td.next()){
 429+ return td.doc();
 430+ }
 431+ return null;
 432+ }
268433 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Related.java
@@ -1,22 +1,55 @@
22 package org.wikimedia.lsearch.ranks;
33
 4+import java.util.ArrayList;
 5+import java.util.Collection;
 6+
47 public class Related {
5 - protected CompactArticleLinks title;
6 - protected CompactArticleLinks relates;
 8+ protected String title;
 9+ protected String relates;
710 protected double score;
8 - public Related(CompactArticleLinks title, CompactArticleLinks relates, double score) {
 11+ public Related(String title, String relates, double score) {
912 this.title = title;
1013 this.relates = relates;
1114 this.score = score;
1215 }
 16+
 17+ public Related(String serialized) {
 18+ this.title = null;
 19+ int i = serialized.indexOf(' ');
 20+ this.score = Double.parseDouble(serialized.substring(0,i));
 21+ this.relates = serialized.substring(i+1);
 22+ }
 23+
1324 @Override
1425 public String toString() {
1526 return title+"->"+relates+" : "+score;
1627 }
17 - public CompactArticleLinks getRelates() {
 28+
 29+
 30+ public static ArrayList<String> convertToStringList(Collection<Related> rel){
 31+ ArrayList<String> ret = new ArrayList<String>();
 32+ for(Related r : rel){
 33+ ret.add(r.serialize());
 34+ }
 35+ return ret;
 36+ }
 37+
 38+ public static ArrayList<Related> convertToRelatedList(Collection<String> sl){
 39+ ArrayList<Related> ret = new ArrayList<Related>();
 40+ for(String s : sl){
 41+ ret.add(new Related(s));
 42+ }
 43+ return ret;
 44+ }
 45+
 46+ public String serialize(){
 47+ return score+" "+relates;
 48+ }
 49+
 50+ public String getRelates() {
1851 return relates;
1952 }
20 - public void setRelates(CompactArticleLinks relates) {
 53+ public void setRelates(String relates) {
2154 this.relates = relates;
2255 }
2356 public double getScore() {
@@ -25,10 +58,10 @@
2659 public void setScore(double score) {
2760 this.score = score;
2861 }
29 - public CompactArticleLinks getTitle() {
 62+ public String getTitle() {
3063 return title;
3164 }
32 - public void setTitle(CompactArticleLinks title) {
 65+ public void setTitle(String title) {
3366 this.title = title;
3467 }
3568
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java
@@ -26,6 +26,8 @@
2727 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
2828 import org.wikimedia.lsearch.ranks.OldLinks;
2929 import org.wikimedia.lsearch.ranks.RelatedTitle;
 30+import org.wikimedia.lsearch.storage.ArticleAnalytics;
 31+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
3032 import org.wikimedia.lsearch.util.Localization;
3133
3234 /**
@@ -40,11 +42,13 @@
4143 Revision revision;
4244 CleanIndexWriter writer;
4345 String langCode;
 46+ LinkAnalysisStorage las;
4447
4548 public CleanIndexImporter(IndexId iid, String langCode) throws IOException{
4649 Configuration.open(); // make sure configuration is loaded
4750 this.writer = new CleanIndexWriter(iid);
4851 this.langCode = langCode;
 52+ this.las = new LinkAnalysisStorage(iid);
4953 }
5054 public void writeRevision(Revision revision) throws IOException {
5155 this.revision = revision;
@@ -53,22 +57,27 @@
5458 this.page = page;
5559 }
5660 public void writeEndPage() throws IOException {
 61+ String key = page.Title.Namespace+":"+page.Title.Text;
 62+ ArticleAnalytics aa = las.getAnalitics(key);
 63+ int references = aa.getReferences();
 64+ boolean isRedirect = aa.isRedirect();
 65+
 66+ // make list of redirects
5767 ArrayList<Redirect> redirects = new ArrayList<Redirect>();
58 - boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null;
59 - ArrayList<RelatedTitle> related = new ArrayList<RelatedTitle>();
 68+ ArrayList<String> anchors = new ArrayList<String>();
 69+ anchors.addAll(aa.getAnchorText());
 70+ for(String rk : aa.getRedirectKeys()){
 71+ String[] parts = rk.toString().split(":",2);
 72+ ArticleAnalytics raa = las.getAnalitics(rk);
 73+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences()));
 74+ anchors.addAll(raa.getAnchorText());
 75+ }
6076 // make article
61 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects,related);
62 - //if(page.Title.Namespace != 0)
63 - // article.setContents("");
 77+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
 78+ references,redirects,new ArrayList<RelatedTitle>(),anchors);
 79+ // Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects,related);
6480
65 - writer.addMainArticle(article);
66 - //writer.addAllArticle(article);
67 - // generate phrases
68 - /* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false);
69 - ArrayList<Token> tokens = parser.parse();
70 - for(int i=0;i<tokens.size()-1;i++){
71 - phrases.addPhrase(tokens.get(i).termText(),tokens.get(i+1).termText());
72 - } */
 81+ writer.addArticle(article);
7382 }
7483
7584 public void close() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -8,8 +8,6 @@
99 import java.util.HashMap;
1010 import java.util.HashSet;
1111 import java.util.Iterator;
12 -import java.util.LinkedList;
13 -import java.util.List;
1412 import java.util.Set;
1513 import java.util.WeakHashMap;
1614 import java.util.Map.Entry;
@@ -40,7 +38,6 @@
4139 import org.wikimedia.lsearch.search.NamespaceFilter;
4240 import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
4341 import org.wikimedia.lsearch.search.SearcherCache;
44 -import org.wikimedia.lsearch.spell.api.NamespaceFreq;
4542 import org.wikimedia.lsearch.spell.api.NgramIndexer;
4643 import org.wikimedia.lsearch.spell.dist.DoubleMetaphone;
4744 import org.wikimedia.lsearch.spell.dist.EditDistance;
@@ -48,11 +45,8 @@
4946 public class Suggest {
5047 static Logger log = Logger.getLogger(Suggest.class);
5148 protected IndexId iid;
52 - protected IndexSearcher words;
53 - protected IndexSearcher titles;
54 - protected IndexReader titlesReader;
55 - protected int minHitsWords;
56 - protected int minHitsTitles;
 49+ protected IndexSearcher searcher;
 50+ protected IndexReader reader;
5751 protected static WeakHashMap<IndexSearcher,Set<String>> stopWordsIndexes = new WeakHashMap<IndexSearcher,Set<String>>();
5852 protected Set<String> stopWords;
5953
@@ -100,26 +94,23 @@
10195 SearcherCache cache = SearcherCache.getInstance();
10296 GlobalConfiguration global = GlobalConfiguration.getInstance();
10397 this.iid = iid;
104 - this.words = cache.getLocalSearcher(iid.getSpellWords());
105 - this.titles = cache.getLocalSearcher(iid.getSpellTitles());
106 - this.titlesReader = titles.getIndexReader();
107 - this.minHitsWords = global.getIntDBParam(iid.getDBname(),"spell_words","minHits",20);
108 - this.minHitsTitles = global.getIntDBParam(iid.getDBname(),"spell_titles","minHits",20);
 98+ this.searcher = cache.getLocalSearcher(iid.getSpell());
 99+ this.reader = searcher.getIndexReader();
109100
110101 synchronized(stopWordsIndexes){
111 - if(!stopWordsIndexes.containsKey(titles)){
 102+ if(!stopWordsIndexes.containsKey(searcher)){
112103 Set<String> s = Collections.synchronizedSet(new HashSet<String>());
113 - stopWordsIndexes.put(titles,s);
114 - TermDocs d = titles.getIndexReader().termDocs(new Term("metadata_key","stopWords"));
 104+ stopWordsIndexes.put(searcher,s);
 105+ TermDocs d = searcher.getIndexReader().termDocs(new Term("metadata_key","stopWords"));
115106 if(d.next()){
116 - String val = titles.doc(d.doc()).get("metadata_value");
 107+ String val = searcher.doc(d.doc()).get("metadata_value");
117108 for(String sw : val.split(" ")){
118109 s.add(sw);
119110 }
120111 }
121112 }
122113 this.stopWords = new HashSet<String>();
123 - this.stopWords.addAll(stopWordsIndexes.get(titles));
 114+ this.stopWords.addAll(stopWordsIndexes.get(searcher));
124115 log.info("Using stop words "+stopWords);
125116 }
126117 }
@@ -224,7 +215,7 @@
225216 continue;
226217
227218 String phrase = w+gap+w2;
228 - if(titlesReader.docFreq(new Term("phrase",phrase)) != 0){
 219+ if(reader.docFreq(new Term("phrase",phrase)) != 0){
229220 correctPhrases.add(i);
230221 correctPhrases.add(i2);
231222 } else if(correctWords.contains(w) && correctWords.contains(w2)){
@@ -258,9 +249,9 @@
259250 // suggest word
260251 ArrayList<SuggestResult> sug;
261252 if(correctWords.contains(w))
262 - sug = suggestWordsFromTitle(w,w,nsf,POOL/2,POOL/2);
 253+ sug = suggestWords(w,w,nsf,POOL/2,POOL/2);
263254 else
264 - sug = suggestWordsFromTitle(w,nsf,POOL);
 255+ sug = suggestWords(w,nsf,POOL);
265256 if(sug.size() > 0){
266257 wordSug.add(sug);
267258 SuggestResult maybeStopWord = null;
@@ -287,7 +278,7 @@
288279 possibleStopWords.add(null);
289280 }
290281 // suggest split
291 - SuggestResult split = suggestSplitFromTitle(w,nsf,minFreq);
 282+ SuggestResult split = suggestSplit(w,nsf,minFreq);
292283 if(split != null){
293284 Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);
294285 sc.substitutes.put(i,split.word.replace("_"," "));
@@ -297,7 +288,7 @@
298289 if(i-1 >= 0
299290 && (wordSug.get(i-1)==null || wordSug.get(i-1).get(0).dist!=0)
300291 && (wordSug.get(i)==null || wordSug.get(i).get(0).dist!=0)){
301 - SuggestResult join = suggestJoinFromTitle(tokens.get(i-1).termText(),w,nsf,minFreq);
 292+ SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,nsf,minFreq);
302293 if(join != null){
303294 Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
304295 sc.substitutes.put(i-1,"");
@@ -356,12 +347,12 @@
357348 String phrase = s1.word+gap+s2.word;
358349 int freq = 0;
359350 boolean inTitle = false;
360 - TermDocs td = titlesReader.termDocs(new Term("phrase",phrase));
 351+ TermDocs td = reader.termDocs(new Term("phrase",phrase));
361352 if(td.next()){
362353 int docid = td.doc();
363 - String f = titlesReader.document(docid).get("freq");
 354+ String f = reader.document(docid).get("freq");
364355 freq = Integer.parseInt(f.substring(2,f.length()-1));
365 - String it = titlesReader.document(docid).get("intitle");
 356+ String it = reader.document(docid).get("intitle");
366357 if(it!=null && it.equals("1"))
367358 inTitle = true;
368359
@@ -413,7 +404,7 @@
414405 }
415406 if(madeChanges){
416407 // check if some title exactly matches the spell-checked query
417 - if(titlesReader.docFreq(new Term("title",title.toLowerCase())) != 0){
 408+ if(reader.docFreq(new Term("title",title.toLowerCase())) != 0){
418409 log.info("Found title match for "+title);
419410 return new SuggestQuery(tidy(title),tidy(formated));
420411 }
@@ -431,7 +422,7 @@
432423 if(r.getDist() > maxdist)
433424 break;
434425 String title = r.getWord();
435 - if(titlesReader.docFreq(new Term("title",title.toLowerCase())) != 0){
 426+ if(reader.docFreq(new Term("title",title.toLowerCase())) != 0){
436427 log.info("Found title match for "+title);
437428 return new SuggestQuery(tidy(title),tidy(markSuggestion(searchterm,t,title)));
438429 }
@@ -492,48 +483,6 @@
493484 }
494485
495486 return null;
496 - }
497 -
498 - protected boolean addPhraseSuggestion(ArrayList<Token> tokens, int i1, int i2, ArrayList<Change> suggestions, NamespaceFilter nsf, int minFreq) {
499 - Token t1 = tokens.get(i1);
500 - Token t2 = tokens.get(i2);
501 - if(t2.type().equals(t1.type())){
502 - String word1 = t1.termText();
503 - String word2 = t2.termText();
504 - if(stopWords.contains(word1) || stopWords.contains(word2))
505 - return false;
506 - log.info("spell-check phrase \""+word1+" "+word2+"\"");
507 - // phrase
508 - ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq);
509 - if(r.size() > 0){
510 - SuggestResult res = r.get(0);
511 - String[] ph = res.word.split("_");
512 - if(ph.length == 2){
513 - // figure out which words need to be changed
514 - Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE);
515 - if(!ph[0].equals(word1))
516 - sc.substitutes.put(i1,ph[0]);
517 - else
518 - sc.preserves.put(i1,ph[0]);
519 - if(!ph[1].equals(word2))
520 - sc.substitutes.put(i2,ph[1]);
521 - else
522 - sc.preserves.put(i2,ph[1]);
523 - suggestions.add(sc);
524 - } else
525 - log.error("Unexpected phrase in suggest result "+res);
526 - }
527 - // join
528 - SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq);
529 - if(join != null){
530 - Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
531 - sc.substitutes.put(i1,"");
532 - sc.substitutes.put(i2,join.word);
533 - suggestions.add(sc);
534 - }
535 - return true;
536 - }
537 - return false;
538487 }
539488
540489 protected String markSuggestion(String formated, Token t, String newWord){
@@ -616,59 +565,12 @@
617566 return new Object[] {proposedChanges, preservedWords};
618567 }
619568
620 - /** Suggest some words from the words index */
621 - public ArrayList<SuggestResult> suggestWords(String word, int num){
622 - Metric metric = new Metric(word);
623 - BooleanQuery bq = new BooleanQuery();
624 - addQuery(bq,"metaphone1",metric.meta1,2);
625 - addQuery(bq,"metaphone2",metric.meta2,2);
626 - bq.add(makeWordQuery(word,""),BooleanClause.Occur.SHOULD);
627 -
628 - try {
629 - TopDocs docs = words.search(bq,null,POOL);
630 - ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
631 - int minfreq = -1;
632 - // fetch results, calculate various edit distances
633 - for(ScoreDoc sc : docs.scoreDocs){
634 - Document d = words.doc(sc.doc);
635 - String w = d.get("word");
636 - SuggestResult r = new SuggestResult(w,
637 - Integer.parseInt(d.get("freq")),
638 - metric);
639 - if(word.equals(r.word)){
640 - minfreq = r.frequency;
641 - }
642 - if(acceptWord(r,metric))
643 - res.add(r);
644 - }
645 - // filter out
646 - if(minfreq != -1){
647 - for(int i=0;i<res.size();){
648 - if(res.get(i).frequency < minfreq ){
649 - res.remove(i);
650 - } else
651 - i++;
652 - }
653 - }
654 - // sort
655 - Collections.sort(res,new SuggestResult.Comparator());
656 - ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
657 - for(int i=0;i<num && i<res.size();i++)
658 - ret.add(res.get(i));
659 - return ret;
660 - } catch (IOException e) {
661 - log.error("Cannot get suggestions for "+word+" at "+iid+" : "+e.getMessage());
662 - e.printStackTrace();
663 - return new ArrayList<SuggestResult>();
664 - }
665 - }
666 -
667 - public ArrayList<SuggestResult> suggestWordsFromTitle(String word, NamespaceFilter nsf, int num){
668 - ArrayList<SuggestResult> r1 = suggestWordsFromTitle(word,word,nsf,POOL,POOL);
 569+ public ArrayList<SuggestResult> suggestWords(String word, NamespaceFilter nsf, int num){
 570+ ArrayList<SuggestResult> r1 = suggestWords(word,word,nsf,POOL,POOL);
669571 if(r1 != null && r1.size() > 0){
670572 if(r1.get(0).dist == 0)
671573 return r1;
672 - ArrayList<SuggestResult> r2 = suggestWordsFromTitle(word,r1.get(0).word,nsf,POOL/2,POOL/2);
 574+ ArrayList<SuggestResult> r2 = suggestWords(word,r1.get(0).word,nsf,POOL/2,POOL/2);
673575 if(r2 != null && r2.size() > 0){
674576 HashSet<SuggestResult> hr = new HashSet<SuggestResult>();
675577 hr.addAll(r1); hr.addAll(r2);
@@ -682,54 +584,27 @@
683585 return r1;
684586 }
685587
686 - public ArrayList<SuggestResult> suggestWordsFromTitle(String word, String searchword, NamespaceFilter nsf, int num, int pool_size){
 588+ public ArrayList<SuggestResult> suggestWords(String word, String searchword, NamespaceFilter nsf, int num, int pool_size){
687589 Metric metric = new Metric(word);
688590 BooleanQuery bq = new BooleanQuery();
689591 bq.add(makeWordQuery(searchword,"word"),BooleanClause.Occur.SHOULD);
690592
691593 try {
692 - TopDocs docs = titles.search(bq,new NamespaceFilterWrapper(nsf),pool_size);
 594+ TopDocs docs = searcher.search(bq,new NamespaceFilterWrapper(nsf),pool_size);
693595 ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
694 - int minfreq = -1;
695596 // fetch results, calculate various edit distances
696597 for(ScoreDoc sc : docs.scoreDocs){
697 - Document d = titles.doc(sc.doc);
 598+ Document d = searcher.doc(sc.doc);
698599 String w = d.get("word");
699600 String f = d.get("freq");
700601 String meta1 = d.get("meta1");
701602 String meta2 = d.get("meta2");
702603 SuggestResult r = new SuggestResult(w, // new NamespaceFreq(d.get("freq")).getFrequency(nsf),
703604 Integer.parseInt(f.substring(2,f.length()-1)),
704 - metric, meta1, meta2);
705 - if(word.equals(r.word)){
706 - minfreq = r.frequency;
707 - }
 605+ metric, meta1, meta2);
708606 if(acceptWord(r,metric))
709607 res.add(r);
710608 }
711 - // filter out
712 - /*if(minfreq != -1){
713 - for(int i=0;i<res.size();){
714 - if(res.get(i).frequency < minfreq ){
715 - res.remove(i);
716 - } else
717 - i++;
718 - }
719 - } */
720 - // suggest simple inversion since it probably won't be found
721 - /* if(word.length() == 2){
722 - String inv = NgramIndexer.reverse(word);
723 - TermDocs td = titlesReader.termDocs(new Term("word",inv));
724 - int freq = 0;
725 - if(td.next()){
726 - freq = new NamespaceFreq(titlesReader.document(td.doc()).get("freq")).getFrequency(nsf);
727 - SuggestResult r = new SuggestResult(inv,
728 - freq,
729 - metric);
730 - //if(acceptWord(r,metric))
731 - res.add(r);
732 - }
733 - } */
734609 // sort
735610 Collections.sort(res,new SuggestResult.Comparator());
736611 ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
@@ -785,22 +660,22 @@
786661 }
787662
788663 /** Try to split word into 2 words which make up a phrase */
789 - public SuggestResult suggestSplitFromTitle(String word, NamespaceFilter nsf, int minFreq){
 664+ public SuggestResult suggestSplit(String word, NamespaceFilter nsf, int minFreq){
790665 int freq = 0;
791666 Hits hits;
792667 ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
793668 try {
794669 // find frequency
795 - hits = titles.search(new TermQuery(new Term("word",word)),new NamespaceFilterWrapper(nsf));
 670+ hits = searcher.search(new TermQuery(new Term("word",word)),new NamespaceFilterWrapper(nsf));
796671 if(hits.length() == 1)
797 - freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf);
 672+ freq = Integer.parseInt(hits.doc(0).get("freq"));
798673
799674 // try different splits
800675 for(int i=1;i<word.length()-1;i++){
801676 String phrase = word.substring(0,i) + "_" + word.substring(i);
802 - hits = titles.search(new TermQuery(new Term("phrase",phrase)),new NamespaceFilterWrapper(nsf));
 677+ hits = searcher.search(new TermQuery(new Term("phrase",phrase)),new NamespaceFilterWrapper(nsf));
803678 if(hits.length() > 0){
804 - int pfreq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf);
 679+ int pfreq = Integer.parseInt(hits.doc(0).get("freq"));
805680 if(pfreq >= freq && pfreq > minFreq)
806681 res.add(new SuggestResult(phrase,pfreq,2));
807682 }
@@ -817,11 +692,11 @@
818693 }
819694
820695 /** Returns suggestion if joining words makes sense */
821 - public SuggestResult suggestJoinFromTitle(String word1, String word2, NamespaceFilter nsf, int minFreq){
 696+ public SuggestResult suggestJoin(String word1, String word2, NamespaceFilter nsf, int minFreq){
822697 try {
823 - Hits hits = titles.search(new TermQuery(new Term("word",word1+word2)),new NamespaceFilterWrapper(nsf));
 698+ Hits hits = searcher.search(new TermQuery(new Term("word",word1+word2)),new NamespaceFilterWrapper(nsf));
824699 if(hits.length() > 0){
825 - int freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf);
 700+ int freq = Integer.parseInt(hits.doc(0).get("freq"));
826701 if(freq >= minFreq)
827702 return new SuggestResult(word1+word2,freq,1);
828703 }
@@ -832,55 +707,6 @@
833708 return null;
834709 }
835710
836 - /** Suggest phrase from a titles index, if the phrase is correct will return it as first result */
837 - public ArrayList<SuggestResult> suggestPhraseFromTitle(String word1, String word2, int num, NamespaceFilter nsf, int minFreq){
838 - String phrase = word1+"_"+word2;
839 - Query q = makeWordQuery(phrase,"phrase");
840 - Metric m1 = new Metric(word1);
841 - Metric m2 = new Metric(word2);
842 - Metric metric = new Metric(phrase);
843 - try {
844 - TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),POOL/2);
845 - ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
846 - int minfreq = (minFreq == 0)? -1 : minFreq;
847 - // fetch results
848 - for(ScoreDoc sc : docs.scoreDocs){
849 - Document d = titles.doc(sc.doc);
850 - String p = d.get("phrase");
851 - int freq = new NamespaceFreq(d.get("freq")).getFrequency(nsf);
852 - SuggestResult r = new SuggestResult(p,freq,metric);
853 - if(phrase.equals(r.word) && minfreq == -1){
854 - minfreq = r.frequency;
855 - }
856 - String[] words = p.split("_");
857 - SuggestResult r1 = new SuggestResult(words[0],freq,m1);
858 - SuggestResult r2 = new SuggestResult(words[1],freq,m2);
859 - if(r.dist < phrase.length() / 2 && acceptWord(r1,m1) && acceptWord(r2,m2)) // don't add if it will change more than half of the phrase
860 - res.add(r);
861 - }
862 - // filter out
863 - if(minfreq != -1){
864 - for(int i=0;i<res.size();){
865 - if(res.get(i).frequency < minfreq ){
866 - res.remove(i);
867 - } else
868 - i++;
869 - }
870 - }
871 - // sort
872 - Collections.sort(res,new SuggestResult.Comparator());
873 - // get first num results
874 - while(res.size() > num){
875 - res.remove(res.size()-1);
876 - }
877 - return res;
878 - } catch (IOException e) {
879 - log.error("Cannot get suggestions for "+phrase+" at "+iid+" : "+e.getMessage());
880 - e.printStackTrace();
881 - return new ArrayList<SuggestResult>();
882 - }
883 - }
884 -
885711 /** check if two words have same stemmed variants */
886712 public boolean stemsToSame(String word1, String word2, FilterFactory filters){
887713 if(!filters.hasStemmer())
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
@@ -62,19 +62,14 @@
6363 if(text.length()>=2){
6464 System.out.println("METAPHONES: "+dmeta.doubleMetaphone(text)+", "+dmeta.doubleMetaphone(text,true));
6565 System.out.println("SUGGEST: ");
66 - for(SuggestResult r : sc.suggestWords(text,10)){
 66+ for(SuggestResult r : sc.suggestWords(text,new NamespaceFilter(ns),10)){
6767 System.out.println(r);
6868 }
69 - System.out.println("SUGGEST_TITLE: ");
70 - for(SuggestResult r : sc.suggestWordsFromTitle(text,new NamespaceFilter(ns),10)){
71 - System.out.println(r);
72 - }
7369
74 - System.out.println("SPLIT: "+sc.suggestSplitFromTitle(text,new NamespaceFilter(ns),0));
 70+ System.out.println("SPLIT: "+sc.suggestSplit(text,new NamespaceFilter(ns),0));
7571 }
7672 if(last != null){
77 - System.out.println("JOIN: "+sc.suggestJoinFromTitle(last,text,new NamespaceFilter(ns),0));
78 - System.out.println("PHRASE: "+sc.suggestPhraseFromTitle(last,text,2,new NamespaceFilter(ns),0));
 73+ System.out.println("JOIN: "+sc.suggestJoin(last,text,new NamespaceFilter(ns),0));
7974 }
8075 last = text;
8176 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
@@ -20,6 +20,7 @@
2121 import org.wikimedia.lsearch.index.IndexUpdateRecord;
2222 import org.wikimedia.lsearch.index.WikiIndexModifier;
2323 import org.wikimedia.lsearch.index.WikiSimilarity;
 24+import org.wikimedia.lsearch.search.NamespaceFilter;
2425 import org.wikimedia.lsearch.util.HighFreqTerms;
2526
2627 /**
@@ -32,37 +33,85 @@
3334 public class CleanIndexWriter {
3435 static Logger log = Logger.getLogger(CleanIndexWriter.class);
3536 protected IndexId iid;
36 - protected IndexWriter writerMain;
37 - protected IndexWriter writerAll;
 37+ protected IndexWriter writer;
3838 protected FieldBuilder builder;
3939 protected String langCode;
 40+ protected NamespaceFilter nsf;
4041
4142 public static final String[] ENGLISH_STOP_WORDS = {
42 - "a", "an", "and", "are", "as", "at", "be", "but", "by",
43 - "for", "if", "in", "into", "is", "it",
44 - "no", "not", "of", "on", "or", "such",
45 - "that", "the", "their", "then", "there", "these",
46 - "they", "this", "to", "was", "will", "with"
47 - };
 43+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
 44+ "for", "if", "in", "into", "is", "it",
 45+ "no", "not", "of", "on", "or", "such",
 46+ "that", "the", "their", "then", "there", "these",
 47+ "they", "this", "to", "was", "will", "with"
 48+ };
 49+
 50+ public final static String[] FRENCH_STOP_WORDS = {
 51+ "a", "afin", "ai", "ainsi", "apres", "attendu", "au", "aujourd", "auquel", "aussi",
 52+ "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
 53+ "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
 54+ "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
 55+ "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
 56+ "dedans", "dehors", "dela", "depuis", "derriere", "des", "desormais", "desquelles",
 57+ "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
 58+ "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "des", "elle", "elles",
 59+ "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepte", "hormis",
 60+ "hors", "helas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
 61+ "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "la",
 62+ "ma", "mais", "malgre", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
 63+ "moins", "mon", "moyennant", "meme", "memes", "n", "ne", "ni", "non", "nos", "notre",
 64+ "nous", "neanmoins", "notre", "notres", "on", "ont", "ou", "outre", "ou", "par", "parmi",
 65+ "partant", "pas", "passe", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
 66+ "proche", "pres", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
 67+ "qui", "quoi", "quoique", "revoici", "revoila", "s", "sa", "sans", "sauf", "se", "selon",
 68+ "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
 69+ "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
 70+ "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
 71+ "voici", "voila", "vos", "votre", "vous", "vu", "votre", "votres", "y", "a", "ca", "es",
 72+ "ete", "etre", "o"
 73+ };
4874
 75+ public final static String[] GERMAN_STOP_WORDS = {
 76+ "einer", "eine", "eines", "einem", "einen",
 77+ "der", "die", "das", "dass", "daß",
 78+ "du", "er", "sie", "es",
 79+ "was", "wer", "wie", "wir",
 80+ "und", "oder", "ohne", "mit",
 81+ "am", "im", "in", "aus", "auf",
 82+ "ist", "sein", "war", "wird",
 83+ "ihr", "ihre", "ihres",
 84+ "als", "für", "von", "mit",
 85+ "dich", "dir", "mich", "mir",
 86+ "mein", "sein", "kein",
 87+ "durch", "wegen", "wird"
 88+ };
 89+
4990 public CleanIndexWriter(IndexId iid) throws IOException{
 91+ GlobalConfiguration global = GlobalConfiguration.getInstance();
5092 this.iid = iid;
5193 this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
52 - this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
 94+ this.langCode = global.getLanguage(iid.getDBname());
5395 HashSet<String> stopWords = new HashSet<String>();
54 - if(langCode.equals("en")){
55 - for(String w : ENGLISH_STOP_WORDS)
56 - stopWords.add(w);
 96+ String[] words = null;
 97+ if(langCode.equals("en"))
 98+ words = ENGLISH_STOP_WORDS;
 99+ else if(langCode.equals("de"))
 100+ words = GERMAN_STOP_WORDS;
 101+ else if(langCode.equals("fr"))
 102+ words = FRENCH_STOP_WORDS;
 103+
 104+ if(words != null){
 105+ for(String w : words)
 106+ stopWords.add(w);
57107 } else{
58108 stopWords.addAll(HighFreqTerms.getHighFreqTerms(iid.getDB(),"contents",20));
59109 }
60110 log.info("Using phrase stopwords: "+stopWords);
61111 builder.getBuilder().getFilters().setStopWords(stopWords);
62 - String pathMain = iid.getSpellWords().getTempPath();
63 - //String pathAll = iid.getSpellTitles().getTempPath();
64 - writerMain = open(pathMain);
65 - //writerAll = open(pathAll);
66 - addMetadata(writerMain,"stopWords",stopWords);
 112+ String path = iid.getSpell().getTempPath();
 113+ writer = open(path);
 114+ addMetadata(writer,"stopWords",stopWords);
 115+ nsf = global.getDefaultNamespace(iid);
67116 }
68117
69118 protected IndexWriter open(String path) throws IOException {
@@ -88,16 +137,12 @@
89138 return writer;
90139 }
91140
92 - /** Add to index used for spell_words */
93 - public void addMainArticle(Article a){
94 - if(a.getNamespace().equals("0"))
95 - addArticle(a,writerMain);
 141+ /** Add to index used for spell-check */
 142+ public void addArticle(Article a){
 143+ if(nsf.contains(Integer.parseInt(a.getNamespace())))
 144+ addArticle(a,writer);
96145 }
97 - /** Add to inde used for spell_titles */
98 - public void addAllArticle(Article a){
99 - //addArticle(a,writerAll);
100 - }
101 -
 146+
102147 /** Add single article */
103148 protected void addArticle(Article a, IndexWriter writer){
104149 if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
@@ -121,12 +166,10 @@
122167 * @throws IOException */
123168 public void close() throws IOException{
124169 try{
125 - writerMain.optimize();
126 - writerMain.close();
127 - //writerAll.optimize();
128 - //writerAll.close();
 170+ writer.optimize();
 171+ writer.close();
129172 } catch(IOException e){
130 - log.warn("I/O error optimizing/closing index at "+iid.getTempPath());
 173+ log.error("I/O error optimizing/closing index at "+iid.getTempPath()+" : "+e.getMessage());
131174 throw e;
132175 }
133176 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java
@@ -2,46 +2,18 @@
33
44 import java.io.IOException;
55 import java.io.InputStream;
6 -import java.util.ArrayList;
7 -import java.util.HashMap;
8 -import java.util.HashSet;
9 -import java.util.Hashtable;
10 -import java.util.Map.Entry;
116
127 import org.apache.log4j.Logger;
13 -import org.apache.lucene.analysis.Token;
14 -import org.apache.lucene.document.Document;
15 -import org.apache.lucene.index.IndexReader;
16 -import org.apache.lucene.index.Term;
17 -import org.apache.lucene.search.CachingWrapperFilter;
18 -import org.apache.lucene.search.Filter;
19 -import org.apache.lucene.search.Hits;
20 -import org.apache.lucene.search.IndexSearcher;
21 -import org.apache.lucene.search.PhraseQuery;
22 -import org.apache.lucene.search.QueryFilter;
23 -import org.apache.lucene.search.TermQuery;
24 -import org.apache.lucene.store.FSDirectory;
258 import org.mediawiki.dumper.ProgressFilter;
269 import org.mediawiki.dumper.Tools;
2710 import org.mediawiki.importer.XmlDumpReader;
28 -import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
29 -import org.wikimedia.lsearch.analyzers.WikiQueryParser;
3011 import org.wikimedia.lsearch.config.Configuration;
3112 import org.wikimedia.lsearch.config.GlobalConfiguration;
3213 import org.wikimedia.lsearch.config.IndexId;
33 -import org.wikimedia.lsearch.config.IndexRegistry;
34 -import org.wikimedia.lsearch.importer.DumpImporter;
3514 import org.wikimedia.lsearch.index.IndexThread;
36 -import org.wikimedia.lsearch.search.NamespaceFilter;
37 -import org.wikimedia.lsearch.spell.api.LuceneDictionary;
38 -import org.wikimedia.lsearch.spell.api.NamespaceFreq;
39 -import org.wikimedia.lsearch.spell.api.TitleIndexer;
40 -import org.wikimedia.lsearch.spell.api.WordsIndexer;
41 -import org.wikimedia.lsearch.spell.api.Dictionary.Word;
 15+import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
4216 import org.wikimedia.lsearch.util.Localization;
43 -import org.wikimedia.lsearch.util.StringCounter;
4417 import org.wikimedia.lsearch.util.UnicodeDecomposer;
45 -import org.wikimedia.lsearch.util.StringCounter.Count;
4618
4719 /**
4820 * Build suggest (did you mean...) indexes
@@ -55,12 +27,12 @@
5628 String inputfile = null;
5729 String dbname = null;
5830
59 - System.out.println("MediaWiki Lucene search indexer - build suggestions index.\n");
 31+ System.out.println("MediaWiki Lucene search indexer - build spelling suggestion index.\n");
6032
6133 Configuration.open();
6234
6335 if(args.length !=1 && args.length != 2){
64 - System.out.println("Syntax: java SpellCheckBuilder <dbname> [<dumpfile>]");
 36+ System.out.println("Syntax: java SuggestBuilder <dbname> [<dumpfile>]");
6537 return;
6638 }
6739 inputfile = args.length>1? args[1] : null;
@@ -75,10 +47,9 @@
7648
7749 long start = System.currentTimeMillis();
7850 IndexId iid = IndexId.get(dbname);
79 - IndexId words = iid.getSpellWords();
80 - IndexId titles = iid.getSpellTitles();
81 - if(words == null || titles == null){
82 - log.fatal("Index "+iid+" doesn't have both spell-check indexes assigned. Enable them in global configuration.");
 51+ IndexId spell = iid.getSpell();
 52+ if(spell == null){
 53+ log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration.");
8354 return;
8455 }
8556
@@ -95,7 +66,7 @@
9667
9768 // make fresh clean index
9869 try {
99 - CleanIndexImporter importer = new CleanIndexImporter(words,langCode);
 70+ CleanIndexImporter importer = new CleanIndexImporter(spell,langCode);
10071 XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000));
10172 reader.readDump();
10273 importer.closeIndex();
@@ -106,36 +77,19 @@
10778 }
10879 }
10980 }
110 - // make words index
111 - /*log.info("Making words index");
112 - try {
113 - LuceneDictionary dict = new LuceneDictionary(IndexReader.open(words.getTempPath()),"contents");
114 - WordsIndexer writer = new WordsIndexer(words.getImportPath(),(dbname.equals("wikilucene")? 3 : 50));
115 - writer.createIndex();
116 - Word word;
117 - while((word = dict.next()) != null){
118 - writer.addWord(word);
119 - }
120 - writer.closeAndOptimze();
121 - } catch (IOException e) {
122 - log.fatal("Cannot open clean dictionary for "+words+" : "+e.getMessage());
123 - e.printStackTrace();
124 - return;
125 - }*/
12681
127 - log.info("Making suggest title index");
 82+ log.info("Making spell-check index");
12883 // make phrase index
12984
130 - TitleIndexer tInx = new TitleIndexer(titles);
 85+ SpellCheckIndexer tInx = new SpellCheckIndexer(spell);
13186 tInx.createFromTempIndex();
13287
13388 long end = System.currentTimeMillis();
13489
13590 // make snapshots
136 - //IndexThread.makeIndexSnapshot(words,words.getImportPath());
137 - IndexThread.makeIndexSnapshot(titles,titles.getImportPath());
 91+ IndexThread.makeIndexSnapshot(spell,spell.getImportPath());
13892
139 - System.out.println("Finished making suggest index in "+formatTime(end-start));
 93+ System.out.println("Finished making spell-check index in "+formatTime(end-start));
14094 }
14195
14296 private static String formatTime(long l) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/WordsIndexer.java
@@ -1,58 +0,0 @@
2 -package org.wikimedia.lsearch.spell.api;
3 -
4 -import java.io.IOException;
5 -
6 -import org.apache.log4j.Logger;
7 -import org.apache.lucene.analysis.SimpleAnalyzer;
8 -import org.apache.lucene.document.Document;
9 -import org.apache.lucene.document.Field;
10 -import org.wikimedia.lsearch.spell.api.Dictionary.Word;
11 -import org.wikimedia.lsearch.spell.dist.DoubleMetaphone;
12 -
13 -/**
14 - * Create the index with words. Overview:
15 - * - 1 word = 1 document
16 - * - split the word into ngrams and index those
17 - *
18 - * @author rainman
19 - *
20 - */
21 -public class WordsIndexer {
22 - static Logger log = Logger.getLogger(WordsIndexer.class);
23 - protected DoubleMetaphone dmeta;
24 - /** If word occurs less that minFreq times, it will be discarded */
25 - protected int minFreq;
26 - protected NgramIndexer indexer;
27 - String path;
28 -
29 - public WordsIndexer(String path, int minFreq) throws IOException {
30 - this.path = path;
31 - this.minFreq = minFreq;
32 - this.dmeta = new DoubleMetaphone();
33 - this.indexer = new NgramIndexer();
34 - }
35 -
36 - public void createIndex() throws IOException{
37 - indexer.createIndex(path, new SimpleAnalyzer());
38 - }
39 -
40 - /** Add word to the index, make sure index is open */
41 - public void addWord(Word word){
42 - if(word.frequency < minFreq)
43 - return;
44 - if(word.getWord().length() < 2)
45 - return;
46 - Document doc = new Document();
47 - indexer.createNgramFields(doc,"",word.word);
48 - doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED));
49 - doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.NO));
50 - doc.add(new Field("metaphone1",dmeta.doubleMetaphone(word.word), Field.Store.NO, Field.Index.UN_TOKENIZED));
51 - doc.add(new Field("metaphone2",dmeta.doubleMetaphone(word.word,true), Field.Store.NO, Field.Index.UN_TOKENIZED));
52 -
53 - indexer.addDocument(doc);
54 - }
55 -
56 - public void closeAndOptimze() throws IOException{
57 - indexer.closeAndOptimize();
58 - }
59 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java
@@ -1,527 +0,0 @@
2 -package org.wikimedia.lsearch.spell.api;
3 -
4 -import java.io.IOException;
5 -import java.util.ArrayList;
6 -import java.util.Collection;
7 -import java.util.HashMap;
8 -import java.util.HashSet;
9 -import java.util.Map.Entry;
10 -
11 -import org.apache.log4j.Logger;
12 -import org.apache.lucene.analysis.SimpleAnalyzer;
13 -import org.apache.lucene.analysis.Token;
14 -import org.apache.lucene.document.Document;
15 -import org.apache.lucene.document.Field;
16 -import org.apache.lucene.index.IndexReader;
17 -import org.apache.lucene.index.Term;
18 -import org.apache.lucene.index.TermDocs;
19 -import org.apache.lucene.search.Hits;
20 -import org.apache.lucene.search.IndexSearcher;
21 -import org.apache.lucene.search.MultiSearcher;
22 -import org.apache.lucene.search.PhraseQuery;
23 -import org.apache.lucene.search.Query;
24 -import org.apache.lucene.search.SearchableMul;
25 -import org.apache.lucene.search.Searcher;
26 -import org.apache.lucene.search.TermQuery;
27 -import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
28 -import org.wikimedia.lsearch.analyzers.FieldNameFactory;
29 -import org.wikimedia.lsearch.config.GlobalConfiguration;
30 -import org.wikimedia.lsearch.config.IndexId;
31 -import org.wikimedia.lsearch.config.IndexRegistry;
32 -import org.wikimedia.lsearch.index.IndexUpdateRecord;
33 -import org.wikimedia.lsearch.index.WikiIndexModifier;
34 -import org.wikimedia.lsearch.search.IndexSearcherMul;
35 -import org.wikimedia.lsearch.search.WikiSearcher;
36 -import org.wikimedia.lsearch.spell.api.Dictionary.Word;
37 -import org.wikimedia.lsearch.spell.dist.DoubleMetaphone;
38 -import org.wikimedia.lsearch.util.HighFreqTerms;
39 -
40 -/**
41 - * Index words and phrases from article titles.
42 - *
43 - * Fields:
44 - * * word - word from title
45 - * * phrase - phrase like douglas_adams
46 - * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3)
47 - * * namespace - namespaces where the word/phrase is present
48 - *
49 - * @author rainman
50 - *
51 - */
52 -public class TitleIndexer {
53 - static Logger log = Logger.getLogger(TitleIndexer.class);
54 - protected NgramIndexer ngramWriter;
55 - public static final boolean NEW_INDEX = true;
56 - protected boolean createNew;
57 - protected int minWordFreq, minPhraseFreq;
58 - protected IndexId iid,titles;
59 - protected String langCode;
60 - protected IndexRegistry registry;
61 - protected DoubleMetaphone dmeta = new DoubleMetaphone();
62 -
63 - public TitleIndexer(IndexId iid){
64 - this(iid,false);
65 - }
66 -
67 - public TitleIndexer(IndexId titles, boolean createNew){
68 - this.titles = titles;
69 - this.iid = titles.getDB();
70 - GlobalConfiguration global = GlobalConfiguration.getInstance();
71 - this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell_titles","wordsMinFreq",3);
72 - this.minPhraseFreq = global.getIntDBParam(iid.getDBname(),"spell_titles","phrasesMinFreq",1);
73 - this.createNew = createNew;
74 - this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
75 - this.ngramWriter = new NgramIndexer();
76 - this.registry = IndexRegistry.getInstance();
77 - }
78 -
79 - protected Searcher makeSearcher(IndexId main) throws IOException{
80 - if(main.isSingle())
81 - return new IndexSearcherMul(registry.getLatestSnapshot(main).path);
82 - else{
83 - ArrayList<IndexSearcherMul> searchers = new ArrayList<IndexSearcherMul>();
84 - for(String part : main.getPhysicalIndexes()){
85 - searchers.add(new IndexSearcherMul(registry.getLatestSnapshot(IndexId.get(part)).path));
86 - }
87 - return new MultiSearcher(searchers.toArray(new SearchableMul[]{}));
88 - }
89 - }
90 -
91 - /** Returns {NamespaceFreq, HashSet<Integer>} */
92 - protected Object[] getFreqAndNamespaces(Searcher searcher, int[] namespaces, int[] ranks, Query q) throws IOException {
93 - Hits hits = searcher.search(q);
94 - NamespaceFreq wnf = new NamespaceFreq();
95 - HashSet<Integer> ns = new HashSet<Integer>();
96 - for(int i=0;i<hits.length();i++){
97 - /*Document d = hits.doc(i);
98 - int n = Integer.parseInt(d.get("namespace"));
99 - String rr = d.get("rank");
100 - int r = rr==null? 0 : Integer.parseInt(d.get("rank")); */
101 - int id = hits.id(i);
102 - int n = namespaces[id];
103 - int r = ranks[id];
104 - wnf.incFrequency(n,r);
105 - ns.add(n);
106 - }
107 - return new Object[] {wnf,ns};
108 - }
109 -
110 - protected Object[] getFreqAndNamespaces(Searcher searcher, int[] ns, int[] ranks, String word) throws IOException {
111 - return getFreqAndNamespaces(searcher,ns,ranks,new TermQuery(new Term("title",word)));
112 - }
113 -
114 - protected Object[] getFreqAndNamespaces(Searcher searcher, int[] ns, int[] ranks, String[] phrase) throws IOException{
115 - PhraseQuery pq = new PhraseQuery();
116 - for(String p : phrase){
117 - pq.add(new Term("title",p));
118 - }
119 - return getFreqAndNamespaces(searcher,ns,ranks,pq);
120 - }
121 -
122 - protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, Query q) throws IOException{
123 - Hits hits = searcher.search(q);
124 - NamespaceFreq wnf = new NamespaceFreq();
125 - //wnf.setFrequency(-10,hits.length());
126 - for(int j=0;j<hits.length();j++){
127 - wnf.incFrequency(namespaces[hits.id(j)]);
128 - }
129 - return wnf;
130 - }
131 -
132 - /** Get frequency for a single word */
133 - protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String word) throws IOException{
134 - return getFrequency(searcher,namespaces,new TermQuery(new Term("contents",word)));
135 - }
136 -
137 - /** Get frequency of phrase (invidual words as array) */
138 - protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{
139 - PhraseQuery pq = new PhraseQuery();
140 - for(String p : phrase){
141 - pq.add(new Term("contents",p));
142 - }
143 - return getFrequency(searcher,namespaces,pq);
144 - }
145 -
146 - /** Get namespaces where word appears in title */
147 - protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, Query q) throws IOException{
148 - Hits hits = searcher.search(q);
149 - HashSet<Integer> ns = new HashSet<Integer>();
150 - for(int j=0;j<hits.length();j++){
151 - ns.add(namespaces[hits.id(j)]);
152 - }
153 - return ns;
154 - }
155 -
156 - protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String word) throws IOException{
157 - return getNamespaces(searcher,namespaces,new TermQuery(new Term("title",word)));
158 - }
159 -
160 - protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{
161 - PhraseQuery pq = new PhraseQuery();
162 - for(String p : phrase){
163 - pq.add(new Term("title",p));
164 - }
165 - return getNamespaces(searcher,namespaces,pq);
166 - }
167 -
168 - /**
169 - * Returns the namespace for each doc_id
170 - * @throws IOException
171 - * @FIXME: assumes optimized index
172 - */
173 - protected Object[] makeNamespaceMap(Searcher searcher) throws IOException{
174 - log.debug("Making namespace map...");
175 - int[] namespaces = new int[searcher.maxDoc()];
176 - int[] ranks = new int[searcher.maxDoc()];
177 - for(int i=0;i<namespaces.length;i++){
178 - namespaces[i] = -100;
179 - Document doc = searcher.doc(i);
180 - if(doc != null){
181 - namespaces[i] = Integer.parseInt(doc.get("namespace"));
182 - String rr = doc.get("rank");
183 - ranks[i] = rr==null? 0 : Integer.parseInt(rr);
184 - }
185 - }
186 - log.debug("Done making namespace map");
187 - return new Object[] {namespaces,ranks};
188 - }
189 -
190 - /**
191 - * Create new index from an index *snapshot* by reading all terms in the index.
192 - * Index will be created in the import directory.
193 - */
194 - @SuppressWarnings("unchecked")
195 - public void createFromSnapshot(){
196 - String path = titles.getImportPath(); // dest where to put index
197 - try{
198 - log.debug("Creating new suggest index");
199 - ngramWriter.createIndex(path,new SimpleAnalyzer());
200 - Searcher searcher = makeSearcher(iid);
201 - //IndexSearcher searcher = new IndexSearcherMul(iid.getSpellTitles().getTempPath());
202 - // map doc_id -> namespace
203 - //int[] namespaces = makeNamespaceMap(searcher);
204 - Object[] nsr = makeNamespaceMap(searcher);
205 - int[] namespaces = (int[]) nsr[0];
206 - int[] ranks = (int[]) nsr[1];
207 - int totalAdded = 0, lastReport=0;
208 -
209 - for(String dbrole : iid.getPhysicalIndexes()){
210 - log.info("Processing index "+dbrole);
211 - if(!ngramWriter.isOpen()) // if we closed the index previously
212 - ngramWriter.reopenIndex(path,new SimpleAnalyzer());
213 -
214 - IndexId part = IndexId.get(dbrole);
215 - //IndexReader ir = searcher.getIndexReader();
216 - IndexReader ir = IndexReader.open(registry.getLatestSnapshot(part).path);
217 - LuceneDictionary dict = new LuceneDictionary(ir,"title");
218 - IndexSearcher ngramSearcher = new IndexSearcher(path);
219 - Word word;
220 - // get all words, and all phrases beginning with word
221 - while((word = dict.next()) != null){
222 - log.debug("Processing word "+word);
223 - String w = word.getWord();
224 -
225 - // check if word is already in the index
226 - if(ngramSearcher.docFreq(new Term("word",w)) != 0)
227 - continue;
228 -
229 - int freq = searcher.docFreq(new Term("contents",w));
230 - if(freq > minWordFreq){
231 - // index word
232 - Object[] ret = getFreqAndNamespaces(searcher,namespaces,ranks,w);
233 - NamespaceFreq wnf = (NamespaceFreq) ret[0];
234 - Collection<Integer> wns = (Collection<Integer>) ret[1];
235 - //NamespaceFreq wnf = getFrequency(searcher,namespaces,w);
236 - if(wnf.getFrequency() > minWordFreq){
237 - //Collection<Integer> wns = getNamespaces(searcher,namespaces,w);
238 - addWord(w,wnf,wns);
239 - }
240 - }
241 - if(freq > minPhraseFreq){
242 - // index phrases
243 - HashSet<String> phrases = new HashSet<String>();
244 - Hits hits = searcher.search(new TermQuery(new Term("title",w)));
245 - // from titles find phrases beginning with word
246 - for(int i=0;i<hits.length();i++){
247 - Document doc = hits.doc(i);
248 - // tokenize to make phrases
249 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false);
250 - ArrayList<Token> tokens = parser.parse();
251 - for(int j=0;j<tokens.size()-1;j++){
252 - Token t = tokens.get(j);
253 - // ignore aliases
254 - if(t.getPositionIncrement() == 0)
255 - continue;
256 - // find phrases beginning with the target word
257 - if(w.equals(t.termText())){
258 - phrases.add(t.termText()+"_"+tokens.get(j+1).termText());
259 - }
260 - }
261 - }
262 - log.debug("Adding "+phrases.size()+" phrases "+phrases);
263 - // index phrases
264 - for(String phrase : phrases){
265 - Object[] ret = getFreqAndNamespaces(searcher,namespaces,ranks,phrase.split("_"));
266 - NamespaceFreq nf = (NamespaceFreq) ret[0];
267 - Collection<Integer> pns = (Collection<Integer>) ret[1];
268 - //NamespaceFreq nf = getFrequency(searcher,namespaces,phrase.split("_"));
269 - if(nf.getFrequency() > minPhraseFreq){
270 - //Collection<Integer> pns = getNamespaces(searcher,namespaces,phrase.split("_"));
271 - addPhrase(phrase,nf,pns,false);
272 - }
273 - }
274 - totalAdded += phrases.size();
275 - if(totalAdded - lastReport > 1000){
276 - log.info("Processed "+totalAdded+" phrases");
277 - lastReport = totalAdded;
278 - }
279 - }
280 - }
281 - log.debug("Finished index "+iid+", closing/optimizing.");
282 - ir.close();
283 - ngramSearcher.close();
284 - ngramWriter.closeAndOptimize();
285 - }
286 - searcher.close();
287 - } catch (IOException e) {
288 - log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
289 - e.printStackTrace();
290 - return;
291 - }
292 - }
293 -
294 - public void createFromTempIndex(){
295 - String path = titles.getImportPath(); // dest where to put index
296 - FieldNameFactory fields = new FieldNameFactory();
297 - final String title = fields.title();
298 - final String contents = fields.contents();
299 - final String alttitle = fields.alttitle();
300 - try {
301 - ngramWriter.createIndex(path,new SimpleAnalyzer());
302 - IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath());
303 - HashSet<String> stopWords = new HashSet<String>();
304 - TermDocs td = ir.termDocs(new Term("metadata_key","stopWords"));
305 - if(td.next()){
306 - for(String s : ir.document(td.doc()).get("metadata_value").split(" "))
307 - stopWords.add(s);
308 - }
309 - addMetadata("stopWords",stopWords);
310 -
311 - // add all titles
312 - for(int i=0;i<ir.maxDoc();i++){
313 - if(ir.isDeleted(i))
314 - continue;
315 - String titleText = ir.document(i).get(title);
316 - if(titleText != null)
317 - addTitle(titleText);
318 - // FIXME: alttitle fiels is not generated!
319 - for(int j=0;j<WikiIndexModifier.ALT_TITLES;j++){
320 - String altTitleText = ir.document(i).get(alttitle+j);
321 - if(altTitleText != null)
322 - addTitle(altTitleText);
323 - }
324 - }
325 -
326 - LuceneDictionary dict = new LuceneDictionary(ir,contents);
327 - Word word;
328 - while((word = dict.next()) != null){
329 - String w = word.getWord();
330 - int freq = word.getFrequency();
331 - if(w.contains("_")){ // phrase
332 - String[] words = w.split("_+");
333 - if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1]))
334 - continue;
335 - boolean allowed = true;
336 - for(String ww : words){
337 - // allow only those phrases consisting of title words
338 - if(ir.docFreq(new Term(title,ww)) == 0){
339 - allowed = false;
340 - break;
341 - }
342 - }
343 - if(allowed && freq > minPhraseFreq){
344 - boolean inTitle = ir.docFreq(new Term(title,w))!= 0;
345 - NamespaceFreq nsf = new NamespaceFreq();
346 - nsf.setFrequency(0,freq);
347 - ArrayList<Integer> nss = new ArrayList<Integer>();
348 - nss.add(0);
349 - addPhrase(w,nsf,nss,inTitle);
350 - }
351 - } else{
352 - if(freq > minWordFreq){
353 - NamespaceFreq nsf = new NamespaceFreq();
354 - nsf.setFrequency(0,freq);
355 - ArrayList<Integer> nss = new ArrayList<Integer>();
356 - nss.add(0);
357 - addWord(w,nsf,nss);
358 - }
359 - }
360 - }
361 - //ngramWriter.closeAndOptimize();
362 - //ngramWriter.reopenIndex(path,new SimpleAnalyzer());
363 - //IndexReader ngramReader = ngramWriter.getReader();
364 - // add stuff from titles with stop words
365 - dict = new LuceneDictionary(ir,title);
366 - while((word = dict.next()) != null){
367 - String w = word.getWord();
368 - if(w.contains("_")){ // phrase
369 - String[] words = w.split("_+");
370 - if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1])){
371 - int freq = ir.docFreq(new Term("contents",w));
372 - NamespaceFreq nsf = new NamespaceFreq();
373 - nsf.setFrequency(0,freq);
374 - ArrayList<Integer> nss = new ArrayList<Integer>();
375 - nss.add(0);
376 - addPhrase(w,nsf,nss,true);
377 - }
378 - }
379 - }
380 - ngramWriter.closeAndOptimize();
381 - ir.close();
382 -
383 - } catch (IOException e) {
384 - log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
385 - e.printStackTrace();
386 - return;
387 - }
388 -
389 - }
390 -
391 - /**
392 - * Register a title in the index, without tokenization, just lowercase.
393 - *
394 - * @param title
395 - */
396 - public void addTitle(String title){
397 - Document doc = new Document();
398 - doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED));
399 - ngramWriter.addDocument(doc);
400 - }
401 - /**
402 - * Add phrase to index
403 - *
404 - * @param phrase - 2+ words joined with underscore
405 - * @param nf - frequencies of phrase in various namespaces
406 - * @param namespaces - namespaces where phrase appears in title
407 - */
408 - public void addPhrase(String phrase, NamespaceFreq nf, Collection<Integer> namespaces, boolean inTitle){
409 - String freq = nf.serialize(minPhraseFreq);
410 - if(freq.length() == 0)
411 - return;
412 - if(phrase.length() <= 2){
413 - log.warn("Invalid phrase: "+phrase);
414 - return;
415 - }
416 - Document doc = new Document();
417 - //ngramWriter.createNgramFields(doc,"phrase",phrase);
418 - doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
419 - doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO));
420 - for(Integer ns : namespaces){
421 - doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED));
422 - }
423 - if(inTitle)
424 - doc.add(new Field("intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED));
425 -
426 - ngramWriter.addDocument(doc);
427 - }
428 -
429 - /**
430 - * Add into metadata_key and metadata_value.
431 - * Collection is assumed to contain words (without spaces)
432 - */
433 - public void addMetadata(String key, Collection<String> values){
434 - StringBuilder sb = new StringBuilder();
435 - // serialize by joining with spaces
436 - for(String val : values){
437 - if(sb.length() != 0)
438 - sb.append(" ");
439 - sb.append(val);
440 - }
441 - Document doc = new Document();
442 - doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED));
443 - doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO));
444 -
445 - ngramWriter.addDocument(doc);
446 - }
447 -
448 - /** Add ordinary word to the index
449 - *
450 - * @param word - word to add
451 - * @param nf - frequencies in namespaces
452 - * @param namespaces - namespaces where word appears in title
453 - */
454 - public void addWord(String word, NamespaceFreq nf, Collection<Integer> namespaces){
455 - if(word.length() < 2)
456 - return;
457 - String freq = nf.serialize();
458 - if(freq.length() == 0)
459 - return;
460 - Document doc = new Document();
461 - ngramWriter.createNgramFields(doc,"word",word);
462 - doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED));
463 - doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO));
464 - doc.add(new Field("meta1",dmeta.doubleMetaphone(word), Field.Store.YES, Field.Index.NO));
465 - doc.add(new Field("meta2",dmeta.doubleMetaphone(word,true), Field.Store.YES, Field.Index.NO));
466 - for(Integer ns : namespaces){
467 - doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED));
468 - }
469 -
470 - ngramWriter.addDocument(doc);
471 - }
472 -
473 - /** Update the index */
474 - public void update(Collection<IndexUpdateRecord> records){
475 - /*String path = iid.getIndexPath();
476 - try{
477 - log.info("Updating suggest index for "+iid+" with "+records.size());
478 - IndexReader ir = IndexReader.open(path);
479 - Searcher searcher = makeSearcher(iid.getDB());
480 - // TODO: don't use namespaces, but fetch fields, it's likely to be more efficient for small updates
481 - int[] namespaces = makeNamespaceMap(searcher);
482 - // get all words and phrases
483 - HashSet<String> words = new HashSet<String>();
484 - HashSet<String> phrases = new HashSet<String>();
485 - for(IndexUpdateRecord rec : records){
486 - String title = rec.getArticle().getTitle();
487 - ArrayList<Token> tokens = new FastWikiTokenizerEngine(title,langCode,false).parse();
488 - String last = null;
489 - // register word/phrases
490 - for(Token t : tokens){
491 - String w = t.termText();
492 - words.add(w);
493 - if(last != null){
494 - phrases.add(last+"_"+w);
495 - }
496 - last = w;
497 - }
498 - }
499 - searcher.close();
500 -
501 - // batch delete old values
502 - for(String word : words){
503 - ir.deleteDocuments(new Term("word",word));
504 - }
505 - for(String phrase : phrases){
506 - ir.deleteDocuments(new Term("phrase",phrase));
507 - }
508 - ir.close();
509 - ngramWriter.reopenIndex(path,new SimpleAnalyzer());
510 -
511 - // batch add new stuff
512 - for(String word : words){
513 - addWord(word,getFrequency(searcher,namespaces,word),getNamespaces(searcher,namespaces,word));
514 - }
515 - for(String phrase : phrases){
516 - String[] ph = phrase.split("_");
517 - addPhrase(phrase,getFrequency(searcher,namespaces,ph),getNamespaces(searcher,namespaces,ph));
518 - }
519 -
520 - ngramWriter.close();
521 - } catch(IOException e){
522 - log.error("Cannot update index for "+iid+" : "+e.getMessage());
523 - e.printStackTrace();
524 - return;
525 - }*/
526 - }
527 -
528 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NamespaceFreq.java
@@ -1,117 +0,0 @@
2 -package org.wikimedia.lsearch.spell.api;
3 -
4 -import java.util.BitSet;
5 -import java.util.HashMap;
6 -import java.util.Set;
7 -import java.util.Map.Entry;
8 -
9 -import org.wikimedia.lsearch.search.NamespaceFilter;
10 -
11 -/** Mapping from namespaces to frequencies */
12 -public class NamespaceFreq {
13 - class IntWrap{
14 - int val = 0;
15 - IntWrap() {}
16 - IntWrap(int value){ val = value; }
17 - IntWrap(String value){ val = Integer.parseInt(value); }
18 - public String toString(){ return ""+val; }
19 - }
20 - /** namespace -> frequency */
21 - protected HashMap<Integer,IntWrap> nsmap = new HashMap<Integer,IntWrap>();
22 -
23 - /** Construct from serialized field value */
24 - public NamespaceFreq(String field){
25 - String[] pairs = field.split(" ");
26 - for(String pair : pairs){
27 - if(pair.length() == 0)
28 - continue;
29 - String[] nsf = pair.split(":");
30 - if(nsf.length == 2)
31 - nsmap.put(Integer.parseInt(nsf[0]),new IntWrap(nsf[1]));
32 - else {
33 - throw new RuntimeException("Bad syntax for namespace-frequency pairs : "+field);
34 - }
35 - }
36 - }
37 -
38 - public NamespaceFreq() {
39 - }
40 -
41 - /** Get frequency of term for one namespace */
42 - public int getFrequency(int namespace){
43 - if(nsmap.containsKey(-10))
44 - return nsmap.get(-10).val;
45 - else if(nsmap.containsKey(namespace))
46 - return nsmap.get(namespace).val;
47 - else
48 - return 0;
49 - }
50 -
51 - /** Get frequency of term over some set of namespaces */
52 - public int getFrequency(NamespaceFilter nsf){
53 - if(nsmap.containsKey(-10))
54 - return nsmap.get(-10).val;
55 - int sum = 0;
56 - BitSet ns = nsf.getIncluded();
57 - for(int i=ns.nextSetBit(0); i>=0; i=ns.nextSetBit(i+1)){
58 - sum += getFrequency(i);
59 - }
60 - return sum;
61 - }
62 -
63 - /** Get total frequency of term over all namespaces */
64 - public int getFrequency(){
65 - if(nsmap.containsKey(-10))
66 - return nsmap.get(-10).val;
67 - int sum = 0;
68 - for(IntWrap i : nsmap.values()){
69 - sum += i.val;
70 - }
71 - return sum;
72 - }
73 -
74 - /** Serialize only if total frequency is at least minFreq */
75 - public String serialize(int minFreq){
76 - StringBuilder sb = new StringBuilder();
77 - int sum = 0;
78 - for(Entry<Integer,IntWrap> e : nsmap.entrySet()){
79 - sum += e.getValue().val;
80 - sb.append(e.getKey());
81 - sb.append(":");
82 - sb.append(e.getValue());
83 - sb.append(" ");
84 - }
85 - if(sum < minFreq)
86 - return "";
87 - return sb.toString();
88 - }
89 -
90 - /** Serialize into a field format: ns:freq ns2:freq2 ... */
91 - public String serialize(){
92 - return serialize(0);
93 - }
94 -
95 - /** Modify frequency value for some namespace */
96 - public void setFrequency(int namespace, int frequency){
97 - nsmap.put(namespace,new IntWrap(frequency));
98 - }
99 -
100 - /** Incremental term frequency in namespace */
101 - public void incFrequency(int namespace){
102 - incFrequency(namespace,1);
103 - }
104 -
105 - /** Incremental term frequency in namespace */
106 - public void incFrequency(int namespace, int inc){
107 - if(nsmap.containsKey(namespace)){
108 - nsmap.get(namespace).val+=inc;
109 - } else
110 - nsmap.put(namespace,new IntWrap(inc));
111 - }
112 -
113 - /** Get all namespaces where term has nonzero frequency */
114 - public Set<Integer> getNamespaces(){
115 - return nsmap.keySet();
116 - }
117 -
118 -}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
@@ -0,0 +1,230 @@
 2+package org.wikimedia.lsearch.spell.api;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Collection;
 7+import java.util.HashMap;
 8+import java.util.HashSet;
 9+import java.util.Map.Entry;
 10+
 11+import org.apache.log4j.Logger;
 12+import org.apache.lucene.analysis.SimpleAnalyzer;
 13+import org.apache.lucene.analysis.Token;
 14+import org.apache.lucene.document.Document;
 15+import org.apache.lucene.document.Field;
 16+import org.apache.lucene.index.IndexReader;
 17+import org.apache.lucene.index.Term;
 18+import org.apache.lucene.index.TermDocs;
 19+import org.apache.lucene.search.Hits;
 20+import org.apache.lucene.search.IndexSearcher;
 21+import org.apache.lucene.search.MultiSearcher;
 22+import org.apache.lucene.search.PhraseQuery;
 23+import org.apache.lucene.search.Query;
 24+import org.apache.lucene.search.SearchableMul;
 25+import org.apache.lucene.search.Searcher;
 26+import org.apache.lucene.search.TermQuery;
 27+import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 28+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
 29+import org.wikimedia.lsearch.config.GlobalConfiguration;
 30+import org.wikimedia.lsearch.config.IndexId;
 31+import org.wikimedia.lsearch.config.IndexRegistry;
 32+import org.wikimedia.lsearch.index.IndexUpdateRecord;
 33+import org.wikimedia.lsearch.index.WikiIndexModifier;
 34+import org.wikimedia.lsearch.search.IndexSearcherMul;
 35+import org.wikimedia.lsearch.search.WikiSearcher;
 36+import org.wikimedia.lsearch.spell.api.Dictionary.Word;
 37+import org.wikimedia.lsearch.spell.dist.DoubleMetaphone;
 38+import org.wikimedia.lsearch.util.HighFreqTerms;
 39+
 40+/**
 41+ * Index words and phrases from article titles.
 42+ *
 43+ * Fields:
 44+ * * word - word from title
 45+ * * phrase - phrase like douglas_adams
 46+ * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3)
 47+ * * namespace - namespaces where the word/phrase is present
 48+ *
 49+ * @author rainman
 50+ *
 51+ */
 52+public class SpellCheckIndexer {
 53+ static Logger log = Logger.getLogger(SpellCheckIndexer.class);
 54+ protected NgramIndexer ngramWriter;
 55+ public static final boolean NEW_INDEX = true;
 56+ protected boolean createNew;
 57+ protected int minWordFreq, minPhraseFreq;
 58+ protected IndexId iid,titles;
 59+ protected String langCode;
 60+ protected IndexRegistry registry;
 61+ protected DoubleMetaphone dmeta = new DoubleMetaphone();
 62+
 63+ public SpellCheckIndexer(IndexId iid){
 64+ this(iid,false);
 65+ }
 66+
 67+ public SpellCheckIndexer(IndexId titles, boolean createNew){
 68+ this.titles = titles;
 69+ this.iid = titles.getDB();
 70+ GlobalConfiguration global = GlobalConfiguration.getInstance();
 71+ this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell","wordsMinFreq",3);
 72+ this.minPhraseFreq = global.getIntDBParam(iid.getDBname(),"spell","phrasesMinFreq",1);
 73+ this.createNew = createNew;
 74+ this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
 75+ this.ngramWriter = new NgramIndexer();
 76+ this.registry = IndexRegistry.getInstance();
 77+ }
 78+
 79+ public void createFromTempIndex(){
 80+ String path = titles.getImportPath(); // dest where to put index
 81+ FieldNameFactory fields = new FieldNameFactory();
 82+ final String title = fields.title();
 83+ final String contents = fields.contents();
 84+ final String alttitle = fields.alttitle();
 85+ try {
 86+ ngramWriter.createIndex(path,new SimpleAnalyzer());
 87+ IndexReader ir = IndexReader.open(iid.getSpell().getTempPath());
 88+ HashSet<String> stopWords = new HashSet<String>();
 89+ TermDocs td = ir.termDocs(new Term("metadata_key","stopWords"));
 90+ if(td.next()){
 91+ for(String s : ir.document(td.doc()).get("metadata_value").split(" "))
 92+ stopWords.add(s);
 93+ }
 94+ addMetadata("stopWords",stopWords);
 95+
 96+ log.info("Adding titles");
 97+ // add all titles
 98+ for(int i=0;i<ir.maxDoc();i++){
 99+ if(ir.isDeleted(i))
 100+ continue;
 101+ String titleText = ir.document(i).get(title);
 102+ if(titleText != null)
 103+ addTitle(titleText);
 104+ for(int j=0;j<WikiIndexModifier.ALT_TITLES;j++){
 105+ String altTitleText = ir.document(i).get(alttitle+j);
 106+ if(altTitleText != null)
 107+ addTitle(altTitleText);
 108+ }
 109+ }
 110+ log.info("Adding words and phrases");
 111+ LuceneDictionary dict = new LuceneDictionary(ir,contents);
 112+ Word word;
 113+ while((word = dict.next()) != null){
 114+ String w = word.getWord();
 115+ int freq = word.getFrequency();
 116+ if(w.contains("_")){ // phrase
 117+ String[] words = w.split("_+");
 118+ if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1]))
 119+ continue;
 120+ boolean allowed = true;
 121+ for(String ww : words){
 122+ // allow only those phrases consisting of title words
 123+ if(ir.docFreq(new Term(title,ww)) == 0){
 124+ allowed = false;
 125+ break;
 126+ }
 127+ }
 128+ if(allowed && freq > minPhraseFreq){
 129+ boolean inTitle = ir.docFreq(new Term(title,w))!= 0;
 130+ addPhrase(w,freq,inTitle);
 131+ }
 132+ } else{
 133+ if(freq > minWordFreq){
 134+ addWord(w,freq);
 135+ }
 136+ }
 137+ }
 138+ log.info("Adding phrases with stop words from titles");
 139+ // add stuff from titles with stop words
 140+ dict = new LuceneDictionary(ir,title);
 141+ while((word = dict.next()) != null){
 142+ String w = word.getWord();
 143+ if(w.contains("_")){ // phrase
 144+ String[] words = w.split("_+");
 145+ if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1])){
 146+ int freq = ir.docFreq(new Term("contents",w));
 147+ addPhrase(w,freq,true);
 148+ }
 149+ }
 150+ }
 151+ ngramWriter.closeAndOptimize();
 152+ ir.close();
 153+
 154+ } catch (IOException e) {
 155+ log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage());
 156+ e.printStackTrace();
 157+ return;
 158+ }
 159+
 160+ }
 161+
 162+ /**
 163+ * Register a title in the index, without tokenization, just lowercase.
 164+ *
 165+ * @param title
 166+ */
 167+ public void addTitle(String title){
 168+ Document doc = new Document();
 169+ doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED));
 170+ ngramWriter.addDocument(doc);
 171+ }
 172+ /**
 173+ * Add phrase to index
 174+ *
 175+ * @param phrase - 2+ words joined with underscore
 176+ * @param nf - frequencies of phrase in various namespaces
 177+ * @param namespaces - namespaces where phrase appears in title
 178+ */
 179+ public void addPhrase(String phrase, int freq, boolean inTitle){
 180+ if(phrase.length() <= 2){
 181+ log.warn("Invalid phrase: "+phrase);
 182+ return;
 183+ }
 184+ Document doc = new Document();
 185+ //ngramWriter.createNgramFields(doc,"phrase",phrase);
 186+ doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED));
 187+ doc.add(new Field("freq",Integer.toString(freq), Field.Store.YES, Field.Index.NO));
 188+ if(inTitle)
 189+ doc.add(new Field("intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED));
 190+
 191+ ngramWriter.addDocument(doc);
 192+ }
 193+
 194+ /**
 195+ * Add into metadata_key and metadata_value.
 196+ * Collection is assumed to contain words (without spaces)
 197+ */
 198+ public void addMetadata(String key, Collection<String> values){
 199+ StringBuilder sb = new StringBuilder();
 200+ // serialize by joining with spaces
 201+ for(String val : values){
 202+ if(sb.length() != 0)
 203+ sb.append(" ");
 204+ sb.append(val);
 205+ }
 206+ Document doc = new Document();
 207+ doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED));
 208+ doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO));
 209+
 210+ ngramWriter.addDocument(doc);
 211+ }
 212+
 213+ /** Add ordinary word to the index
 214+ *
 215+ * @param word - word to add
 216+ * @param nf - frequencies in namespaces
 217+ * @param namespaces - namespaces where word appears in title
 218+ */
 219+ public void addWord(String word, int freq){
 220+ if(word.length() < 2)
 221+ return;
 222+ Document doc = new Document();
 223+ ngramWriter.createNgramFields(doc,"word",word);
 224+ doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED));
 225+ doc.add(new Field("freq",Integer.toString(freq), Field.Store.YES, Field.Index.NO));
 226+ doc.add(new Field("meta1",dmeta.doubleMetaphone(word), Field.Store.YES, Field.Index.NO));
 227+ doc.add(new Field("meta2",dmeta.doubleMetaphone(word,true), Field.Store.YES, Field.Index.NO));
 228+
 229+ ngramWriter.addDocument(doc);
 230+ }
 231+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java
@@ -39,6 +39,7 @@
4040 private TermEnum termEnum;
4141 private int count = 0;
4242 private String field;
 43+ private boolean first = true;
4344
4445 public LuceneDictionary(IndexReader reader, String field) {
4546 try {
@@ -55,10 +56,14 @@
5657 }
5758 try {
5859 while(true){
59 - if(!termEnum.next())
 60+ if(first){
 61+ first = false;
 62+ break;
 63+ }
 64+ else if(!termEnum.next())
6065 return null;
6166 else if(!termEnum.term().field().equals(field))
62 - continue; // skip terms that are not from the desired field
 67+ return null; // end of our field
6368
6469 break;
6570 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -66,7 +66,7 @@
6767 showTokens(text);
6868 text = "Dž (Dž), dž (dž), d' (ď), l' (ľ), t' (ť), IJ (IJ), ij (ij), LJ (LJ), Lj (Lj), lj (lj). NJ (NJ), Nj (Nj), nj (nj). All characters in parentheses are the single-unicode form; those not in parentheses are component character forms. There's also the issue of searching for AE (Æ), ae (æ), OE (Œ), & oe (œ).";
6969 showTokens(text);
70 - text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained ";
 70+ text = "ça Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained ";
7171 showTokens(text);
7272 text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]";
7373 showTokens(text);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
@@ -39,7 +39,7 @@
4040 int bad=0;
4141 long start = System.currentTimeMillis();
4242 for(String[] m : DATA){
43 - ArrayList<SuggestResult> res = sc.suggestWordsFromTitle(m[0],new NamespaceFilter(0),5);
 43+ ArrayList<SuggestResult> res = sc.suggestWords(m[0],new NamespaceFilter(0),5);
4444 if(res.size() > 0){
4545 SuggestResult r = res.get(0);
4646 if(r.getWord().equals(m[1]))
@@ -48,7 +48,7 @@
4949 && res.get(1).getWord().equals(m[1]))
5050 good++;
5151 else if(r.getDist() > 1){
52 - SuggestResult split = sc.suggestSplitFromTitle(m[0],new NamespaceFilter(0),0);
 52+ SuggestResult split = sc.suggestSplit(m[0],new NamespaceFilter(0),0);
5353 if(split!=null && m[1].equals(split.getWord()))
5454 good++;
5555 else{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -145,7 +145,7 @@
146146 assertEquals("detest",sir[3]);
147147 assertEquals("rutest",sir[4]);
148148 assertEquals("frtest",sir[5]);
149 - assertEquals(8,sir.length);
 149+ assertEquals(11,sir.length);
150150
151151 // indexLocation
152152 Hashtable indexLocation = testgc.getIndexLocation();
@@ -191,14 +191,9 @@
192192 assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki"));
193193
194194 // test suggest tag
195 - Hashtable<String,String> sug = testgc.getDBParams("entest","spell_words");
196 - assertEquals("3",sug.get("minFreq"));
197 - assertEquals("20",sug.get("minHits"));
198 -
199 - sug = testgc.getDBParams("entest","spell_titles");
 195+ Hashtable<String,String> sug = testgc.getDBParams("entest","spell");
200196 assertEquals("1",sug.get("wordsMinFreq"));
201197 assertEquals("2",sug.get("phrasesMinFreq"));
202 - assertEquals("20",sug.get("minHits"));
203198
204199 } catch (MalformedURLException e) {
205200 e.printStackTrace();
@@ -272,10 +267,10 @@
273268 assertEquals(2,njawiki2.getPartNum());
274269 assertEquals("[192.168.0.1]",njawiki2.getSearchHosts().toString());
275270
276 - IndexId sug = IndexId.get("entest.spell_words");
277 - assertTrue(sug.isSpellWords());
 271+ IndexId sug = IndexId.get("entest.spell");
 272+ assertTrue(sug.isSpell());
278273 assertFalse(sug.isLogical());
279 - assertEquals(sug,sug.getSpellWords());
 274+ assertEquals(sug,sug.getSpell());
280275
281276 }
282277 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java
@@ -99,5 +99,14 @@
100100 public void setTitle(java.lang.String title) {
101101 this.title = title;
102102 }
 103+
 104+ /**
 105+ * Get string representation of namespace
 106+ *
 107+ * @return
 108+ */
 109+ public String getNamespaceAsString(){
 110+ return Integer.toString(namespace);
 111+ }
103112
104113 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
@@ -51,6 +51,8 @@
5252 private transient int rank;
5353 /** names of articles that relate to this article */
5454 private ArrayList<RelatedTitle> related;
 55+ /** names of articles that relate to this article */
 56+ private ArrayList<String> anchorText;
5557
5658 public Article(){
5759 namespace="";
@@ -61,6 +63,7 @@
6264 references = 0;
6365 redirects=new ArrayList<Redirect>();
6466 related = new ArrayList<RelatedTitle>();
 67+ anchorText = new ArrayList<String>();
6568 }
6669
6770 public Article(long pageId, Title title, String text, boolean redirect, int references) {
@@ -72,6 +75,7 @@
7376 this.references = references;
7477 this.redirects = new ArrayList<Redirect>();
7578 this.related = new ArrayList<RelatedTitle>();
 79+ this.anchorText = new ArrayList<String>();
7680 }
7781
7882 public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references) {
@@ -83,9 +87,11 @@
8488 this.references = references;
8589 this.redirects = new ArrayList<Redirect>();
8690 this.related = new ArrayList<RelatedTitle>();
 91+ this.anchorText = new ArrayList<String>();
8792 }
8893
89 - public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related) {
 94+ public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references,
 95+ ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related, ArrayList<String> anchorText) {
9096 this.namespace = Integer.toString(namespace);
9197 this.title = titleText;
9298 contents = text;
@@ -94,6 +100,7 @@
95101 this.references = references;
96102 this.redirects = redirects;
97103 this.related = related;
 104+ this.anchorText = anchorText;
98105 }
99106
100107 public boolean isRedirect() {
@@ -216,11 +223,14 @@
217224
218225 public void setRelated(ArrayList<RelatedTitle> related) {
219226 this.related = related;
 227+ }
 228+
 229+ public ArrayList<String> getAnchorText() {
 230+ return anchorText;
 231+ }
 232+
 233+ public void setAnchorText(ArrayList<String> anchorText) {
 234+ this.anchorText = anchorText;
220235 }
221236
222 -
223 -
224 -
225 -
226 -
227237 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
@@ -17,8 +17,10 @@
1818 import org.wikimedia.lsearch.config.IndexId;
1919 import org.wikimedia.lsearch.index.IndexThread;
2020 import org.wikimedia.lsearch.ranks.LinkReader;
 21+import org.wikimedia.lsearch.ranks.Links;
2122 import org.wikimedia.lsearch.ranks.OldLinks;
2223 import org.wikimedia.lsearch.ranks.RankBuilder;
 24+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
2325 import org.wikimedia.lsearch.storage.Storage;
2426 import org.wikimedia.lsearch.util.Localization;
2527 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -42,7 +44,7 @@
4345 Boolean optimize = null;
4446 Integer mergeFactor = null, maxBufDocs = null;
4547 boolean newIndex = true, makeSnapshot = false;
46 - boolean snapshotDb = false; boolean updateReferences=false;
 48+ boolean snapshotDb = false, useOldLinkAnalysis = false;
4749
4850 System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
4951
@@ -50,11 +52,11 @@
5153 log = Logger.getLogger(Importer.class);
5254
5355 if(args.length < 2){
54 - System.out.println("Syntax: java Importer [-n] [-s] [-r] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
 56+ System.out.println("Syntax: java Importer [-a] [-n] [-s] [-la] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
5557 System.out.println("Options: ");
5658 System.out.println(" -a - don't create new index, append to old");
5759 System.out.println(" -s - make index snapshot when finished");
58 - System.out.println(" -r - update references info on storage backend");
 60+ System.out.println(" -la - use earlier link analysis index, don't recalculate");
5961 System.out.println(" -l limit_num - add at most limit_num articles");
6062 System.out.println(" -o optimize - true/false overrides optimization param from global settings");
6163 System.out.println(" -m mergeFactor - overrides param from global settings");
@@ -73,8 +75,8 @@
7476 maxBufDocs = Integer.parseInt(args[++i]);
7577 else if(args[i].equals("-a"))
7678 newIndex = false;
77 - else if(args[i].equals("-r"))
78 - updateReferences = true;
 79+ else if(args[i].equals("-la"))
 80+ useOldLinkAnalysis = true;
7981 else if(args[i].equals("-s"))
8082 makeSnapshot = true;
8183 else if(args[i].equals("--snapshot")){
@@ -95,6 +97,7 @@
9698 }
9799
98100 String langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
 101+ IndexId iid = IndexId.get(dbname);
99102 // preload
100103 UnicodeDecomposer.getInstance();
101104 Localization.readLocalization(langCode);
@@ -102,19 +105,16 @@
103106
104107 long start = System.currentTimeMillis();
105108
106 - // regenerate link and redirect information
107 - OldLinks links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,LinkReader.READ_REDIRECTS);
108 -
109 - if(updateReferences){
 109+ if(!useOldLinkAnalysis){
 110+ // regenerate link and redirect information
 111+ Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode,iid),langCode);
110112 try {
111 - Storage.getInstance().storePageReferences(links.getAll(),dbname);
 113+ RankBuilder.storeLinkAnalysis(links,iid);
112114 } catch (IOException e) {
113 - log.error("Failed to update references info: "+e.getMessage());
 115+ log.fatal("Cannot store link analytics: "+e.getMessage());
 116+ return;
114117 }
115118 }
116 - links.generateRedirectLists();
117 - links.compactAll();
118 -
119119 log.info("Third pass, indexing articles...");
120120
121121 // open
@@ -125,9 +125,9 @@
126126 log.fatal("I/O error opening "+inputfile);
127127 return;
128128 }
129 -
 129+ LinkAnalysisStorage las = new LinkAnalysisStorage(iid);
130130 // read
131 - DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,links,langCode);
 131+ DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,las,langCode);
132132 XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000));
133133 try {
134134 reader.readDump();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -21,9 +21,12 @@
2222 import org.wikimedia.lsearch.config.Configuration;
2323 import org.wikimedia.lsearch.config.IndexId;
2424 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
 25+import org.wikimedia.lsearch.ranks.Links;
2526 import org.wikimedia.lsearch.ranks.OldLinks;
2627 import org.wikimedia.lsearch.ranks.RankBuilder;
2728 import org.wikimedia.lsearch.ranks.RelatedTitle;
 29+import org.wikimedia.lsearch.storage.ArticleAnalytics;
 30+import org.wikimedia.lsearch.storage.LinkAnalysisStorage;
2831 import org.wikimedia.lsearch.util.Localization;
2932
3033 public class DumpImporter implements DumpWriter {
@@ -32,15 +35,15 @@
3336 Revision revision;
3437 SimpleIndexWriter writer;
3538 int count = 0, limit;
36 - OldLinks links;
 39+ LinkAnalysisStorage las;
3740 String langCode;
3841
3942 public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
40 - Integer maxBufDocs, boolean newIndex, OldLinks ranks, String langCode){
 43+ Integer maxBufDocs, boolean newIndex, LinkAnalysisStorage las, String langCode){
4144 Configuration.open(); // make sure configuration is loaded
4245 writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex);
4346 this.limit = limit;
44 - this.links = ranks;
 47+ this.las = las;
4548 this.langCode = langCode;
4649 }
4750 public void writeRevision(Revision revision) throws IOException {
@@ -50,28 +53,25 @@
5154 this.page = page;
5255 }
5356 public void writeEndPage() throws IOException {
54 - // get reference count
5557 String key = page.Title.Namespace+":"+page.Title.Text;
56 - CompactArticleLinks r = links.get(key);
57 - int references;
58 - boolean isRedirect = r.redirectsTo != null;
59 - if(r == null){
60 - references = 0;
61 - log.error("Reference count for "+key+" is undefined, which should never happen.");
62 - } else
63 - references = r.links;
 58+ ArticleAnalytics aa = las.getAnalitics(key);
 59+ int references = aa.getReferences();
 60+ boolean isRedirect = aa.isRedirect();
 61+
6462 // make list of redirects
6563 ArrayList<Redirect> redirects = new ArrayList<Redirect>();
66 - if(r.redirected != null){
67 - for(CompactArticleLinks rk : r.redirected){
68 - String[] parts = rk.toString().split(":",2);
69 - redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links));
70 - }
 64+ ArrayList<String> anchors = new ArrayList<String>();
 65+ anchors.addAll(aa.getAnchorText());
 66+ for(String rk : aa.getRedirectKeys()){
 67+ String[] parts = rk.toString().split(":",2);
 68+ ArticleAnalytics raa = las.getAnalitics(rk);
 69+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences()));
 70+ anchors.addAll(raa.getAnchorText());
7171 }
72 - ArrayList<RelatedTitle> related = RankBuilder.getRelatedTitles(r,links);
 72+ //TODO: ArrayList<RelatedTitle> related = RankBuilder.getRelatedTitles(r,links);
7373 // make article
7474 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
75 - references,redirects,related);
 75+ references,redirects,new ArrayList<RelatedTitle>(),anchors);
7676 writer.addArticle(article);
7777 count++;
7878 if(limit >= 0 && count > limit)
Index: branches/lucene-search-2.1/lsearch-global.conf
@@ -17,7 +17,7 @@
1818 wikidev : (single) (language,sr)
1919 wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[])
2020 wikilucene : (language,en) (warmup,10)
21 -wikilucene : (spell_words,10,2) (spell_titles,3,1,2)
 21+wikilucene : (spell,3,1)
2222
2323 # Search groups
2424 # Index parts of a split index are always taken from the node's group
@@ -56,7 +56,7 @@
5757
5858 # suffix for databases that should also have exact-case index built
5959 # note: this will also turn off stemming!
60 -ExactCase.suffix=wiktionary wikilucene
 60+ExactCase.suffix=wiktionary
6161
6262 # wmf-style init file, attempt to read wgserver (for oai) and lang info
6363 # for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html

Status & tagging log