r22892 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r22891‎ | r22892 | r22893 >
Date:21:37, 10 June 2007
Author:rainman
Status:old
Tags:
Comment:
Restructured page rank calculation (since it's too expensive
to get it from group by database query):
* MySQL storage backend to store page ranks, supports master/slave
write/read and some load balancing. In some future will probably
be also used to store text for highlighting
* RankBuilder, rebuilds page rank data from xml dumps, optimized
for low memory usage
Modified paths:
  • /trunk/lucene-search-2.0/.classpath (modified) (history)
  • /trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar (added) (history)
  • /trunk/lucene-search-2.0/lsearch.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Title.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexRegistry.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java (deleted) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java (deleted) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/LinkReader.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/Links.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/TitleReader.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/Storage.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar
___________________________________________________________________
Added: svn:mime-type
11 + application/octet-stream
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -3,6 +3,7 @@
44 import java.io.IOException;
55 import java.util.ArrayList;
66 import java.util.Iterator;
 7+import java.util.Map.Entry;
78
89 import org.apache.log4j.Logger;
910 import org.mediawiki.importer.DumpWriter;
@@ -65,11 +66,11 @@
6667 }
6768 public void writeEndPage() throws IOException {
6869 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects);
69 - log.info("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
 70+ log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
7071 records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
7172 log.debug(iid+": Update for "+article);
7273 references = 0;
73 - redirects.clear();
 74+ redirects = new ArrayList<Redirect>();
7475 }
7576
7677 public void writeSiteinfo(Siteinfo info) throws IOException {
@@ -77,9 +78,8 @@
7879 // write to localization
7980 Iterator it = info.Namespaces.orderedEntries();
8081 while(it.hasNext()){
81 - Integer inx = (Integer)it.next();
82 - String prefix = info.Namespaces.getPrefix(inx);
83 - Localization.addCustomMapping(prefix,inx,langCode);
 82+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
 83+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
8484 }
8585 }
8686
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -10,15 +10,21 @@
1111 import java.net.Authenticator;
1212 import java.net.PasswordAuthentication;
1313 import java.util.ArrayList;
 14+import java.util.HashMap;
1415 import java.util.HashSet;
1516 import java.util.Properties;
1617
1718 import org.apache.log4j.Logger;
 19+import org.wikimedia.lsearch.beans.Article;
 20+import org.wikimedia.lsearch.beans.Redirect;
 21+import org.wikimedia.lsearch.beans.Title;
1822 import org.wikimedia.lsearch.config.Configuration;
1923 import org.wikimedia.lsearch.config.GlobalConfiguration;
2024 import org.wikimedia.lsearch.config.IndexId;
2125 import org.wikimedia.lsearch.index.IndexUpdateRecord;
2226 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 27+import org.wikimedia.lsearch.ranks.Links;
 28+import org.wikimedia.lsearch.storage.Storage;
2329 import org.wikimedia.lsearch.util.Localization;
2430 import org.wikimedia.lsearch.util.UnicodeDecomposer;
2531
@@ -160,6 +166,12 @@
161167 continue;
162168 boolean hasMore = false;
163169 do{
 170+ // fetch references for records
 171+ fetchReferences(records,dbname);
 172+ for(IndexUpdateRecord rec : records){
 173+ Article ar = rec.getArticle();
 174+ log.debug("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
 175+ }
164176 // send to indexer
165177 RMIMessengerClient messenger = new RMIMessengerClient(true);
166178 try {
@@ -235,4 +247,34 @@
236248 }
237249 } while(daemon);
238250 }
 251+
 252+ protected static void fetchReferences(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException {
 253+ Storage store = Storage.getInstance();
 254+ ArrayList<Title> titles = new ArrayList<Title>();
 255+ for(IndexUpdateRecord rec : records){
 256+ if(rec.isDelete())
 257+ continue;
 258+ Article ar = rec.getArticle();
 259+ titles.add(ar.makeTitle());
 260+ if(ar.getRedirects() != null){
 261+ for(Redirect r : ar.getRedirects()){
 262+ titles.add(r.makeTitle());
 263+ }
 264+ }
 265+ }
 266+ // fetch
 267+ Links links = new Links(store.getPageReferences(titles,dbname));
 268+ // update
 269+ for(IndexUpdateRecord rec : records){
 270+ if(rec.isDelete())
 271+ continue;
 272+ Article ar = rec.getArticle();
 273+ ar.setReferences(links.getLinks(ar.makeTitle().getKey()));
 274+ if(ar.getRedirects() != null){
 275+ for(Redirect r : ar.getRedirects()){
 276+ r.setReferences(links.getLinks(r.makeTitle().getKey()));
 277+ }
 278+ }
 279+ }
 280+ }
239281 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
@@ -185,7 +185,7 @@
186186 boolean isRed = false;
187187 if(line.startsWith("#redirect"))
188188 isRed = true;
189 - else if(lang != null ){
 189+ else if(lang != null && redirects.get(lang)!=null){
190190 for(String magic : redirects.get(lang)){
191191 if(line.startsWith(magic)){
192192 isRed = true;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/TitleReader.java
@@ -0,0 +1,63 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.HashMap;
 7+import java.util.HashSet;
 8+import java.util.Iterator;
 9+import java.util.Map.Entry;
 10+
 11+import org.mediawiki.importer.DumpWriter;
 12+import org.mediawiki.importer.Page;
 13+import org.mediawiki.importer.Revision;
 14+import org.mediawiki.importer.Siteinfo;
 15+import org.wikimedia.lsearch.beans.ArticleLinks;
 16+import org.wikimedia.lsearch.util.Localization;
 17+
 18+/**
 19+ * Read a HashSet of titles from dump
 20+ *
 21+ * @author rainman
 22+ *
 23+ */
 24+public class TitleReader implements DumpWriter{
 25+ Page page;
 26+ Revision revision;
 27+ Links links = new Links();
 28+ protected String langCode;
 29+
 30+ public TitleReader(String langCode){
 31+ this.langCode = langCode;
 32+ }
 33+
 34+ public void writeRevision(Revision revision) throws IOException {
 35+ this.revision = revision;
 36+ }
 37+ public void writeStartPage(Page page) throws IOException {
 38+ this.page = page;
 39+ }
 40+ public void writeEndPage() throws IOException {
 41+ String key = page.Title.Namespace+":"+page.Title.Text;
 42+ links.add(key,0);
 43+ }
 44+ public Links getTitles() {
 45+ return links;
 46+ }
 47+ public void close() throws IOException {
 48+ // nop
 49+ }
 50+ public void writeEndWiki() throws IOException {
 51+ // nop
 52+ }
 53+ public void writeSiteinfo(Siteinfo info) throws IOException {
 54+ // write siteinfo to localization
 55+ Iterator it = info.Namespaces.orderedEntries();
 56+ while(it.hasNext()){
 57+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
 58+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
 59+ }
 60+ }
 61+ public void writeStartWiki() throws IOException {
 62+ // nop
 63+ }
 64+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java
@@ -0,0 +1,97 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.io.UnsupportedEncodingException;
 5+import java.util.ArrayList;
 6+
 7+
 8+/**
 9+ * Page links object that has been optimized to use for low
 10+ * memory consumption. String is being stored in a utf-8
 11+ * encoded byte[] array, and the same object is to be used
 12+ * as a key and value in a hashmap.
 13+ *
 14+ * Two objects equals iff they have the same string (other fields
 15+ * are ignored in equals())
 16+ *
 17+ * @author rainman
 18+ *
 19+ */
 20+public class CompactArticleLinks{
 21+ /** format: <ns>:<title> */
 22+ protected byte[] str;
 23+ public int links;
 24+ protected int hash = 0;
 25+ /** if this page is a redirect */
 26+ public CompactArticleLinks redirectsTo;
 27+ /** list of pages that redirect here */
 28+ public ArrayList<CompactArticleLinks> redirected;
 29+
 30+ public CompactArticleLinks(String s){
 31+ try {
 32+ str = s.getBytes("utf-8");
 33+ } catch (UnsupportedEncodingException e) {
 34+ e.printStackTrace();
 35+ }
 36+ }
 37+
 38+ public CompactArticleLinks(String s, int count){
 39+ this(s);
 40+ this.links = count;
 41+ }
 42+
 43+ @Override
 44+ public String toString() {
 45+ try {
 46+ return new String(str,0,str.length,"utf-8")+", count="+links;
 47+ } catch (UnsupportedEncodingException e) {
 48+ return "";
 49+ }
 50+ }
 51+
 52+ public String getKey(){
 53+ try {
 54+ return new String(str,0,str.length,"utf-8");
 55+ } catch (UnsupportedEncodingException e) {
 56+ return "";
 57+ }
 58+ }
 59+
 60+ public void addRedirect(CompactArticleLinks from){
 61+ if(redirected == null)
 62+ redirected = new ArrayList<CompactArticleLinks>();
 63+ redirected.add(from);
 64+ }
 65+ @Override
 66+ public int hashCode() {
 67+ int h = hash;
 68+ if(h == 0){
 69+ int off = 0;
 70+
 71+ for (int i = 0; i < str.length; i++) {
 72+ h = 31*h + str[off++];
 73+ }
 74+ hash = h;
 75+ }
 76+
 77+ return h;
 78+ }
 79+
 80+ @Override
 81+ public boolean equals(Object obj) {
 82+ if (this == obj)
 83+ return true;
 84+ if (obj == null)
 85+ return false;
 86+ if (getClass() != obj.getClass())
 87+ return false;
 88+ final CompactArticleLinks other = (CompactArticleLinks) obj;
 89+ if(other.str.length != str.length)
 90+ return false;
 91+ for(int i=0;i<str.length;i++)
 92+ if(str[i] != other.str[i])
 93+ return false;
 94+ return true;
 95+ }
 96+
 97+
 98+}
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/LinkReader.java
@@ -0,0 +1,183 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.io.IOException;
 5+import java.util.HashMap;
 6+import java.util.HashSet;
 7+import java.util.regex.Matcher;
 8+import java.util.regex.Pattern;
 9+
 10+import org.apache.commons.lang.WordUtils;
 11+import org.apache.log4j.Logger;
 12+import org.mediawiki.importer.DumpWriter;
 13+import org.mediawiki.importer.Page;
 14+import org.mediawiki.importer.Revision;
 15+import org.mediawiki.importer.Siteinfo;
 16+import org.wikimedia.lsearch.beans.Article;
 17+import org.wikimedia.lsearch.beans.ArticleLinks;
 18+import org.wikimedia.lsearch.beans.Title;
 19+import org.wikimedia.lsearch.config.Configuration;
 20+import org.wikimedia.lsearch.config.IndexId;
 21+import org.wikimedia.lsearch.util.Localization;
 22+
 23+/**
 24+ * Reads page links and references, i.e. how many times a page
 25+ * is referenced within other articles.
 26+ *
 27+ * @author rainman
 28+ *
 29+ */
 30+public class LinkReader implements DumpWriter {
 31+ static Logger log = Logger.getLogger(LinkReader.class);
 32+ Page page;
 33+ Revision revision;
 34+ Siteinfo siteinfo;
 35+ /** ns:title -> number of referring articles */
 36+ Links links;
 37+ HashSet<String> interwiki;
 38+ String langCode;
 39+ boolean readRedirects;
 40+
 41+ public static final boolean READ_REDIRECTS = true;
 42+ public static final boolean NO_REDIRECTS = false;
 43+
 44+ public LinkReader(Links links, String langCode){
 45+ this(links,langCode,false);
 46+ }
 47+
 48+ public LinkReader(Links links, String langCode, boolean readRedirects){
 49+ this.links = links;
 50+ this.readRedirects = readRedirects;
 51+ if(langCode == null || langCode.equals(""))
 52+ langCode = "en";
 53+ this.langCode = langCode;
 54+ interwiki = Localization.getInterwiki();
 55+ }
 56+ public void writeRevision(Revision revision) throws IOException {
 57+ this.revision = revision;
 58+ }
 59+ public void writeStartPage(Page page) throws IOException {
 60+ this.page = page;
 61+ }
 62+ public void writeEndPage() throws IOException {
 63+ if(readRedirects){
 64+ // register redirect
 65+ Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
 66+ if( redirect !=null ){
 67+ CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
 68+ if(cs != null)
 69+ links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs);
 70+ return;
 71+ }
 72+ }
 73+ processLinks(revision.Text,page.Title.Namespace);
 74+ }
 75+
 76+ /** Find the links object for the ns:title key */
 77+ protected CompactArticleLinks findArticleLinks(int ns, String title){
 78+ String key;
 79+ CompactArticleLinks rank;
 80+ if(title.length() == 0)
 81+ return null;
 82+ // try exact match
 83+ key = ns+":"+title;
 84+ rank = links.get(key);
 85+ if(rank != null)
 86+ return rank;
 87+ // try lowercase
 88+ key = ns+":"+title.toLowerCase();
 89+ rank = links.get(key);
 90+ if(rank != null)
 91+ return rank;
 92+ // try lowercase with first letter upper case
 93+ if(title.length()==1)
 94+ key = ns+":"+title.toUpperCase();
 95+ else
 96+ key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();
 97+ rank = links.get(key);
 98+ if(rank != null)
 99+ return rank;
 100+ // try title case
 101+ key = ns+":"+WordUtils.capitalize(title);
 102+ rank = links.get(key);
 103+ if(rank != null)
 104+ return rank;
 105+ // try upper case
 106+ key = ns+":"+title.toUpperCase();
 107+ rank = links.get(key);
 108+ if(rank != null)
 109+ return rank;
 110+ // try capitalizing at word breaks
 111+ key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
 112+ rank = links.get(key);
 113+ if(rank != null)
 114+ return rank;
 115+
 116+ return null;
 117+ }
 118+
 119+ /** Extract all links from this page, and increment ref count for linked pages */
 120+ protected void processLinks(String text, int namespace) {
 121+ Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
 122+ Matcher matcher = linkPat.matcher(text);
 123+ int ns; String title;
 124+ boolean escaped;
 125+
 126+ HashSet<CompactArticleLinks> pagelinks = new HashSet<CompactArticleLinks>();
 127+ while(matcher.find()){
 128+ String link = matcher.group(1);
 129+ int fragment = link.lastIndexOf('#');
 130+ if(fragment != -1)
 131+ link = link.substring(0,fragment);
 132+ //System.out.println("Got link "+link);
 133+ if(link.startsWith(":")){
 134+ escaped = true;
 135+ link = link.substring(1);
 136+ } else escaped = false;
 137+ ns = 0;
 138+ title = link;
 139+ // check for ns:title syntax
 140+ String[] parts = link.split(":",2);
 141+ if(parts.length == 2 && parts[0].length() > 1){
 142+ Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
 143+ if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14)))
 144+ continue; // categories, ignore
 145+ if(inx!=null && inx < 0)
 146+ continue; // special pages, ignore
 147+ if(inx != null){
 148+ ns = inx;
 149+ title = parts[1];
 150+ }
 151+
 152+ // ignore interwiki links
 153+ if(interwiki.contains(parts[0]))
 154+ continue;
 155+ }
 156+ if(ns == 0 && namespace!=0)
 157+ continue; // skip links from other namespaces into the main namespace
 158+ // register as link
 159+ CompactArticleLinks target = findArticleLinks(ns,title);
 160+ if(target != null)
 161+ pagelinks.add(target);
 162+ }
 163+ // increment page ranks
 164+ for(CompactArticleLinks rank : pagelinks){
 165+ rank.links++;
 166+ }
 167+ }
 168+ public void writeSiteinfo(Siteinfo info) throws IOException {
 169+ siteinfo = info;
 170+ }
 171+ public void close() throws IOException {
 172+ // nop
 173+ }
 174+ public void writeEndWiki() throws IOException {
 175+ // nop
 176+ }
 177+ public void writeStartWiki() throws IOException {
 178+ // nop
 179+ }
 180+ public Links getRanks() {
 181+ return links;
 182+ }
 183+
 184+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/Links.java
@@ -0,0 +1,76 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.util.ArrayList;
 5+import java.util.Collection;
 6+import java.util.HashMap;
 7+import java.util.Map.Entry;
 8+
 9+import org.wikimedia.lsearch.beans.ArticleLinks;
 10+
 11+/**
 12+ * Abstraction of links retrieval and other operations related to
 13+ * CompactArticleLinks.
 14+ *
 15+ * @author rainman
 16+ *
 17+ */
 18+public class Links {
 19+ protected HashMap<CompactArticleLinks,CompactArticleLinks> links = new HashMap<CompactArticleLinks,CompactArticleLinks>();
 20+
 21+ public Links() {
 22+ }
 23+
 24+ public Links(Collection<CompactArticleLinks> col){
 25+ for(CompactArticleLinks c : col){
 26+ links.put(c,c);
 27+ }
 28+ }
 29+
 30+ /** Add new page with key and ref */
 31+ public void add(String key, int ref){
 32+ CompactArticleLinks cs = new CompactArticleLinks(key,ref);
 33+ links.put(cs,cs);
 34+ }
 35+
 36+ /** Setup redirect key -> tokey */
 37+ public void setRedirect(String key, String tokey){
 38+ CompactArticleLinks from = links.get(new CompactArticleLinks(key));
 39+ CompactArticleLinks to = links.get(new CompactArticleLinks(tokey));
 40+ from.redirectsTo = to;
 41+ }
 42+
 43+ /** Setup redirect key -> to */
 44+ public void setRedirect(String key, CompactArticleLinks to){
 45+ CompactArticleLinks from = links.get(new CompactArticleLinks(key));
 46+ from.redirectsTo = to;
 47+ }
 48+
 49+ /** Get links object from key */
 50+ public CompactArticleLinks get(String key){
 51+ return links.get(new CompactArticleLinks(key));
 52+ }
 53+
 54+ /** Get collection of all links objects */
 55+ public Collection<CompactArticleLinks> getAll(){
 56+ return links.values();
 57+ }
 58+
 59+ /** Get number of references (links) to article of key */
 60+ public int getLinks(String key){
 61+ CompactArticleLinks c = links.get(new CompactArticleLinks(key));
 62+ if(c == null)
 63+ return 0;
 64+ else
 65+ return c.links;
 66+ }
 67+
 68+ /** Generate "redirects here" lists for each article */
 69+ public void generateRedirectLists(){
 70+ for(CompactArticleLinks r : links.values()){
 71+ if(r.redirectsTo != null && r != r.redirectsTo){
 72+ r.redirectsTo.addRedirect(r);
 73+ }
 74+ }
 75+
 76+ }
 77+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java
@@ -0,0 +1,130 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import java.io.IOException;
 5+import java.io.InputStream;
 6+import java.util.ArrayList;
 7+import java.util.HashMap;
 8+import java.util.HashSet;
 9+import java.util.Map.Entry;
 10+
 11+import org.apache.log4j.Logger;
 12+import org.mediawiki.dumper.ProgressFilter;
 13+import org.mediawiki.dumper.Tools;
 14+import org.mediawiki.importer.XmlDumpReader;
 15+import org.wikimedia.lsearch.beans.ArticleLinks;
 16+import org.wikimedia.lsearch.config.Configuration;
 17+import org.wikimedia.lsearch.config.GlobalConfiguration;
 18+import org.wikimedia.lsearch.config.IndexId;
 19+import org.wikimedia.lsearch.index.IndexThread;
 20+import org.wikimedia.lsearch.storage.Storage;
 21+import org.wikimedia.lsearch.util.Localization;
 22+import org.wikimedia.lsearch.util.UnicodeDecomposer;
 23+
 24+/**
 25+ * Main class, builds index from a database dump.
 26+ * Syntax: java Importer inputfile dbname
 27+ *
 28+ * @author rainman
 29+ *
 30+ */
 31+public class RankBuilder {
 32+ static Logger log = Logger.getLogger(RankBuilder.class);
 33+ /**
 34+ * @param args
 35+ * @throws IOException
 36+ */
 37+ public static void main(String[] args) throws IOException {
 38+ String inputfile = null;
 39+ String dbname = null;
 40+
 41+ System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n");
 42+
 43+ Configuration.open();
 44+ log = Logger.getLogger(RankBuilder.class);
 45+
 46+ if(args.length < 2){
 47+ System.out.println("Syntax: java Importer <inputfile> <dbname>");
 48+ return;
 49+ }
 50+ inputfile = args[0];
 51+ dbname = args[1];
 52+ if(inputfile == null || dbname == null){
 53+ System.out.println("Please specify both input xml file and database name");
 54+ return;
 55+ }
 56+
 57+ String langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
 58+ // preload
 59+ UnicodeDecomposer.getInstance();
 60+ Localization.readLocalization(langCode);
 61+ Localization.loadInterwiki();
 62+
 63+ long start = System.currentTimeMillis();
 64+
 65+ // regenerate link info
 66+ Links links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS);
 67+
 68+ Storage store = Storage.getInstance();
 69+ store.storePageReferences(links.getAll(),dbname);
 70+
 71+ /*for(CompactArticleLinks cs : links.values()){
 72+ System.out.println(cs);
 73+ }*/
 74+
 75+ long end = System.currentTimeMillis();
 76+
 77+ System.out.println("Finished generating ranks in "+formatTime(end-start));
 78+ }
 79+
 80+ public static Links processLinks(String inputfile, Links links, String langCode, boolean readRedirects) {
 81+ log.info("Second pass, calculating article links...");
 82+ InputStream input = null;
 83+ // second pass - calculate page ranks
 84+ try {
 85+ input = Tools.openInputFile(inputfile);
 86+ } catch (IOException e) {
 87+ log.fatal("I/O error opening "+inputfile);
 88+ return null;
 89+ }
 90+ // calculate ranks
 91+ LinkReader rr = new LinkReader(links,langCode,readRedirects);
 92+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
 93+ try {
 94+ reader.readDump();
 95+ } catch (IOException e) {
 96+ log.fatal("I/O error reading dump while calculating ranks for from "+inputfile);
 97+ return null;
 98+ }
 99+ return links;
 100+ }
 101+
 102+ public static Links getTitles(String inputfile,String langCode) {
 103+ log.info("First pass, getting a list of valid articles...");
 104+ InputStream input = null;
 105+ try {
 106+ input = Tools.openInputFile(inputfile);
 107+ } catch (IOException e) {
 108+ log.fatal("I/O error opening "+inputfile);
 109+ return null;
 110+ }
 111+ // first pass, get titles
 112+ TitleReader tr = new TitleReader(langCode);
 113+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
 114+ try {
 115+ reader.readDump();
 116+ input.close();
 117+ } catch (IOException e) {
 118+ log.fatal("I/O error reading dump while getting titles from "+inputfile);
 119+ return null;
 120+ }
 121+ return tr.getTitles();
 122+ }
 123+
 124+ private static String formatTime(long l) {
 125+ l /= 1000;
 126+ if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
 127+ else if(l >= 60) return (l%3600)/60+"m "+(l%60)+"s";
 128+ else return l+"s";
 129+ }
 130+
 131+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java
@@ -34,6 +34,10 @@
3535 return namespace+":"+title+" ("+references+")";
3636 }
3737
 38+ public Title makeTitle(){
 39+ return new Title(namespace,title);
 40+ }
3841
3942
 43+
4044 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Title.java
@@ -16,6 +16,9 @@
1717 this.title = title;
1818 }
1919
 20+ public String getKey(){
 21+ return namespace+":"+title;
 22+ }
2023
2124 @Override
2225 public String toString() {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
@@ -193,6 +193,10 @@
194194 this.redirectKeywordRanks = redirectKeywordRanks;
195195 }
196196
 197+ /** Get title object corresponding to this article */
 198+ public Title makeTitle(){
 199+ return new Title(Integer.parseInt(namespace),title);
 200+ }
197201
198202
199203
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java
@@ -1,156 +0,0 @@
2 -package org.wikimedia.lsearch.importer;
3 -
4 -import java.io.IOException;
5 -import java.util.HashMap;
6 -import java.util.HashSet;
7 -import java.util.regex.Matcher;
8 -import java.util.regex.Pattern;
9 -
10 -import org.apache.commons.lang.WordUtils;
11 -import org.apache.log4j.Logger;
12 -import org.mediawiki.importer.DumpWriter;
13 -import org.mediawiki.importer.Page;
14 -import org.mediawiki.importer.Revision;
15 -import org.mediawiki.importer.Siteinfo;
16 -import org.wikimedia.lsearch.beans.Article;
17 -import org.wikimedia.lsearch.beans.ArticleLinks;
18 -import org.wikimedia.lsearch.beans.Title;
19 -import org.wikimedia.lsearch.config.Configuration;
20 -import org.wikimedia.lsearch.config.IndexId;
21 -import org.wikimedia.lsearch.util.Localization;
22 -
23 -/**
24 - * Reads page links and references, i.e. how many times a page
25 - * is referenced within other articles.
26 - *
27 - * @author rainman
28 - *
29 - */
30 -public class LinkReader implements DumpWriter {
31 - static Logger log = Logger.getLogger(LinkReader.class);
32 - Page page;
33 - Revision revision;
34 - Siteinfo siteinfo;
35 - /** ns:title -> number of referring articles */
36 - HashMap<String,ArticleLinks> links = new HashMap<String,ArticleLinks>();
37 - HashSet<String> interwiki;
38 - String langCode;
39 -
40 - public LinkReader(HashMap<String,ArticleLinks> links, String langCode){
41 - this.links = links;
42 - if(langCode == null || langCode.equals(""))
43 - langCode = "en";
44 - this.langCode = langCode;
45 - interwiki = Localization.getInterwiki();
46 - }
47 - public void writeRevision(Revision revision) throws IOException {
48 - this.revision = revision;
49 - }
50 - public void writeStartPage(Page page) throws IOException {
51 - this.page = page;
52 - }
53 - public void writeEndPage() throws IOException {
54 - ArticleLinks r = links.get(page.Title.Namespace+":"+page.Title.Text);
55 - // register redirect
56 - Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
57 - if( redirect !=null ){
58 - r.redirectsTo = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
59 - } else // process links
60 - processLinks(revision.Text,page.Title.Namespace);
61 - }
62 -
63 - /** Find the links object for the ns:title key */
64 - protected ArticleLinks findArticleLinks(int ns, String title){
65 - String key;
66 - ArticleLinks rank;
67 - // try exact match
68 - key = ns+":"+title;
69 - rank = links.get(key);
70 - if(rank != null)
71 - return rank;
72 - // try lowercase
73 - key = ns+":"+title.toLowerCase();
74 - rank = links.get(key);
75 - if(rank != null)
76 - return rank;
77 - // try title case
78 - key = ns+":"+WordUtils.capitalize(title);
79 - rank = links.get(key);
80 - if(rank != null)
81 - return rank;
82 - // try capitalizing at word breaks
83 - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
84 - rank = links.get(key);
85 - if(rank != null)
86 - return rank;
87 -
88 - return null;
89 - }
90 -
91 - /** Extract all links from this page, and increment ref count for linked pages */
92 - protected void processLinks(String text, int namespace) {
93 - Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]");
94 - Matcher matcher = linkPat.matcher(text);
95 - int ns; String title;
96 - boolean escaped;
97 -
98 - HashSet<ArticleLinks> pagelinks = new HashSet<ArticleLinks>();
99 - while(matcher.find()){
100 - String link = matcher.group(1);
101 - int fragment = link.lastIndexOf('#');
102 - if(fragment != -1)
103 - link = link.substring(0,fragment);
104 - //System.out.println("Got link "+link);
105 - if(link.startsWith(":")){
106 - escaped = true;
107 - link = link.substring(1);
108 - } else escaped = false;
109 - ns = 0;
110 - title = link;
111 - // check for ns:title syntax
112 - String[] parts = link.split(":",2);
113 - if(parts.length == 2 && parts[0].length() > 1){
114 - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
115 - if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14)))
116 - continue; // categories, ignore
117 - if(inx!=null && inx < 0)
118 - continue; // special pages, ignore
119 - if(inx != null){
120 - ns = inx;
121 - title = parts[1];
122 - }
123 -
124 - // ignore interwiki links
125 - if(interwiki.contains(parts[0]))
126 - continue;
127 - }
128 - if(ns == 0 && namespace!=0)
129 - continue; // skip links from other namespaces into the main namespace
130 -
131 - // register as link
132 - ArticleLinks target = findArticleLinks(ns,title);
133 - if(target != null)
134 - pagelinks.add(target);
135 - }
136 - // increment page ranks
137 - for(ArticleLinks rank : pagelinks){
138 - rank.links++;
139 - }
140 - }
141 - public void writeSiteinfo(Siteinfo info) throws IOException {
142 - siteinfo = info;
143 - }
144 - public void close() throws IOException {
145 - // nop
146 - }
147 - public void writeEndWiki() throws IOException {
148 - // nop
149 - }
150 - public void writeStartWiki() throws IOException {
151 - // nop
152 - }
153 - public HashMap<String, ArticleLinks> getRanks() {
154 - return links;
155 - }
156 -
157 -}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java
@@ -1,63 +0,0 @@
2 -package org.wikimedia.lsearch.importer;
3 -
4 -import java.io.IOException;
5 -import java.util.ArrayList;
6 -import java.util.HashMap;
7 -import java.util.HashSet;
8 -import java.util.Iterator;
9 -import java.util.Map.Entry;
10 -
11 -import org.mediawiki.importer.DumpWriter;
12 -import org.mediawiki.importer.Page;
13 -import org.mediawiki.importer.Revision;
14 -import org.mediawiki.importer.Siteinfo;
15 -import org.wikimedia.lsearch.beans.ArticleLinks;
16 -import org.wikimedia.lsearch.util.Localization;
17 -
18 -/**
19 - * Read a HashSet of titles from dump
20 - *
21 - * @author rainman
22 - *
23 - */
24 -public class TitleReader implements DumpWriter{
25 - Page page;
26 - Revision revision;
27 - HashMap<String,ArticleLinks> titles = new HashMap<String,ArticleLinks>();
28 - protected String langCode;
29 -
30 - public TitleReader(String langCode){
31 - this.langCode = langCode;
32 - }
33 -
34 - public void writeRevision(Revision revision) throws IOException {
35 - this.revision = revision;
36 - }
37 - public void writeStartPage(Page page) throws IOException {
38 - this.page = page;
39 - }
40 - public void writeEndPage() throws IOException {
41 - String key = page.Title.Namespace+":"+page.Title.Text;
42 - titles.put(key,new ArticleLinks(0));
43 - }
44 - public HashMap<String,ArticleLinks> getTitles() {
45 - return titles;
46 - }
47 - public void close() throws IOException {
48 - // nop
49 - }
50 - public void writeEndWiki() throws IOException {
51 - // nop
52 - }
53 - public void writeSiteinfo(Siteinfo info) throws IOException {
54 - // write siteinfo to localization
55 - Iterator it = info.Namespaces.orderedEntries();
56 - while(it.hasNext()){
57 - Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
58 - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
59 - }
60 - }
61 - public void writeStartWiki() throws IOException {
62 - // nop
63 - }
64 -}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -20,6 +20,8 @@
2121 import org.wikimedia.lsearch.beans.Title;
2222 import org.wikimedia.lsearch.config.Configuration;
2323 import org.wikimedia.lsearch.config.IndexId;
 24+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
 25+import org.wikimedia.lsearch.ranks.Links;
2426 import org.wikimedia.lsearch.util.Localization;
2527
2628 public class DumpImporter implements DumpWriter {
@@ -28,11 +30,11 @@
2931 Revision revision;
3032 SimpleIndexWriter writer;
3133 int count = 0, limit;
32 - HashMap<String,ArticleLinks> ranks;
 34+ Links ranks;
3335 String langCode;
3436
3537 public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
36 - Integer maxBufDocs, boolean newIndex, HashMap<String,ArticleLinks> ranks, String langCode){
 38+ Integer maxBufDocs, boolean newIndex, Links ranks, String langCode){
3739 Configuration.open(); // make sure configuration is loaded
3840 writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex);
3941 this.limit = limit;
@@ -48,7 +50,7 @@
4951 public void writeEndPage() throws IOException {
5052 // get reference count
5153 String key = page.Title.Namespace+":"+page.Title.Text;
52 - ArticleLinks r = ranks.get(key);
 54+ CompactArticleLinks r = ranks.get(key);
5355 int references;
5456 boolean isRedirect = r.redirectsTo != null;
5557 if(r == null){
@@ -59,9 +61,9 @@
6062 // make list of redirects
6163 ArrayList<Redirect> redirects = new ArrayList<Redirect>();
6264 if(r.redirected != null){
63 - for(String rk : r.redirected){
64 - String[] parts = rk.split(":",2);
65 - redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],ranks.get(rk).links));
 65+ for(CompactArticleLinks rk : r.redirected){
 66+ String[] parts = rk.toString().split(":",2);
 67+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links));
6668 }
6769 }
6870 // make article
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -16,6 +16,8 @@
1717 import org.wikimedia.lsearch.config.GlobalConfiguration;
1818 import org.wikimedia.lsearch.config.IndexId;
1919 import org.wikimedia.lsearch.index.IndexThread;
 20+import org.wikimedia.lsearch.ranks.Links;
 21+import org.wikimedia.lsearch.ranks.RankBuilder;
2022 import org.wikimedia.lsearch.util.Localization;
2123 import org.wikimedia.lsearch.util.UnicodeDecomposer;
2224
@@ -96,8 +98,9 @@
9799 long start = System.currentTimeMillis();
98100
99101 // regenerate link and redirect information
100 - HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile,langCode),langCode);
101 -
 102+ Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS);
 103+ links.generateRedirectLists();
 104+
102105 log.info("Third pass, indexing articles...");
103106
104107 // open
@@ -148,59 +151,6 @@
149152 }
150153 }
151154
152 - private static HashMap<String,ArticleLinks> processLinks(String inputfile, HashMap<String,ArticleLinks> links, String langCode) {
153 - log.info("Second pass, calculating article links...");
154 - InputStream input = null;
155 - // second pass - calculate page ranks
156 - try {
157 - input = Tools.openInputFile(inputfile);
158 - } catch (IOException e) {
159 - log.fatal("I/O error opening "+inputfile);
160 - return null;
161 - }
162 - // calculate ranks
163 - LinkReader rr = new LinkReader(links,langCode);
164 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
165 - try {
166 - reader.readDump();
167 - } catch (IOException e) {
168 - log.fatal("I/O error reading dump while calculating ranks for from "+inputfile);
169 - return null;
170 - }
171 - // generate "redirects here" lists for each article
172 - for(Entry<String,ArticleLinks> e : links.entrySet()){
173 - ArticleLinks r = e.getValue();
174 - if(r.redirectsTo != null && r != r.redirectsTo){
175 - if(r.redirectsTo.redirected == null)
176 - r.redirectsTo.redirected = new ArrayList<String>();
177 - r.redirectsTo.redirected.add(e.getKey());
178 - }
179 - }
180 - return links;
181 - }
182 -
183 - private static HashMap<String,ArticleLinks> getTitles(String inputfile,String langCode) {
184 - log.info("First pass, getting a list of valid articles...");
185 - InputStream input = null;
186 - try {
187 - input = Tools.openInputFile(inputfile);
188 - } catch (IOException e) {
189 - log.fatal("I/O error opening "+inputfile);
190 - return null;
191 - }
192 - // first pass, get titles
193 - TitleReader tr = new TitleReader(langCode);
194 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
195 - try {
196 - reader.readDump();
197 - input.close();
198 - } catch (IOException e) {
199 - log.fatal("I/O error reading dump while getting titles from "+inputfile);
200 - return null;
201 - }
202 - return tr.getTitles();
203 - }
204 -
205155 private static String formatTime(long l) {
206156 l /= 1000;
207157 if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html
@@ -0,0 +1,3 @@
 2+<html><body>
 3+Storage of data, mainly in database. E.g. page ranks, text for highlighting...
 4+</body></html>
\ No newline at end of file
Property changes on: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html
___________________________________________________________________
Added: svn:executable
15 + *
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/Storage.java
@@ -0,0 +1,29 @@
 2+package org.wikimedia.lsearch.storage;
 3+
 4+import java.io.IOException;
 5+import java.util.Collection;
 6+
 7+import org.wikimedia.lsearch.beans.Title;
 8+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
 9+
 10+abstract public class Storage {
 11+ static protected Storage instance = null;
 12+
 13+ /** Get instance of Storage singleton class */
 14+ public static synchronized Storage getInstance(){
 15+ if(instance == null)
 16+ instance = new MySQLStorage();
 17+ return instance;
 18+ }
 19+
 20+ /**
 21+ * Store a complete array of page references
 22+ */
 23+ abstract public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException;
 24+
 25+ /**
 26+ * Fetch page references for number of titles
 27+ */
 28+ abstract public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException;
 29+
 30+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java
@@ -0,0 +1,267 @@
 2+package org.wikimedia.lsearch.storage;
 3+
 4+import java.io.BufferedReader;
 5+import java.io.FileNotFoundException;
 6+import java.io.FileReader;
 7+import java.io.IOException;
 8+import java.sql.Connection;
 9+import java.sql.DriverManager;
 10+import java.sql.ResultSet;
 11+import java.sql.SQLException;
 12+import java.sql.Statement;
 13+import java.util.ArrayList;
 14+import java.util.Collection;
 15+import java.util.Hashtable;
 16+import java.util.Iterator;
 17+import java.util.Map.Entry;
 18+
 19+import org.apache.log4j.Logger;
 20+import org.wikimedia.lsearch.beans.Title;
 21+import org.wikimedia.lsearch.config.Configuration;
 22+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
 23+
 24+/**
 25+ * MySQL storage backend
 26+ *
 27+ *
 28+ * @author rainman
 29+ *
 30+ */
 31+public class MySQLStorage extends Storage {
 32+ static Logger log = Logger.getLogger(MySQLStorage.class);
 33+ protected Configuration config;
 34+ /** master host */
 35+ protected String master;
 36+ /** slave host -> % of load */
 37+ protected Hashtable<String,Double> slaves = null;
 38+ /** mysql username */
 39+ protected String username;
 40+ /** mysql password */
 41+ protected String password;
 42+ /** If we should separate data in many dbs */
 43+ protected boolean separate;
 44+ /** db where to put everything, if we are not using one db per dbname */
 45+ protected String defaultDB;
 46+ /** where sql stuff is, e.g. references_table.sql */
 47+ protected String lib;
 48+ /** table name -> create table file */
 49+ protected Hashtable<String,String> tableDefs = new Hashtable<String,String>();
 50+
 51+ protected MySQLStorage() {
 52+ config = Configuration.open();
 53+ try {
 54+ Class.forName("com.mysql.jdbc.Driver");
 55+ } catch (ClassNotFoundException e) {
 56+ log.error("Cannot load mysql jdbc driver, class not found: "+e.getMessage());
 57+ }
 58+
 59+ lib = config.getString("Storage","lib","./sql");
 60+
 61+ master = config.getString("Storage","master","localhost");
 62+ String[] ss = config.getArray("Storage","slaves");
 63+ if(ss != null){
 64+ Hashtable<String,Double> rawslaves = new Hashtable<String,Double>();
 65+ for(String slave : ss){
 66+ String[] parts = slave.split("->",2);
 67+ if(parts.length==2){
 68+ rawslaves.put(parts[0],Double.parseDouble(parts[1]));
 69+ }
 70+ }
 71+ // normalize to 1
 72+ double sum = 0;
 73+ for(Double d : rawslaves.values())
 74+ sum += d;
 75+ if(sum == 0) // in case no loads are specified
 76+ sum = 1;
 77+ slaves = new Hashtable<String,Double>();
 78+ for(Entry<String,Double> ed : rawslaves.entrySet())
 79+ slaves.put(ed.getKey(),ed.getValue()/sum);
 80+
 81+ }
 82+
 83+ username = config.getString("Storage","username","root");
 84+ password = config.getString("Storage","password","");
 85+
 86+ // figure out db configuration
 87+ separate = config.getBoolean("Storage","useSeparateDBs");
 88+ if(!separate){
 89+ defaultDB = config.getString("Storage","defaultDB");
 90+ if(defaultDB == null){
 91+ log.error("Set Storage.defaultDB in local configuration.");
 92+ }
 93+ }
 94+ }
 95+
 96+ /** Get connection for writing stuff, i.e. on the master */
 97+ protected Connection getReadConnection(String dbname) throws IOException{
 98+ return openConnection(dbname,false);
 99+ }
 100+
 101+ /** Get connection for reading of (possibly lagged) stuff, i.e. on slaves (or master if there are no slaves) */
 102+ protected Connection getWriteConnection(String dbname) throws IOException{
 103+ return openConnection(dbname,true);
 104+ }
 105+
 106+ /** Open connection on the master, or load-balanced on one of the slaves */
 107+ protected Connection openConnection(String dbname, boolean onMaster) throws IOException {
 108+ String host=null;
 109+ if(onMaster || slaves == null)
 110+ host = master;
 111+ else{
 112+ // load balance slaves
 113+ double r = Math.random();
 114+ for(Entry<String,Double> load : slaves.entrySet()){
 115+ r-=load.getValue();
 116+ if(r < 0){
 117+ host = load.getKey();
 118+ break;
 119+ }
 120+ }
 121+ }
 122+ String dburl = "jdbc:mysql://"+host+":3306/";
 123+ if(!separate && defaultDB!=null)
 124+ dburl += defaultDB;
 125+ try {
 126+ return DriverManager.getConnection(dburl, username, password);
 127+ } catch (SQLException e) {
 128+ log.error("Cannot establish connection to "+dburl+" - check host, db, username and password : "+e.getMessage());
 129+ throw new IOException("Cannot establish connection to mysql database.");
 130+ }
 131+ }
 132+
 133+ public String quote(String str){
 134+ return "'"+str+"'";
 135+ }
 136+
 137+ public String escape(String str){
 138+ return str.replace("'","\\'");
 139+ }
 140+
 141+ public String getTableName(String name, String dbname){
 142+ if(!separate)
 143+ return dbname+"_"+name;
 144+ else
 145+ return name;
 146+ }
 147+
 148+ // inherit javadoc
 149+ public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException {
 150+ String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE ";
 151+ if(titles.size()==1){
 152+ sql += "rf_key="+quote(escape(titles.iterator().next().getKey()));
 153+ } else{
 154+ StringBuilder sb = new StringBuilder(sql);
 155+ sb.append("rf_key IN (");
 156+ Iterator<Title> it = titles.iterator();
 157+ while(it.hasNext()){
 158+ sb.append('\'');
 159+ sb.append(it.next().getKey());
 160+ sb.append('\'');
 161+ if(it.hasNext())
 162+ sb.append(',');
 163+ }
 164+ sb.append(")");
 165+ sql = sb.toString();
 166+ }
 167+ try {
 168+ Connection conn = getReadConnection(dbname);
 169+ log.info("Fetching references for "+titles.size()+" pages");
 170+ Statement stmt = conn.createStatement();
 171+ ResultSet res = stmt.executeQuery(sql);
 172+ ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>();
 173+ while(res.next()){
 174+ ret.add(new CompactArticleLinks(res.getString("rf_key"),res.getInt("rf_references")));
 175+ }
 176+ conn.close();
 177+ return ret;
 178+ } catch (SQLException e) {
 179+ log.error("Cannot execute sql "+sql+" : "+e.getMessage());
 180+ throw new IOException(e.getMessage());
 181+ }
 182+ }
 183+
 184+ // inherit javadoc
 185+ public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException {
 186+ final int maxPerQuery = 10000;
 187+ Connection conn = getWriteConnection(dbname);
 188+ verifyTable("references",dbname,conn);
 189+ Iterator<CompactArticleLinks> it = refs.iterator();
 190+ // send chunks of maxPerQuery referenace replacements
 191+ while(it.hasNext()){
 192+ StringBuilder sb = new StringBuilder("REPLACE INTO "+getTableName("references",dbname)+" (rf_key,rf_references) VALUES ");
 193+ int count = 0;
 194+ while(it.hasNext() && count < maxPerQuery){
 195+ CompactArticleLinks cs = it.next();
 196+ sb.append("('");
 197+ sb.append(escape(cs.getKey()));
 198+ sb.append("','");
 199+ sb.append(cs.links);
 200+ count++;
 201+ if(it.hasNext() && count<maxPerQuery)
 202+ sb.append("'), ");
 203+ else
 204+ sb.append("');");
 205+ }
 206+ try {
 207+ log.info("Storing "+Math.min(maxPerQuery,count)+" page ranks... ");
 208+ Statement stmt = conn.createStatement();
 209+ stmt.executeUpdate(sb.toString());
 210+
 211+ } catch (SQLException e) {
 212+ log.error("Cannot execute replace query "+sb+" : "+e.getMessage());
 213+ throw new IOException(e.getMessage());
 214+ }
 215+ }
 216+ try {
 217+ conn.close(); // be sure we close the connection
 218+ } catch (SQLException e) {
 219+ }
 220+ }
 221+
 222+ /** Creates table if it doesn't exist */
 223+ protected void verifyTable(String name, String dbname, Connection conn) throws IOException {
 224+ // verify if table exists
 225+ String table = getTableName(name,dbname);
 226+ try {
 227+ log.info("Verifying table "+name+" on "+dbname);
 228+ Statement stmt = conn.createStatement();
 229+ ResultSet res = stmt.executeQuery("SHOW TABLES LIKE '"+table+"';");
 230+ if(res.next()) // table exists!
 231+ return;
 232+
 233+ } catch (SQLException e) {
 234+ log.error("Cannot verify table "+table+" : "+e.getMessage());
 235+ throw new IOException(e.getMessage());
 236+ }
 237+
 238+ // fetch table definition
 239+ String def = tableDefs.get(name);
 240+ if(def == null){
 241+ if(!lib.endsWith(Configuration.PATH_SEP))
 242+ lib = lib+Configuration.PATH_SEP;
 243+
 244+ BufferedReader file = new BufferedReader(new FileReader(lib+name+"_table.sql"));
 245+ StringBuilder sb = new StringBuilder();
 246+ String line;
 247+ while((line = file.readLine()) != null){
 248+ sb.append(line.replaceFirst("--.*",""));
 249+ }
 250+ def = sb.toString();
 251+ }
 252+ // preprocess dbprefix tags
 253+ String tdef;
 254+ if(!separate)
 255+ tdef = def.replace("/*DBprefix*/",dbname+"_");
 256+ else
 257+ tdef = def;
 258+ // create
 259+ try {
 260+ log.info("Creating table "+name+" on "+dbname);
 261+ Statement stmt = conn.createStatement();
 262+ stmt.executeUpdate(tdef);
 263+ } catch (SQLException e) {
 264+ log.error("Cannot create table "+table+" : "+e.getMessage());
 265+ throw new IOException(e.getMessage());
 266+ }
 267+ }
 268+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexRegistry.java
@@ -26,11 +26,20 @@
2727 protected Hashtable<String,LocalIndex> latestUpdate;
2828 /** current search index */
2929 protected Hashtable<String,LocalIndex> currentSearch;
 30+ /** when was the last time when snapshot was refreshed */
 31+ protected Hashtable<String,Long> lastSnapshotRefresh = new Hashtable<String,Long>();
3032
 33+ protected Object lock = new Object();
 34+
3135 protected static IndexRegistry instance = null;
3236
3337 /** Get info about the latest index snapshot */
3438 public LocalIndex getLatestSnapshot(IndexId iid){
 39+ synchronized (lock) {
 40+ // wait at least 5 second before the next refresh
 41+ if(lastSnapshotRefresh.get(iid.toString()) == null || (System.currentTimeMillis() - lastSnapshotRefresh.get(iid.toString()) > 5000))
 42+ refreshSnapshots(iid);
 43+ }
3544 return latestSnapshot.get(iid.toString()); // hashtable is synchronized
3645 }
3746
@@ -95,10 +104,11 @@
96105 } else if(latestSnapshot.get(iid.toString()) != null){
97106 latestSnapshot.remove((iid.toString()));
98107 }
 108+ lastSnapshotRefresh.put(iid.toString(),System.currentTimeMillis());
99109 }
100110
101111 /** Refresh latest search update info */
102 - public synchronized void refreshUpdates(IndexId iid){
 112+ public synchronized void refreshUpdates(IndexId iid){
103113 File updateDir = new File(iid.getUpdatePath());
104114 LocalIndex latest = getLatestLocalIndex(updateDir,iid);
105115 if(latest != null){
@@ -106,6 +116,7 @@
107117 } else if(latestUpdate.get(iid.toString()) != null){
108118 latestUpdate.remove((iid.toString()));
109119 }
 120+
110121 }
111122
112123 /** Tell registry this is the most current version of search index */
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
@@ -35,7 +35,7 @@
3636 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
3737 return f;
3838 }
39 - } else if(fieldName.equals("title") || fieldName.startsWith("alttitle")){
 39+ } else if(fieldName.equals("title") || fieldName.equals("stemtitle") || fieldName.startsWith("alttitle")){
4040 float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
4141 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
4242 return f;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java
@@ -152,4 +152,5 @@
153153 public String getKey(){
154154 return article.getKey();
155155 }
 156+
156157 }
Index: trunk/lucene-search-2.0/.classpath
@@ -14,5 +14,6 @@
1515 <classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/>
1616 <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/>
1717 <classpathentry kind="lib" path="lib/highlighter.jar" sourcepath="/sandbox/highlighter"/>
 18+ <classpathentry kind="lib" path="lib/mysql-connector-java-3.0.17-ga-bin.jar"/>
1819 <classpathentry kind="output" path="bin"/>
1920 </classpath>
Index: trunk/lucene-search-2.0/lsearch.conf
@@ -51,6 +51,31 @@
5252 Index.maxqueuetimeout=12
5353
5454 ################################################
 55+# Storage backend (currently mysql)
 56+################################################
 57+
 58+# host of database master
 59+Storage.master=localhost
 60+
 61+# array of host->load
 62+#Storage.slaves=host1->10 host2->50 host3->100
 63+
 64+# Storage.username=root
 65+# Storage.password=
 66+
 67+# Values:
 68+# true - each dbname has a separate db of that name
 69+# false - each dbname is a prefix for tables in a default db (set default db below)
 70+Storage.useSeparateDBs=false
 71+
 72+# Default db where all the stuff will be stored (if useSeparateDB=false)
 73+Storage.defaultDB=lsearch
 74+
 75+# Where table definitions are
 76+Storage.lib=/var/www/html/lucene-search-2.0/sql
 77+
 78+
 79+################################################
5580 # Log, ganglia, localization
5681 ################################################
5782

Status & tagging log