r22892 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r22891‎ \| r22892 \| r22893 >
Date:	21:37, 10 June 2007
Author:	rainman
Status:	old
Tags:
Comment:	Restructured page rank calculation (since it's too expensive to get it from group by database query): * MySQL storage backend to store page ranks, supports master/slave write/read and some load balancing. In some future will probably be also used to store text for highlighting * RankBuilder, rebuilds page rank data from xml dumps, optimized for low memory usage
Modified paths:	/trunk/lucene-search-2.0/.classpath (modified) (history) /trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar (added) (history) /trunk/lucene-search-2.0/lsearch.conf (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Title.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexRegistry.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java (deleted) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java (deleted) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/LinkReader.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/Links.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/TitleReader.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/Storage.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar
___________________________________________________________________
Added: svn:mime-type
1	1	+ application/octet-stream
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
—	—	@@ -3,6 +3,7 @@
4	4	import java.io.IOException;
5	5	import java.util.ArrayList;
6	6	import java.util.Iterator;
	7	+import java.util.Map.Entry;
7	8
8	9	import org.apache.log4j.Logger;
9	10	import org.mediawiki.importer.DumpWriter;
—	—	@@ -65,11 +66,11 @@
66	67	}
67	68	public void writeEndPage() throws IOException {
68	69	Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects);
69		~~- log.info("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);~~
	70	+ log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
70	71	records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
71	72	log.debug(iid+": Update for "+article);
72	73	references = 0;
73		~~- redirects.clear();~~
	74	+ redirects = new ArrayList<Redirect>();
74	75	}
75	76
76	77	public void writeSiteinfo(Siteinfo info) throws IOException {
—	—	@@ -77,9 +78,8 @@
78	79	// write to localization
79	80	Iterator it = info.Namespaces.orderedEntries();
80	81	while(it.hasNext()){
81		~~- Integer inx = (Integer)it.next();~~
82		~~- String prefix = info.Namespaces.getPrefix(inx);~~
83		~~- Localization.addCustomMapping(prefix,inx,langCode);~~
	82	+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
	83	+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
84	84	}
85	85	}
86	86
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
—	—	@@ -10,15 +10,21 @@
11	11	import java.net.Authenticator;
12	12	import java.net.PasswordAuthentication;
13	13	import java.util.ArrayList;
	14	+import java.util.HashMap;
14	15	import java.util.HashSet;
15	16	import java.util.Properties;
16	17
17	18	import org.apache.log4j.Logger;
	19	+import org.wikimedia.lsearch.beans.Article;
	20	+import org.wikimedia.lsearch.beans.Redirect;
	21	+import org.wikimedia.lsearch.beans.Title;
18	22	import org.wikimedia.lsearch.config.Configuration;
19	23	import org.wikimedia.lsearch.config.GlobalConfiguration;
20	24	import org.wikimedia.lsearch.config.IndexId;
21	25	import org.wikimedia.lsearch.index.IndexUpdateRecord;
22	26	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	27	+import org.wikimedia.lsearch.ranks.Links;
	28	+import org.wikimedia.lsearch.storage.Storage;
23	29	import org.wikimedia.lsearch.util.Localization;
24	30	import org.wikimedia.lsearch.util.UnicodeDecomposer;
25	31
—	—	@@ -160,6 +166,12 @@
161	167	continue;
162	168	boolean hasMore = false;
163	169	do{
	170	+ // fetch references for records
	171	+ fetchReferences(records,dbname);
	172	+ for(IndexUpdateRecord rec : records){
	173	+ Article ar = rec.getArticle();
	174	+ log.debug("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
	175	+ }
164	176	// send to indexer
165	177	RMIMessengerClient messenger = new RMIMessengerClient(true);
166	178	try {
—	—	@@ -235,4 +247,34 @@
236	248	}
237	249	} while(daemon);
238	250	}
	251	+
	252	+ protected static void fetchReferences(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException {
	253	+ Storage store = Storage.getInstance();
	254	+ ArrayList<Title> titles = new ArrayList<Title>();
	255	+ for(IndexUpdateRecord rec : records){
	256	+ if(rec.isDelete())
	257	+ continue;
	258	+ Article ar = rec.getArticle();
	259	+ titles.add(ar.makeTitle());
	260	+ if(ar.getRedirects() != null){
	261	+ for(Redirect r : ar.getRedirects()){
	262	+ titles.add(r.makeTitle());
	263	+ }
	264	+ }
	265	+ }
	266	+ // fetch
	267	+ Links links = new Links(store.getPageReferences(titles,dbname));
	268	+ // update
	269	+ for(IndexUpdateRecord rec : records){
	270	+ if(rec.isDelete())
	271	+ continue;
	272	+ Article ar = rec.getArticle();
	273	+ ar.setReferences(links.getLinks(ar.makeTitle().getKey()));
	274	+ if(ar.getRedirects() != null){
	275	+ for(Redirect r : ar.getRedirects()){
	276	+ r.setReferences(links.getLinks(r.makeTitle().getKey()));
	277	+ }
	278	+ }
	279	+ }
	280	+ }
239	281	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
—	—	@@ -185,7 +185,7 @@
186	186	boolean isRed = false;
187	187	if(line.startsWith("#redirect"))
188	188	isRed = true;
189		~~- else if(lang != null ){~~
	189	+ else if(lang != null && redirects.get(lang)!=null){
190	190	for(String magic : redirects.get(lang)){
191	191	if(line.startsWith(magic)){
192	192	isRed = true;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/TitleReader.java
—	—	@@ -0,0 +1,63 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.HashMap;
	7	+import java.util.HashSet;
	8	+import java.util.Iterator;
	9	+import java.util.Map.Entry;
	10	+
	11	+import org.mediawiki.importer.DumpWriter;
	12	+import org.mediawiki.importer.Page;
	13	+import org.mediawiki.importer.Revision;
	14	+import org.mediawiki.importer.Siteinfo;
	15	+import org.wikimedia.lsearch.beans.ArticleLinks;
	16	+import org.wikimedia.lsearch.util.Localization;
	17	+
	18	+/**
	19	+ * Read a HashSet of titles from dump
	20	+ *
	21	+ * @author rainman
	22	+ *
	23	+ */
	24	+public class TitleReader implements DumpWriter{
	25	+ Page page;
	26	+ Revision revision;
	27	+ Links links = new Links();
	28	+ protected String langCode;
	29	+
	30	+ public TitleReader(String langCode){
	31	+ this.langCode = langCode;
	32	+ }
	33	+
	34	+ public void writeRevision(Revision revision) throws IOException {
	35	+ this.revision = revision;
	36	+ }
	37	+ public void writeStartPage(Page page) throws IOException {
	38	+ this.page = page;
	39	+ }
	40	+ public void writeEndPage() throws IOException {
	41	+ String key = page.Title.Namespace+":"+page.Title.Text;
	42	+ links.add(key,0);
	43	+ }
	44	+ public Links getTitles() {
	45	+ return links;
	46	+ }
	47	+ public void close() throws IOException {
	48	+ // nop
	49	+ }
	50	+ public void writeEndWiki() throws IOException {
	51	+ // nop
	52	+ }
	53	+ public void writeSiteinfo(Siteinfo info) throws IOException {
	54	+ // write siteinfo to localization
	55	+ Iterator it = info.Namespaces.orderedEntries();
	56	+ while(it.hasNext()){
	57	+ Entry<Integer,String> pair = (Entry<Integer,String>)it.next();
	58	+ Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);
	59	+ }
	60	+ }
	61	+ public void writeStartWiki() throws IOException {
	62	+ // nop
	63	+ }
	64	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java
—	—	@@ -0,0 +1,97 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.io.UnsupportedEncodingException;
	5	+import java.util.ArrayList;
	6	+
	7	+
	8	+/**
	9	+ * Page links object that has been optimized to use for low
	10	+ * memory consumption. String is being stored in a utf-8
	11	+ * encoded byte[] array, and the same object is to be used
	12	+ * as a key and value in a hashmap.
	13	+ *
	14	+ * Two objects equals iff they have the same string (other fields
	15	+ * are ignored in equals())
	16	+ *
	17	+ * @author rainman
	18	+ *
	19	+ */
	20	+public class CompactArticleLinks{
	21	+ /** format: <ns>:<title> */
	22	+ protected byte[] str;
	23	+ public int links;
	24	+ protected int hash = 0;
	25	+ /** if this page is a redirect */
	26	+ public CompactArticleLinks redirectsTo;
	27	+ /** list of pages that redirect here */
	28	+ public ArrayList<CompactArticleLinks> redirected;
	29	+
	30	+ public CompactArticleLinks(String s){
	31	+ try {
	32	+ str = s.getBytes("utf-8");
	33	+ } catch (UnsupportedEncodingException e) {
	34	+ e.printStackTrace();
	35	+ }
	36	+ }
	37	+
	38	+ public CompactArticleLinks(String s, int count){
	39	+ this(s);
	40	+ this.links = count;
	41	+ }
	42	+
	43	+ @Override
	44	+ public String toString() {
	45	+ try {
	46	+ return new String(str,0,str.length,"utf-8")+", count="+links;
	47	+ } catch (UnsupportedEncodingException e) {
	48	+ return "";
	49	+ }
	50	+ }
	51	+
	52	+ public String getKey(){
	53	+ try {
	54	+ return new String(str,0,str.length,"utf-8");
	55	+ } catch (UnsupportedEncodingException e) {
	56	+ return "";
	57	+ }
	58	+ }
	59	+
	60	+ public void addRedirect(CompactArticleLinks from){
	61	+ if(redirected == null)
	62	+ redirected = new ArrayList<CompactArticleLinks>();
	63	+ redirected.add(from);
	64	+ }
	65	+ @Override
	66	+ public int hashCode() {
	67	+ int h = hash;
	68	+ if(h == 0){
	69	+ int off = 0;
	70	+
	71	+ for (int i = 0; i < str.length; i++) {
	72	+ h = 31*h + str[off++];
	73	+ }
	74	+ hash = h;
	75	+ }
	76	+
	77	+ return h;
	78	+ }
	79	+
	80	+ @Override
	81	+ public boolean equals(Object obj) {
	82	+ if (this == obj)
	83	+ return true;
	84	+ if (obj == null)
	85	+ return false;
	86	+ if (getClass() != obj.getClass())
	87	+ return false;
	88	+ final CompactArticleLinks other = (CompactArticleLinks) obj;
	89	+ if(other.str.length != str.length)
	90	+ return false;
	91	+ for(int i=0;i<str.length;i++)
	92	+ if(str[i] != other.str[i])
	93	+ return false;
	94	+ return true;
	95	+ }
	96	+
	97	+
	98	+}
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/LinkReader.java
—	—	@@ -0,0 +1,183 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.HashMap;
	6	+import java.util.HashSet;
	7	+import java.util.regex.Matcher;
	8	+import java.util.regex.Pattern;
	9	+
	10	+import org.apache.commons.lang.WordUtils;
	11	+import org.apache.log4j.Logger;
	12	+import org.mediawiki.importer.DumpWriter;
	13	+import org.mediawiki.importer.Page;
	14	+import org.mediawiki.importer.Revision;
	15	+import org.mediawiki.importer.Siteinfo;
	16	+import org.wikimedia.lsearch.beans.Article;
	17	+import org.wikimedia.lsearch.beans.ArticleLinks;
	18	+import org.wikimedia.lsearch.beans.Title;
	19	+import org.wikimedia.lsearch.config.Configuration;
	20	+import org.wikimedia.lsearch.config.IndexId;
	21	+import org.wikimedia.lsearch.util.Localization;
	22	+
	23	+/**
	24	+ * Reads page links and references, i.e. how many times a page
	25	+ * is referenced within other articles.
	26	+ *
	27	+ * @author rainman
	28	+ *
	29	+ */
	30	+public class LinkReader implements DumpWriter {
	31	+ static Logger log = Logger.getLogger(LinkReader.class);
	32	+ Page page;
	33	+ Revision revision;
	34	+ Siteinfo siteinfo;
	35	+ /** ns:title -> number of referring articles */
	36	+ Links links;
	37	+ HashSet<String> interwiki;
	38	+ String langCode;
	39	+ boolean readRedirects;
	40	+
	41	+ public static final boolean READ_REDIRECTS = true;
	42	+ public static final boolean NO_REDIRECTS = false;
	43	+
	44	+ public LinkReader(Links links, String langCode){
	45	+ this(links,langCode,false);
	46	+ }
	47	+
	48	+ public LinkReader(Links links, String langCode, boolean readRedirects){
	49	+ this.links = links;
	50	+ this.readRedirects = readRedirects;
	51	+ if(langCode == null \|\| langCode.equals(""))
	52	+ langCode = "en";
	53	+ this.langCode = langCode;
	54	+ interwiki = Localization.getInterwiki();
	55	+ }
	56	+ public void writeRevision(Revision revision) throws IOException {
	57	+ this.revision = revision;
	58	+ }
	59	+ public void writeStartPage(Page page) throws IOException {
	60	+ this.page = page;
	61	+ }
	62	+ public void writeEndPage() throws IOException {
	63	+ if(readRedirects){
	64	+ // register redirect
	65	+ Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
	66	+ if( redirect !=null ){
	67	+ CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
	68	+ if(cs != null)
	69	+ links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs);
	70	+ return;
	71	+ }
	72	+ }
	73	+ processLinks(revision.Text,page.Title.Namespace);
	74	+ }
	75	+
	76	+ /** Find the links object for the ns:title key */
	77	+ protected CompactArticleLinks findArticleLinks(int ns, String title){
	78	+ String key;
	79	+ CompactArticleLinks rank;
	80	+ if(title.length() == 0)
	81	+ return null;
	82	+ // try exact match
	83	+ key = ns+":"+title;
	84	+ rank = links.get(key);
	85	+ if(rank != null)
	86	+ return rank;
	87	+ // try lowercase
	88	+ key = ns+":"+title.toLowerCase();
	89	+ rank = links.get(key);
	90	+ if(rank != null)
	91	+ return rank;
	92	+ // try lowercase with first letter upper case
	93	+ if(title.length()==1)
	94	+ key = ns+":"+title.toUpperCase();
	95	+ else
	96	+ key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase();
	97	+ rank = links.get(key);
	98	+ if(rank != null)
	99	+ return rank;
	100	+ // try title case
	101	+ key = ns+":"+WordUtils.capitalize(title);
	102	+ rank = links.get(key);
	103	+ if(rank != null)
	104	+ return rank;
	105	+ // try upper case
	106	+ key = ns+":"+title.toUpperCase();
	107	+ rank = links.get(key);
	108	+ if(rank != null)
	109	+ return rank;
	110	+ // try capitalizing at word breaks
	111	+ key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});
	112	+ rank = links.get(key);
	113	+ if(rank != null)
	114	+ return rank;
	115	+
	116	+ return null;
	117	+ }
	118	+
	119	+ /** Extract all links from this page, and increment ref count for linked pages */
	120	+ protected void processLinks(String text, int namespace) {
	121	+ Pattern linkPat = Pattern.compile("\\[\\[(.?)(\\\|(.?))?\\]\\]");
	122	+ Matcher matcher = linkPat.matcher(text);
	123	+ int ns; String title;
	124	+ boolean escaped;
	125	+
	126	+ HashSet<CompactArticleLinks> pagelinks = new HashSet<CompactArticleLinks>();
	127	+ while(matcher.find()){
	128	+ String link = matcher.group(1);
	129	+ int fragment = link.lastIndexOf('#');
	130	+ if(fragment != -1)
	131	+ link = link.substring(0,fragment);
	132	+ //System.out.println("Got link "+link);
	133	+ if(link.startsWith(":")){
	134	+ escaped = true;
	135	+ link = link.substring(1);
	136	+ } else escaped = false;
	137	+ ns = 0;
	138	+ title = link;
	139	+ // check for ns:title syntax
	140	+ String[] parts = link.split(":",2);
	141	+ if(parts.length == 2 && parts[0].length() > 1){
	142	+ Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());
	143	+ if(!escaped && (parts[0].equalsIgnoreCase("category") \|\| (inx!=null && inx==14)))
	144	+ continue; // categories, ignore
	145	+ if(inx!=null && inx < 0)
	146	+ continue; // special pages, ignore
	147	+ if(inx != null){
	148	+ ns = inx;
	149	+ title = parts[1];
	150	+ }
	151	+
	152	+ // ignore interwiki links
	153	+ if(interwiki.contains(parts[0]))
	154	+ continue;
	155	+ }
	156	+ if(ns == 0 && namespace!=0)
	157	+ continue; // skip links from other namespaces into the main namespace
	158	+ // register as link
	159	+ CompactArticleLinks target = findArticleLinks(ns,title);
	160	+ if(target != null)
	161	+ pagelinks.add(target);
	162	+ }
	163	+ // increment page ranks
	164	+ for(CompactArticleLinks rank : pagelinks){
	165	+ rank.links++;
	166	+ }
	167	+ }
	168	+ public void writeSiteinfo(Siteinfo info) throws IOException {
	169	+ siteinfo = info;
	170	+ }
	171	+ public void close() throws IOException {
	172	+ // nop
	173	+ }
	174	+ public void writeEndWiki() throws IOException {
	175	+ // nop
	176	+ }
	177	+ public void writeStartWiki() throws IOException {
	178	+ // nop
	179	+ }
	180	+ public Links getRanks() {
	181	+ return links;
	182	+ }
	183	+
	184	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/Links.java
—	—	@@ -0,0 +1,76 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.util.ArrayList;
	5	+import java.util.Collection;
	6	+import java.util.HashMap;
	7	+import java.util.Map.Entry;
	8	+
	9	+import org.wikimedia.lsearch.beans.ArticleLinks;
	10	+
	11	+/**
	12	+ * Abstraction of links retrieval and other operations related to
	13	+ * CompactArticleLinks.
	14	+ *
	15	+ * @author rainman
	16	+ *
	17	+ */
	18	+public class Links {
	19	+ protected HashMap<CompactArticleLinks,CompactArticleLinks> links = new HashMap<CompactArticleLinks,CompactArticleLinks>();
	20	+
	21	+ public Links() {
	22	+ }
	23	+
	24	+ public Links(Collection<CompactArticleLinks> col){
	25	+ for(CompactArticleLinks c : col){
	26	+ links.put(c,c);
	27	+ }
	28	+ }
	29	+
	30	+ /** Add new page with key and ref */
	31	+ public void add(String key, int ref){
	32	+ CompactArticleLinks cs = new CompactArticleLinks(key,ref);
	33	+ links.put(cs,cs);
	34	+ }
	35	+
	36	+ /** Setup redirect key -> tokey */
	37	+ public void setRedirect(String key, String tokey){
	38	+ CompactArticleLinks from = links.get(new CompactArticleLinks(key));
	39	+ CompactArticleLinks to = links.get(new CompactArticleLinks(tokey));
	40	+ from.redirectsTo = to;
	41	+ }
	42	+
	43	+ /** Setup redirect key -> to */
	44	+ public void setRedirect(String key, CompactArticleLinks to){
	45	+ CompactArticleLinks from = links.get(new CompactArticleLinks(key));
	46	+ from.redirectsTo = to;
	47	+ }
	48	+
	49	+ /** Get links object from key */
	50	+ public CompactArticleLinks get(String key){
	51	+ return links.get(new CompactArticleLinks(key));
	52	+ }
	53	+
	54	+ /** Get collection of all links objects */
	55	+ public Collection<CompactArticleLinks> getAll(){
	56	+ return links.values();
	57	+ }
	58	+
	59	+ /** Get number of references (links) to article of key */
	60	+ public int getLinks(String key){
	61	+ CompactArticleLinks c = links.get(new CompactArticleLinks(key));
	62	+ if(c == null)
	63	+ return 0;
	64	+ else
	65	+ return c.links;
	66	+ }
	67	+
	68	+ /** Generate "redirects here" lists for each article */
	69	+ public void generateRedirectLists(){
	70	+ for(CompactArticleLinks r : links.values()){
	71	+ if(r.redirectsTo != null && r != r.redirectsTo){
	72	+ r.redirectsTo.addRedirect(r);
	73	+ }
	74	+ }
	75	+
	76	+ }
	77	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java
—	—	@@ -0,0 +1,130 @@
	2	+package org.wikimedia.lsearch.ranks;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.InputStream;
	6	+import java.util.ArrayList;
	7	+import java.util.HashMap;
	8	+import java.util.HashSet;
	9	+import java.util.Map.Entry;
	10	+
	11	+import org.apache.log4j.Logger;
	12	+import org.mediawiki.dumper.ProgressFilter;
	13	+import org.mediawiki.dumper.Tools;
	14	+import org.mediawiki.importer.XmlDumpReader;
	15	+import org.wikimedia.lsearch.beans.ArticleLinks;
	16	+import org.wikimedia.lsearch.config.Configuration;
	17	+import org.wikimedia.lsearch.config.GlobalConfiguration;
	18	+import org.wikimedia.lsearch.config.IndexId;
	19	+import org.wikimedia.lsearch.index.IndexThread;
	20	+import org.wikimedia.lsearch.storage.Storage;
	21	+import org.wikimedia.lsearch.util.Localization;
	22	+import org.wikimedia.lsearch.util.UnicodeDecomposer;
	23	+
	24	+/**
	25	+ * Main class, builds index from a database dump.
	26	+ * Syntax: java Importer inputfile dbname
	27	+ *
	28	+ * @author rainman
	29	+ *
	30	+ */
	31	+public class RankBuilder {
	32	+ static Logger log = Logger.getLogger(RankBuilder.class);
	33	+ /**
	34	+ * @param args
	35	+ * @throws IOException
	36	+ */
	37	+ public static void main(String[] args) throws IOException {
	38	+ String inputfile = null;
	39	+ String dbname = null;
	40	+
	41	+ System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n");
	42	+
	43	+ Configuration.open();
	44	+ log = Logger.getLogger(RankBuilder.class);
	45	+
	46	+ if(args.length < 2){
	47	+ System.out.println("Syntax: java Importer <inputfile> <dbname>");
	48	+ return;
	49	+ }
	50	+ inputfile = args[0];
	51	+ dbname = args[1];
	52	+ if(inputfile == null \|\| dbname == null){
	53	+ System.out.println("Please specify both input xml file and database name");
	54	+ return;
	55	+ }
	56	+
	57	+ String langCode = GlobalConfiguration.getInstance().getLanguage(dbname);
	58	+ // preload
	59	+ UnicodeDecomposer.getInstance();
	60	+ Localization.readLocalization(langCode);
	61	+ Localization.loadInterwiki();
	62	+
	63	+ long start = System.currentTimeMillis();
	64	+
	65	+ // regenerate link info
	66	+ Links links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS);
	67	+
	68	+ Storage store = Storage.getInstance();
	69	+ store.storePageReferences(links.getAll(),dbname);
	70	+
	71	+ /*for(CompactArticleLinks cs : links.values()){
	72	+ System.out.println(cs);
	73	+ }*/
	74	+
	75	+ long end = System.currentTimeMillis();
	76	+
	77	+ System.out.println("Finished generating ranks in "+formatTime(end-start));
	78	+ }
	79	+
	80	+ public static Links processLinks(String inputfile, Links links, String langCode, boolean readRedirects) {
	81	+ log.info("Second pass, calculating article links...");
	82	+ InputStream input = null;
	83	+ // second pass - calculate page ranks
	84	+ try {
	85	+ input = Tools.openInputFile(inputfile);
	86	+ } catch (IOException e) {
	87	+ log.fatal("I/O error opening "+inputfile);
	88	+ return null;
	89	+ }
	90	+ // calculate ranks
	91	+ LinkReader rr = new LinkReader(links,langCode,readRedirects);
	92	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));
	93	+ try {
	94	+ reader.readDump();
	95	+ } catch (IOException e) {
	96	+ log.fatal("I/O error reading dump while calculating ranks for from "+inputfile);
	97	+ return null;
	98	+ }
	99	+ return links;
	100	+ }
	101	+
	102	+ public static Links getTitles(String inputfile,String langCode) {
	103	+ log.info("First pass, getting a list of valid articles...");
	104	+ InputStream input = null;
	105	+ try {
	106	+ input = Tools.openInputFile(inputfile);
	107	+ } catch (IOException e) {
	108	+ log.fatal("I/O error opening "+inputfile);
	109	+ return null;
	110	+ }
	111	+ // first pass, get titles
	112	+ TitleReader tr = new TitleReader(langCode);
	113	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));
	114	+ try {
	115	+ reader.readDump();
	116	+ input.close();
	117	+ } catch (IOException e) {
	118	+ log.fatal("I/O error reading dump while getting titles from "+inputfile);
	119	+ return null;
	120	+ }
	121	+ return tr.getTitles();
	122	+ }
	123	+
	124	+ private static String formatTime(long l) {
	125	+ l /= 1000;
	126	+ if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
	127	+ else if(l >= 60) return (l%3600)/60+"m "+(l%60)+"s";
	128	+ else return l+"s";
	129	+ }
	130	+
	131	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java
—	—	@@ -34,6 +34,10 @@
35	35	return namespace+":"+title+" ("+references+")";
36	36	}
37	37
	38	+ public Title makeTitle(){
	39	+ return new Title(namespace,title);
	40	+ }
38	41
39	42
	43	+
40	44	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Title.java
—	—	@@ -16,6 +16,9 @@
17	17	this.title = title;
18	18	}
19	19
	20	+ public String getKey(){
	21	+ return namespace+":"+title;
	22	+ }
20	23
21	24	@Override
22	25	public String toString() {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
—	—	@@ -193,6 +193,10 @@
194	194	this.redirectKeywordRanks = redirectKeywordRanks;
195	195	}
196	196
	197	+ /** Get title object corresponding to this article */
	198	+ public Title makeTitle(){
	199	+ return new Title(Integer.parseInt(namespace),title);
	200	+ }
197	201
198	202
199	203
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java
—	—	@@ -1,156 +0,0 @@
2		~~-package org.wikimedia.lsearch.importer;~~
3		-
4		~~-import java.io.IOException;~~
5		~~-import java.util.HashMap;~~
6		~~-import java.util.HashSet;~~
7		~~-import java.util.regex.Matcher;~~
8		~~-import java.util.regex.Pattern;~~
9		-
10		~~-import org.apache.commons.lang.WordUtils;~~
11		~~-import org.apache.log4j.Logger;~~
12		~~-import org.mediawiki.importer.DumpWriter;~~
13		~~-import org.mediawiki.importer.Page;~~
14		~~-import org.mediawiki.importer.Revision;~~
15		~~-import org.mediawiki.importer.Siteinfo;~~
16		~~-import org.wikimedia.lsearch.beans.Article;~~
17		~~-import org.wikimedia.lsearch.beans.ArticleLinks;~~
18		~~-import org.wikimedia.lsearch.beans.Title;~~
19		~~-import org.wikimedia.lsearch.config.Configuration;~~
20		~~-import org.wikimedia.lsearch.config.IndexId;~~
21		~~-import org.wikimedia.lsearch.util.Localization;~~
22		-
23		-/**
24		~~- * Reads page links and references, i.e. how many times a page~~
25		~~- * is referenced within other articles.~~
26		- *
27		~~- * @author rainman~~
28		- *
29		~~- */~~
30		~~-public class LinkReader implements DumpWriter {~~
31		~~- static Logger log = Logger.getLogger(LinkReader.class);~~
32		~~- Page page;~~
33		~~- Revision revision;~~
34		~~- Siteinfo siteinfo;~~
35		~~- /** ns:title -> number of referring articles */~~
36		~~- HashMap<String,ArticleLinks> links = new HashMap<String,ArticleLinks>();~~
37		~~- HashSet<String> interwiki;~~
38		~~- String langCode;~~
39		-
40		~~- public LinkReader(HashMap<String,ArticleLinks> links, String langCode){~~
41		~~- this.links = links;~~
42		~~- if(langCode == null \|\| langCode.equals(""))~~
43		~~- langCode = "en";~~
44		~~- this.langCode = langCode;~~
45		~~- interwiki = Localization.getInterwiki();~~
46		~~- }~~
47		~~- public void writeRevision(Revision revision) throws IOException {~~
48		~~- this.revision = revision;~~
49		~~- }~~
50		~~- public void writeStartPage(Page page) throws IOException {~~
51		~~- this.page = page;~~
52		~~- }~~
53		~~- public void writeEndPage() throws IOException {~~
54		~~- ArticleLinks r = links.get(page.Title.Namespace+":"+page.Title.Text);~~
55		~~- // register redirect~~
56		~~- Title redirect = Localization.getRedirectTitle(revision.Text,langCode);~~
57		~~- if( redirect !=null ){~~
58		~~- r.redirectsTo = findArticleLinks(redirect.getNamespace(),redirect.getTitle());~~
59		~~- } else // process links~~
60		~~- processLinks(revision.Text,page.Title.Namespace);~~
61		~~- }~~
62		-
63		~~- /** Find the links object for the ns:title key */~~
64		~~- protected ArticleLinks findArticleLinks(int ns, String title){~~
65		~~- String key;~~
66		~~- ArticleLinks rank;~~
67		~~- // try exact match~~
68		~~- key = ns+":"+title;~~
69		~~- rank = links.get(key);~~
70		~~- if(rank != null)~~
71		~~- return rank;~~
72		~~- // try lowercase~~
73		~~- key = ns+":"+title.toLowerCase();~~
74		~~- rank = links.get(key);~~
75		~~- if(rank != null)~~
76		~~- return rank;~~
77		~~- // try title case~~
78		~~- key = ns+":"+WordUtils.capitalize(title);~~
79		~~- rank = links.get(key);~~
80		~~- if(rank != null)~~
81		~~- return rank;~~
82		~~- // try capitalizing at word breaks~~
83		~~- key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'});~~
84		~~- rank = links.get(key);~~
85		~~- if(rank != null)~~
86		~~- return rank;~~
87		-
88		~~- return null;~~
89		~~- }~~
90		-
91		~~- /** Extract all links from this page, and increment ref count for linked pages */~~
92		~~- protected void processLinks(String text, int namespace) {~~
93		~~- Pattern linkPat = Pattern.compile("\\[\\[(.?)(\\\|(.?))?\\]\\]");~~
94		~~- Matcher matcher = linkPat.matcher(text);~~
95		~~- int ns; String title;~~
96		~~- boolean escaped;~~
97		-
98		~~- HashSet<ArticleLinks> pagelinks = new HashSet<ArticleLinks>();~~
99		~~- while(matcher.find()){~~
100		~~- String link = matcher.group(1);~~
101		~~- int fragment = link.lastIndexOf('#');~~
102		~~- if(fragment != -1)~~
103		~~- link = link.substring(0,fragment);~~
104		~~- //System.out.println("Got link "+link);~~
105		~~- if(link.startsWith(":")){~~
106		~~- escaped = true;~~
107		~~- link = link.substring(1);~~
108		~~- } else escaped = false;~~
109		~~- ns = 0;~~
110		~~- title = link;~~
111		~~- // check for ns:title syntax~~
112		~~- String[] parts = link.split(":",2);~~
113		~~- if(parts.length == 2 && parts[0].length() > 1){~~
114		~~- Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase());~~
115		~~- if(!escaped && (parts[0].equalsIgnoreCase("category") \|\| (inx!=null && inx==14)))~~
116		~~- continue; // categories, ignore~~
117		~~- if(inx!=null && inx < 0)~~
118		~~- continue; // special pages, ignore~~
119		~~- if(inx != null){~~
120		~~- ns = inx;~~
121		~~- title = parts[1];~~
122		~~- }~~
123		-
124		~~- // ignore interwiki links~~
125		~~- if(interwiki.contains(parts[0]))~~
126		~~- continue;~~
127		~~- }~~
128		~~- if(ns == 0 && namespace!=0)~~
129		~~- continue; // skip links from other namespaces into the main namespace~~
130		-
131		~~- // register as link~~
132		~~- ArticleLinks target = findArticleLinks(ns,title);~~
133		~~- if(target != null)~~
134		~~- pagelinks.add(target);~~
135		~~- }~~
136		~~- // increment page ranks~~
137		~~- for(ArticleLinks rank : pagelinks){~~
138		~~- rank.links++;~~
139		~~- }~~
140		~~- }~~
141		~~- public void writeSiteinfo(Siteinfo info) throws IOException {~~
142		~~- siteinfo = info;~~
143		~~- }~~
144		~~- public void close() throws IOException {~~
145		~~- // nop~~
146		~~- }~~
147		~~- public void writeEndWiki() throws IOException {~~
148		~~- // nop~~
149		~~- }~~
150		~~- public void writeStartWiki() throws IOException {~~
151		~~- // nop~~
152		~~- }~~
153		~~- public HashMap<String, ArticleLinks> getRanks() {~~
154		~~- return links;~~
155		~~- }~~
156		-
157		-}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java
—	—	@@ -1,63 +0,0 @@
2		~~-package org.wikimedia.lsearch.importer;~~
3		-
4		~~-import java.io.IOException;~~
5		~~-import java.util.ArrayList;~~
6		~~-import java.util.HashMap;~~
7		~~-import java.util.HashSet;~~
8		~~-import java.util.Iterator;~~
9		~~-import java.util.Map.Entry;~~
10		-
11		~~-import org.mediawiki.importer.DumpWriter;~~
12		~~-import org.mediawiki.importer.Page;~~
13		~~-import org.mediawiki.importer.Revision;~~
14		~~-import org.mediawiki.importer.Siteinfo;~~
15		~~-import org.wikimedia.lsearch.beans.ArticleLinks;~~
16		~~-import org.wikimedia.lsearch.util.Localization;~~
17		-
18		-/**
19		~~- * Read a HashSet of titles from dump~~
20		- *
21		~~- * @author rainman~~
22		- *
23		~~- */~~
24		~~-public class TitleReader implements DumpWriter{~~
25		~~- Page page;~~
26		~~- Revision revision;~~
27		~~- HashMap<String,ArticleLinks> titles = new HashMap<String,ArticleLinks>();~~
28		~~- protected String langCode;~~
29		-
30		~~- public TitleReader(String langCode){~~
31		~~- this.langCode = langCode;~~
32		~~- }~~
33		-
34		~~- public void writeRevision(Revision revision) throws IOException {~~
35		~~- this.revision = revision;~~
36		~~- }~~
37		~~- public void writeStartPage(Page page) throws IOException {~~
38		~~- this.page = page;~~
39		~~- }~~
40		~~- public void writeEndPage() throws IOException {~~
41		~~- String key = page.Title.Namespace+":"+page.Title.Text;~~
42		~~- titles.put(key,new ArticleLinks(0));~~
43		~~- }~~
44		~~- public HashMap<String,ArticleLinks> getTitles() {~~
45		~~- return titles;~~
46		~~- }~~
47		~~- public void close() throws IOException {~~
48		~~- // nop~~
49		~~- }~~
50		~~- public void writeEndWiki() throws IOException {~~
51		~~- // nop~~
52		~~- }~~
53		~~- public void writeSiteinfo(Siteinfo info) throws IOException {~~
54		~~- // write siteinfo to localization~~
55		~~- Iterator it = info.Namespaces.orderedEntries();~~
56		~~- while(it.hasNext()){~~
57		~~- Entry<Integer,String> pair = (Entry<Integer,String>)it.next();~~
58		~~- Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode);~~
59		~~- }~~
60		~~- }~~
61		~~- public void writeStartWiki() throws IOException {~~
62		~~- // nop~~
63		~~- }~~
64		-}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
—	—	@@ -20,6 +20,8 @@
21	21	import org.wikimedia.lsearch.beans.Title;
22	22	import org.wikimedia.lsearch.config.Configuration;
23	23	import org.wikimedia.lsearch.config.IndexId;
	24	+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
	25	+import org.wikimedia.lsearch.ranks.Links;
24	26	import org.wikimedia.lsearch.util.Localization;
25	27
26	28	public class DumpImporter implements DumpWriter {
—	—	@@ -28,11 +30,11 @@
29	31	Revision revision;
30	32	SimpleIndexWriter writer;
31	33	int count = 0, limit;
32		~~- HashMap<String,ArticleLinks> ranks;~~
	34	+ Links ranks;
33	35	String langCode;
34	36
35	37	public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
36		~~- Integer maxBufDocs, boolean newIndex, HashMap<String,ArticleLinks> ranks, String langCode){~~
	38	+ Integer maxBufDocs, boolean newIndex, Links ranks, String langCode){
37	39	Configuration.open(); // make sure configuration is loaded
38	40	writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex);
39	41	this.limit = limit;
—	—	@@ -48,7 +50,7 @@
49	51	public void writeEndPage() throws IOException {
50	52	// get reference count
51	53	String key = page.Title.Namespace+":"+page.Title.Text;
52		~~- ArticleLinks r = ranks.get(key);~~
	54	+ CompactArticleLinks r = ranks.get(key);
53	55	int references;
54	56	boolean isRedirect = r.redirectsTo != null;
55	57	if(r == null){
—	—	@@ -59,9 +61,9 @@
60	62	// make list of redirects
61	63	ArrayList<Redirect> redirects = new ArrayList<Redirect>();
62	64	if(r.redirected != null){
63		~~- for(String rk : r.redirected){~~
64		~~- String[] parts = rk.split(":",2);~~
65		~~- redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],ranks.get(rk).links));~~
	65	+ for(CompactArticleLinks rk : r.redirected){
	66	+ String[] parts = rk.toString().split(":",2);
	67	+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links));
66	68	}
67	69	}
68	70	// make article
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -16,6 +16,8 @@
17	17	import org.wikimedia.lsearch.config.GlobalConfiguration;
18	18	import org.wikimedia.lsearch.config.IndexId;
19	19	import org.wikimedia.lsearch.index.IndexThread;
	20	+import org.wikimedia.lsearch.ranks.Links;
	21	+import org.wikimedia.lsearch.ranks.RankBuilder;
20	22	import org.wikimedia.lsearch.util.Localization;
21	23	import org.wikimedia.lsearch.util.UnicodeDecomposer;
22	24
—	—	@@ -96,8 +98,9 @@
97	99	long start = System.currentTimeMillis();
98	100
99	101	// regenerate link and redirect information
100		~~- HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile,langCode),langCode);~~
101		-
	102	+ Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS);
	103	+ links.generateRedirectLists();
	104	+
102	105	log.info("Third pass, indexing articles...");
103	106
104	107	// open
—	—	@@ -148,59 +151,6 @@
149	152	}
150	153	}
151	154
152		~~- private static HashMap<String,ArticleLinks> processLinks(String inputfile, HashMap<String,ArticleLinks> links, String langCode) {~~
153		~~- log.info("Second pass, calculating article links...");~~
154		~~- InputStream input = null;~~
155		~~- // second pass - calculate page ranks~~
156		~~- try {~~
157		~~- input = Tools.openInputFile(inputfile);~~
158		~~- } catch (IOException e) {~~
159		~~- log.fatal("I/O error opening "+inputfile);~~
160		~~- return null;~~
161		~~- }~~
162		~~- // calculate ranks~~
163		~~- LinkReader rr = new LinkReader(links,langCode);~~
164		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));~~
165		~~- try {~~
166		~~- reader.readDump();~~
167		~~- } catch (IOException e) {~~
168		~~- log.fatal("I/O error reading dump while calculating ranks for from "+inputfile);~~
169		~~- return null;~~
170		~~- }~~
171		~~- // generate "redirects here" lists for each article~~
172		~~- for(Entry<String,ArticleLinks> e : links.entrySet()){~~
173		~~- ArticleLinks r = e.getValue();~~
174		~~- if(r.redirectsTo != null && r != r.redirectsTo){~~
175		~~- if(r.redirectsTo.redirected == null)~~
176		~~- r.redirectsTo.redirected = new ArrayList<String>();~~
177		~~- r.redirectsTo.redirected.add(e.getKey());~~
178		~~- }~~
179		~~- }~~
180		~~- return links;~~
181		~~- }~~
182		-
183		~~- private static HashMap<String,ArticleLinks> getTitles(String inputfile,String langCode) {~~
184		~~- log.info("First pass, getting a list of valid articles...");~~
185		~~- InputStream input = null;~~
186		~~- try {~~
187		~~- input = Tools.openInputFile(inputfile);~~
188		~~- } catch (IOException e) {~~
189		~~- log.fatal("I/O error opening "+inputfile);~~
190		~~- return null;~~
191		~~- }~~
192		~~- // first pass, get titles~~
193		~~- TitleReader tr = new TitleReader(langCode);~~
194		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));~~
195		~~- try {~~
196		~~- reader.readDump();~~
197		~~- input.close();~~
198		~~- } catch (IOException e) {~~
199		~~- log.fatal("I/O error reading dump while getting titles from "+inputfile);~~
200		~~- return null;~~
201		~~- }~~
202		~~- return tr.getTitles();~~
203		~~- }~~
204		-
205	155	private static String formatTime(long l) {
206	156	l /= 1000;
207	157	if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s";
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html
—	—	@@ -0,0 +1,3 @@
	2	+<html><body>
	3	+Storage of data, mainly in database. E.g. page ranks, text for highlighting...
	4	+</body></html>
\ No newline at end of file
Property changes on: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html
___________________________________________________________________
Added: svn:executable
1	5	+ *
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/Storage.java
—	—	@@ -0,0 +1,29 @@
	2	+package org.wikimedia.lsearch.storage;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.Collection;
	6	+
	7	+import org.wikimedia.lsearch.beans.Title;
	8	+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
	9	+
	10	+abstract public class Storage {
	11	+ static protected Storage instance = null;
	12	+
	13	+ /** Get instance of Storage singleton class */
	14	+ public static synchronized Storage getInstance(){
	15	+ if(instance == null)
	16	+ instance = new MySQLStorage();
	17	+ return instance;
	18	+ }
	19	+
	20	+ /**
	21	+ * Store a complete array of page references
	22	+ */
	23	+ abstract public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException;
	24	+
	25	+ /**
	26	+ * Fetch page references for number of titles
	27	+ */
	28	+ abstract public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException;
	29	+
	30	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java
—	—	@@ -0,0 +1,267 @@
	2	+package org.wikimedia.lsearch.storage;
	3	+
	4	+import java.io.BufferedReader;
	5	+import java.io.FileNotFoundException;
	6	+import java.io.FileReader;
	7	+import java.io.IOException;
	8	+import java.sql.Connection;
	9	+import java.sql.DriverManager;
	10	+import java.sql.ResultSet;
	11	+import java.sql.SQLException;
	12	+import java.sql.Statement;
	13	+import java.util.ArrayList;
	14	+import java.util.Collection;
	15	+import java.util.Hashtable;
	16	+import java.util.Iterator;
	17	+import java.util.Map.Entry;
	18	+
	19	+import org.apache.log4j.Logger;
	20	+import org.wikimedia.lsearch.beans.Title;
	21	+import org.wikimedia.lsearch.config.Configuration;
	22	+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
	23	+
	24	+/**
	25	+ * MySQL storage backend
	26	+ *
	27	+ *
	28	+ * @author rainman
	29	+ *
	30	+ */
	31	+public class MySQLStorage extends Storage {
	32	+ static Logger log = Logger.getLogger(MySQLStorage.class);
	33	+ protected Configuration config;
	34	+ /** master host */
	35	+ protected String master;
	36	+ /** slave host -> % of load */
	37	+ protected Hashtable<String,Double> slaves = null;
	38	+ /** mysql username */
	39	+ protected String username;
	40	+ /** mysql password */
	41	+ protected String password;
	42	+ /** If we should separate data in many dbs */
	43	+ protected boolean separate;
	44	+ /** db where to put everything, if we are not using one db per dbname */
	45	+ protected String defaultDB;
	46	+ /** where sql stuff is, e.g. references_table.sql */
	47	+ protected String lib;
	48	+ /** table name -> create table file */
	49	+ protected Hashtable<String,String> tableDefs = new Hashtable<String,String>();
	50	+
	51	+ protected MySQLStorage() {
	52	+ config = Configuration.open();
	53	+ try {
	54	+ Class.forName("com.mysql.jdbc.Driver");
	55	+ } catch (ClassNotFoundException e) {
	56	+ log.error("Cannot load mysql jdbc driver, class not found: "+e.getMessage());
	57	+ }
	58	+
	59	+ lib = config.getString("Storage","lib","./sql");
	60	+
	61	+ master = config.getString("Storage","master","localhost");
	62	+ String[] ss = config.getArray("Storage","slaves");
	63	+ if(ss != null){
	64	+ Hashtable<String,Double> rawslaves = new Hashtable<String,Double>();
	65	+ for(String slave : ss){
	66	+ String[] parts = slave.split("->",2);
	67	+ if(parts.length==2){
	68	+ rawslaves.put(parts[0],Double.parseDouble(parts[1]));
	69	+ }
	70	+ }
	71	+ // normalize to 1
	72	+ double sum = 0;
	73	+ for(Double d : rawslaves.values())
	74	+ sum += d;
	75	+ if(sum == 0) // in case no loads are specified
	76	+ sum = 1;
	77	+ slaves = new Hashtable<String,Double>();
	78	+ for(Entry<String,Double> ed : rawslaves.entrySet())
	79	+ slaves.put(ed.getKey(),ed.getValue()/sum);
	80	+
	81	+ }
	82	+
	83	+ username = config.getString("Storage","username","root");
	84	+ password = config.getString("Storage","password","");
	85	+
	86	+ // figure out db configuration
	87	+ separate = config.getBoolean("Storage","useSeparateDBs");
	88	+ if(!separate){
	89	+ defaultDB = config.getString("Storage","defaultDB");
	90	+ if(defaultDB == null){
	91	+ log.error("Set Storage.defaultDB in local configuration.");
	92	+ }
	93	+ }
	94	+ }
	95	+
	96	+ /** Get connection for writing stuff, i.e. on the master */
	97	+ protected Connection getReadConnection(String dbname) throws IOException{
	98	+ return openConnection(dbname,false);
	99	+ }
	100	+
	101	+ /** Get connection for reading of (possibly lagged) stuff, i.e. on slaves (or master if there are no slaves) */
	102	+ protected Connection getWriteConnection(String dbname) throws IOException{
	103	+ return openConnection(dbname,true);
	104	+ }
	105	+
	106	+ /** Open connection on the master, or load-balanced on one of the slaves */
	107	+ protected Connection openConnection(String dbname, boolean onMaster) throws IOException {
	108	+ String host=null;
	109	+ if(onMaster \|\| slaves == null)
	110	+ host = master;
	111	+ else{
	112	+ // load balance slaves
	113	+ double r = Math.random();
	114	+ for(Entry<String,Double> load : slaves.entrySet()){
	115	+ r-=load.getValue();
	116	+ if(r < 0){
	117	+ host = load.getKey();
	118	+ break;
	119	+ }
	120	+ }
	121	+ }
	122	+ String dburl = "jdbc:mysql://"+host+":3306/";
	123	+ if(!separate && defaultDB!=null)
	124	+ dburl += defaultDB;
	125	+ try {
	126	+ return DriverManager.getConnection(dburl, username, password);
	127	+ } catch (SQLException e) {
	128	+ log.error("Cannot establish connection to "+dburl+" - check host, db, username and password : "+e.getMessage());
	129	+ throw new IOException("Cannot establish connection to mysql database.");
	130	+ }
	131	+ }
	132	+
	133	+ public String quote(String str){
	134	+ return "'"+str+"'";
	135	+ }
	136	+
	137	+ public String escape(String str){
	138	+ return str.replace("'","\\'");
	139	+ }
	140	+
	141	+ public String getTableName(String name, String dbname){
	142	+ if(!separate)
	143	+ return dbname+"_"+name;
	144	+ else
	145	+ return name;
	146	+ }
	147	+
	148	+ // inherit javadoc
	149	+ public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException {
	150	+ String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE ";
	151	+ if(titles.size()==1){
	152	+ sql += "rf_key="+quote(escape(titles.iterator().next().getKey()));
	153	+ } else{
	154	+ StringBuilder sb = new StringBuilder(sql);
	155	+ sb.append("rf_key IN (");
	156	+ Iterator<Title> it = titles.iterator();
	157	+ while(it.hasNext()){
	158	+ sb.append('\'');
	159	+ sb.append(it.next().getKey());
	160	+ sb.append('\'');
	161	+ if(it.hasNext())
	162	+ sb.append(',');
	163	+ }
	164	+ sb.append(")");
	165	+ sql = sb.toString();
	166	+ }
	167	+ try {
	168	+ Connection conn = getReadConnection(dbname);
	169	+ log.info("Fetching references for "+titles.size()+" pages");
	170	+ Statement stmt = conn.createStatement();
	171	+ ResultSet res = stmt.executeQuery(sql);
	172	+ ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>();
	173	+ while(res.next()){
	174	+ ret.add(new CompactArticleLinks(res.getString("rf_key"),res.getInt("rf_references")));
	175	+ }
	176	+ conn.close();
	177	+ return ret;
	178	+ } catch (SQLException e) {
	179	+ log.error("Cannot execute sql "+sql+" : "+e.getMessage());
	180	+ throw new IOException(e.getMessage());
	181	+ }
	182	+ }
	183	+
	184	+ // inherit javadoc
	185	+ public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException {
	186	+ final int maxPerQuery = 10000;
	187	+ Connection conn = getWriteConnection(dbname);
	188	+ verifyTable("references",dbname,conn);
	189	+ Iterator<CompactArticleLinks> it = refs.iterator();
	190	+ // send chunks of maxPerQuery referenace replacements
	191	+ while(it.hasNext()){
	192	+ StringBuilder sb = new StringBuilder("REPLACE INTO "+getTableName("references",dbname)+" (rf_key,rf_references) VALUES ");
	193	+ int count = 0;
	194	+ while(it.hasNext() && count < maxPerQuery){
	195	+ CompactArticleLinks cs = it.next();
	196	+ sb.append("('");
	197	+ sb.append(escape(cs.getKey()));
	198	+ sb.append("','");
	199	+ sb.append(cs.links);
	200	+ count++;
	201	+ if(it.hasNext() && count<maxPerQuery)
	202	+ sb.append("'), ");
	203	+ else
	204	+ sb.append("');");
	205	+ }
	206	+ try {
	207	+ log.info("Storing "+Math.min(maxPerQuery,count)+" page ranks... ");
	208	+ Statement stmt = conn.createStatement();
	209	+ stmt.executeUpdate(sb.toString());
	210	+
	211	+ } catch (SQLException e) {
	212	+ log.error("Cannot execute replace query "+sb+" : "+e.getMessage());
	213	+ throw new IOException(e.getMessage());
	214	+ }
	215	+ }
	216	+ try {
	217	+ conn.close(); // be sure we close the connection
	218	+ } catch (SQLException e) {
	219	+ }
	220	+ }
	221	+
	222	+ /** Creates table if it doesn't exist */
	223	+ protected void verifyTable(String name, String dbname, Connection conn) throws IOException {
	224	+ // verify if table exists
	225	+ String table = getTableName(name,dbname);
	226	+ try {
	227	+ log.info("Verifying table "+name+" on "+dbname);
	228	+ Statement stmt = conn.createStatement();
	229	+ ResultSet res = stmt.executeQuery("SHOW TABLES LIKE '"+table+"';");
	230	+ if(res.next()) // table exists!
	231	+ return;
	232	+
	233	+ } catch (SQLException e) {
	234	+ log.error("Cannot verify table "+table+" : "+e.getMessage());
	235	+ throw new IOException(e.getMessage());
	236	+ }
	237	+
	238	+ // fetch table definition
	239	+ String def = tableDefs.get(name);
	240	+ if(def == null){
	241	+ if(!lib.endsWith(Configuration.PATH_SEP))
	242	+ lib = lib+Configuration.PATH_SEP;
	243	+
	244	+ BufferedReader file = new BufferedReader(new FileReader(lib+name+"_table.sql"));
	245	+ StringBuilder sb = new StringBuilder();
	246	+ String line;
	247	+ while((line = file.readLine()) != null){
	248	+ sb.append(line.replaceFirst("--.*",""));
	249	+ }
	250	+ def = sb.toString();
	251	+ }
	252	+ // preprocess dbprefix tags
	253	+ String tdef;
	254	+ if(!separate)
	255	+ tdef = def.replace("/DBprefix/",dbname+"_");
	256	+ else
	257	+ tdef = def;
	258	+ // create
	259	+ try {
	260	+ log.info("Creating table "+name+" on "+dbname);
	261	+ Statement stmt = conn.createStatement();
	262	+ stmt.executeUpdate(tdef);
	263	+ } catch (SQLException e) {
	264	+ log.error("Cannot create table "+table+" : "+e.getMessage());
	265	+ throw new IOException(e.getMessage());
	266	+ }
	267	+ }
	268	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexRegistry.java
—	—	@@ -26,11 +26,20 @@
27	27	protected Hashtable<String,LocalIndex> latestUpdate;
28	28	/** current search index */
29	29	protected Hashtable<String,LocalIndex> currentSearch;
	30	+ /** when was the last time when snapshot was refreshed */
	31	+ protected Hashtable<String,Long> lastSnapshotRefresh = new Hashtable<String,Long>();
30	32
	33	+ protected Object lock = new Object();
	34	+
31	35	protected static IndexRegistry instance = null;
32	36
33	37	/** Get info about the latest index snapshot */
34	38	public LocalIndex getLatestSnapshot(IndexId iid){
	39	+ synchronized (lock) {
	40	+ // wait at least 5 second before the next refresh
	41	+ if(lastSnapshotRefresh.get(iid.toString()) == null \|\| (System.currentTimeMillis() - lastSnapshotRefresh.get(iid.toString()) > 5000))
	42	+ refreshSnapshots(iid);
	43	+ }
35	44	return latestSnapshot.get(iid.toString()); // hashtable is synchronized
36	45	}
37	46
—	—	@@ -95,10 +104,11 @@
96	105	} else if(latestSnapshot.get(iid.toString()) != null){
97	106	latestSnapshot.remove((iid.toString()));
98	107	}
	108	+ lastSnapshotRefresh.put(iid.toString(),System.currentTimeMillis());
99	109	}
100	110
101	111	/** Refresh latest search update info */
102		~~- public synchronized void refreshUpdates(IndexId iid){~~
	112	+ public synchronized void refreshUpdates(IndexId iid){
103	113	File updateDir = new File(iid.getUpdatePath());
104	114	LocalIndex latest = getLatestLocalIndex(updateDir,iid);
105	115	if(latest != null){
—	—	@@ -106,6 +116,7 @@
107	117	} else if(latestUpdate.get(iid.toString()) != null){
108	118	latestUpdate.remove((iid.toString()));
109	119	}
	120	+
110	121	}
111	122
112	123	/** Tell registry this is the most current version of search index */
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
—	—	@@ -35,7 +35,7 @@
36	36	//log.debug("Length-norm: "+f+", numtokens: "+numTokens);
37	37	return f;
38	38	}
39		~~- } else if(fieldName.equals("title") \|\| fieldName.startsWith("alttitle")){~~
	39	+ } else if(fieldName.equals("title") \|\| fieldName.equals("stemtitle") \|\| fieldName.startsWith("alttitle")){
40	40	float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
41	41	//log.debug("Length-norm: "+f+", numtokens: "+numTokens);
42	42	return f;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java
—	—	@@ -152,4 +152,5 @@
153	153	public String getKey(){
154	154	return article.getKey();
155	155	}
	156	+
156	157	}
Index: trunk/lucene-search-2.0/.classpath
—	—	@@ -14,5 +14,6 @@
15	15	<classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/>
16	16	<classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/>
17	17	<classpathentry kind="lib" path="lib/highlighter.jar" sourcepath="/sandbox/highlighter"/>
	18	+ <classpathentry kind="lib" path="lib/mysql-connector-java-3.0.17-ga-bin.jar"/>
18	19	<classpathentry kind="output" path="bin"/>
19	20	</classpath>
Index: trunk/lucene-search-2.0/lsearch.conf
—	—	@@ -51,6 +51,31 @@
52	52	Index.maxqueuetimeout=12
53	53
54	54	################################################
	55	+# Storage backend (currently mysql)
	56	+################################################
	57	+
	58	+# host of database master
	59	+Storage.master=localhost
	60	+
	61	+# array of host->load
	62	+#Storage.slaves=host1->10 host2->50 host3->100
	63	+
	64	+# Storage.username=root
	65	+# Storage.password=
	66	+
	67	+# Values:
	68	+# true - each dbname has a separate db of that name
	69	+# false - each dbname is a prefix for tables in a default db (set default db below)
	70	+Storage.useSeparateDBs=false
	71	+
	72	+# Default db where all the stuff will be stored (if useSeparateDB=false)
	73	+Storage.defaultDB=lsearch
	74	+
	75	+# Where table definitions are
	76	+Storage.lib=/var/www/html/lucene-search-2.0/sql
	77	+
	78	+
	79	+################################################
55	80	# Log, ganglia, localization
56	81	################################################
57	82

Status & tagging log

15:16, 12 September 2011 Meno25 (talk | contribs) changed the status of r22892 [removed: ok added: old]