Index: trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/mysql-connector-java-3.0.17-ga-bin.jar |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 1 | + application/octet-stream |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.Iterator; |
| 7 | +import java.util.Map.Entry; |
7 | 8 | |
8 | 9 | import org.apache.log4j.Logger; |
9 | 10 | import org.mediawiki.importer.DumpWriter; |
— | — | @@ -65,11 +66,11 @@ |
66 | 67 | } |
67 | 68 | public void writeEndPage() throws IOException { |
68 | 69 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects); |
69 | | - log.info("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects); |
| 70 | + log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects); |
70 | 71 | records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE)); |
71 | 72 | log.debug(iid+": Update for "+article); |
72 | 73 | references = 0; |
73 | | - redirects.clear(); |
| 74 | + redirects = new ArrayList<Redirect>(); |
74 | 75 | } |
75 | 76 | |
76 | 77 | public void writeSiteinfo(Siteinfo info) throws IOException { |
— | — | @@ -77,9 +78,8 @@ |
78 | 79 | // write to localization |
79 | 80 | Iterator it = info.Namespaces.orderedEntries(); |
80 | 81 | while(it.hasNext()){ |
81 | | - Integer inx = (Integer)it.next(); |
82 | | - String prefix = info.Namespaces.getPrefix(inx); |
83 | | - Localization.addCustomMapping(prefix,inx,langCode); |
| 82 | + Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
| 83 | + Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
84 | 84 | } |
85 | 85 | } |
86 | 86 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java |
— | — | @@ -10,15 +10,21 @@ |
11 | 11 | import java.net.Authenticator; |
12 | 12 | import java.net.PasswordAuthentication; |
13 | 13 | import java.util.ArrayList; |
| 14 | +import java.util.HashMap; |
14 | 15 | import java.util.HashSet; |
15 | 16 | import java.util.Properties; |
16 | 17 | |
17 | 18 | import org.apache.log4j.Logger; |
| 19 | +import org.wikimedia.lsearch.beans.Article; |
| 20 | +import org.wikimedia.lsearch.beans.Redirect; |
| 21 | +import org.wikimedia.lsearch.beans.Title; |
18 | 22 | import org.wikimedia.lsearch.config.Configuration; |
19 | 23 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
20 | 24 | import org.wikimedia.lsearch.config.IndexId; |
21 | 25 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
22 | 26 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 27 | +import org.wikimedia.lsearch.ranks.Links; |
| 28 | +import org.wikimedia.lsearch.storage.Storage; |
23 | 29 | import org.wikimedia.lsearch.util.Localization; |
24 | 30 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
25 | 31 | |
— | — | @@ -160,6 +166,12 @@ |
161 | 167 | continue; |
162 | 168 | boolean hasMore = false; |
163 | 169 | do{ |
| 170 | + // fetch references for records |
| 171 | + fetchReferences(records,dbname); |
| 172 | + for(IndexUpdateRecord rec : records){ |
| 173 | + Article ar = rec.getArticle(); |
| 174 | + log.debug("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects()); |
| 175 | + } |
164 | 176 | // send to indexer |
165 | 177 | RMIMessengerClient messenger = new RMIMessengerClient(true); |
166 | 178 | try { |
— | — | @@ -235,4 +247,34 @@ |
236 | 248 | } |
237 | 249 | } while(daemon); |
238 | 250 | } |
| 251 | + |
| 252 | + protected static void fetchReferences(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException { |
| 253 | + Storage store = Storage.getInstance(); |
| 254 | + ArrayList<Title> titles = new ArrayList<Title>(); |
| 255 | + for(IndexUpdateRecord rec : records){ |
| 256 | + if(rec.isDelete()) |
| 257 | + continue; |
| 258 | + Article ar = rec.getArticle(); |
| 259 | + titles.add(ar.makeTitle()); |
| 260 | + if(ar.getRedirects() != null){ |
| 261 | + for(Redirect r : ar.getRedirects()){ |
| 262 | + titles.add(r.makeTitle()); |
| 263 | + } |
| 264 | + } |
| 265 | + } |
| 266 | + // fetch |
| 267 | + Links links = new Links(store.getPageReferences(titles,dbname)); |
| 268 | + // update |
| 269 | + for(IndexUpdateRecord rec : records){ |
| 270 | + if(rec.isDelete()) |
| 271 | + continue; |
| 272 | + Article ar = rec.getArticle(); |
| 273 | + ar.setReferences(links.getLinks(ar.makeTitle().getKey())); |
| 274 | + if(ar.getRedirects() != null){ |
| 275 | + for(Redirect r : ar.getRedirects()){ |
| 276 | + r.setReferences(links.getLinks(r.makeTitle().getKey())); |
| 277 | + } |
| 278 | + } |
| 279 | + } |
| 280 | + } |
239 | 281 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -185,7 +185,7 @@ |
186 | 186 | boolean isRed = false; |
187 | 187 | if(line.startsWith("#redirect")) |
188 | 188 | isRed = true; |
189 | | - else if(lang != null ){ |
| 189 | + else if(lang != null && redirects.get(lang)!=null){ |
190 | 190 | for(String magic : redirects.get(lang)){ |
191 | 191 | if(line.startsWith(magic)){ |
192 | 192 | isRed = true; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/TitleReader.java |
— | — | @@ -0,0 +1,63 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.HashMap; |
| 7 | +import java.util.HashSet; |
| 8 | +import java.util.Iterator; |
| 9 | +import java.util.Map.Entry; |
| 10 | + |
| 11 | +import org.mediawiki.importer.DumpWriter; |
| 12 | +import org.mediawiki.importer.Page; |
| 13 | +import org.mediawiki.importer.Revision; |
| 14 | +import org.mediawiki.importer.Siteinfo; |
| 15 | +import org.wikimedia.lsearch.beans.ArticleLinks; |
| 16 | +import org.wikimedia.lsearch.util.Localization; |
| 17 | + |
| 18 | +/** |
| 19 | + * Read a HashSet of titles from dump |
| 20 | + * |
| 21 | + * @author rainman |
| 22 | + * |
| 23 | + */ |
| 24 | +public class TitleReader implements DumpWriter{ |
| 25 | + Page page; |
| 26 | + Revision revision; |
| 27 | + Links links = new Links(); |
| 28 | + protected String langCode; |
| 29 | + |
| 30 | + public TitleReader(String langCode){ |
| 31 | + this.langCode = langCode; |
| 32 | + } |
| 33 | + |
| 34 | + public void writeRevision(Revision revision) throws IOException { |
| 35 | + this.revision = revision; |
| 36 | + } |
| 37 | + public void writeStartPage(Page page) throws IOException { |
| 38 | + this.page = page; |
| 39 | + } |
| 40 | + public void writeEndPage() throws IOException { |
| 41 | + String key = page.Title.Namespace+":"+page.Title.Text; |
| 42 | + links.add(key,0); |
| 43 | + } |
| 44 | + public Links getTitles() { |
| 45 | + return links; |
| 46 | + } |
| 47 | + public void close() throws IOException { |
| 48 | + // nop |
| 49 | + } |
| 50 | + public void writeEndWiki() throws IOException { |
| 51 | + // nop |
| 52 | + } |
| 53 | + public void writeSiteinfo(Siteinfo info) throws IOException { |
| 54 | + // write siteinfo to localization |
| 55 | + Iterator it = info.Namespaces.orderedEntries(); |
| 56 | + while(it.hasNext()){ |
| 57 | + Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
| 58 | + Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
| 59 | + } |
| 60 | + } |
| 61 | + public void writeStartWiki() throws IOException { |
| 62 | + // nop |
| 63 | + } |
| 64 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java |
— | — | @@ -0,0 +1,97 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.io.UnsupportedEncodingException; |
| 5 | +import java.util.ArrayList; |
| 6 | + |
| 7 | + |
| 8 | +/** |
| 9 | + * Page links object that has been optimized to use for low |
| 10 | + * memory consumption. String is being stored in a utf-8 |
| 11 | + * encoded byte[] array, and the same object is to be used |
| 12 | + * as a key and value in a hashmap. |
| 13 | + * |
| 14 | + * Two objects equals iff they have the same string (other fields |
| 15 | + * are ignored in equals()) |
| 16 | + * |
| 17 | + * @author rainman |
| 18 | + * |
| 19 | + */ |
| 20 | +public class CompactArticleLinks{ |
| 21 | + /** format: <ns>:<title> */ |
| 22 | + protected byte[] str; |
| 23 | + public int links; |
| 24 | + protected int hash = 0; |
| 25 | + /** if this page is a redirect */ |
| 26 | + public CompactArticleLinks redirectsTo; |
| 27 | + /** list of pages that redirect here */ |
| 28 | + public ArrayList<CompactArticleLinks> redirected; |
| 29 | + |
| 30 | + public CompactArticleLinks(String s){ |
| 31 | + try { |
| 32 | + str = s.getBytes("utf-8"); |
| 33 | + } catch (UnsupportedEncodingException e) { |
| 34 | + e.printStackTrace(); |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + public CompactArticleLinks(String s, int count){ |
| 39 | + this(s); |
| 40 | + this.links = count; |
| 41 | + } |
| 42 | + |
| 43 | + @Override |
| 44 | + public String toString() { |
| 45 | + try { |
| 46 | + return new String(str,0,str.length,"utf-8")+", count="+links; |
| 47 | + } catch (UnsupportedEncodingException e) { |
| 48 | + return ""; |
| 49 | + } |
| 50 | + } |
| 51 | + |
| 52 | + public String getKey(){ |
| 53 | + try { |
| 54 | + return new String(str,0,str.length,"utf-8"); |
| 55 | + } catch (UnsupportedEncodingException e) { |
| 56 | + return ""; |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + public void addRedirect(CompactArticleLinks from){ |
| 61 | + if(redirected == null) |
| 62 | + redirected = new ArrayList<CompactArticleLinks>(); |
| 63 | + redirected.add(from); |
| 64 | + } |
| 65 | + @Override |
| 66 | + public int hashCode() { |
| 67 | + int h = hash; |
| 68 | + if(h == 0){ |
| 69 | + int off = 0; |
| 70 | + |
| 71 | + for (int i = 0; i < str.length; i++) { |
| 72 | + h = 31*h + str[off++]; |
| 73 | + } |
| 74 | + hash = h; |
| 75 | + } |
| 76 | + |
| 77 | + return h; |
| 78 | + } |
| 79 | + |
| 80 | + @Override |
| 81 | + public boolean equals(Object obj) { |
| 82 | + if (this == obj) |
| 83 | + return true; |
| 84 | + if (obj == null) |
| 85 | + return false; |
| 86 | + if (getClass() != obj.getClass()) |
| 87 | + return false; |
| 88 | + final CompactArticleLinks other = (CompactArticleLinks) obj; |
| 89 | + if(other.str.length != str.length) |
| 90 | + return false; |
| 91 | + for(int i=0;i<str.length;i++) |
| 92 | + if(str[i] != other.str[i]) |
| 93 | + return false; |
| 94 | + return true; |
| 95 | + } |
| 96 | + |
| 97 | + |
| 98 | +} |
\ No newline at end of file |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/LinkReader.java |
— | — | @@ -0,0 +1,183 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.HashMap; |
| 6 | +import java.util.HashSet; |
| 7 | +import java.util.regex.Matcher; |
| 8 | +import java.util.regex.Pattern; |
| 9 | + |
| 10 | +import org.apache.commons.lang.WordUtils; |
| 11 | +import org.apache.log4j.Logger; |
| 12 | +import org.mediawiki.importer.DumpWriter; |
| 13 | +import org.mediawiki.importer.Page; |
| 14 | +import org.mediawiki.importer.Revision; |
| 15 | +import org.mediawiki.importer.Siteinfo; |
| 16 | +import org.wikimedia.lsearch.beans.Article; |
| 17 | +import org.wikimedia.lsearch.beans.ArticleLinks; |
| 18 | +import org.wikimedia.lsearch.beans.Title; |
| 19 | +import org.wikimedia.lsearch.config.Configuration; |
| 20 | +import org.wikimedia.lsearch.config.IndexId; |
| 21 | +import org.wikimedia.lsearch.util.Localization; |
| 22 | + |
| 23 | +/** |
| 24 | + * Reads page links and references, i.e. how many times a page |
| 25 | + * is referenced within other articles. |
| 26 | + * |
| 27 | + * @author rainman |
| 28 | + * |
| 29 | + */ |
| 30 | +public class LinkReader implements DumpWriter { |
| 31 | + static Logger log = Logger.getLogger(LinkReader.class); |
| 32 | + Page page; |
| 33 | + Revision revision; |
| 34 | + Siteinfo siteinfo; |
| 35 | + /** ns:title -> number of referring articles */ |
| 36 | + Links links; |
| 37 | + HashSet<String> interwiki; |
| 38 | + String langCode; |
| 39 | + boolean readRedirects; |
| 40 | + |
| 41 | + public static final boolean READ_REDIRECTS = true; |
| 42 | + public static final boolean NO_REDIRECTS = false; |
| 43 | + |
| 44 | + public LinkReader(Links links, String langCode){ |
| 45 | + this(links,langCode,false); |
| 46 | + } |
| 47 | + |
| 48 | + public LinkReader(Links links, String langCode, boolean readRedirects){ |
| 49 | + this.links = links; |
| 50 | + this.readRedirects = readRedirects; |
| 51 | + if(langCode == null || langCode.equals("")) |
| 52 | + langCode = "en"; |
| 53 | + this.langCode = langCode; |
| 54 | + interwiki = Localization.getInterwiki(); |
| 55 | + } |
| 56 | + public void writeRevision(Revision revision) throws IOException { |
| 57 | + this.revision = revision; |
| 58 | + } |
| 59 | + public void writeStartPage(Page page) throws IOException { |
| 60 | + this.page = page; |
| 61 | + } |
| 62 | + public void writeEndPage() throws IOException { |
| 63 | + if(readRedirects){ |
| 64 | + // register redirect |
| 65 | + Title redirect = Localization.getRedirectTitle(revision.Text,langCode); |
| 66 | + if( redirect !=null ){ |
| 67 | + CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle()); |
| 68 | + if(cs != null) |
| 69 | + links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs); |
| 70 | + return; |
| 71 | + } |
| 72 | + } |
| 73 | + processLinks(revision.Text,page.Title.Namespace); |
| 74 | + } |
| 75 | + |
| 76 | + /** Find the links object for the ns:title key */ |
| 77 | + protected CompactArticleLinks findArticleLinks(int ns, String title){ |
| 78 | + String key; |
| 79 | + CompactArticleLinks rank; |
| 80 | + if(title.length() == 0) |
| 81 | + return null; |
| 82 | + // try exact match |
| 83 | + key = ns+":"+title; |
| 84 | + rank = links.get(key); |
| 85 | + if(rank != null) |
| 86 | + return rank; |
| 87 | + // try lowercase |
| 88 | + key = ns+":"+title.toLowerCase(); |
| 89 | + rank = links.get(key); |
| 90 | + if(rank != null) |
| 91 | + return rank; |
| 92 | + // try lowercase with first letter upper case |
| 93 | + if(title.length()==1) |
| 94 | + key = ns+":"+title.toUpperCase(); |
| 95 | + else |
| 96 | + key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase(); |
| 97 | + rank = links.get(key); |
| 98 | + if(rank != null) |
| 99 | + return rank; |
| 100 | + // try title case |
| 101 | + key = ns+":"+WordUtils.capitalize(title); |
| 102 | + rank = links.get(key); |
| 103 | + if(rank != null) |
| 104 | + return rank; |
| 105 | + // try upper case |
| 106 | + key = ns+":"+title.toUpperCase(); |
| 107 | + rank = links.get(key); |
| 108 | + if(rank != null) |
| 109 | + return rank; |
| 110 | + // try capitalizing at word breaks |
| 111 | + key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
| 112 | + rank = links.get(key); |
| 113 | + if(rank != null) |
| 114 | + return rank; |
| 115 | + |
| 116 | + return null; |
| 117 | + } |
| 118 | + |
| 119 | + /** Extract all links from this page, and increment ref count for linked pages */ |
| 120 | + protected void processLinks(String text, int namespace) { |
| 121 | + Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
| 122 | + Matcher matcher = linkPat.matcher(text); |
| 123 | + int ns; String title; |
| 124 | + boolean escaped; |
| 125 | + |
| 126 | + HashSet<CompactArticleLinks> pagelinks = new HashSet<CompactArticleLinks>(); |
| 127 | + while(matcher.find()){ |
| 128 | + String link = matcher.group(1); |
| 129 | + int fragment = link.lastIndexOf('#'); |
| 130 | + if(fragment != -1) |
| 131 | + link = link.substring(0,fragment); |
| 132 | + //System.out.println("Got link "+link); |
| 133 | + if(link.startsWith(":")){ |
| 134 | + escaped = true; |
| 135 | + link = link.substring(1); |
| 136 | + } else escaped = false; |
| 137 | + ns = 0; |
| 138 | + title = link; |
| 139 | + // check for ns:title syntax |
| 140 | + String[] parts = link.split(":",2); |
| 141 | + if(parts.length == 2 && parts[0].length() > 1){ |
| 142 | + Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase()); |
| 143 | + if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14))) |
| 144 | + continue; // categories, ignore |
| 145 | + if(inx!=null && inx < 0) |
| 146 | + continue; // special pages, ignore |
| 147 | + if(inx != null){ |
| 148 | + ns = inx; |
| 149 | + title = parts[1]; |
| 150 | + } |
| 151 | + |
| 152 | + // ignore interwiki links |
| 153 | + if(interwiki.contains(parts[0])) |
| 154 | + continue; |
| 155 | + } |
| 156 | + if(ns == 0 && namespace!=0) |
| 157 | + continue; // skip links from other namespaces into the main namespace |
| 158 | + // register as link |
| 159 | + CompactArticleLinks target = findArticleLinks(ns,title); |
| 160 | + if(target != null) |
| 161 | + pagelinks.add(target); |
| 162 | + } |
| 163 | + // increment page ranks |
| 164 | + for(CompactArticleLinks rank : pagelinks){ |
| 165 | + rank.links++; |
| 166 | + } |
| 167 | + } |
| 168 | + public void writeSiteinfo(Siteinfo info) throws IOException { |
| 169 | + siteinfo = info; |
| 170 | + } |
| 171 | + public void close() throws IOException { |
| 172 | + // nop |
| 173 | + } |
| 174 | + public void writeEndWiki() throws IOException { |
| 175 | + // nop |
| 176 | + } |
| 177 | + public void writeStartWiki() throws IOException { |
| 178 | + // nop |
| 179 | + } |
| 180 | + public Links getRanks() { |
| 181 | + return links; |
| 182 | + } |
| 183 | + |
| 184 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -0,0 +1,76 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
| 6 | +import java.util.HashMap; |
| 7 | +import java.util.Map.Entry; |
| 8 | + |
| 9 | +import org.wikimedia.lsearch.beans.ArticleLinks; |
| 10 | + |
| 11 | +/** |
| 12 | + * Abstraction of links retrieval and other operations related to |
| 13 | + * CompactArticleLinks. |
| 14 | + * |
| 15 | + * @author rainman |
| 16 | + * |
| 17 | + */ |
| 18 | +public class Links { |
| 19 | + protected HashMap<CompactArticleLinks,CompactArticleLinks> links = new HashMap<CompactArticleLinks,CompactArticleLinks>(); |
| 20 | + |
| 21 | + public Links() { |
| 22 | + } |
| 23 | + |
| 24 | + public Links(Collection<CompactArticleLinks> col){ |
| 25 | + for(CompactArticleLinks c : col){ |
| 26 | + links.put(c,c); |
| 27 | + } |
| 28 | + } |
| 29 | + |
| 30 | + /** Add new page with key and ref */ |
| 31 | + public void add(String key, int ref){ |
| 32 | + CompactArticleLinks cs = new CompactArticleLinks(key,ref); |
| 33 | + links.put(cs,cs); |
| 34 | + } |
| 35 | + |
| 36 | + /** Setup redirect key -> tokey */ |
| 37 | + public void setRedirect(String key, String tokey){ |
| 38 | + CompactArticleLinks from = links.get(new CompactArticleLinks(key)); |
| 39 | + CompactArticleLinks to = links.get(new CompactArticleLinks(tokey)); |
| 40 | + from.redirectsTo = to; |
| 41 | + } |
| 42 | + |
| 43 | + /** Setup redirect key -> to */ |
| 44 | + public void setRedirect(String key, CompactArticleLinks to){ |
| 45 | + CompactArticleLinks from = links.get(new CompactArticleLinks(key)); |
| 46 | + from.redirectsTo = to; |
| 47 | + } |
| 48 | + |
| 49 | + /** Get links object from key */ |
| 50 | + public CompactArticleLinks get(String key){ |
| 51 | + return links.get(new CompactArticleLinks(key)); |
| 52 | + } |
| 53 | + |
| 54 | + /** Get collection of all links objects */ |
| 55 | + public Collection<CompactArticleLinks> getAll(){ |
| 56 | + return links.values(); |
| 57 | + } |
| 58 | + |
| 59 | + /** Get number of references (links) to article of key */ |
| 60 | + public int getLinks(String key){ |
| 61 | + CompactArticleLinks c = links.get(new CompactArticleLinks(key)); |
| 62 | + if(c == null) |
| 63 | + return 0; |
| 64 | + else |
| 65 | + return c.links; |
| 66 | + } |
| 67 | + |
| 68 | + /** Generate "redirects here" lists for each article */ |
| 69 | + public void generateRedirectLists(){ |
| 70 | + for(CompactArticleLinks r : links.values()){ |
| 71 | + if(r.redirectsTo != null && r != r.redirectsTo){ |
| 72 | + r.redirectsTo.addRedirect(r); |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + } |
| 77 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java |
— | — | @@ -0,0 +1,130 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.InputStream; |
| 6 | +import java.util.ArrayList; |
| 7 | +import java.util.HashMap; |
| 8 | +import java.util.HashSet; |
| 9 | +import java.util.Map.Entry; |
| 10 | + |
| 11 | +import org.apache.log4j.Logger; |
| 12 | +import org.mediawiki.dumper.ProgressFilter; |
| 13 | +import org.mediawiki.dumper.Tools; |
| 14 | +import org.mediawiki.importer.XmlDumpReader; |
| 15 | +import org.wikimedia.lsearch.beans.ArticleLinks; |
| 16 | +import org.wikimedia.lsearch.config.Configuration; |
| 17 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 18 | +import org.wikimedia.lsearch.config.IndexId; |
| 19 | +import org.wikimedia.lsearch.index.IndexThread; |
| 20 | +import org.wikimedia.lsearch.storage.Storage; |
| 21 | +import org.wikimedia.lsearch.util.Localization; |
| 22 | +import org.wikimedia.lsearch.util.UnicodeDecomposer; |
| 23 | + |
| 24 | +/** |
| 25 | + * Main class, builds index from a database dump. |
| 26 | + * Syntax: java Importer inputfile dbname |
| 27 | + * |
| 28 | + * @author rainman |
| 29 | + * |
| 30 | + */ |
| 31 | +public class RankBuilder { |
| 32 | + static Logger log = Logger.getLogger(RankBuilder.class); |
| 33 | + /** |
| 34 | + * @param args |
| 35 | + * @throws IOException |
| 36 | + */ |
| 37 | + public static void main(String[] args) throws IOException { |
| 38 | + String inputfile = null; |
| 39 | + String dbname = null; |
| 40 | + |
| 41 | + System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n"); |
| 42 | + |
| 43 | + Configuration.open(); |
| 44 | + log = Logger.getLogger(RankBuilder.class); |
| 45 | + |
| 46 | + if(args.length < 2){ |
| 47 | + System.out.println("Syntax: java Importer <inputfile> <dbname>"); |
| 48 | + return; |
| 49 | + } |
| 50 | + inputfile = args[0]; |
| 51 | + dbname = args[1]; |
| 52 | + if(inputfile == null || dbname == null){ |
| 53 | + System.out.println("Please specify both input xml file and database name"); |
| 54 | + return; |
| 55 | + } |
| 56 | + |
| 57 | + String langCode = GlobalConfiguration.getInstance().getLanguage(dbname); |
| 58 | + // preload |
| 59 | + UnicodeDecomposer.getInstance(); |
| 60 | + Localization.readLocalization(langCode); |
| 61 | + Localization.loadInterwiki(); |
| 62 | + |
| 63 | + long start = System.currentTimeMillis(); |
| 64 | + |
| 65 | + // regenerate link info |
| 66 | + Links links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS); |
| 67 | + |
| 68 | + Storage store = Storage.getInstance(); |
| 69 | + store.storePageReferences(links.getAll(),dbname); |
| 70 | + |
| 71 | + /*for(CompactArticleLinks cs : links.values()){ |
| 72 | + System.out.println(cs); |
| 73 | + }*/ |
| 74 | + |
| 75 | + long end = System.currentTimeMillis(); |
| 76 | + |
| 77 | + System.out.println("Finished generating ranks in "+formatTime(end-start)); |
| 78 | + } |
| 79 | + |
| 80 | + public static Links processLinks(String inputfile, Links links, String langCode, boolean readRedirects) { |
| 81 | + log.info("Second pass, calculating article links..."); |
| 82 | + InputStream input = null; |
| 83 | + // second pass - calculate page ranks |
| 84 | + try { |
| 85 | + input = Tools.openInputFile(inputfile); |
| 86 | + } catch (IOException e) { |
| 87 | + log.fatal("I/O error opening "+inputfile); |
| 88 | + return null; |
| 89 | + } |
| 90 | + // calculate ranks |
| 91 | + LinkReader rr = new LinkReader(links,langCode,readRedirects); |
| 92 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000)); |
| 93 | + try { |
| 94 | + reader.readDump(); |
| 95 | + } catch (IOException e) { |
| 96 | + log.fatal("I/O error reading dump while calculating ranks for from "+inputfile); |
| 97 | + return null; |
| 98 | + } |
| 99 | + return links; |
| 100 | + } |
| 101 | + |
| 102 | + public static Links getTitles(String inputfile,String langCode) { |
| 103 | + log.info("First pass, getting a list of valid articles..."); |
| 104 | + InputStream input = null; |
| 105 | + try { |
| 106 | + input = Tools.openInputFile(inputfile); |
| 107 | + } catch (IOException e) { |
| 108 | + log.fatal("I/O error opening "+inputfile); |
| 109 | + return null; |
| 110 | + } |
| 111 | + // first pass, get titles |
| 112 | + TitleReader tr = new TitleReader(langCode); |
| 113 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
| 114 | + try { |
| 115 | + reader.readDump(); |
| 116 | + input.close(); |
| 117 | + } catch (IOException e) { |
| 118 | + log.fatal("I/O error reading dump while getting titles from "+inputfile); |
| 119 | + return null; |
| 120 | + } |
| 121 | + return tr.getTitles(); |
| 122 | + } |
| 123 | + |
| 124 | + private static String formatTime(long l) { |
| 125 | + l /= 1000; |
| 126 | + if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s"; |
| 127 | + else if(l >= 60) return (l%3600)/60+"m "+(l%60)+"s"; |
| 128 | + else return l+"s"; |
| 129 | + } |
| 130 | + |
| 131 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Redirect.java |
— | — | @@ -34,6 +34,10 @@ |
35 | 35 | return namespace+":"+title+" ("+references+")"; |
36 | 36 | } |
37 | 37 | |
| 38 | + public Title makeTitle(){ |
| 39 | + return new Title(namespace,title); |
| 40 | + } |
38 | 41 | |
39 | 42 | |
| 43 | + |
40 | 44 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Title.java |
— | — | @@ -16,6 +16,9 @@ |
17 | 17 | this.title = title; |
18 | 18 | } |
19 | 19 | |
| 20 | + public String getKey(){ |
| 21 | + return namespace+":"+title; |
| 22 | + } |
20 | 23 | |
21 | 24 | @Override |
22 | 25 | public String toString() { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -193,6 +193,10 @@ |
194 | 194 | this.redirectKeywordRanks = redirectKeywordRanks; |
195 | 195 | } |
196 | 196 | |
| 197 | + /** Get title object corresponding to this article */ |
| 198 | + public Title makeTitle(){ |
| 199 | + return new Title(Integer.parseInt(namespace),title); |
| 200 | + } |
197 | 201 | |
198 | 202 | |
199 | 203 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/LinkReader.java |
— | — | @@ -1,156 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.importer; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | -import java.util.HashMap; |
6 | | -import java.util.HashSet; |
7 | | -import java.util.regex.Matcher; |
8 | | -import java.util.regex.Pattern; |
9 | | - |
10 | | -import org.apache.commons.lang.WordUtils; |
11 | | -import org.apache.log4j.Logger; |
12 | | -import org.mediawiki.importer.DumpWriter; |
13 | | -import org.mediawiki.importer.Page; |
14 | | -import org.mediawiki.importer.Revision; |
15 | | -import org.mediawiki.importer.Siteinfo; |
16 | | -import org.wikimedia.lsearch.beans.Article; |
17 | | -import org.wikimedia.lsearch.beans.ArticleLinks; |
18 | | -import org.wikimedia.lsearch.beans.Title; |
19 | | -import org.wikimedia.lsearch.config.Configuration; |
20 | | -import org.wikimedia.lsearch.config.IndexId; |
21 | | -import org.wikimedia.lsearch.util.Localization; |
22 | | - |
23 | | -/** |
24 | | - * Reads page links and references, i.e. how many times a page |
25 | | - * is referenced within other articles. |
26 | | - * |
27 | | - * @author rainman |
28 | | - * |
29 | | - */ |
30 | | -public class LinkReader implements DumpWriter { |
31 | | - static Logger log = Logger.getLogger(LinkReader.class); |
32 | | - Page page; |
33 | | - Revision revision; |
34 | | - Siteinfo siteinfo; |
35 | | - /** ns:title -> number of referring articles */ |
36 | | - HashMap<String,ArticleLinks> links = new HashMap<String,ArticleLinks>(); |
37 | | - HashSet<String> interwiki; |
38 | | - String langCode; |
39 | | - |
40 | | - public LinkReader(HashMap<String,ArticleLinks> links, String langCode){ |
41 | | - this.links = links; |
42 | | - if(langCode == null || langCode.equals("")) |
43 | | - langCode = "en"; |
44 | | - this.langCode = langCode; |
45 | | - interwiki = Localization.getInterwiki(); |
46 | | - } |
47 | | - public void writeRevision(Revision revision) throws IOException { |
48 | | - this.revision = revision; |
49 | | - } |
50 | | - public void writeStartPage(Page page) throws IOException { |
51 | | - this.page = page; |
52 | | - } |
53 | | - public void writeEndPage() throws IOException { |
54 | | - ArticleLinks r = links.get(page.Title.Namespace+":"+page.Title.Text); |
55 | | - // register redirect |
56 | | - Title redirect = Localization.getRedirectTitle(revision.Text,langCode); |
57 | | - if( redirect !=null ){ |
58 | | - r.redirectsTo = findArticleLinks(redirect.getNamespace(),redirect.getTitle()); |
59 | | - } else // process links |
60 | | - processLinks(revision.Text,page.Title.Namespace); |
61 | | - } |
62 | | - |
63 | | - /** Find the links object for the ns:title key */ |
64 | | - protected ArticleLinks findArticleLinks(int ns, String title){ |
65 | | - String key; |
66 | | - ArticleLinks rank; |
67 | | - // try exact match |
68 | | - key = ns+":"+title; |
69 | | - rank = links.get(key); |
70 | | - if(rank != null) |
71 | | - return rank; |
72 | | - // try lowercase |
73 | | - key = ns+":"+title.toLowerCase(); |
74 | | - rank = links.get(key); |
75 | | - if(rank != null) |
76 | | - return rank; |
77 | | - // try title case |
78 | | - key = ns+":"+WordUtils.capitalize(title); |
79 | | - rank = links.get(key); |
80 | | - if(rank != null) |
81 | | - return rank; |
82 | | - // try capitalizing at word breaks |
83 | | - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
84 | | - rank = links.get(key); |
85 | | - if(rank != null) |
86 | | - return rank; |
87 | | - |
88 | | - return null; |
89 | | - } |
90 | | - |
91 | | - /** Extract all links from this page, and increment ref count for linked pages */ |
92 | | - protected void processLinks(String text, int namespace) { |
93 | | - Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
94 | | - Matcher matcher = linkPat.matcher(text); |
95 | | - int ns; String title; |
96 | | - boolean escaped; |
97 | | - |
98 | | - HashSet<ArticleLinks> pagelinks = new HashSet<ArticleLinks>(); |
99 | | - while(matcher.find()){ |
100 | | - String link = matcher.group(1); |
101 | | - int fragment = link.lastIndexOf('#'); |
102 | | - if(fragment != -1) |
103 | | - link = link.substring(0,fragment); |
104 | | - //System.out.println("Got link "+link); |
105 | | - if(link.startsWith(":")){ |
106 | | - escaped = true; |
107 | | - link = link.substring(1); |
108 | | - } else escaped = false; |
109 | | - ns = 0; |
110 | | - title = link; |
111 | | - // check for ns:title syntax |
112 | | - String[] parts = link.split(":",2); |
113 | | - if(parts.length == 2 && parts[0].length() > 1){ |
114 | | - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase()); |
115 | | - if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14))) |
116 | | - continue; // categories, ignore |
117 | | - if(inx!=null && inx < 0) |
118 | | - continue; // special pages, ignore |
119 | | - if(inx != null){ |
120 | | - ns = inx; |
121 | | - title = parts[1]; |
122 | | - } |
123 | | - |
124 | | - // ignore interwiki links |
125 | | - if(interwiki.contains(parts[0])) |
126 | | - continue; |
127 | | - } |
128 | | - if(ns == 0 && namespace!=0) |
129 | | - continue; // skip links from other namespaces into the main namespace |
130 | | - |
131 | | - // register as link |
132 | | - ArticleLinks target = findArticleLinks(ns,title); |
133 | | - if(target != null) |
134 | | - pagelinks.add(target); |
135 | | - } |
136 | | - // increment page ranks |
137 | | - for(ArticleLinks rank : pagelinks){ |
138 | | - rank.links++; |
139 | | - } |
140 | | - } |
141 | | - public void writeSiteinfo(Siteinfo info) throws IOException { |
142 | | - siteinfo = info; |
143 | | - } |
144 | | - public void close() throws IOException { |
145 | | - // nop |
146 | | - } |
147 | | - public void writeEndWiki() throws IOException { |
148 | | - // nop |
149 | | - } |
150 | | - public void writeStartWiki() throws IOException { |
151 | | - // nop |
152 | | - } |
153 | | - public HashMap<String, ArticleLinks> getRanks() { |
154 | | - return links; |
155 | | - } |
156 | | - |
157 | | -} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/TitleReader.java |
— | — | @@ -1,63 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.importer; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | -import java.util.ArrayList; |
6 | | -import java.util.HashMap; |
7 | | -import java.util.HashSet; |
8 | | -import java.util.Iterator; |
9 | | -import java.util.Map.Entry; |
10 | | - |
11 | | -import org.mediawiki.importer.DumpWriter; |
12 | | -import org.mediawiki.importer.Page; |
13 | | -import org.mediawiki.importer.Revision; |
14 | | -import org.mediawiki.importer.Siteinfo; |
15 | | -import org.wikimedia.lsearch.beans.ArticleLinks; |
16 | | -import org.wikimedia.lsearch.util.Localization; |
17 | | - |
18 | | -/** |
19 | | - * Read a HashSet of titles from dump |
20 | | - * |
21 | | - * @author rainman |
22 | | - * |
23 | | - */ |
24 | | -public class TitleReader implements DumpWriter{ |
25 | | - Page page; |
26 | | - Revision revision; |
27 | | - HashMap<String,ArticleLinks> titles = new HashMap<String,ArticleLinks>(); |
28 | | - protected String langCode; |
29 | | - |
30 | | - public TitleReader(String langCode){ |
31 | | - this.langCode = langCode; |
32 | | - } |
33 | | - |
34 | | - public void writeRevision(Revision revision) throws IOException { |
35 | | - this.revision = revision; |
36 | | - } |
37 | | - public void writeStartPage(Page page) throws IOException { |
38 | | - this.page = page; |
39 | | - } |
40 | | - public void writeEndPage() throws IOException { |
41 | | - String key = page.Title.Namespace+":"+page.Title.Text; |
42 | | - titles.put(key,new ArticleLinks(0)); |
43 | | - } |
44 | | - public HashMap<String,ArticleLinks> getTitles() { |
45 | | - return titles; |
46 | | - } |
47 | | - public void close() throws IOException { |
48 | | - // nop |
49 | | - } |
50 | | - public void writeEndWiki() throws IOException { |
51 | | - // nop |
52 | | - } |
53 | | - public void writeSiteinfo(Siteinfo info) throws IOException { |
54 | | - // write siteinfo to localization |
55 | | - Iterator it = info.Namespaces.orderedEntries(); |
56 | | - while(it.hasNext()){ |
57 | | - Entry<Integer,String> pair = (Entry<Integer,String>)it.next(); |
58 | | - Localization.addCustomMapping(pair.getValue(),pair.getKey(),langCode); |
59 | | - } |
60 | | - } |
61 | | - public void writeStartWiki() throws IOException { |
62 | | - // nop |
63 | | - } |
64 | | -} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -20,6 +20,8 @@ |
21 | 21 | import org.wikimedia.lsearch.beans.Title; |
22 | 22 | import org.wikimedia.lsearch.config.Configuration; |
23 | 23 | import org.wikimedia.lsearch.config.IndexId; |
| 24 | +import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
| 25 | +import org.wikimedia.lsearch.ranks.Links; |
24 | 26 | import org.wikimedia.lsearch.util.Localization; |
25 | 27 | |
26 | 28 | public class DumpImporter implements DumpWriter { |
— | — | @@ -28,11 +30,11 @@ |
29 | 31 | Revision revision; |
30 | 32 | SimpleIndexWriter writer; |
31 | 33 | int count = 0, limit; |
32 | | - HashMap<String,ArticleLinks> ranks; |
| 34 | + Links ranks; |
33 | 35 | String langCode; |
34 | 36 | |
35 | 37 | public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor, |
36 | | - Integer maxBufDocs, boolean newIndex, HashMap<String,ArticleLinks> ranks, String langCode){ |
| 38 | + Integer maxBufDocs, boolean newIndex, Links ranks, String langCode){ |
37 | 39 | Configuration.open(); // make sure configuration is loaded |
38 | 40 | writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex); |
39 | 41 | this.limit = limit; |
— | — | @@ -48,7 +50,7 @@ |
49 | 51 | public void writeEndPage() throws IOException { |
50 | 52 | // get reference count |
51 | 53 | String key = page.Title.Namespace+":"+page.Title.Text; |
52 | | - ArticleLinks r = ranks.get(key); |
| 54 | + CompactArticleLinks r = ranks.get(key); |
53 | 55 | int references; |
54 | 56 | boolean isRedirect = r.redirectsTo != null; |
55 | 57 | if(r == null){ |
— | — | @@ -59,9 +61,9 @@ |
60 | 62 | // make list of redirects |
61 | 63 | ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
62 | 64 | if(r.redirected != null){ |
63 | | - for(String rk : r.redirected){ |
64 | | - String[] parts = rk.split(":",2); |
65 | | - redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],ranks.get(rk).links)); |
| 65 | + for(CompactArticleLinks rk : r.redirected){ |
| 66 | + String[] parts = rk.toString().split(":",2); |
| 67 | + redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links)); |
66 | 68 | } |
67 | 69 | } |
68 | 70 | // make article |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -16,6 +16,8 @@ |
17 | 17 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
18 | 18 | import org.wikimedia.lsearch.config.IndexId; |
19 | 19 | import org.wikimedia.lsearch.index.IndexThread; |
| 20 | +import org.wikimedia.lsearch.ranks.Links; |
| 21 | +import org.wikimedia.lsearch.ranks.RankBuilder; |
20 | 22 | import org.wikimedia.lsearch.util.Localization; |
21 | 23 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
22 | 24 | |
— | — | @@ -96,8 +98,9 @@ |
97 | 99 | long start = System.currentTimeMillis(); |
98 | 100 | |
99 | 101 | // regenerate link and redirect information |
100 | | - HashMap<String,ArticleLinks> links = processLinks(inputfile,getTitles(inputfile,langCode),langCode); |
101 | | - |
| 102 | + Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS); |
| 103 | + links.generateRedirectLists(); |
| 104 | + |
102 | 105 | log.info("Third pass, indexing articles..."); |
103 | 106 | |
104 | 107 | // open |
— | — | @@ -148,59 +151,6 @@ |
149 | 152 | } |
150 | 153 | } |
151 | 154 | |
152 | | - private static HashMap<String,ArticleLinks> processLinks(String inputfile, HashMap<String,ArticleLinks> links, String langCode) { |
153 | | - log.info("Second pass, calculating article links..."); |
154 | | - InputStream input = null; |
155 | | - // second pass - calculate page ranks |
156 | | - try { |
157 | | - input = Tools.openInputFile(inputfile); |
158 | | - } catch (IOException e) { |
159 | | - log.fatal("I/O error opening "+inputfile); |
160 | | - return null; |
161 | | - } |
162 | | - // calculate ranks |
163 | | - LinkReader rr = new LinkReader(links,langCode); |
164 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000)); |
165 | | - try { |
166 | | - reader.readDump(); |
167 | | - } catch (IOException e) { |
168 | | - log.fatal("I/O error reading dump while calculating ranks for from "+inputfile); |
169 | | - return null; |
170 | | - } |
171 | | - // generate "redirects here" lists for each article |
172 | | - for(Entry<String,ArticleLinks> e : links.entrySet()){ |
173 | | - ArticleLinks r = e.getValue(); |
174 | | - if(r.redirectsTo != null && r != r.redirectsTo){ |
175 | | - if(r.redirectsTo.redirected == null) |
176 | | - r.redirectsTo.redirected = new ArrayList<String>(); |
177 | | - r.redirectsTo.redirected.add(e.getKey()); |
178 | | - } |
179 | | - } |
180 | | - return links; |
181 | | - } |
182 | | - |
183 | | - private static HashMap<String,ArticleLinks> getTitles(String inputfile,String langCode) { |
184 | | - log.info("First pass, getting a list of valid articles..."); |
185 | | - InputStream input = null; |
186 | | - try { |
187 | | - input = Tools.openInputFile(inputfile); |
188 | | - } catch (IOException e) { |
189 | | - log.fatal("I/O error opening "+inputfile); |
190 | | - return null; |
191 | | - } |
192 | | - // first pass, get titles |
193 | | - TitleReader tr = new TitleReader(langCode); |
194 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
195 | | - try { |
196 | | - reader.readDump(); |
197 | | - input.close(); |
198 | | - } catch (IOException e) { |
199 | | - log.fatal("I/O error reading dump while getting titles from "+inputfile); |
200 | | - return null; |
201 | | - } |
202 | | - return tr.getTitles(); |
203 | | - } |
204 | | - |
205 | 155 | private static String formatTime(long l) { |
206 | 156 | l /= 1000; |
207 | 157 | if(l >= 3600) return l/3600+"h "+(l%3600)/60+"m "+(l%60)+"s"; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html |
— | — | @@ -0,0 +1,3 @@ |
| 2 | +<html><body> |
| 3 | +Storage of data, mainly in database. E.g. page ranks, text for highlighting... |
| 4 | +</body></html> |
\ No newline at end of file |
Property changes on: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/package.html |
___________________________________________________________________ |
Added: svn:executable |
1 | 5 | + * |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/Storage.java |
— | — | @@ -0,0 +1,29 @@ |
| 2 | +package org.wikimedia.lsearch.storage; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.Collection; |
| 6 | + |
| 7 | +import org.wikimedia.lsearch.beans.Title; |
| 8 | +import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
| 9 | + |
| 10 | +abstract public class Storage { |
| 11 | + static protected Storage instance = null; |
| 12 | + |
| 13 | + /** Get instance of Storage singleton class */ |
| 14 | + public static synchronized Storage getInstance(){ |
| 15 | + if(instance == null) |
| 16 | + instance = new MySQLStorage(); |
| 17 | + return instance; |
| 18 | + } |
| 19 | + |
| 20 | + /** |
| 21 | + * Store a complete array of page references |
| 22 | + */ |
| 23 | + abstract public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException; |
| 24 | + |
| 25 | + /** |
| 26 | + * Fetch page references for number of titles |
| 27 | + */ |
| 28 | + abstract public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException; |
| 29 | + |
| 30 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java |
— | — | @@ -0,0 +1,267 @@ |
| 2 | +package org.wikimedia.lsearch.storage; |
| 3 | + |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.FileNotFoundException; |
| 6 | +import java.io.FileReader; |
| 7 | +import java.io.IOException; |
| 8 | +import java.sql.Connection; |
| 9 | +import java.sql.DriverManager; |
| 10 | +import java.sql.ResultSet; |
| 11 | +import java.sql.SQLException; |
| 12 | +import java.sql.Statement; |
| 13 | +import java.util.ArrayList; |
| 14 | +import java.util.Collection; |
| 15 | +import java.util.Hashtable; |
| 16 | +import java.util.Iterator; |
| 17 | +import java.util.Map.Entry; |
| 18 | + |
| 19 | +import org.apache.log4j.Logger; |
| 20 | +import org.wikimedia.lsearch.beans.Title; |
| 21 | +import org.wikimedia.lsearch.config.Configuration; |
| 22 | +import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
| 23 | + |
| 24 | +/** |
| 25 | + * MySQL storage backend |
| 26 | + * |
| 27 | + * |
| 28 | + * @author rainman |
| 29 | + * |
| 30 | + */ |
| 31 | +public class MySQLStorage extends Storage { |
| 32 | + static Logger log = Logger.getLogger(MySQLStorage.class); |
| 33 | + protected Configuration config; |
| 34 | + /** master host */ |
| 35 | + protected String master; |
| 36 | + /** slave host -> % of load */ |
| 37 | + protected Hashtable<String,Double> slaves = null; |
| 38 | + /** mysql username */ |
| 39 | + protected String username; |
| 40 | + /** mysql password */ |
| 41 | + protected String password; |
| 42 | + /** If we should separate data in many dbs */ |
| 43 | + protected boolean separate; |
| 44 | + /** db where to put everything, if we are not using one db per dbname */ |
| 45 | + protected String defaultDB; |
| 46 | + /** where sql stuff is, e.g. references_table.sql */ |
| 47 | + protected String lib; |
| 48 | + /** table name -> create table file */ |
| 49 | + protected Hashtable<String,String> tableDefs = new Hashtable<String,String>(); |
| 50 | + |
| 51 | + protected MySQLStorage() { |
| 52 | + config = Configuration.open(); |
| 53 | + try { |
| 54 | + Class.forName("com.mysql.jdbc.Driver"); |
| 55 | + } catch (ClassNotFoundException e) { |
| 56 | + log.error("Cannot load mysql jdbc driver, class not found: "+e.getMessage()); |
| 57 | + } |
| 58 | + |
| 59 | + lib = config.getString("Storage","lib","./sql"); |
| 60 | + |
| 61 | + master = config.getString("Storage","master","localhost"); |
| 62 | + String[] ss = config.getArray("Storage","slaves"); |
| 63 | + if(ss != null){ |
| 64 | + Hashtable<String,Double> rawslaves = new Hashtable<String,Double>(); |
| 65 | + for(String slave : ss){ |
| 66 | + String[] parts = slave.split("->",2); |
| 67 | + if(parts.length==2){ |
| 68 | + rawslaves.put(parts[0],Double.parseDouble(parts[1])); |
| 69 | + } |
| 70 | + } |
| 71 | + // normalize to 1 |
| 72 | + double sum = 0; |
| 73 | + for(Double d : rawslaves.values()) |
| 74 | + sum += d; |
| 75 | + if(sum == 0) // in case no loads are specified |
| 76 | + sum = 1; |
| 77 | + slaves = new Hashtable<String,Double>(); |
| 78 | + for(Entry<String,Double> ed : rawslaves.entrySet()) |
| 79 | + slaves.put(ed.getKey(),ed.getValue()/sum); |
| 80 | + |
| 81 | + } |
| 82 | + |
| 83 | + username = config.getString("Storage","username","root"); |
| 84 | + password = config.getString("Storage","password",""); |
| 85 | + |
| 86 | + // figure out db configuration |
| 87 | + separate = config.getBoolean("Storage","useSeparateDBs"); |
| 88 | + if(!separate){ |
| 89 | + defaultDB = config.getString("Storage","defaultDB"); |
| 90 | + if(defaultDB == null){ |
| 91 | + log.error("Set Storage.defaultDB in local configuration."); |
| 92 | + } |
| 93 | + } |
| 94 | + } |
| 95 | + |
| 96 | + /** Get connection for writing stuff, i.e. on the master */ |
| 97 | + protected Connection getReadConnection(String dbname) throws IOException{ |
| 98 | + return openConnection(dbname,false); |
| 99 | + } |
| 100 | + |
| 101 | + /** Get connection for reading of (possibly lagged) stuff, i.e. on slaves (or master if there are no slaves) */ |
| 102 | + protected Connection getWriteConnection(String dbname) throws IOException{ |
| 103 | + return openConnection(dbname,true); |
| 104 | + } |
| 105 | + |
| 106 | + /** Open connection on the master, or load-balanced on one of the slaves */ |
| 107 | + protected Connection openConnection(String dbname, boolean onMaster) throws IOException { |
| 108 | + String host=null; |
| 109 | + if(onMaster || slaves == null) |
| 110 | + host = master; |
| 111 | + else{ |
| 112 | + // load balance slaves |
| 113 | + double r = Math.random(); |
| 114 | + for(Entry<String,Double> load : slaves.entrySet()){ |
| 115 | + r-=load.getValue(); |
| 116 | + if(r < 0){ |
| 117 | + host = load.getKey(); |
| 118 | + break; |
| 119 | + } |
| 120 | + } |
| 121 | + } |
| 122 | + String dburl = "jdbc:mysql://"+host+":3306/"; |
| 123 | + if(!separate && defaultDB!=null) |
| 124 | + dburl += defaultDB; |
| 125 | + try { |
| 126 | + return DriverManager.getConnection(dburl, username, password); |
| 127 | + } catch (SQLException e) { |
| 128 | + log.error("Cannot establish connection to "+dburl+" - check host, db, username and password : "+e.getMessage()); |
| 129 | + throw new IOException("Cannot establish connection to mysql database."); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + public String quote(String str){ |
| 134 | + return "'"+str+"'"; |
| 135 | + } |
| 136 | + |
| 137 | + public String escape(String str){ |
| 138 | + return str.replace("'","\\'"); |
| 139 | + } |
| 140 | + |
| 141 | + public String getTableName(String name, String dbname){ |
| 142 | + if(!separate) |
| 143 | + return dbname+"_"+name; |
| 144 | + else |
| 145 | + return name; |
| 146 | + } |
| 147 | + |
| 148 | + // inherit javadoc |
| 149 | + public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException { |
| 150 | + String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE "; |
| 151 | + if(titles.size()==1){ |
| 152 | + sql += "rf_key="+quote(escape(titles.iterator().next().getKey())); |
| 153 | + } else{ |
| 154 | + StringBuilder sb = new StringBuilder(sql); |
| 155 | + sb.append("rf_key IN ("); |
| 156 | + Iterator<Title> it = titles.iterator(); |
| 157 | + while(it.hasNext()){ |
| 158 | + sb.append('\''); |
| 159 | + sb.append(it.next().getKey()); |
| 160 | + sb.append('\''); |
| 161 | + if(it.hasNext()) |
| 162 | + sb.append(','); |
| 163 | + } |
| 164 | + sb.append(")"); |
| 165 | + sql = sb.toString(); |
| 166 | + } |
| 167 | + try { |
| 168 | + Connection conn = getReadConnection(dbname); |
| 169 | + log.info("Fetching references for "+titles.size()+" pages"); |
| 170 | + Statement stmt = conn.createStatement(); |
| 171 | + ResultSet res = stmt.executeQuery(sql); |
| 172 | + ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>(); |
| 173 | + while(res.next()){ |
| 174 | + ret.add(new CompactArticleLinks(res.getString("rf_key"),res.getInt("rf_references"))); |
| 175 | + } |
| 176 | + conn.close(); |
| 177 | + return ret; |
| 178 | + } catch (SQLException e) { |
| 179 | + log.error("Cannot execute sql "+sql+" : "+e.getMessage()); |
| 180 | + throw new IOException(e.getMessage()); |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + // inherit javadoc |
| 185 | + public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException { |
| 186 | + final int maxPerQuery = 10000; |
| 187 | + Connection conn = getWriteConnection(dbname); |
| 188 | + verifyTable("references",dbname,conn); |
| 189 | + Iterator<CompactArticleLinks> it = refs.iterator(); |
| 190 | + // send chunks of maxPerQuery referenace replacements |
| 191 | + while(it.hasNext()){ |
| 192 | + StringBuilder sb = new StringBuilder("REPLACE INTO "+getTableName("references",dbname)+" (rf_key,rf_references) VALUES "); |
| 193 | + int count = 0; |
| 194 | + while(it.hasNext() && count < maxPerQuery){ |
| 195 | + CompactArticleLinks cs = it.next(); |
| 196 | + sb.append("('"); |
| 197 | + sb.append(escape(cs.getKey())); |
| 198 | + sb.append("','"); |
| 199 | + sb.append(cs.links); |
| 200 | + count++; |
| 201 | + if(it.hasNext() && count<maxPerQuery) |
| 202 | + sb.append("'), "); |
| 203 | + else |
| 204 | + sb.append("');"); |
| 205 | + } |
| 206 | + try { |
| 207 | + log.info("Storing "+Math.min(maxPerQuery,count)+" page ranks... "); |
| 208 | + Statement stmt = conn.createStatement(); |
| 209 | + stmt.executeUpdate(sb.toString()); |
| 210 | + |
| 211 | + } catch (SQLException e) { |
| 212 | + log.error("Cannot execute replace query "+sb+" : "+e.getMessage()); |
| 213 | + throw new IOException(e.getMessage()); |
| 214 | + } |
| 215 | + } |
| 216 | + try { |
| 217 | + conn.close(); // be sure we close the connection |
| 218 | + } catch (SQLException e) { |
| 219 | + } |
| 220 | + } |
| 221 | + |
| 222 | + /** Creates table if it doesn't exist */ |
| 223 | + protected void verifyTable(String name, String dbname, Connection conn) throws IOException { |
| 224 | + // verify if table exists |
| 225 | + String table = getTableName(name,dbname); |
| 226 | + try { |
| 227 | + log.info("Verifying table "+name+" on "+dbname); |
| 228 | + Statement stmt = conn.createStatement(); |
| 229 | + ResultSet res = stmt.executeQuery("SHOW TABLES LIKE '"+table+"';"); |
| 230 | + if(res.next()) // table exists! |
| 231 | + return; |
| 232 | + |
| 233 | + } catch (SQLException e) { |
| 234 | + log.error("Cannot verify table "+table+" : "+e.getMessage()); |
| 235 | + throw new IOException(e.getMessage()); |
| 236 | + } |
| 237 | + |
| 238 | + // fetch table definition |
| 239 | + String def = tableDefs.get(name); |
| 240 | + if(def == null){ |
| 241 | + if(!lib.endsWith(Configuration.PATH_SEP)) |
| 242 | + lib = lib+Configuration.PATH_SEP; |
| 243 | + |
| 244 | + BufferedReader file = new BufferedReader(new FileReader(lib+name+"_table.sql")); |
| 245 | + StringBuilder sb = new StringBuilder(); |
| 246 | + String line; |
| 247 | + while((line = file.readLine()) != null){ |
| 248 | + sb.append(line.replaceFirst("--.*","")); |
| 249 | + } |
| 250 | + def = sb.toString(); |
| 251 | + } |
| 252 | + // preprocess dbprefix tags |
| 253 | + String tdef; |
| 254 | + if(!separate) |
| 255 | + tdef = def.replace("/*DBprefix*/",dbname+"_"); |
| 256 | + else |
| 257 | + tdef = def; |
| 258 | + // create |
| 259 | + try { |
| 260 | + log.info("Creating table "+name+" on "+dbname); |
| 261 | + Statement stmt = conn.createStatement(); |
| 262 | + stmt.executeUpdate(tdef); |
| 263 | + } catch (SQLException e) { |
| 264 | + log.error("Cannot create table "+table+" : "+e.getMessage()); |
| 265 | + throw new IOException(e.getMessage()); |
| 266 | + } |
| 267 | + } |
| 268 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexRegistry.java |
— | — | @@ -26,11 +26,20 @@ |
27 | 27 | protected Hashtable<String,LocalIndex> latestUpdate; |
28 | 28 | /** current search index */ |
29 | 29 | protected Hashtable<String,LocalIndex> currentSearch; |
| 30 | + /** when was the last time when snapshot was refreshed */ |
| 31 | + protected Hashtable<String,Long> lastSnapshotRefresh = new Hashtable<String,Long>(); |
30 | 32 | |
| 33 | + protected Object lock = new Object(); |
| 34 | + |
31 | 35 | protected static IndexRegistry instance = null; |
32 | 36 | |
33 | 37 | /** Get info about the latest index snapshot */ |
34 | 38 | public LocalIndex getLatestSnapshot(IndexId iid){ |
| 39 | + synchronized (lock) { |
| 40 | + // wait at least 5 second before the next refresh |
| 41 | + if(lastSnapshotRefresh.get(iid.toString()) == null || (System.currentTimeMillis() - lastSnapshotRefresh.get(iid.toString()) > 5000)) |
| 42 | + refreshSnapshots(iid); |
| 43 | + } |
35 | 44 | return latestSnapshot.get(iid.toString()); // hashtable is synchronized |
36 | 45 | } |
37 | 46 | |
— | — | @@ -95,10 +104,11 @@ |
96 | 105 | } else if(latestSnapshot.get(iid.toString()) != null){ |
97 | 106 | latestSnapshot.remove((iid.toString())); |
98 | 107 | } |
| 108 | + lastSnapshotRefresh.put(iid.toString(),System.currentTimeMillis()); |
99 | 109 | } |
100 | 110 | |
101 | 111 | /** Refresh latest search update info */ |
102 | | - public synchronized void refreshUpdates(IndexId iid){ |
| 112 | + public synchronized void refreshUpdates(IndexId iid){ |
103 | 113 | File updateDir = new File(iid.getUpdatePath()); |
104 | 114 | LocalIndex latest = getLatestLocalIndex(updateDir,iid); |
105 | 115 | if(latest != null){ |
— | — | @@ -106,6 +116,7 @@ |
107 | 117 | } else if(latestUpdate.get(iid.toString()) != null){ |
108 | 118 | latestUpdate.remove((iid.toString())); |
109 | 119 | } |
| 120 | + |
110 | 121 | } |
111 | 122 | |
112 | 123 | /** Tell registry this is the most current version of search index */ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java |
— | — | @@ -35,7 +35,7 @@ |
36 | 36 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
37 | 37 | return f; |
38 | 38 | } |
39 | | - } else if(fieldName.equals("title") || fieldName.startsWith("alttitle")){ |
| 39 | + } else if(fieldName.equals("title") || fieldName.equals("stemtitle") || fieldName.startsWith("alttitle")){ |
40 | 40 | float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
41 | 41 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
42 | 42 | return f; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java |
— | — | @@ -152,4 +152,5 @@ |
153 | 153 | public String getKey(){ |
154 | 154 | return article.getKey(); |
155 | 155 | } |
| 156 | + |
156 | 157 | } |
Index: trunk/lucene-search-2.0/.classpath |
— | — | @@ -14,5 +14,6 @@ |
15 | 15 | <classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/> |
16 | 16 | <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/> |
17 | 17 | <classpathentry kind="lib" path="lib/highlighter.jar" sourcepath="/sandbox/highlighter"/> |
| 18 | + <classpathentry kind="lib" path="lib/mysql-connector-java-3.0.17-ga-bin.jar"/> |
18 | 19 | <classpathentry kind="output" path="bin"/> |
19 | 20 | </classpath> |
Index: trunk/lucene-search-2.0/lsearch.conf |
— | — | @@ -51,6 +51,31 @@ |
52 | 52 | Index.maxqueuetimeout=12 |
53 | 53 | |
54 | 54 | ################################################ |
| 55 | +# Storage backend (currently mysql) |
| 56 | +################################################ |
| 57 | + |
| 58 | +# host of database master |
| 59 | +Storage.master=localhost |
| 60 | + |
| 61 | +# array of host->load |
| 62 | +#Storage.slaves=host1->10 host2->50 host3->100 |
| 63 | + |
| 64 | +# Storage.username=root |
| 65 | +# Storage.password= |
| 66 | + |
| 67 | +# Values: |
| 68 | +# true - each dbname has a separate db of that name |
| 69 | +# false - each dbname is a prefix for tables in a default db (set default db below) |
| 70 | +Storage.useSeparateDBs=false |
| 71 | + |
| 72 | +# Default db where all the stuff will be stored (if useSeparateDB=false) |
| 73 | +Storage.defaultDB=lsearch |
| 74 | + |
| 75 | +# Where table definitions are |
| 76 | +Storage.lib=/var/www/html/lucene-search-2.0/sql |
| 77 | + |
| 78 | + |
| 79 | +################################################ |
55 | 80 | # Log, ganglia, localization |
56 | 81 | ################################################ |
57 | 82 | |