r24865 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r24864‎ | r24865 | r24866 >
Date:22:41, 16 August 2007
Author:rainman
Status:old
Tags:
Comment:
More work in progress:
* Move to a cleaner database schema: page, related tables
* Store/fetch related info (mysql)
* Prepare ground to add related info into index
* Refactored the related calculation
Modified paths:
  • /branches/lucene-search-2.1/sql/page_table.sql (added) (history)
  • /branches/lucene-search-2.1/sql/references_table.sql (deleted) (history)
  • /branches/lucene-search-2.1/sql/related_table.sql (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Related.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RelatedTitle.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/MySQLStorage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/Storage.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java (added) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java
@@ -25,6 +25,7 @@
2626 import org.wikimedia.lsearch.config.IndexId;
2727 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
2828 import org.wikimedia.lsearch.ranks.Links;
 29+import org.wikimedia.lsearch.ranks.RelatedTitle;
2930 import org.wikimedia.lsearch.util.Localization;
3031
3132 /**
@@ -54,8 +55,9 @@
5556 public void writeEndPage() throws IOException {
5657 ArrayList<Redirect> redirects = new ArrayList<Redirect>();
5758 boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null;
 59+ ArrayList<RelatedTitle> related = new ArrayList<RelatedTitle>();
5860 // make article
59 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects);
 61+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects,related);
6062 //if(page.Title.Namespace != 0)
6163 // article.setContents("");
6264
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java
@@ -16,6 +16,14 @@
1717 this.title = title;
1818 }
1919
 20+ public Title(String key){
 21+ String[] parts = key.split(":",2);
 22+ if(parts.length != 2)
 23+ throw new RuntimeException("Wrong key format in Title constructor");
 24+ this.namespace = Integer.parseInt(parts[0]);
 25+ this.title = parts[1];
 26+ }
 27+
2028 public String getKey(){
2129 return namespace+":"+title;
2230 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
@@ -28,6 +28,8 @@
2929 import java.util.ArrayList;
3030 import java.util.Collection;
3131
 32+import org.wikimedia.lsearch.ranks.RelatedTitle;
 33+
3234 /**
3335 * Wiki article.
3436 *
@@ -47,6 +49,8 @@
4850 private transient ArrayList<Integer> redirectKeywordRanks;
4951 /** generated before indexing from the reference sto this article, and references from redirects */
5052 private transient int rank;
 53+ /** names of articles that relate to this article */
 54+ private ArrayList<RelatedTitle> related;
5155
5256 public Article(){
5357 namespace="";
@@ -56,6 +60,7 @@
5761 redirect=false;
5862 references = 0;
5963 redirects=new ArrayList<Redirect>();
 64+ related = new ArrayList<RelatedTitle>();
6065 }
6166
6267 public Article(long pageId, Title title, String text, boolean redirect, int references) {
@@ -66,6 +71,7 @@
6772 this.redirect = redirect;
6873 this.references = references;
6974 this.redirects = new ArrayList<Redirect>();
 75+ this.related = new ArrayList<RelatedTitle>();
7076 }
7177
7278 public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references) {
@@ -76,9 +82,10 @@
7783 this.pageId = pageId;
7884 this.references = references;
7985 this.redirects = new ArrayList<Redirect>();
 86+ this.related = new ArrayList<RelatedTitle>();
8087 }
8188
82 - public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, ArrayList<Redirect> redirects) {
 89+ public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related) {
8390 this.namespace = Integer.toString(namespace);
8491 this.title = titleText;
8592 contents = text;
@@ -86,6 +93,7 @@
8794 this.pageId = pageId;
8895 this.references = references;
8996 this.redirects = redirects;
 97+ this.related = related;
9098 }
9199
92100 public boolean isRedirect() {
@@ -200,9 +208,19 @@
201209
202210 public void setContents(String contents) {
203211 this.contents = contents;
 212+ }
 213+
 214+ public ArrayList<RelatedTitle> getRelated() {
 215+ return related;
 216+ }
 217+
 218+ public void setRelated(ArrayList<RelatedTitle> related) {
 219+ this.related = related;
204220 }
205221
206222
207223
208224
 225+
 226+
209227 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
@@ -16,6 +16,7 @@
1717 import org.wikimedia.lsearch.config.GlobalConfiguration;
1818 import org.wikimedia.lsearch.config.IndexId;
1919 import org.wikimedia.lsearch.index.IndexThread;
 20+import org.wikimedia.lsearch.ranks.LinkReader;
2021 import org.wikimedia.lsearch.ranks.Links;
2122 import org.wikimedia.lsearch.ranks.RankBuilder;
2223 import org.wikimedia.lsearch.storage.Storage;
@@ -102,8 +103,8 @@
103104 long start = System.currentTimeMillis();
104105
105106 // regenerate link and redirect information
106 - Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS);
107 -
 107+ Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,LinkReader.READ_REDIRECTS);
 108+
108109 if(updateReferences){
109110 try {
110111 Storage.getInstance().storePageReferences(links.getAll(),dbname);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -22,6 +22,8 @@
2323 import org.wikimedia.lsearch.config.IndexId;
2424 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
2525 import org.wikimedia.lsearch.ranks.Links;
 26+import org.wikimedia.lsearch.ranks.RankBuilder;
 27+import org.wikimedia.lsearch.ranks.RelatedTitle;
2628 import org.wikimedia.lsearch.util.Localization;
2729
2830 public class DumpImporter implements DumpWriter {
@@ -30,7 +32,7 @@
3133 Revision revision;
3234 SimpleIndexWriter writer;
3335 int count = 0, limit;
34 - Links ranks;
 36+ Links links;
3537 String langCode;
3638
3739 public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
@@ -38,7 +40,7 @@
3941 Configuration.open(); // make sure configuration is loaded
4042 writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex);
4143 this.limit = limit;
42 - this.ranks = ranks;
 44+ this.links = ranks;
4345 this.langCode = langCode;
4446 }
4547 public void writeRevision(Revision revision) throws IOException {
@@ -50,7 +52,7 @@
5153 public void writeEndPage() throws IOException {
5254 // get reference count
5355 String key = page.Title.Namespace+":"+page.Title.Text;
54 - CompactArticleLinks r = ranks.get(key);
 56+ CompactArticleLinks r = links.get(key);
5557 int references;
5658 boolean isRedirect = r.redirectsTo != null;
5759 if(r == null){
@@ -65,9 +67,11 @@
6668 String[] parts = rk.toString().split(":",2);
6769 redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links));
6870 }
69 - }
 71+ }
 72+ ArrayList<RelatedTitle> related = RankBuilder.getRelatedTitles(r,links);
7073 // make article
71 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,references,redirects);
 74+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
 75+ references,redirects,related);
7276 writer.addArticle(article);
7377 count++;
7478 if(limit >= 0 && count > limit)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/Storage.java
@@ -1,10 +1,16 @@
22 package org.wikimedia.lsearch.storage;
33
44 import java.io.IOException;
 5+import java.util.ArrayList;
56 import java.util.Collection;
 7+import java.util.HashMap;
 8+import java.util.List;
 9+import java.util.Map.Entry;
610
711 import org.wikimedia.lsearch.beans.Title;
812 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
 13+import org.wikimedia.lsearch.ranks.Related;
 14+import org.wikimedia.lsearch.ranks.RelatedTitle;
915
1016 abstract public class Storage {
1117 static protected Storage instance = null;
@@ -25,5 +31,15 @@
2632 * Fetch page references for number of titles
2733 */
2834 abstract public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException;
 35+
 36+ /**
 37+ * Store some related mappings
 38+ */
 39+ abstract public void storeRelatedPages(Collection<Related> related, String dbname) throws IOException;
2940
 41+ /**
 42+ * Get related mapping for a collection of titles
 43+ */
 44+ abstract public HashMap<Title,ArrayList<RelatedTitle>> getRelatedPages(Collection<Title> titles, String dbname) throws IOException;
 45+
3046 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/MySQLStorage.java
@@ -1,7 +1,6 @@
22 package org.wikimedia.lsearch.storage;
33
44 import java.io.BufferedReader;
5 -import java.io.FileNotFoundException;
65 import java.io.FileReader;
76 import java.io.IOException;
87 import java.sql.Connection;
@@ -11,14 +10,19 @@
1211 import java.sql.Statement;
1312 import java.util.ArrayList;
1413 import java.util.Collection;
 14+import java.util.HashMap;
 15+import java.util.HashSet;
1516 import java.util.Hashtable;
1617 import java.util.Iterator;
 18+import java.util.List;
1719 import java.util.Map.Entry;
1820
1921 import org.apache.log4j.Logger;
2022 import org.wikimedia.lsearch.beans.Title;
2123 import org.wikimedia.lsearch.config.Configuration;
2224 import org.wikimedia.lsearch.ranks.CompactArticleLinks;
 25+import org.wikimedia.lsearch.ranks.Related;
 26+import org.wikimedia.lsearch.ranks.RelatedTitle;
2327
2428 /**
2529 * MySQL storage backend
@@ -162,14 +166,14 @@
163167
164168 // inherit javadoc
165169 public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException {
166 - String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE ";
 170+ String sql = "SELECT page_key, page_references from "+getTableName("page",dbname)+" WHERE ";
167171 if(titles == null || titles.size()==0)
168172 return new ArrayList<CompactArticleLinks>();
169173 else if(titles.size()==1){
170 - sql += "rf_key="+quote(escape(titles.iterator().next().getKey()));
 174+ sql += "page_key="+quote(escape(titles.iterator().next().getKey()));
171175 } else{
172176 StringBuilder sb = new StringBuilder(sql);
173 - sb.append("rf_key IN (");
 177+ sb.append("page_key IN (");
174178 Iterator<Title> it = titles.iterator();
175179 while(it.hasNext()){
176180 sb.append('\'');
@@ -178,7 +182,7 @@
179183 if(it.hasNext())
180184 sb.append(',');
181185 }
182 - sb.append(")");
 186+ sb.append(");");
183187 sql = sb.toString();
184188 }
185189 try {
@@ -188,7 +192,7 @@
189193 ResultSet res = stmt.executeQuery(sql);
190194 ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>();
191195 while(res.next()){
192 - ret.add(new CompactArticleLinks(res.getString("rf_key"),res.getInt("rf_references")));
 196+ ret.add(new CompactArticleLinks(res.getString("page_key"),res.getInt("page_references")));
193197 }
194198 conn.close();
195199 return ret;
@@ -202,11 +206,11 @@
203207 public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException {
204208 final int maxPerQuery = 10000;
205209 Connection conn = getWriteConnection(dbname);
206 - verifyTable("references",dbname,conn);
 210+ verifyTable("page",dbname,conn);
207211 Iterator<CompactArticleLinks> it = refs.iterator();
208212 // send chunks of maxPerQuery referenace replacements
209213 while(it.hasNext()){
210 - StringBuilder sb = new StringBuilder("REPLACE INTO "+getTableName("references",dbname)+" (rf_key,rf_references) VALUES ");
 214+ StringBuilder sb = new StringBuilder("INSERT INTO "+getTableName("page",dbname)+" (page_key,page_references) VALUES ");
211215 int count = 0;
212216 while(it.hasNext() && count < maxPerQuery){
213217 CompactArticleLinks cs = it.next();
@@ -218,7 +222,7 @@
219223 if(it.hasNext() && count<maxPerQuery)
220224 sb.append("'), ");
221225 else
222 - sb.append("');");
 226+ sb.append("') ON DUPLICATE KEY UPDATE page_references=VALUES(page_references);");
223227 }
224228 try {
225229 log.info("Storing "+Math.min(maxPerQuery,count)+" page ranks... ");
@@ -283,5 +287,135 @@
284288 log.error("Cannot create table "+table+" : "+e.getMessage());
285289 throw new IOException(e.getMessage());
286290 }
 291+ }
 292+
 293+ @Override
 294+ public HashMap<Title, ArrayList<RelatedTitle>> getRelatedPages(Collection<Title> titles, String dbname) throws IOException {
 295+ String page = getTableName("page",dbname);
 296+ String related = getTableName("related",dbname);
 297+ String sql = "SELECT a.page_key as 'r_to', b.page_key as 'r_related', rel_score FROM "+related+", "+page+" a, "+page+" b WHERE rel_to=a.page_id AND rel_related=b.page_id AND ";
 298+ if(titles == null || titles.size()==0)
 299+ return new HashMap<Title, ArrayList<RelatedTitle>>();
 300+ else if(titles.size()==1){
 301+ sql += "a.page_key="+quote(escape(titles.iterator().next().getKey()));
 302+ } else{
 303+ StringBuilder sb = new StringBuilder(sql);
 304+ sb.append("a.page_key IN (");
 305+ Iterator<Title> it = titles.iterator();
 306+ while(it.hasNext()){
 307+ sb.append('\'');
 308+ sb.append(escape(it.next().getKey()));
 309+ sb.append('\'');
 310+ if(it.hasNext())
 311+ sb.append(',');
 312+ }
 313+ sb.append(");");
 314+ sql = sb.toString();
 315+ }
 316+ try {
 317+ Connection conn = getReadConnection(dbname);
 318+ log.info("Fetching related info for "+titles.size()+" articles");
 319+ Statement stmt = conn.createStatement();
 320+ ResultSet res = stmt.executeQuery(sql);
 321+ HashMap<Title, ArrayList<RelatedTitle>> ret = new HashMap<Title, ArrayList<RelatedTitle>>();
 322+ while(res.next()){
 323+ Title t1 = new Title(res.getString("r_to"));
 324+ Title t2 = new Title(res.getString("r_related"));
 325+ double score = res.getDouble("rel_score");
 326+ ArrayList<RelatedTitle> rel = ret.get(t1);
 327+ if(rel == null){
 328+ rel = new ArrayList<RelatedTitle>();
 329+ ret.put(t1,rel);
 330+ }
 331+ rel.add(new RelatedTitle(t2,score));
 332+ }
 333+ conn.close();
 334+ return ret;
 335+ } catch (SQLException e) {
 336+ log.error("Cannot execute sql "+sql+" : "+e.getMessage());
 337+ throw new IOException(e.getMessage());
 338+ }
 339+ }
 340+
 341+ protected HashMap<String,Integer> getPageIDs(Collection<String> keys, String dbname, Connection conn) throws IOException{
 342+ String sql = "SELECT page_key, page_id from "+getTableName("page",dbname)+" WHERE ";
 343+ if(keys.size()==1){
 344+ sql += "page_key="+quote(escape(keys.iterator().next()));
 345+ } else{
 346+ StringBuilder sb = new StringBuilder(sql);
 347+ sb.append("page_key IN (");
 348+ Iterator<String> it = keys.iterator();
 349+ while(it.hasNext()){
 350+ sb.append('\'');
 351+ sb.append(escape(it.next()));
 352+ sb.append('\'');
 353+ if(it.hasNext())
 354+ sb.append(',');
 355+ }
 356+ sb.append(");");
 357+ sql = sb.toString();
 358+ }
 359+ try {
 360+ log.info("Fetching page ids for "+keys.size()+" pages");
 361+ Statement stmt = conn.createStatement();
 362+ ResultSet res = stmt.executeQuery(sql);
 363+ HashMap<String,Integer> map = new HashMap<String,Integer>();
 364+ while(res.next()){
 365+ map.put(res.getString("page_key"),res.getInt("page_id"));
 366+ }
 367+ return map;
 368+ } catch (SQLException e) {
 369+ log.error("Cannot execute sql "+sql+" : "+e.getMessage());
 370+ throw new IOException(e.getMessage());
 371+ }
 372+ }
 373+
 374+ @Override
 375+ public void storeRelatedPages(Collection<Related> related, String dbname) throws IOException {
 376+ Connection read = getReadConnection(dbname);
 377+ HashSet<String> keys = new HashSet<String>();
 378+ for(Related r : related){
 379+ keys.add(r.getTitle().toString());
 380+ keys.add(r.getRelates().toString());
 381+ }
 382+ HashMap<String,Integer> map = getPageIDs(keys,dbname,read);
 383+ final int maxPerQuery = 20000;
 384+ Connection write = getWriteConnection(dbname);
 385+ verifyTable("related",dbname,write);
 386+ Iterator<Related> it = related.iterator();
 387+ // send chunks of maxPerQuery referenace replacements
 388+ while(it.hasNext()){
 389+ StringBuilder sb = new StringBuilder("INSERT INTO "+getTableName("related",dbname)+" (rel_to,rel_related,rel_score) VALUES ");
 390+ int count = 0;
 391+ while(it.hasNext() && count < maxPerQuery){
 392+ Related r = it.next();
 393+ sb.append("('");
 394+ sb.append(Integer.toString(map.get(r.getTitle().toString())));
 395+ sb.append("','");
 396+ sb.append(Integer.toString(map.get(r.getRelates().toString())));
 397+ sb.append("','");
 398+ sb.append(Double.toString(r.getScore()));
 399+ count++;
 400+ if(it.hasNext() && count<maxPerQuery)
 401+ sb.append("'), ");
 402+ else
 403+ sb.append("') ON DUPLICATE KEY UPDATE rel_score=VALUES(rel_score);");
 404+ }
 405+ try {
 406+ log.info("Storing "+Math.min(maxPerQuery,count)+" related pages... ");
 407+ Statement stmt = write.createStatement();
 408+ stmt.executeUpdate(sb.toString());
 409+
 410+ } catch (SQLException e) {
 411+ log.error("Cannot execute replace query "+sb+" : "+e.getMessage());
 412+ throw new IOException(e.getMessage());
 413+ }
 414+ }
 415+ try {
 416+ write.close(); // be sure we close the connection
 417+ read.close();
 418+ } catch (SQLException e) {
 419+ }
 420+
287421 }
288422 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -60,8 +60,10 @@
6161 }
6262
6363 static public final int MAX_FIELD_LENGTH = 100000;
64 - /** number of aditional title1, title2, .. etc fields to be filled in with redirects */
65 - static public int ALT_TITLES = 3;
 64+ /** number of aditional alttitle1, alttitle2, .. etc fields to be filled in with redirects */
 65+ static public int ALT_TITLES = 3;
 66+ /** number of related fields in the index, first has the top-scored, etc, last everything else */
 67+ static public int RELATED_GROUPS = 4;
6668 /** Simple implementation of batch addition and deletion */
6769 class SimpleIndexModifier {
6870 protected IndexId iid;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
@@ -16,6 +16,9 @@
1717 import org.wikimedia.lsearch.config.GlobalConfiguration;
1818 import org.wikimedia.lsearch.config.IndexId;
1919 import org.wikimedia.lsearch.index.IndexUpdateRecord;
 20+import org.wikimedia.lsearch.ranks.RankBuilder;
 21+import org.wikimedia.lsearch.ranks.Related;
 22+import org.wikimedia.lsearch.ranks.RelatedTitle;
2023 import org.wikimedia.lsearch.util.Localization;
2124
2225 public class IndexUpdatesCollector implements DumpWriter {
@@ -65,7 +68,8 @@
6669 this.page = page;
6770 }
6871 public void writeEndPage() throws IOException {
69 - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects);
 72+ Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),
 73+ references,redirects,new ArrayList<RelatedTitle>()); // references and related titles are set correctly later (in incremental updater)
7074 log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
7175 records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
7276 log.debug(iid+": Update for "+article);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -10,8 +10,12 @@
1111 import java.net.Authenticator;
1212 import java.net.PasswordAuthentication;
1313 import java.util.ArrayList;
 14+import java.util.Collection;
 15+import java.util.Collections;
 16+import java.util.Comparator;
1417 import java.util.HashMap;
1518 import java.util.HashSet;
 19+import java.util.List;
1620 import java.util.Properties;
1721
1822 import org.apache.log4j.Logger;
@@ -23,7 +27,10 @@
2428 import org.wikimedia.lsearch.config.IndexId;
2529 import org.wikimedia.lsearch.index.IndexUpdateRecord;
2630 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 31+import org.wikimedia.lsearch.ranks.CompactArticleLinks;
2732 import org.wikimedia.lsearch.ranks.Links;
 33+import org.wikimedia.lsearch.ranks.Related;
 34+import org.wikimedia.lsearch.ranks.RelatedTitle;
2835 import org.wikimedia.lsearch.storage.Storage;
2936 import org.wikimedia.lsearch.util.Localization;
3037 import org.wikimedia.lsearch.util.UnicodeDecomposer;
@@ -185,7 +192,7 @@
186193 if(fetchReferences){
187194 try{
188195 // fetch references for records
189 - fetchReferences(records,dbname);
 196+ fetchReferencesAndRelated(records,dbname);
190197 } catch(IOException e){
191198 // FIXME: quick hack, if the table cannot be found (e.g. for new wikis) don't abort
192199 if(e.getMessage().contains("Base table or view not found")){
@@ -276,7 +283,7 @@
277284 } while(daemon);
278285 }
279286
280 - protected static void fetchReferences(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException {
 287+ protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException {
281288 Storage store = Storage.getInstance();
282289 ArrayList<Title> titles = new ArrayList<Title>();
283290 for(IndexUpdateRecord rec : records){
@@ -292,17 +299,33 @@
293300 }
294301 // fetch
295302 Links links = new Links(store.getPageReferences(titles,dbname));
 303+ HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname);
296304 // update
297305 for(IndexUpdateRecord rec : records){
298306 if(rec.isDelete())
299307 continue;
300308 Article ar = rec.getArticle();
301 - ar.setReferences(links.getLinks(ar.makeTitle().getKey()));
 309+ Title t = ar.makeTitle();
 310+ // set references
 311+ ar.setReferences(links.getLinks(t.getKey()));
302312 if(ar.getRedirects() != null){
303313 for(Redirect r : ar.getRedirects()){
304314 r.setReferences(links.getLinks(r.makeTitle().getKey()));
305315 }
306 - }
 316+ }
 317+ // set related
 318+ ArrayList<RelatedTitle> rt = rel.get(t.getKey());
 319+ if(rt != null){
 320+ Collections.sort(rt,new Comparator<RelatedTitle>() {
 321+ public int compare(RelatedTitle o1, RelatedTitle o2){
 322+ double d = o2.getScore()-o1.getScore();
 323+ if(d == 0) return 0;
 324+ else if(d > 0) return 1;
 325+ else return -1;
 326+ }
 327+ });
 328+ ar.setRelated(rt);
 329+ }
307330 }
308331 }
309332 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java
@@ -0,0 +1,103 @@
 2+package org.wikimedia.lsearch.util;
 3+
 4+public class MathFunc {
 5+
 6+ /** Calculate average value starting from start to end (end excluded) */
 7+ public static double avg(double[] val, int start, int end){
 8+ double s = 0;
 9+ for(int i=start;i<end;i++)
 10+ s+=val[i];
 11+ return s/(end-start);
 12+ }
 13+
 14+ /**
 15+ * Approximate the graph of function with num horizontal lines
 16+ * (const functions), so to minimize the maximal deviation
 17+ * @return list of discontinuities (begin points of horizontal lines)
 18+ */
 19+ public static int[] partitionList(double[] val, int num){
 20+ int[] part = new int[num+1]; // point of discontinuity
 21+ //double[] av = new double[num]; // approximate (average over included points)
 22+ // make first num-1 segments have length 1
 23+ for(int i=0;i<num;i++)
 24+ part[i]=i;
 25+ part[num] = val.length; // last point is end of values
 26+ //for(int i=0;i<num;i++)
 27+// av[i] = avg(val,part[i],part[i+1]);
 28+ // error
 29+ double err = calcErr(part,val,num);
 30+ // values at next iteration
 31+ int[] newpart = new int[num+1];
 32+ //double[] newav = new double[num];
 33+ double newerr = 0;
 34+
 35+ while(true){
 36+ for(int i=0;i<num-1;i++){
 37+ merge(i,part,newpart,val,num);
 38+ newerr = calcErr(newpart,val,num);
 39+ if(newerr < err){
 40+ copy(newpart,part);
 41+ err = newerr;
 42+ continue;
 43+ }
 44+ }
 45+ // try extending last
 46+ extend(part,newpart,val,num);
 47+ newerr = calcErr(newpart,val,num);
 48+ if(newerr < err){
 49+ copy(newpart,part);
 50+ err = newerr;
 51+ continue;
 52+ }
 53+ break;
 54+
 55+ }
 56+
 57+
 58+ return part;
 59+ }
 60+
 61+ private static void extend(int[] part, int[] newpart, double[] val, int num) {
 62+ for(int j=0;j<num;j++)
 63+ newpart[j] = part[j];
 64+ newpart[num-1] = part[num-1]+1;
 65+ newpart[num] = part[num];
 66+ /*for(int j=0;j<num;j++)
 67+ newav[j] = newav[j];
 68+ newav[num-1] = avg(val,newpart[num-1],newpart[num]); */
 69+ }
 70+
 71+ private static void copy(int[] newpart, int[] part) {
 72+ for(int i=0;i<newpart.length;i++)
 73+ part[i] = newpart[i];
 74+ /*for(int i=0;i<newav.length;i++)
 75+ av[i] = newav[i]; */
 76+
 77+ }
 78+
 79+ /** merge i to i+1, create one new part at the end */
 80+ private static void merge(int i, int[] part, int[] newpart, double[] val, int num) {
 81+ for(int j=0;j<=i;j++)
 82+ newpart[j] = part[j];
 83+ for(int j=i+1;j<num-1;j++)
 84+ newpart[j] = part[j+1];
 85+ newpart[num-1] = part[num-1]+1;
 86+ newpart[num] = part[num];
 87+ // update avg
 88+ /*for(int j=0;j<i;j++)
 89+ newavg[j] = avg[j];
 90+ for(int j=i;j<num;j++)
 91+ newavg[j] = avg(val,newpart[j],newpart[j+1]); */
 92+ }
 93+
 94+ private static double calcErr(int[] part, double[] val, int num) {
 95+ double err = 0;
 96+ for(int i=0;i<num;i++){
 97+ // max - min value
 98+ double e = val[part[i]]-val[part[i+1]-1];
 99+ if( e > err )
 100+ err = e;
 101+ }
 102+ return err;
 103+ }
 104+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java
@@ -60,17 +60,16 @@
6161 }
6262 public void writeEndPage() throws IOException {
6363 CompactArticleLinks p = links.get(page.Title.Namespace+":"+page.Title.Text);
64 - if(readRedirects){
65 - // register redirect
66 - Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
67 - if( redirect !=null ){
68 - CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
69 - if(cs != null)
70 - links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs);
71 - return;
 64+ // register redirect
 65+ Title redirect = Localization.getRedirectTitle(revision.Text,langCode);
 66+ if(redirect != null && readRedirects){
 67+ CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle());
 68+ if(cs != null){
 69+ links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs);
7270 }
 71+ } else if(redirect == null){ // process only non-redirects
 72+ processLinks(p,revision.Text,page.Title.Namespace);
7373 }
74 - processLinks(p,revision.Text,page.Title.Namespace);
7574 }
7675
7776 /** Find the links object for the ns:title key */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
@@ -72,12 +72,7 @@
7373 }
7474 }
7575 }
76 - /** Sort "what links here" list for each article */
77 - public void sortLinked(){
78 - for(CompactArticleLinks r : links.values()){
79 - r.sortLinked();
80 - }
81 - }
 76+
8277 /** Delete any unnecessary allocated space */
8378 public void compactAll() {
8479 for(CompactArticleLinks r : links.values()){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Related.java
@@ -0,0 +1,37 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+public class Related {
 5+ protected CompactArticleLinks title;
 6+ protected CompactArticleLinks relates;
 7+ protected double score;
 8+ public Related(CompactArticleLinks title, CompactArticleLinks relates, double score) {
 9+ this.title = title;
 10+ this.relates = relates;
 11+ this.score = score;
 12+ }
 13+ @Override
 14+ public String toString() {
 15+ return title+"->"+relates+" : "+score;
 16+ }
 17+ public CompactArticleLinks getRelates() {
 18+ return relates;
 19+ }
 20+ public void setRelates(CompactArticleLinks relates) {
 21+ this.relates = relates;
 22+ }
 23+ public double getScore() {
 24+ return score;
 25+ }
 26+ public void setScore(double score) {
 27+ this.score = score;
 28+ }
 29+ public CompactArticleLinks getTitle() {
 30+ return title;
 31+ }
 32+ public void setTitle(CompactArticleLinks title) {
 33+ this.title = title;
 34+ }
 35+
 36+
 37+
 38+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java
@@ -13,10 +13,12 @@
1414 import java.util.Map.Entry;
1515
1616 import org.apache.log4j.Logger;
 17+import org.apache.lucene.document.Field.Store;
1718 import org.mediawiki.dumper.ProgressFilter;
1819 import org.mediawiki.dumper.Tools;
1920 import org.mediawiki.importer.XmlDumpReader;
2021 import org.wikimedia.lsearch.beans.ArticleLinks;
 22+import org.wikimedia.lsearch.beans.Title;
2123 import org.wikimedia.lsearch.config.Configuration;
2224 import org.wikimedia.lsearch.config.GlobalConfiguration;
2325 import org.wikimedia.lsearch.config.IndexId;
@@ -71,12 +73,8 @@
7274 Links links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS);
7375 links.compactAll();
7476 Storage store = Storage.getInstance();
75 - //store.storePageReferences(links.getAll(),dbname);
76 - printRelated(links);
77 -
78 - /*for(CompactArticleLinks cs : links.values()){
79 - System.out.println(cs);
80 - }*/
 77+ store.storePageReferences(links.getAll(),dbname);
 78+ storeRelated(store,links,dbname);
8179
8280 long end = System.currentTimeMillis();
8381
@@ -127,93 +125,58 @@
128126 return tr.getTitles();
129127 }
130128
131 - static class Related {
132 - CompactArticleLinks title;
133 - CompactArticleLinks relates;
134 - double score;
135 - HashMap<CompactArticleLinks,Double> scores;
136 - public Related(CompactArticleLinks title, CompactArticleLinks relates, double score, HashMap<CompactArticleLinks,Double> scores) {
137 - this.title = title;
138 - this.relates = relates;
139 - this.score = score;
140 - this.scores = scores;
141 - }
142 - @Override
143 - public String toString() {
144 - return title+"->"+relates+" : "+score;
145 - }
146 - }
147 -
148 - public static void printRelated(Links links){
 129+ public static void storeRelated(Storage store, Links links, String dbname) throws IOException{
149130 int num = 0;
150131 int total = links.getAll().size();
 132+ ArrayList<Related> buf = new ArrayList<Related>();
151133 for(CompactArticleLinks cs : links.getAll()){
152134 num++;
153 -
154 - ArrayList<Related> pq = new ArrayList<Related>();
155 - HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();
156 - //HashSet<CompactArticleLinks> lin = new HashSet<CompactArticleLinks>();
157 - //HashSet<CompactArticleLinks> lout = new HashSet<CompactArticleLinks>();
158 - System.out.println("["+num+"/"+total+" - "+cs.linksInIndex+"] "+cs.toString());
159 - if(cs.linksIn != null){
160 - for(CompactArticleLinks csl : cs.linksIn)
161 - ll.add(csl);
 135+ log.debug("["+num+"/"+total+" - "+cs.linksInIndex+"] "+cs.toString());
 136+ buf.addAll(getRelated(cs,links));
 137+ if(buf.size() > 10000){
 138+ store.storeRelatedPages(buf,dbname);
 139+ buf.clear();
162140 }
163 - /* if(cs.linksOut != null){
164 - for(CompactArticleLinks csl : cs.linksOut)
165 - ll.add(csl);
166 - } */
167 - if(cs.toString().equals("0:Douglas Adams")){
168 - int b = 01;
169 - b++;
 141+ }
 142+ }
 143+
 144+ /**
 145+ * Get related articles, sorted descending by score
 146+ */
 147+ public static ArrayList<Related> getRelated(CompactArticleLinks cs, Links links){
 148+ ArrayList<Related> ret = new ArrayList<Related>();
 149+
 150+ HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>();
 151+ if(cs.linksIn != null){
 152+ for(CompactArticleLinks csl : cs.linksIn)
 153+ ll.add(csl);
 154+ }
 155+ for(CompactArticleLinks from : ll){
 156+ double score = relatedScore(cs,ll,from);
 157+ if(score != 0)
 158+ ret.add(new Related(cs,from,score));
 159+ }
 160+ Collections.sort(ret,new Comparator<Related>() {
 161+ public int compare(Related o1, Related o2){
 162+ double d = o2.score-o1.score;
 163+ if(d == 0) return 0;
 164+ else if(d > 0) return 1;
 165+ else return -1;
170166 }
171 - for(CompactArticleLinks from : ll){
172 - //double score = relatedScore(cs,ll,from);
173 - Object[] ret = relatedScore(cs,ll,from);
174 - double score = (Double) ret[0];
175 - HashMap<CompactArticleLinks,Double> scores = (HashMap<CompactArticleLinks, Double>) ret[1];
176 - if(score != 0)
177 - pq.add(new Related(cs,from,score,scores));
178 -
179 - }
180 - /*for(CompactArticleLinks to : lout){
181 - if(!lin.contains(to)){
182 - double score = relatedScore(cs,lin,lout,to);
183 - if(score != 0)
184 - pq.add(new Related(cs,to,score));
185 - }
186 - }*/
187 - if(pq.size() > 0){
188 - Collections.sort(pq,new Comparator<Related>() {
189 - public int compare(Related o1, Related o2){
190 - double d = o2.score-o1.score;
191 - if(d == 0) return 0;
192 - else if(d > 0) return 1;
193 - else return -1;
194 - }
195 - });
196 - System.out.println(cs.getKey()+" -> ");
197 - for(Related r : pq){
198 - System.out.println(" -> "+r.relates+" ("+r.score+")");
199 - if(r.scores != null){
200 - ArrayList<Entry<CompactArticleLinks, Double>> ss = new ArrayList<Entry<CompactArticleLinks, Double>>();
201 - ss.addAll(r.scores.entrySet());
202 - Collections.sort(ss,new Comparator<Entry<CompactArticleLinks, Double>>() {
203 - public int compare(Entry<CompactArticleLinks, Double> o1, Entry<CompactArticleLinks, Double> o2){
204 - double d = o2.getValue()-o1.getValue();
205 - if(d == 0) return 0;
206 - else if(d > 0) return 1;
207 - else return -1;
208 - }
209 - });
210 - for(Entry<CompactArticleLinks, Double> e : ss){
211 - System.out.println(" + "+e.getKey().toString()+" = "+e.getValue());
212 - }
213 - }
214 - }
215 - System.out.println();
216 - }
 167+ });
 168+ return ret;
 169+ }
 170+
 171+ /**
 172+ * Get related titles (RelatedTitle is used in Article)
 173+ */
 174+ public static ArrayList<RelatedTitle> getRelatedTitles(CompactArticleLinks cs, Links links){
 175+ ArrayList<Related> rel = getRelated(cs,links);
 176+ ArrayList<RelatedTitle> ret = new ArrayList<RelatedTitle>();
 177+ for(Related r : rel){
 178+ ret.add(new RelatedTitle(new Title(r.relates.toString()),r.score));
217179 }
 180+ return ret;
218181 }
219182
220183 public static double norm(double d){
@@ -223,37 +186,24 @@
224187 return d;
225188 }
226189
227 - public static Object[] relatedScore(CompactArticleLinks p, HashSet<CompactArticleLinks> ll, CompactArticleLinks q){
 190+ public static double relatedScore(CompactArticleLinks p, HashSet<CompactArticleLinks> ll, CompactArticleLinks q){
228191 double score = 0;
229 - //HashMap<CompactArticleLinks,Double> scores = new HashMap<CompactArticleLinks,Double>();
230 - //int links = q.links;
231 - // iterate the neighbourhood of q and see it they link to p
 192+ // all r that links to q
232193 for(int i=0;i<q.linksInIndex;i++){
233194 CompactArticleLinks r = q.linksIn[i];
234195 if(r != q && r.links != 0 && ll.contains(r)){
235 - //score += 1.0/(norm(q.links)*norm(r.links));
236196 score += 1.0/norm(r.links);
237 - //scores.put(r,1.0/norm(r.links));
238197 }
239198
240199 }
 200+ // all r that q links to
241201 for(int i=0;i<q.linksOutIndex;i++){
242202 CompactArticleLinks r = q.linksOut[i];
243203 if(r != q && r.links!=0 && ll.contains(r)){
244 - //score += 1.0/(norm(q.links)*norm(r.links));
245204 score += 1.0/norm(r.links);
246 - //scores.put(r,1.0/norm(r.links));
247205 }
248206 }
249 - // iterate neighbourhood of p and see if it links to q
250 - /*for(int i=0;i<p.linksInIndex;i++){
251 - CompactArticleLinks r = p.linksIn[i];
252 - if(q.hasLinkFrom(r))
253 - score += 1.0/(norm(q.links)*norm(r.links));
254 - } */
255 - //return score * (count / (double)(q.linksInIndex+q.linksOutIndex)) * q.links;
256 - //return score * q.links;
257 - return new Object[]{ score, null };
 207+ return score;
258208 }
259209
260210 private static String formatTime(long l) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RelatedTitle.java
@@ -0,0 +1,31 @@
 2+package org.wikimedia.lsearch.ranks;
 3+
 4+import org.wikimedia.lsearch.beans.Title;
 5+
 6+public class RelatedTitle {
 7+ protected Title related;
 8+ protected double score;
 9+
 10+ public RelatedTitle(Title related, double score) {
 11+ this.related = related;
 12+ this.score = score;
 13+ }
 14+ public Title getRelated() {
 15+ return related;
 16+ }
 17+ public void setRelated(Title related) {
 18+ this.related = related;
 19+ }
 20+ public double getScore() {
 21+ return score;
 22+ }
 23+ public void setScore(double score) {
 24+ this.score = score;
 25+ }
 26+ @Override
 27+ public String toString() {
 28+ return related.toString()+" ("+score+")";
 29+ }
 30+
 31+
 32+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java
@@ -133,22 +133,6 @@
134134 return h;
135135 }
136136
137 - /** Sort by hash value to be able to more quickly find an object */
138 - public void sortLinked(){
139 - if(linksIn == null)
140 - return;
141 - ArrayList<CompactArticleLinks> l = new ArrayList<CompactArticleLinks>();
142 - for(int i=0;i<linksInIndex;i++)
143 - l.add(linksIn[i]);
144 - Collections.sort(l,new Comparator<CompactArticleLinks>() {
145 - public int compare(CompactArticleLinks o1, CompactArticleLinks o2){
146 - return o1.hashCode() - o2.hashCode();
147 - }
148 - });
149 - linksIn = l.toArray(new CompactArticleLinks[] {});
150 - linksInIndex = linksIn.length;
151 - }
152 -
153137 /** Delete any excessive space in linksIn, linksOut */
154138 public void compact(){
155139 CompactArticleLinks[] n;
Index: branches/lucene-search-2.1/sql/references_table.sql
@@ -1,15 +0,0 @@
2 -CREATE TABLE /*DBprefix*/references (
3 - -- key in form <ns>:<title>
4 - rf_key varchar(255) binary NOT NULL,
5 -
6 - -- number of page links to this page
7 - rf_references int(10) unsigned NOT NULL,
8 -
9 - --
10 - PRIMARY KEY rf_key(rf_key)
11 -
12 -) TYPE=InnoDB;
13 -
Index: branches/lucene-search-2.1/sql/page_table.sql
@@ -0,0 +1,17 @@
 2+--
 3+-- Table with cached information about pages and references to id
 4+--
 5+CREATE TABLE /*DBprefix*/page (
 6+ page_id int unsigned auto_increment NOT NULL,
 7+
 8+ -- key in form <ns>:<title>
 9+ page_key varchar(255) binary NOT NULL,
 10+
 11+ -- number of page links to this page
 12+ page_references int unsigned NOT NULL,
 13+
 14+ --
 15+ PRIMARY KEY page_id(page_id),
 16+ UNIQUE (page_key)
 17+
 18+) TYPE=InnoDB;
\ No newline at end of file
Index: branches/lucene-search-2.1/sql/related_table.sql
@@ -0,0 +1,18 @@
 2+--
 3+-- Table with a mapping of related articles
 4+--
 5+CREATE TABLE /*DBprefix*/related (
 6+ -- key of article (fk to page_id)
 7+ rel_to int unsigned NOT NULL,
 8+
 9+ -- article which links to rel_id (fk to page_id)
 10+ rel_related int unsigned NOT NULL,
 11+
 12+ -- the strength of relatedness, positive
 13+ rel_score float(23) NOT NULL,
 14+
 15+ --
 16+ PRIMARY KEY re_relates(rel_to,rel_related),
 17+ INDEX (rel_to)
 18+
 19+) TYPE=InnoDB;
\ No newline at end of file

Status & tagging log