Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java |
— | — | @@ -25,6 +25,7 @@ |
26 | 26 | import org.wikimedia.lsearch.config.IndexId; |
27 | 27 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
28 | 28 | import org.wikimedia.lsearch.ranks.Links; |
| 29 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
29 | 30 | import org.wikimedia.lsearch.util.Localization; |
30 | 31 | |
31 | 32 | /** |
— | — | @@ -54,8 +55,9 @@ |
55 | 56 | public void writeEndPage() throws IOException { |
56 | 57 | ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
57 | 58 | boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null; |
| 59 | + ArrayList<RelatedTitle> related = new ArrayList<RelatedTitle>(); |
58 | 60 | // make article |
59 | | - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects); |
| 61 | + Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects,related); |
60 | 62 | //if(page.Title.Namespace != 0) |
61 | 63 | // article.setContents(""); |
62 | 64 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java |
— | — | @@ -16,6 +16,14 @@ |
17 | 17 | this.title = title; |
18 | 18 | } |
19 | 19 | |
| 20 | + public Title(String key){ |
| 21 | + String[] parts = key.split(":",2); |
| 22 | + if(parts.length != 2) |
| 23 | + throw new RuntimeException("Wrong key format in Title constructor"); |
| 24 | + this.namespace = Integer.parseInt(parts[0]); |
| 25 | + this.title = parts[1]; |
| 26 | + } |
| 27 | + |
20 | 28 | public String getKey(){ |
21 | 29 | return namespace+":"+title; |
22 | 30 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -28,6 +28,8 @@ |
29 | 29 | import java.util.ArrayList; |
30 | 30 | import java.util.Collection; |
31 | 31 | |
| 32 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
| 33 | + |
32 | 34 | /** |
33 | 35 | * Wiki article. |
34 | 36 | * |
— | — | @@ -47,6 +49,8 @@ |
48 | 50 | private transient ArrayList<Integer> redirectKeywordRanks; |
49 | 51 | /** generated before indexing from the reference sto this article, and references from redirects */ |
50 | 52 | private transient int rank; |
| 53 | + /** names of articles that relate to this article */ |
| 54 | + private ArrayList<RelatedTitle> related; |
51 | 55 | |
52 | 56 | public Article(){ |
53 | 57 | namespace=""; |
— | — | @@ -56,6 +60,7 @@ |
57 | 61 | redirect=false; |
58 | 62 | references = 0; |
59 | 63 | redirects=new ArrayList<Redirect>(); |
| 64 | + related = new ArrayList<RelatedTitle>(); |
60 | 65 | } |
61 | 66 | |
62 | 67 | public Article(long pageId, Title title, String text, boolean redirect, int references) { |
— | — | @@ -66,6 +71,7 @@ |
67 | 72 | this.redirect = redirect; |
68 | 73 | this.references = references; |
69 | 74 | this.redirects = new ArrayList<Redirect>(); |
| 75 | + this.related = new ArrayList<RelatedTitle>(); |
70 | 76 | } |
71 | 77 | |
72 | 78 | public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references) { |
— | — | @@ -76,9 +82,10 @@ |
77 | 83 | this.pageId = pageId; |
78 | 84 | this.references = references; |
79 | 85 | this.redirects = new ArrayList<Redirect>(); |
| 86 | + this.related = new ArrayList<RelatedTitle>(); |
80 | 87 | } |
81 | 88 | |
82 | | - public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, ArrayList<Redirect> redirects) { |
| 89 | + public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related) { |
83 | 90 | this.namespace = Integer.toString(namespace); |
84 | 91 | this.title = titleText; |
85 | 92 | contents = text; |
— | — | @@ -86,6 +93,7 @@ |
87 | 94 | this.pageId = pageId; |
88 | 95 | this.references = references; |
89 | 96 | this.redirects = redirects; |
| 97 | + this.related = related; |
90 | 98 | } |
91 | 99 | |
92 | 100 | public boolean isRedirect() { |
— | — | @@ -200,9 +208,19 @@ |
201 | 209 | |
202 | 210 | public void setContents(String contents) { |
203 | 211 | this.contents = contents; |
| 212 | + } |
| 213 | + |
| 214 | + public ArrayList<RelatedTitle> getRelated() { |
| 215 | + return related; |
| 216 | + } |
| 217 | + |
| 218 | + public void setRelated(ArrayList<RelatedTitle> related) { |
| 219 | + this.related = related; |
204 | 220 | } |
205 | 221 | |
206 | 222 | |
207 | 223 | |
208 | 224 | |
| 225 | + |
| 226 | + |
209 | 227 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -16,6 +16,7 @@ |
17 | 17 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
18 | 18 | import org.wikimedia.lsearch.config.IndexId; |
19 | 19 | import org.wikimedia.lsearch.index.IndexThread; |
| 20 | +import org.wikimedia.lsearch.ranks.LinkReader; |
20 | 21 | import org.wikimedia.lsearch.ranks.Links; |
21 | 22 | import org.wikimedia.lsearch.ranks.RankBuilder; |
22 | 23 | import org.wikimedia.lsearch.storage.Storage; |
— | — | @@ -102,8 +103,8 @@ |
103 | 104 | long start = System.currentTimeMillis(); |
104 | 105 | |
105 | 106 | // regenerate link and redirect information |
106 | | - Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS); |
107 | | - |
| 107 | + Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,LinkReader.READ_REDIRECTS); |
| 108 | + |
108 | 109 | if(updateReferences){ |
109 | 110 | try { |
110 | 111 | Storage.getInstance().storePageReferences(links.getAll(),dbname); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -22,6 +22,8 @@ |
23 | 23 | import org.wikimedia.lsearch.config.IndexId; |
24 | 24 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
25 | 25 | import org.wikimedia.lsearch.ranks.Links; |
| 26 | +import org.wikimedia.lsearch.ranks.RankBuilder; |
| 27 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
26 | 28 | import org.wikimedia.lsearch.util.Localization; |
27 | 29 | |
28 | 30 | public class DumpImporter implements DumpWriter { |
— | — | @@ -30,7 +32,7 @@ |
31 | 33 | Revision revision; |
32 | 34 | SimpleIndexWriter writer; |
33 | 35 | int count = 0, limit; |
34 | | - Links ranks; |
| 36 | + Links links; |
35 | 37 | String langCode; |
36 | 38 | |
37 | 39 | public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor, |
— | — | @@ -38,7 +40,7 @@ |
39 | 41 | Configuration.open(); // make sure configuration is loaded |
40 | 42 | writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex); |
41 | 43 | this.limit = limit; |
42 | | - this.ranks = ranks; |
| 44 | + this.links = ranks; |
43 | 45 | this.langCode = langCode; |
44 | 46 | } |
45 | 47 | public void writeRevision(Revision revision) throws IOException { |
— | — | @@ -50,7 +52,7 @@ |
51 | 53 | public void writeEndPage() throws IOException { |
52 | 54 | // get reference count |
53 | 55 | String key = page.Title.Namespace+":"+page.Title.Text; |
54 | | - CompactArticleLinks r = ranks.get(key); |
| 56 | + CompactArticleLinks r = links.get(key); |
55 | 57 | int references; |
56 | 58 | boolean isRedirect = r.redirectsTo != null; |
57 | 59 | if(r == null){ |
— | — | @@ -65,9 +67,11 @@ |
66 | 68 | String[] parts = rk.toString().split(":",2); |
67 | 69 | redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links)); |
68 | 70 | } |
69 | | - } |
| 71 | + } |
| 72 | + ArrayList<RelatedTitle> related = RankBuilder.getRelatedTitles(r,links); |
70 | 73 | // make article |
71 | | - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,references,redirects); |
| 74 | + Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect, |
| 75 | + references,redirects,related); |
72 | 76 | writer.addArticle(article); |
73 | 77 | count++; |
74 | 78 | if(limit >= 0 && count > limit) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/Storage.java |
— | — | @@ -1,10 +1,16 @@ |
2 | 2 | package org.wikimedia.lsearch.storage; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
5 | 6 | import java.util.Collection; |
| 7 | +import java.util.HashMap; |
| 8 | +import java.util.List; |
| 9 | +import java.util.Map.Entry; |
6 | 10 | |
7 | 11 | import org.wikimedia.lsearch.beans.Title; |
8 | 12 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
| 13 | +import org.wikimedia.lsearch.ranks.Related; |
| 14 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
9 | 15 | |
10 | 16 | abstract public class Storage { |
11 | 17 | static protected Storage instance = null; |
— | — | @@ -25,5 +31,15 @@ |
26 | 32 | * Fetch page references for number of titles |
27 | 33 | */ |
28 | 34 | abstract public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException; |
| 35 | + |
| 36 | + /** |
| 37 | + * Store some related mappings |
| 38 | + */ |
| 39 | + abstract public void storeRelatedPages(Collection<Related> related, String dbname) throws IOException; |
29 | 40 | |
| 41 | + /** |
| 42 | + * Get related mapping for a collection of titles |
| 43 | + */ |
| 44 | + abstract public HashMap<Title,ArrayList<RelatedTitle>> getRelatedPages(Collection<Title> titles, String dbname) throws IOException; |
| 45 | + |
30 | 46 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/MySQLStorage.java |
— | — | @@ -1,7 +1,6 @@ |
2 | 2 | package org.wikimedia.lsearch.storage; |
3 | 3 | |
4 | 4 | import java.io.BufferedReader; |
5 | | -import java.io.FileNotFoundException; |
6 | 5 | import java.io.FileReader; |
7 | 6 | import java.io.IOException; |
8 | 7 | import java.sql.Connection; |
— | — | @@ -11,14 +10,19 @@ |
12 | 11 | import java.sql.Statement; |
13 | 12 | import java.util.ArrayList; |
14 | 13 | import java.util.Collection; |
| 14 | +import java.util.HashMap; |
| 15 | +import java.util.HashSet; |
15 | 16 | import java.util.Hashtable; |
16 | 17 | import java.util.Iterator; |
| 18 | +import java.util.List; |
17 | 19 | import java.util.Map.Entry; |
18 | 20 | |
19 | 21 | import org.apache.log4j.Logger; |
20 | 22 | import org.wikimedia.lsearch.beans.Title; |
21 | 23 | import org.wikimedia.lsearch.config.Configuration; |
22 | 24 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
| 25 | +import org.wikimedia.lsearch.ranks.Related; |
| 26 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
23 | 27 | |
24 | 28 | /** |
25 | 29 | * MySQL storage backend |
— | — | @@ -162,14 +166,14 @@ |
163 | 167 | |
164 | 168 | // inherit javadoc |
165 | 169 | public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException { |
166 | | - String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE "; |
| 170 | + String sql = "SELECT page_key, page_references from "+getTableName("page",dbname)+" WHERE "; |
167 | 171 | if(titles == null || titles.size()==0) |
168 | 172 | return new ArrayList<CompactArticleLinks>(); |
169 | 173 | else if(titles.size()==1){ |
170 | | - sql += "rf_key="+quote(escape(titles.iterator().next().getKey())); |
| 174 | + sql += "page_key="+quote(escape(titles.iterator().next().getKey())); |
171 | 175 | } else{ |
172 | 176 | StringBuilder sb = new StringBuilder(sql); |
173 | | - sb.append("rf_key IN ("); |
| 177 | + sb.append("page_key IN ("); |
174 | 178 | Iterator<Title> it = titles.iterator(); |
175 | 179 | while(it.hasNext()){ |
176 | 180 | sb.append('\''); |
— | — | @@ -178,7 +182,7 @@ |
179 | 183 | if(it.hasNext()) |
180 | 184 | sb.append(','); |
181 | 185 | } |
182 | | - sb.append(")"); |
| 186 | + sb.append(");"); |
183 | 187 | sql = sb.toString(); |
184 | 188 | } |
185 | 189 | try { |
— | — | @@ -188,7 +192,7 @@ |
189 | 193 | ResultSet res = stmt.executeQuery(sql); |
190 | 194 | ArrayList<CompactArticleLinks> ret = new ArrayList<CompactArticleLinks>(); |
191 | 195 | while(res.next()){ |
192 | | - ret.add(new CompactArticleLinks(res.getString("rf_key"),res.getInt("rf_references"))); |
| 196 | + ret.add(new CompactArticleLinks(res.getString("page_key"),res.getInt("page_references"))); |
193 | 197 | } |
194 | 198 | conn.close(); |
195 | 199 | return ret; |
— | — | @@ -202,11 +206,11 @@ |
203 | 207 | public void storePageReferences(Collection<CompactArticleLinks> refs, String dbname) throws IOException { |
204 | 208 | final int maxPerQuery = 10000; |
205 | 209 | Connection conn = getWriteConnection(dbname); |
206 | | - verifyTable("references",dbname,conn); |
| 210 | + verifyTable("page",dbname,conn); |
207 | 211 | Iterator<CompactArticleLinks> it = refs.iterator(); |
208 | 212 | // send chunks of maxPerQuery referenace replacements |
209 | 213 | while(it.hasNext()){ |
210 | | - StringBuilder sb = new StringBuilder("REPLACE INTO "+getTableName("references",dbname)+" (rf_key,rf_references) VALUES "); |
| 214 | + StringBuilder sb = new StringBuilder("INSERT INTO "+getTableName("page",dbname)+" (page_key,page_references) VALUES "); |
211 | 215 | int count = 0; |
212 | 216 | while(it.hasNext() && count < maxPerQuery){ |
213 | 217 | CompactArticleLinks cs = it.next(); |
— | — | @@ -218,7 +222,7 @@ |
219 | 223 | if(it.hasNext() && count<maxPerQuery) |
220 | 224 | sb.append("'), "); |
221 | 225 | else |
222 | | - sb.append("');"); |
| 226 | + sb.append("') ON DUPLICATE KEY UPDATE page_references=VALUES(page_references);"); |
223 | 227 | } |
224 | 228 | try { |
225 | 229 | log.info("Storing "+Math.min(maxPerQuery,count)+" page ranks... "); |
— | — | @@ -283,5 +287,135 @@ |
284 | 288 | log.error("Cannot create table "+table+" : "+e.getMessage()); |
285 | 289 | throw new IOException(e.getMessage()); |
286 | 290 | } |
| 291 | + } |
| 292 | + |
| 293 | + @Override |
| 294 | + public HashMap<Title, ArrayList<RelatedTitle>> getRelatedPages(Collection<Title> titles, String dbname) throws IOException { |
| 295 | + String page = getTableName("page",dbname); |
| 296 | + String related = getTableName("related",dbname); |
| 297 | + String sql = "SELECT a.page_key as 'r_to', b.page_key as 'r_related', rel_score FROM "+related+", "+page+" a, "+page+" b WHERE rel_to=a.page_id AND rel_related=b.page_id AND "; |
| 298 | + if(titles == null || titles.size()==0) |
| 299 | + return new HashMap<Title, ArrayList<RelatedTitle>>(); |
| 300 | + else if(titles.size()==1){ |
| 301 | + sql += "a.page_key="+quote(escape(titles.iterator().next().getKey())); |
| 302 | + } else{ |
| 303 | + StringBuilder sb = new StringBuilder(sql); |
| 304 | + sb.append("a.page_key IN ("); |
| 305 | + Iterator<Title> it = titles.iterator(); |
| 306 | + while(it.hasNext()){ |
| 307 | + sb.append('\''); |
| 308 | + sb.append(escape(it.next().getKey())); |
| 309 | + sb.append('\''); |
| 310 | + if(it.hasNext()) |
| 311 | + sb.append(','); |
| 312 | + } |
| 313 | + sb.append(");"); |
| 314 | + sql = sb.toString(); |
| 315 | + } |
| 316 | + try { |
| 317 | + Connection conn = getReadConnection(dbname); |
| 318 | + log.info("Fetching related info for "+titles.size()+" articles"); |
| 319 | + Statement stmt = conn.createStatement(); |
| 320 | + ResultSet res = stmt.executeQuery(sql); |
| 321 | + HashMap<Title, ArrayList<RelatedTitle>> ret = new HashMap<Title, ArrayList<RelatedTitle>>(); |
| 322 | + while(res.next()){ |
| 323 | + Title t1 = new Title(res.getString("r_to")); |
| 324 | + Title t2 = new Title(res.getString("r_related")); |
| 325 | + double score = res.getDouble("rel_score"); |
| 326 | + ArrayList<RelatedTitle> rel = ret.get(t1); |
| 327 | + if(rel == null){ |
| 328 | + rel = new ArrayList<RelatedTitle>(); |
| 329 | + ret.put(t1,rel); |
| 330 | + } |
| 331 | + rel.add(new RelatedTitle(t2,score)); |
| 332 | + } |
| 333 | + conn.close(); |
| 334 | + return ret; |
| 335 | + } catch (SQLException e) { |
| 336 | + log.error("Cannot execute sql "+sql+" : "+e.getMessage()); |
| 337 | + throw new IOException(e.getMessage()); |
| 338 | + } |
| 339 | + } |
| 340 | + |
| 341 | + protected HashMap<String,Integer> getPageIDs(Collection<String> keys, String dbname, Connection conn) throws IOException{ |
| 342 | + String sql = "SELECT page_key, page_id from "+getTableName("page",dbname)+" WHERE "; |
| 343 | + if(keys.size()==1){ |
| 344 | + sql += "page_key="+quote(escape(keys.iterator().next())); |
| 345 | + } else{ |
| 346 | + StringBuilder sb = new StringBuilder(sql); |
| 347 | + sb.append("page_key IN ("); |
| 348 | + Iterator<String> it = keys.iterator(); |
| 349 | + while(it.hasNext()){ |
| 350 | + sb.append('\''); |
| 351 | + sb.append(escape(it.next())); |
| 352 | + sb.append('\''); |
| 353 | + if(it.hasNext()) |
| 354 | + sb.append(','); |
| 355 | + } |
| 356 | + sb.append(");"); |
| 357 | + sql = sb.toString(); |
| 358 | + } |
| 359 | + try { |
| 360 | + log.info("Fetching page ids for "+keys.size()+" pages"); |
| 361 | + Statement stmt = conn.createStatement(); |
| 362 | + ResultSet res = stmt.executeQuery(sql); |
| 363 | + HashMap<String,Integer> map = new HashMap<String,Integer>(); |
| 364 | + while(res.next()){ |
| 365 | + map.put(res.getString("page_key"),res.getInt("page_id")); |
| 366 | + } |
| 367 | + return map; |
| 368 | + } catch (SQLException e) { |
| 369 | + log.error("Cannot execute sql "+sql+" : "+e.getMessage()); |
| 370 | + throw new IOException(e.getMessage()); |
| 371 | + } |
| 372 | + } |
| 373 | + |
| 374 | + @Override |
| 375 | + public void storeRelatedPages(Collection<Related> related, String dbname) throws IOException { |
| 376 | + Connection read = getReadConnection(dbname); |
| 377 | + HashSet<String> keys = new HashSet<String>(); |
| 378 | + for(Related r : related){ |
| 379 | + keys.add(r.getTitle().toString()); |
| 380 | + keys.add(r.getRelates().toString()); |
| 381 | + } |
| 382 | + HashMap<String,Integer> map = getPageIDs(keys,dbname,read); |
| 383 | + final int maxPerQuery = 20000; |
| 384 | + Connection write = getWriteConnection(dbname); |
| 385 | + verifyTable("related",dbname,write); |
| 386 | + Iterator<Related> it = related.iterator(); |
| 387 | + // send chunks of maxPerQuery referenace replacements |
| 388 | + while(it.hasNext()){ |
| 389 | + StringBuilder sb = new StringBuilder("INSERT INTO "+getTableName("related",dbname)+" (rel_to,rel_related,rel_score) VALUES "); |
| 390 | + int count = 0; |
| 391 | + while(it.hasNext() && count < maxPerQuery){ |
| 392 | + Related r = it.next(); |
| 393 | + sb.append("('"); |
| 394 | + sb.append(Integer.toString(map.get(r.getTitle().toString()))); |
| 395 | + sb.append("','"); |
| 396 | + sb.append(Integer.toString(map.get(r.getRelates().toString()))); |
| 397 | + sb.append("','"); |
| 398 | + sb.append(Double.toString(r.getScore())); |
| 399 | + count++; |
| 400 | + if(it.hasNext() && count<maxPerQuery) |
| 401 | + sb.append("'), "); |
| 402 | + else |
| 403 | + sb.append("') ON DUPLICATE KEY UPDATE rel_score=VALUES(rel_score);"); |
| 404 | + } |
| 405 | + try { |
| 406 | + log.info("Storing "+Math.min(maxPerQuery,count)+" related pages... "); |
| 407 | + Statement stmt = write.createStatement(); |
| 408 | + stmt.executeUpdate(sb.toString()); |
| 409 | + |
| 410 | + } catch (SQLException e) { |
| 411 | + log.error("Cannot execute replace query "+sb+" : "+e.getMessage()); |
| 412 | + throw new IOException(e.getMessage()); |
| 413 | + } |
| 414 | + } |
| 415 | + try { |
| 416 | + write.close(); // be sure we close the connection |
| 417 | + read.close(); |
| 418 | + } catch (SQLException e) { |
| 419 | + } |
| 420 | + |
287 | 421 | } |
288 | 422 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -60,8 +60,10 @@ |
61 | 61 | } |
62 | 62 | |
63 | 63 | static public final int MAX_FIELD_LENGTH = 100000; |
64 | | - /** number of aditional title1, title2, .. etc fields to be filled in with redirects */ |
65 | | - static public int ALT_TITLES = 3; |
| 64 | + /** number of aditional alttitle1, alttitle2, .. etc fields to be filled in with redirects */ |
| 65 | + static public int ALT_TITLES = 3; |
| 66 | + /** number of related fields in the index, first has the top-scored, etc, last everything else */ |
| 67 | + static public int RELATED_GROUPS = 4; |
66 | 68 | /** Simple implementation of batch addition and deletion */ |
67 | 69 | class SimpleIndexModifier { |
68 | 70 | protected IndexId iid; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -16,6 +16,9 @@ |
17 | 17 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
18 | 18 | import org.wikimedia.lsearch.config.IndexId; |
19 | 19 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
| 20 | +import org.wikimedia.lsearch.ranks.RankBuilder; |
| 21 | +import org.wikimedia.lsearch.ranks.Related; |
| 22 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
20 | 23 | import org.wikimedia.lsearch.util.Localization; |
21 | 24 | |
22 | 25 | public class IndexUpdatesCollector implements DumpWriter { |
— | — | @@ -65,7 +68,8 @@ |
66 | 69 | this.page = page; |
67 | 70 | } |
68 | 71 | public void writeEndPage() throws IOException { |
69 | | - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(),references,redirects); |
| 72 | + Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(), |
| 73 | + references,redirects,new ArrayList<RelatedTitle>()); // references and related titles are set correctly later (in incremental updater) |
70 | 74 | log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects); |
71 | 75 | records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE)); |
72 | 76 | log.debug(iid+": Update for "+article); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java |
— | — | @@ -10,8 +10,12 @@ |
11 | 11 | import java.net.Authenticator; |
12 | 12 | import java.net.PasswordAuthentication; |
13 | 13 | import java.util.ArrayList; |
| 14 | +import java.util.Collection; |
| 15 | +import java.util.Collections; |
| 16 | +import java.util.Comparator; |
14 | 17 | import java.util.HashMap; |
15 | 18 | import java.util.HashSet; |
| 19 | +import java.util.List; |
16 | 20 | import java.util.Properties; |
17 | 21 | |
18 | 22 | import org.apache.log4j.Logger; |
— | — | @@ -23,7 +27,10 @@ |
24 | 28 | import org.wikimedia.lsearch.config.IndexId; |
25 | 29 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
26 | 30 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 31 | +import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
27 | 32 | import org.wikimedia.lsearch.ranks.Links; |
| 33 | +import org.wikimedia.lsearch.ranks.Related; |
| 34 | +import org.wikimedia.lsearch.ranks.RelatedTitle; |
28 | 35 | import org.wikimedia.lsearch.storage.Storage; |
29 | 36 | import org.wikimedia.lsearch.util.Localization; |
30 | 37 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -185,7 +192,7 @@ |
186 | 193 | if(fetchReferences){ |
187 | 194 | try{ |
188 | 195 | // fetch references for records |
189 | | - fetchReferences(records,dbname); |
| 196 | + fetchReferencesAndRelated(records,dbname); |
190 | 197 | } catch(IOException e){ |
191 | 198 | // FIXME: quick hack, if the table cannot be found (e.g. for new wikis) don't abort |
192 | 199 | if(e.getMessage().contains("Base table or view not found")){ |
— | — | @@ -276,7 +283,7 @@ |
277 | 284 | } while(daemon); |
278 | 285 | } |
279 | 286 | |
280 | | - protected static void fetchReferences(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException { |
| 287 | + protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException { |
281 | 288 | Storage store = Storage.getInstance(); |
282 | 289 | ArrayList<Title> titles = new ArrayList<Title>(); |
283 | 290 | for(IndexUpdateRecord rec : records){ |
— | — | @@ -292,17 +299,33 @@ |
293 | 300 | } |
294 | 301 | // fetch |
295 | 302 | Links links = new Links(store.getPageReferences(titles,dbname)); |
| 303 | + HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname); |
296 | 304 | // update |
297 | 305 | for(IndexUpdateRecord rec : records){ |
298 | 306 | if(rec.isDelete()) |
299 | 307 | continue; |
300 | 308 | Article ar = rec.getArticle(); |
301 | | - ar.setReferences(links.getLinks(ar.makeTitle().getKey())); |
| 309 | + Title t = ar.makeTitle(); |
| 310 | + // set references |
| 311 | + ar.setReferences(links.getLinks(t.getKey())); |
302 | 312 | if(ar.getRedirects() != null){ |
303 | 313 | for(Redirect r : ar.getRedirects()){ |
304 | 314 | r.setReferences(links.getLinks(r.makeTitle().getKey())); |
305 | 315 | } |
306 | | - } |
| 316 | + } |
| 317 | + // set related |
| 318 | + ArrayList<RelatedTitle> rt = rel.get(t.getKey()); |
| 319 | + if(rt != null){ |
| 320 | + Collections.sort(rt,new Comparator<RelatedTitle>() { |
| 321 | + public int compare(RelatedTitle o1, RelatedTitle o2){ |
| 322 | + double d = o2.getScore()-o1.getScore(); |
| 323 | + if(d == 0) return 0; |
| 324 | + else if(d > 0) return 1; |
| 325 | + else return -1; |
| 326 | + } |
| 327 | + }); |
| 328 | + ar.setRelated(rt); |
| 329 | + } |
307 | 330 | } |
308 | 331 | } |
309 | 332 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/MathFunc.java |
— | — | @@ -0,0 +1,103 @@ |
| 2 | +package org.wikimedia.lsearch.util; |
| 3 | + |
| 4 | +public class MathFunc { |
| 5 | + |
| 6 | + /** Calculate average value starting from start to end (end excluded) */ |
| 7 | + public static double avg(double[] val, int start, int end){ |
| 8 | + double s = 0; |
| 9 | + for(int i=start;i<end;i++) |
| 10 | + s+=val[i]; |
| 11 | + return s/(end-start); |
| 12 | + } |
| 13 | + |
| 14 | + /** |
| 15 | + * Approximate the graph of function with num horizontal lines |
| 16 | + * (const functions), so to minimize the maximal deviation |
| 17 | + * @return list of discontinuities (begin points of horizontal lines) |
| 18 | + */ |
| 19 | + public static int[] partitionList(double[] val, int num){ |
| 20 | + int[] part = new int[num+1]; // point of discontinuity |
| 21 | + //double[] av = new double[num]; // approximate (average over included points) |
| 22 | + // make first num-1 segments have length 1 |
| 23 | + for(int i=0;i<num;i++) |
| 24 | + part[i]=i; |
| 25 | + part[num] = val.length; // last point is end of values |
| 26 | + //for(int i=0;i<num;i++) |
| 27 | +// av[i] = avg(val,part[i],part[i+1]); |
| 28 | + // error |
| 29 | + double err = calcErr(part,val,num); |
| 30 | + // values at next iteration |
| 31 | + int[] newpart = new int[num+1]; |
| 32 | + //double[] newav = new double[num]; |
| 33 | + double newerr = 0; |
| 34 | + |
| 35 | + while(true){ |
| 36 | + for(int i=0;i<num-1;i++){ |
| 37 | + merge(i,part,newpart,val,num); |
| 38 | + newerr = calcErr(newpart,val,num); |
| 39 | + if(newerr < err){ |
| 40 | + copy(newpart,part); |
| 41 | + err = newerr; |
| 42 | + continue; |
| 43 | + } |
| 44 | + } |
| 45 | + // try extending last |
| 46 | + extend(part,newpart,val,num); |
| 47 | + newerr = calcErr(newpart,val,num); |
| 48 | + if(newerr < err){ |
| 49 | + copy(newpart,part); |
| 50 | + err = newerr; |
| 51 | + continue; |
| 52 | + } |
| 53 | + break; |
| 54 | + |
| 55 | + } |
| 56 | + |
| 57 | + |
| 58 | + return part; |
| 59 | + } |
| 60 | + |
| 61 | + private static void extend(int[] part, int[] newpart, double[] val, int num) { |
| 62 | + for(int j=0;j<num;j++) |
| 63 | + newpart[j] = part[j]; |
| 64 | + newpart[num-1] = part[num-1]+1; |
| 65 | + newpart[num] = part[num]; |
| 66 | + /*for(int j=0;j<num;j++) |
| 67 | + newav[j] = newav[j]; |
| 68 | + newav[num-1] = avg(val,newpart[num-1],newpart[num]); */ |
| 69 | + } |
| 70 | + |
| 71 | + private static void copy(int[] newpart, int[] part) { |
| 72 | + for(int i=0;i<newpart.length;i++) |
| 73 | + part[i] = newpart[i]; |
| 74 | + /*for(int i=0;i<newav.length;i++) |
| 75 | + av[i] = newav[i]; */ |
| 76 | + |
| 77 | + } |
| 78 | + |
| 79 | + /** merge i to i+1, create one new part at the end */ |
| 80 | + private static void merge(int i, int[] part, int[] newpart, double[] val, int num) { |
| 81 | + for(int j=0;j<=i;j++) |
| 82 | + newpart[j] = part[j]; |
| 83 | + for(int j=i+1;j<num-1;j++) |
| 84 | + newpart[j] = part[j+1]; |
| 85 | + newpart[num-1] = part[num-1]+1; |
| 86 | + newpart[num] = part[num]; |
| 87 | + // update avg |
| 88 | + /*for(int j=0;j<i;j++) |
| 89 | + newavg[j] = avg[j]; |
| 90 | + for(int j=i;j<num;j++) |
| 91 | + newavg[j] = avg(val,newpart[j],newpart[j+1]); */ |
| 92 | + } |
| 93 | + |
| 94 | + private static double calcErr(int[] part, double[] val, int num) { |
| 95 | + double err = 0; |
| 96 | + for(int i=0;i<num;i++){ |
| 97 | + // max - min value |
| 98 | + double e = val[part[i]]-val[part[i+1]-1]; |
| 99 | + if( e > err ) |
| 100 | + err = e; |
| 101 | + } |
| 102 | + return err; |
| 103 | + } |
| 104 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java |
— | — | @@ -60,17 +60,16 @@ |
61 | 61 | } |
62 | 62 | public void writeEndPage() throws IOException { |
63 | 63 | CompactArticleLinks p = links.get(page.Title.Namespace+":"+page.Title.Text); |
64 | | - if(readRedirects){ |
65 | | - // register redirect |
66 | | - Title redirect = Localization.getRedirectTitle(revision.Text,langCode); |
67 | | - if( redirect !=null ){ |
68 | | - CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle()); |
69 | | - if(cs != null) |
70 | | - links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs); |
71 | | - return; |
| 64 | + // register redirect |
| 65 | + Title redirect = Localization.getRedirectTitle(revision.Text,langCode); |
| 66 | + if(redirect != null && readRedirects){ |
| 67 | + CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle()); |
| 68 | + if(cs != null){ |
| 69 | + links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs); |
72 | 70 | } |
| 71 | + } else if(redirect == null){ // process only non-redirects |
| 72 | + processLinks(p,revision.Text,page.Title.Namespace); |
73 | 73 | } |
74 | | - processLinks(p,revision.Text,page.Title.Namespace); |
75 | 74 | } |
76 | 75 | |
77 | 76 | /** Find the links object for the ns:title key */ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -72,12 +72,7 @@ |
73 | 73 | } |
74 | 74 | } |
75 | 75 | } |
76 | | - /** Sort "what links here" list for each article */ |
77 | | - public void sortLinked(){ |
78 | | - for(CompactArticleLinks r : links.values()){ |
79 | | - r.sortLinked(); |
80 | | - } |
81 | | - } |
| 76 | + |
82 | 77 | /** Delete any unnecessary allocated space */ |
83 | 78 | public void compactAll() { |
84 | 79 | for(CompactArticleLinks r : links.values()){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Related.java |
— | — | @@ -0,0 +1,37 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +public class Related { |
| 5 | + protected CompactArticleLinks title; |
| 6 | + protected CompactArticleLinks relates; |
| 7 | + protected double score; |
| 8 | + public Related(CompactArticleLinks title, CompactArticleLinks relates, double score) { |
| 9 | + this.title = title; |
| 10 | + this.relates = relates; |
| 11 | + this.score = score; |
| 12 | + } |
| 13 | + @Override |
| 14 | + public String toString() { |
| 15 | + return title+"->"+relates+" : "+score; |
| 16 | + } |
| 17 | + public CompactArticleLinks getRelates() { |
| 18 | + return relates; |
| 19 | + } |
| 20 | + public void setRelates(CompactArticleLinks relates) { |
| 21 | + this.relates = relates; |
| 22 | + } |
| 23 | + public double getScore() { |
| 24 | + return score; |
| 25 | + } |
| 26 | + public void setScore(double score) { |
| 27 | + this.score = score; |
| 28 | + } |
| 29 | + public CompactArticleLinks getTitle() { |
| 30 | + return title; |
| 31 | + } |
| 32 | + public void setTitle(CompactArticleLinks title) { |
| 33 | + this.title = title; |
| 34 | + } |
| 35 | + |
| 36 | + |
| 37 | + |
| 38 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java |
— | — | @@ -13,10 +13,12 @@ |
14 | 14 | import java.util.Map.Entry; |
15 | 15 | |
16 | 16 | import org.apache.log4j.Logger; |
| 17 | +import org.apache.lucene.document.Field.Store; |
17 | 18 | import org.mediawiki.dumper.ProgressFilter; |
18 | 19 | import org.mediawiki.dumper.Tools; |
19 | 20 | import org.mediawiki.importer.XmlDumpReader; |
20 | 21 | import org.wikimedia.lsearch.beans.ArticleLinks; |
| 22 | +import org.wikimedia.lsearch.beans.Title; |
21 | 23 | import org.wikimedia.lsearch.config.Configuration; |
22 | 24 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
23 | 25 | import org.wikimedia.lsearch.config.IndexId; |
— | — | @@ -71,12 +73,8 @@ |
72 | 74 | Links links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS); |
73 | 75 | links.compactAll(); |
74 | 76 | Storage store = Storage.getInstance(); |
75 | | - //store.storePageReferences(links.getAll(),dbname); |
76 | | - printRelated(links); |
77 | | - |
78 | | - /*for(CompactArticleLinks cs : links.values()){ |
79 | | - System.out.println(cs); |
80 | | - }*/ |
| 77 | + store.storePageReferences(links.getAll(),dbname); |
| 78 | + storeRelated(store,links,dbname); |
81 | 79 | |
82 | 80 | long end = System.currentTimeMillis(); |
83 | 81 | |
— | — | @@ -127,93 +125,58 @@ |
128 | 126 | return tr.getTitles(); |
129 | 127 | } |
130 | 128 | |
131 | | - static class Related { |
132 | | - CompactArticleLinks title; |
133 | | - CompactArticleLinks relates; |
134 | | - double score; |
135 | | - HashMap<CompactArticleLinks,Double> scores; |
136 | | - public Related(CompactArticleLinks title, CompactArticleLinks relates, double score, HashMap<CompactArticleLinks,Double> scores) { |
137 | | - this.title = title; |
138 | | - this.relates = relates; |
139 | | - this.score = score; |
140 | | - this.scores = scores; |
141 | | - } |
142 | | - @Override |
143 | | - public String toString() { |
144 | | - return title+"->"+relates+" : "+score; |
145 | | - } |
146 | | - } |
147 | | - |
148 | | - public static void printRelated(Links links){ |
| 129 | + public static void storeRelated(Storage store, Links links, String dbname) throws IOException{ |
149 | 130 | int num = 0; |
150 | 131 | int total = links.getAll().size(); |
| 132 | + ArrayList<Related> buf = new ArrayList<Related>(); |
151 | 133 | for(CompactArticleLinks cs : links.getAll()){ |
152 | 134 | num++; |
153 | | - |
154 | | - ArrayList<Related> pq = new ArrayList<Related>(); |
155 | | - HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>(); |
156 | | - //HashSet<CompactArticleLinks> lin = new HashSet<CompactArticleLinks>(); |
157 | | - //HashSet<CompactArticleLinks> lout = new HashSet<CompactArticleLinks>(); |
158 | | - System.out.println("["+num+"/"+total+" - "+cs.linksInIndex+"] "+cs.toString()); |
159 | | - if(cs.linksIn != null){ |
160 | | - for(CompactArticleLinks csl : cs.linksIn) |
161 | | - ll.add(csl); |
| 135 | + log.debug("["+num+"/"+total+" - "+cs.linksInIndex+"] "+cs.toString()); |
| 136 | + buf.addAll(getRelated(cs,links)); |
| 137 | + if(buf.size() > 10000){ |
| 138 | + store.storeRelatedPages(buf,dbname); |
| 139 | + buf.clear(); |
162 | 140 | } |
163 | | - /* if(cs.linksOut != null){ |
164 | | - for(CompactArticleLinks csl : cs.linksOut) |
165 | | - ll.add(csl); |
166 | | - } */ |
167 | | - if(cs.toString().equals("0:Douglas Adams")){ |
168 | | - int b = 01; |
169 | | - b++; |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + /** |
| 145 | + * Get related articles, sorted descending by score |
| 146 | + */ |
| 147 | + public static ArrayList<Related> getRelated(CompactArticleLinks cs, Links links){ |
| 148 | + ArrayList<Related> ret = new ArrayList<Related>(); |
| 149 | + |
| 150 | + HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>(); |
| 151 | + if(cs.linksIn != null){ |
| 152 | + for(CompactArticleLinks csl : cs.linksIn) |
| 153 | + ll.add(csl); |
| 154 | + } |
| 155 | + for(CompactArticleLinks from : ll){ |
| 156 | + double score = relatedScore(cs,ll,from); |
| 157 | + if(score != 0) |
| 158 | + ret.add(new Related(cs,from,score)); |
| 159 | + } |
| 160 | + Collections.sort(ret,new Comparator<Related>() { |
| 161 | + public int compare(Related o1, Related o2){ |
| 162 | + double d = o2.score-o1.score; |
| 163 | + if(d == 0) return 0; |
| 164 | + else if(d > 0) return 1; |
| 165 | + else return -1; |
170 | 166 | } |
171 | | - for(CompactArticleLinks from : ll){ |
172 | | - //double score = relatedScore(cs,ll,from); |
173 | | - Object[] ret = relatedScore(cs,ll,from); |
174 | | - double score = (Double) ret[0]; |
175 | | - HashMap<CompactArticleLinks,Double> scores = (HashMap<CompactArticleLinks, Double>) ret[1]; |
176 | | - if(score != 0) |
177 | | - pq.add(new Related(cs,from,score,scores)); |
178 | | - |
179 | | - } |
180 | | - /*for(CompactArticleLinks to : lout){ |
181 | | - if(!lin.contains(to)){ |
182 | | - double score = relatedScore(cs,lin,lout,to); |
183 | | - if(score != 0) |
184 | | - pq.add(new Related(cs,to,score)); |
185 | | - } |
186 | | - }*/ |
187 | | - if(pq.size() > 0){ |
188 | | - Collections.sort(pq,new Comparator<Related>() { |
189 | | - public int compare(Related o1, Related o2){ |
190 | | - double d = o2.score-o1.score; |
191 | | - if(d == 0) return 0; |
192 | | - else if(d > 0) return 1; |
193 | | - else return -1; |
194 | | - } |
195 | | - }); |
196 | | - System.out.println(cs.getKey()+" -> "); |
197 | | - for(Related r : pq){ |
198 | | - System.out.println(" -> "+r.relates+" ("+r.score+")"); |
199 | | - if(r.scores != null){ |
200 | | - ArrayList<Entry<CompactArticleLinks, Double>> ss = new ArrayList<Entry<CompactArticleLinks, Double>>(); |
201 | | - ss.addAll(r.scores.entrySet()); |
202 | | - Collections.sort(ss,new Comparator<Entry<CompactArticleLinks, Double>>() { |
203 | | - public int compare(Entry<CompactArticleLinks, Double> o1, Entry<CompactArticleLinks, Double> o2){ |
204 | | - double d = o2.getValue()-o1.getValue(); |
205 | | - if(d == 0) return 0; |
206 | | - else if(d > 0) return 1; |
207 | | - else return -1; |
208 | | - } |
209 | | - }); |
210 | | - for(Entry<CompactArticleLinks, Double> e : ss){ |
211 | | - System.out.println(" + "+e.getKey().toString()+" = "+e.getValue()); |
212 | | - } |
213 | | - } |
214 | | - } |
215 | | - System.out.println(); |
216 | | - } |
| 167 | + }); |
| 168 | + return ret; |
| 169 | + } |
| 170 | + |
| 171 | + /** |
| 172 | + * Get related titles (RelatedTitle is used in Article) |
| 173 | + */ |
| 174 | + public static ArrayList<RelatedTitle> getRelatedTitles(CompactArticleLinks cs, Links links){ |
| 175 | + ArrayList<Related> rel = getRelated(cs,links); |
| 176 | + ArrayList<RelatedTitle> ret = new ArrayList<RelatedTitle>(); |
| 177 | + for(Related r : rel){ |
| 178 | + ret.add(new RelatedTitle(new Title(r.relates.toString()),r.score)); |
217 | 179 | } |
| 180 | + return ret; |
218 | 181 | } |
219 | 182 | |
220 | 183 | public static double norm(double d){ |
— | — | @@ -223,37 +186,24 @@ |
224 | 187 | return d; |
225 | 188 | } |
226 | 189 | |
227 | | - public static Object[] relatedScore(CompactArticleLinks p, HashSet<CompactArticleLinks> ll, CompactArticleLinks q){ |
| 190 | + public static double relatedScore(CompactArticleLinks p, HashSet<CompactArticleLinks> ll, CompactArticleLinks q){ |
228 | 191 | double score = 0; |
229 | | - //HashMap<CompactArticleLinks,Double> scores = new HashMap<CompactArticleLinks,Double>(); |
230 | | - //int links = q.links; |
231 | | - // iterate the neighbourhood of q and see it they link to p |
| 192 | + // all r that links to q |
232 | 193 | for(int i=0;i<q.linksInIndex;i++){ |
233 | 194 | CompactArticleLinks r = q.linksIn[i]; |
234 | 195 | if(r != q && r.links != 0 && ll.contains(r)){ |
235 | | - //score += 1.0/(norm(q.links)*norm(r.links)); |
236 | 196 | score += 1.0/norm(r.links); |
237 | | - //scores.put(r,1.0/norm(r.links)); |
238 | 197 | } |
239 | 198 | |
240 | 199 | } |
| 200 | + // all r that q links to |
241 | 201 | for(int i=0;i<q.linksOutIndex;i++){ |
242 | 202 | CompactArticleLinks r = q.linksOut[i]; |
243 | 203 | if(r != q && r.links!=0 && ll.contains(r)){ |
244 | | - //score += 1.0/(norm(q.links)*norm(r.links)); |
245 | 204 | score += 1.0/norm(r.links); |
246 | | - //scores.put(r,1.0/norm(r.links)); |
247 | 205 | } |
248 | 206 | } |
249 | | - // iterate neighbourhood of p and see if it links to q |
250 | | - /*for(int i=0;i<p.linksInIndex;i++){ |
251 | | - CompactArticleLinks r = p.linksIn[i]; |
252 | | - if(q.hasLinkFrom(r)) |
253 | | - score += 1.0/(norm(q.links)*norm(r.links)); |
254 | | - } */ |
255 | | - //return score * (count / (double)(q.linksInIndex+q.linksOutIndex)) * q.links; |
256 | | - //return score * q.links; |
257 | | - return new Object[]{ score, null }; |
| 207 | + return score; |
258 | 208 | } |
259 | 209 | |
260 | 210 | private static String formatTime(long l) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RelatedTitle.java |
— | — | @@ -0,0 +1,31 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import org.wikimedia.lsearch.beans.Title; |
| 5 | + |
| 6 | +public class RelatedTitle { |
| 7 | + protected Title related; |
| 8 | + protected double score; |
| 9 | + |
| 10 | + public RelatedTitle(Title related, double score) { |
| 11 | + this.related = related; |
| 12 | + this.score = score; |
| 13 | + } |
| 14 | + public Title getRelated() { |
| 15 | + return related; |
| 16 | + } |
| 17 | + public void setRelated(Title related) { |
| 18 | + this.related = related; |
| 19 | + } |
| 20 | + public double getScore() { |
| 21 | + return score; |
| 22 | + } |
| 23 | + public void setScore(double score) { |
| 24 | + this.score = score; |
| 25 | + } |
| 26 | + @Override |
| 27 | + public String toString() { |
| 28 | + return related.toString()+" ("+score+")"; |
| 29 | + } |
| 30 | + |
| 31 | + |
| 32 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java |
— | — | @@ -133,22 +133,6 @@ |
134 | 134 | return h; |
135 | 135 | } |
136 | 136 | |
137 | | - /** Sort by hash value to be able to more quickly find an object */ |
138 | | - public void sortLinked(){ |
139 | | - if(linksIn == null) |
140 | | - return; |
141 | | - ArrayList<CompactArticleLinks> l = new ArrayList<CompactArticleLinks>(); |
142 | | - for(int i=0;i<linksInIndex;i++) |
143 | | - l.add(linksIn[i]); |
144 | | - Collections.sort(l,new Comparator<CompactArticleLinks>() { |
145 | | - public int compare(CompactArticleLinks o1, CompactArticleLinks o2){ |
146 | | - return o1.hashCode() - o2.hashCode(); |
147 | | - } |
148 | | - }); |
149 | | - linksIn = l.toArray(new CompactArticleLinks[] {}); |
150 | | - linksInIndex = linksIn.length; |
151 | | - } |
152 | | - |
153 | 137 | /** Delete any excessive space in linksIn, linksOut */ |
154 | 138 | public void compact(){ |
155 | 139 | CompactArticleLinks[] n; |
Index: branches/lucene-search-2.1/sql/references_table.sql |
— | — | @@ -1,15 +0,0 @@ |
2 | | -CREATE TABLE /*DBprefix*/references ( |
3 | | - -- key in form <ns>:<title> |
4 | | - rf_key varchar(255) binary NOT NULL, |
5 | | - |
6 | | - -- number of page links to this page |
7 | | - rf_references int(10) unsigned NOT NULL, |
8 | | - |
9 | | - -- |
10 | | - PRIMARY KEY rf_key(rf_key) |
11 | | - |
12 | | -) TYPE=InnoDB; |
13 | | - |
Index: branches/lucene-search-2.1/sql/page_table.sql |
— | — | @@ -0,0 +1,17 @@ |
| 2 | +-- |
| 3 | +-- Table with cached information about pages and references to id |
| 4 | +-- |
| 5 | +CREATE TABLE /*DBprefix*/page ( |
| 6 | + page_id int unsigned auto_increment NOT NULL, |
| 7 | + |
| 8 | + -- key in form <ns>:<title> |
| 9 | + page_key varchar(255) binary NOT NULL, |
| 10 | + |
| 11 | + -- number of page links to this page |
| 12 | + page_references int unsigned NOT NULL, |
| 13 | + |
| 14 | + -- |
| 15 | + PRIMARY KEY page_id(page_id), |
| 16 | + UNIQUE (page_key) |
| 17 | + |
| 18 | +) TYPE=InnoDB; |
\ No newline at end of file |
Index: branches/lucene-search-2.1/sql/related_table.sql |
— | — | @@ -0,0 +1,18 @@ |
| 2 | +-- |
| 3 | +-- Table with a mapping of related articles |
| 4 | +-- |
| 5 | +CREATE TABLE /*DBprefix*/related ( |
| 6 | + -- key of article (fk to page_id) |
| 7 | + rel_to int unsigned NOT NULL, |
| 8 | + |
| 9 | + -- article which links to rel_id (fk to page_id) |
| 10 | + rel_related int unsigned NOT NULL, |
| 11 | + |
| 12 | + -- the strength of relatedness, positive |
| 13 | + rel_score float(23) NOT NULL, |
| 14 | + |
| 15 | + -- |
| 16 | + PRIMARY KEY re_relates(rel_to,rel_related), |
| 17 | + INDEX (rel_to) |
| 18 | + |
| 19 | +) TYPE=InnoDB; |
\ No newline at end of file |