Index: branches/lucene-search-2.1/.project |
— | — | @@ -1,6 +1,6 @@ |
2 | 2 | <?xml version="1.0" encoding="UTF-8"?> |
3 | 3 | <projectDescription> |
4 | | - <name>search-2.0</name> |
| 4 | + <name>search-2</name> |
5 | 5 | <comment>JavaCC Nature</comment> |
6 | 6 | <projects> |
7 | 7 | </projects> |
Index: branches/lucene-search-2.1/test-data/mwsearch-global.test |
— | — | @@ -9,7 +9,7 @@ |
10 | 10 | # aspell <language> |
11 | 11 | [Database] |
12 | 12 | entest : (mainsplit), (mainpart,false,2,10), (restpart,true,2) |
13 | | -entest : (ngram), (spell_words,3,20), (spell_titles,1,2,20) |
| 13 | +entest : (ngram), (spell,1,2) |
14 | 14 | detest,rutest : (single,true,2,10) |
15 | 15 | frtest : (split,3) (part1) (part2) (part3) |
16 | 16 | srwiki : (single) |
— | — | @@ -28,7 +28,7 @@ |
29 | 29 | 192.168.0.10 :entest.mainpart |
30 | 30 | 192.168.0.2 : entest.restpart, rutest |
31 | 31 | [Search-Group] |
32 | | -192.168.0.1 : njawiki entest.spell_words entest.spell_titles |
| 32 | +192.168.0.1 : njawiki entest.spell |
33 | 33 | |
34 | 34 | # Index nodes |
35 | 35 | # host: db1.role, db2.role |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/ArticleAnalytics.java |
— | — | @@ -0,0 +1,99 @@ |
| 2 | +package org.wikimedia.lsearch.storage; |
| 3 | + |
| 4 | +import java.util.Collection; |
| 5 | + |
| 6 | +import org.wikimedia.lsearch.ranks.Related; |
| 7 | + |
| 8 | +/** |
| 9 | + * Various link analysis info about the article |
| 10 | + * |
| 11 | + * @author rainman |
| 12 | + * |
| 13 | + */ |
| 14 | +public class ArticleAnalytics { |
| 15 | + String key; |
| 16 | + int references; |
| 17 | + String redirectTarget; |
| 18 | + Collection<String> anchorText; |
| 19 | + Collection<Related> related; |
| 20 | + Collection<String> redirectKeys; |
| 21 | + |
| 22 | + /** |
| 23 | + * @param key - article key (ns:title) |
| 24 | + * @param references - number of links to article |
| 25 | + * @param redirectTarget - if article is redirect - target article key, otherwise null |
| 26 | + * @param anchorText - anchor texts |
| 27 | + * @param relatedKeys - related articles (ns:title) |
| 28 | + * @param redirectKeys - articles that redirect here (ns:title) |
| 29 | + * |
| 30 | + */ |
| 31 | + public ArticleAnalytics(String key, int references, String redirectTarget, Collection<String> anchorText, Collection<Related> related, Collection<String> redirectKeys) { |
| 32 | + this.key = key; |
| 33 | + this.references = references; |
| 34 | + this.redirectTarget = redirectTarget; |
| 35 | + this.anchorText = anchorText; |
| 36 | + this.related = related; |
| 37 | + this.redirectKeys = redirectKeys; |
| 38 | + } |
| 39 | + |
| 40 | + @Override |
| 41 | + public String toString() { |
| 42 | + return key+" : ref="+references+", redirect_to="+redirectTarget+", anchor="+anchorText+", redirects="+redirectKeys+", related="+related; |
| 43 | + } |
| 44 | + |
| 45 | + public boolean isRedirect(){ |
| 46 | + return redirectTarget != null; |
| 47 | + } |
| 48 | + |
| 49 | + |
| 50 | + public Collection<String> getAnchorText() { |
| 51 | + return anchorText; |
| 52 | + } |
| 53 | + |
| 54 | + public void setAnchorText(Collection<String> anchorText) { |
| 55 | + this.anchorText = anchorText; |
| 56 | + } |
| 57 | + |
| 58 | + public String getKey() { |
| 59 | + return key; |
| 60 | + } |
| 61 | + |
| 62 | + public void setKey(String key) { |
| 63 | + this.key = key; |
| 64 | + } |
| 65 | + |
| 66 | + public Collection<String> getRedirectKeys() { |
| 67 | + return redirectKeys; |
| 68 | + } |
| 69 | + |
| 70 | + public void setRedirectKeys(Collection<String> redirectKeys) { |
| 71 | + this.redirectKeys = redirectKeys; |
| 72 | + } |
| 73 | + |
| 74 | + public String getRedirectTarget() { |
| 75 | + return redirectTarget; |
| 76 | + } |
| 77 | + |
| 78 | + public void setRedirectTarget(String redirectTarget) { |
| 79 | + this.redirectTarget = redirectTarget; |
| 80 | + } |
| 81 | + |
| 82 | + public int getReferences() { |
| 83 | + return references; |
| 84 | + } |
| 85 | + |
| 86 | + public void setReferences(int references) { |
| 87 | + this.references = references; |
| 88 | + } |
| 89 | + |
| 90 | + public Collection<Related> getRelated() { |
| 91 | + return related; |
| 92 | + } |
| 93 | + |
| 94 | + public void setRelated(Collection<Related> related) { |
| 95 | + this.related = related; |
| 96 | + } |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/Storage.java |
— | — | @@ -11,7 +11,7 @@ |
12 | 12 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
13 | 13 | import org.wikimedia.lsearch.ranks.Related; |
14 | 14 | import org.wikimedia.lsearch.ranks.RelatedTitle; |
15 | | - |
| 15 | +@Deprecated |
16 | 16 | abstract public class Storage { |
17 | 17 | static protected Storage instance = null; |
18 | 18 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/LinkAnalysisStorage.java |
— | — | @@ -0,0 +1,106 @@ |
| 2 | +package org.wikimedia.lsearch.storage; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.Collection; |
| 6 | + |
| 7 | +import org.apache.log4j.Logger; |
| 8 | +import org.apache.lucene.document.Document; |
| 9 | +import org.apache.lucene.document.Field; |
| 10 | +import org.apache.lucene.index.IndexReader; |
| 11 | +import org.apache.lucene.index.IndexWriter; |
| 12 | +import org.apache.lucene.index.Term; |
| 13 | +import org.apache.lucene.index.TermDocs; |
| 14 | +import org.wikimedia.lsearch.analyzers.SplitAnalyzer; |
| 15 | +import org.wikimedia.lsearch.beans.LocalIndex; |
| 16 | +import org.wikimedia.lsearch.config.IndexId; |
| 17 | +import org.wikimedia.lsearch.config.IndexRegistry; |
| 18 | +import org.wikimedia.lsearch.index.IndexThread; |
| 19 | +import org.wikimedia.lsearch.ranks.Related; |
| 20 | +import org.wikimedia.lsearch.ranks.StringList; |
| 21 | + |
| 22 | +/** |
| 23 | + * Store/retrieve link analysis results |
| 24 | + * |
| 25 | + * @author rainman |
| 26 | + * |
| 27 | + */ |
| 28 | +public class LinkAnalysisStorage { |
| 29 | + static Logger log = Logger.getLogger(LinkAnalysisStorage.class); |
| 30 | + protected IndexId iid; |
| 31 | + protected IndexWriter writer = null; |
| 32 | + protected IndexReader reader = null; |
| 33 | + protected IndexRegistry registry = IndexRegistry.getInstance(); |
| 34 | + |
| 35 | + public LinkAnalysisStorage(IndexId iid){ |
| 36 | + this.iid = iid.getLinkAnalysis(); |
| 37 | + } |
| 38 | + |
| 39 | + protected void ensureWrite() throws IOException{ |
| 40 | + if(writer == null){ |
| 41 | + writer = new IndexWriter(iid.getImportPath(), new SplitAnalyzer(), true); |
| 42 | + } |
| 43 | + } |
| 44 | + |
| 45 | + protected void ensureRead() throws IOException{ |
| 46 | + if(reader == null){ |
| 47 | + LocalIndex li = registry.getLatestSnapshot(iid); |
| 48 | + if(li == null) |
| 49 | + throw new IOException("There are no snapshots for "+iid); |
| 50 | + |
| 51 | + reader = IndexReader.open(li.getPath()); |
| 52 | + } |
| 53 | + } |
| 54 | + /** |
| 55 | + * Add rank analysis stuff for a single article |
| 56 | + * @throws IOException |
| 57 | + */ |
| 58 | + public void addAnalitics(ArticleAnalytics aa) throws IOException{ |
| 59 | + ensureWrite(); |
| 60 | + //log.info("Writing analitics "+aa); |
| 61 | + Document doc = new Document(); |
| 62 | + doc.add(new Field("key",aa.key,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 63 | + doc.add(new Field("references",Integer.toString(aa.references),Field.Store.YES,Field.Index.NO)); |
| 64 | + doc.add(new Field("anchor",new StringList(aa.anchorText).toString(),Field.Store.YES,Field.Index.NO)); |
| 65 | + doc.add(new Field("related",new StringList(Related.convertToStringList(aa.related)).toString(),Field.Store.YES,Field.Index.NO)); |
| 66 | + doc.add(new Field("redirect",new StringList(aa.redirectKeys).toString(),Field.Store.YES,Field.Index.NO)); |
| 67 | + if(aa.redirectTarget != null) |
| 68 | + doc.add(new Field("redirect_to",aa.redirectTarget,Field.Store.YES,Field.Index.NO)); |
| 69 | + writer.addDocument(doc); |
| 70 | + } |
| 71 | + |
| 72 | + public void snapshot() throws IOException{ |
| 73 | + if(writer != null){ |
| 74 | + writer.optimize(); |
| 75 | + writer.close(); |
| 76 | + writer = null; |
| 77 | + IndexThread.makeIndexSnapshot(iid,iid.getImportPath()); |
| 78 | + registry.refreshSnapshots(iid); |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + /** |
| 83 | + * Read analitics from latest link analysis index snapshot |
| 84 | + * @param key ns:title |
| 85 | + * @return |
| 86 | + * @throws IOException |
| 87 | + */ |
| 88 | + public ArticleAnalytics getAnalitics(String key) throws IOException{ |
| 89 | + ensureRead(); |
| 90 | + |
| 91 | + TermDocs td = reader.termDocs(new Term("key",key)); |
| 92 | + if(td.next()){ |
| 93 | + Document d = reader.document(td.doc()); |
| 94 | + int ref = Integer.parseInt(d.get("references")); |
| 95 | + StringList anchor = new StringList(d.get("anchor")); |
| 96 | + StringList related = new StringList(d.get("related")); |
| 97 | + StringList redirect = new StringList(d.get("redirect")); |
| 98 | + String redirectTarget = d.get("redirect_to"); |
| 99 | + return new ArticleAnalytics(key,ref,redirectTarget, |
| 100 | + anchor.toCollection(), |
| 101 | + Related.convertToRelatedList(related.toCollection()), |
| 102 | + redirect.toCollection()); |
| 103 | + } |
| 104 | + |
| 105 | + return null; |
| 106 | + } |
| 107 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/storage/MySQLStorage.java |
— | — | @@ -31,6 +31,7 @@ |
32 | 32 | * @author rainman |
33 | 33 | * |
34 | 34 | */ |
| 35 | +@Deprecated |
35 | 36 | public class MySQLStorage extends Storage { |
36 | 37 | static Logger log = Logger.getLogger(MySQLStorage.class); |
37 | 38 | protected Configuration config; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -66,6 +66,8 @@ |
67 | 67 | protected Hashtable<String,String> wgLanguageCode = null; |
68 | 68 | /** wgServer, suffix -> server (default server is "default")*/ |
69 | 69 | protected Hashtable<String,String> wgServer = null; |
| 70 | + /** wgNamespacesToBeSearchedDefault from InitialiseSettings, suffix -> lang code */ |
| 71 | + protected Hashtable<String,NamespaceFilter> wgDefaultSearch = null; |
70 | 72 | |
71 | 73 | /** info about this host */ |
72 | 74 | protected static InetAddress myHost; |
— | — | @@ -183,6 +185,9 @@ |
184 | 186 | database.get(dbname).put(dbpart,new Hashtable<String,String>()); |
185 | 187 | } |
186 | 188 | } |
| 189 | + // add the link analysis to indexers |
| 190 | + if(!types.contains("link_analysis")) |
| 191 | + database.get(dbname).put("link_analysis",new Hashtable<String,String>()); |
187 | 192 | // add spellcheck indexes |
188 | 193 | /* if(!types.contains("spell_words")) |
189 | 194 | database.get(dbname).put("spell_words",new Hashtable<String,String>()); |
— | — | @@ -209,8 +214,7 @@ |
210 | 215 | } |
211 | 216 | } |
212 | 217 | // spell check indexes are searched by default if they exist |
213 | | - addToList(hostsearch,dbname+".spell_words"); |
214 | | - addToList(hostsearch,dbname+".spell_titles"); |
| 218 | + addToList(hostsearch,dbname+".spell"); |
215 | 219 | } |
216 | 220 | } |
217 | 221 | |
— | — | @@ -231,7 +235,7 @@ |
232 | 236 | } else if(typeid.matches("nspart[1-9][0-9]*")){ |
233 | 237 | type = "nssplit"; |
234 | 238 | dbrole = dbname + "." + typeid; |
235 | | - } else if(typeid.equals("spell_words") || typeid.equals("spell_titles")){ |
| 239 | + } else if(typeid.equals("spell") || typeid.equals("link_analysis")){ |
236 | 240 | type = typeid; |
237 | 241 | dbrole = dbname + "." + typeid; |
238 | 242 | } else |
— | — | @@ -250,7 +254,7 @@ |
251 | 255 | } |
252 | 256 | } |
253 | 257 | boolean searched = (getSearchHosts(dbrole).size() != 0); |
254 | | - if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit"))){ |
| 258 | + if(!searched && !(typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit") || typeid.equals("link_analysis"))){ |
255 | 259 | if(verbose) |
256 | 260 | System.out.println("WARNING: in Global Configuration: index "+dbrole+" is not searched by any host."); |
257 | 261 | } |
— | — | @@ -455,6 +459,7 @@ |
456 | 460 | String text = parser.readURL(new URL(initset)); |
457 | 461 | wgLanguageCode = parser.getLanguages(text); |
458 | 462 | wgServer = parser.getServer(text); |
| 463 | + wgDefaultSearch = parser.getDefaultSearch(text); |
459 | 464 | } catch (IOException e) { |
460 | 465 | System.out.println("Error: Cannot read InitialiseSettings.php from url "+initset+" : "+e.getMessage()); |
461 | 466 | } |
— | — | @@ -516,7 +521,7 @@ |
517 | 522 | } else if(typeid.matches("nspart[1-9][0-9]*")){ |
518 | 523 | type = "nssplit"; |
519 | 524 | dbrole = dbname + "." + typeid; |
520 | | - } else if(typeid.equals("spell_words") || typeid.equals("spell_titles")){ |
| 525 | + } else if(typeid.equals("spell") || typeid.equals("link_analysis")){ |
521 | 526 | type = typeid; |
522 | 527 | dbrole = dbname + "." + typeid; |
523 | 528 | } else |
— | — | @@ -802,27 +807,14 @@ |
803 | 808 | |
804 | 809 | dbroles.put(type,params); |
805 | 810 | |
806 | | - } else if(type.equals("spell_words")){ |
| 811 | + } else if(type.equals("spell")){ |
807 | 812 | // all params are optional, if absent default will be used |
808 | 813 | if(tokens.length>1) |
809 | | - params.put("minFreq",tokens[1]); |
810 | | - if(tokens.length>2) |
811 | | - params.put("minHits",tokens[2]); |
812 | | - |
813 | | - if(tokens.length>3 && verbose) |
814 | | - System.out.println("Unrecognized suggest parameters in ("+role+")"); |
815 | | - |
816 | | - dbroles.put(type,params); |
817 | | - } else if(type.equals("spell_titles")){ |
818 | | - // all params are optional, if absent default will be used |
819 | | - if(tokens.length>1) |
820 | 814 | params.put("wordsMinFreq",tokens[1]); |
821 | 815 | if(tokens.length>2) |
822 | 816 | params.put("phrasesMinFreq",tokens[2]); |
823 | | - if(tokens.length>3) |
824 | | - params.put("minHits",tokens[3]); |
825 | 817 | |
826 | | - if(tokens.length>4 && verbose) |
| 818 | + if(tokens.length>3 && verbose) |
827 | 819 | System.out.println("Unrecognized suggest parameters in ("+role+")"); |
828 | 820 | |
829 | 821 | dbroles.put(type,params); |
— | — | @@ -1102,6 +1094,19 @@ |
1103 | 1095 | GlobalConfiguration.verbose = verbose; |
1104 | 1096 | } |
1105 | 1097 | |
| 1098 | + public NamespaceFilter getDefaultNamespace(IndexId iid){ |
| 1099 | + return getDefaultNamespace(iid.getDBname()); |
| 1100 | + } |
| 1101 | + public NamespaceFilter getDefaultNamespace(String dbname){ |
| 1102 | + if(wgDefaultSearch != null){ |
| 1103 | + if(wgDefaultSearch.containsKey(dbname)) |
| 1104 | + return wgDefaultSearch.get(dbname); |
| 1105 | + else if(wgDefaultSearch.containsKey("default")) |
| 1106 | + return wgDefaultSearch.get("default"); |
| 1107 | + } |
| 1108 | + return new NamespaceFilter(0); |
| 1109 | + } |
1106 | 1110 | |
| 1111 | + |
1107 | 1112 | |
1108 | 1113 | } |
\ No newline at end of file |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | /** If true, this machine is an indexer for this index */ |
60 | 60 | protected boolean myIndex; |
61 | 61 | |
62 | | - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL_WORDS, SPELL_TITLES }; |
| 62 | + protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT, SPELL, LINK_ANALYSIS }; |
63 | 63 | |
64 | 64 | /** Type of index, enumeration */ |
65 | 65 | protected IndexType type; |
— | — | @@ -156,10 +156,10 @@ |
157 | 157 | this.type = IndexType.SPLIT; |
158 | 158 | else if(type.equals("nssplit")) |
159 | 159 | this.type = IndexType.NSSPLIT; |
160 | | - else if(type.equals("spell_words")) |
161 | | - this.type = IndexType.SPELL_WORDS; |
162 | | - else if(type.equals("spell_titles")) |
163 | | - this.type = IndexType.SPELL_TITLES; |
| 160 | + else if(type.equals("spell")) |
| 161 | + this.type = IndexType.SPELL; |
| 162 | + else if(type.equals("link_analysis")) |
| 163 | + this.type = IndexType.LINK_ANALYSIS; |
164 | 164 | |
165 | 165 | // parts |
166 | 166 | String[] parts = dbrole.split("\\."); |
— | — | @@ -251,18 +251,14 @@ |
252 | 252 | public boolean isNssplit(){ |
253 | 253 | return type == IndexType.NSSPLIT; |
254 | 254 | } |
255 | | - /** If this is the spell-check index for words */ |
256 | | - public boolean isSpellWords(){ |
257 | | - return type == IndexType.SPELL_WORDS; |
| 255 | + /** If this is the spell-check index */ |
| 256 | + public boolean isSpell(){ |
| 257 | + return type == IndexType.SPELL; |
258 | 258 | } |
259 | | - /** It this is the spell-check index for phrases and words from titles */ |
260 | | - public boolean isSpellTitles(){ |
261 | | - return type == IndexType.SPELL_TITLES; |
| 259 | + /** If this is the link-analysis index */ |
| 260 | + public boolean isLinkAnalysis(){ |
| 261 | + return type == IndexType.LINK_ANALYSIS; |
262 | 262 | } |
263 | | - /** If this is one of the spell-check indexes */ |
264 | | - public boolean isSpellCheck(){ |
265 | | - return isSpellWords() || isSpellTitles(); |
266 | | - } |
267 | 263 | |
268 | 264 | /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */ |
269 | 265 | public int getPartNum() { |
— | — | @@ -374,7 +370,6 @@ |
375 | 371 | return tempPath; |
376 | 372 | } |
377 | 373 | |
378 | | - |
379 | 374 | /** Get search path with resolved symlinks */ |
380 | 375 | public String getCanonicalSearchPath(){ |
381 | 376 | try { |
— | — | @@ -411,7 +406,7 @@ |
412 | 407 | |
413 | 408 | /** get all hosts that search db this iid belongs to */ |
414 | 409 | public HashSet<String> getDBSearchHosts(){ |
415 | | - if(isSingle() || isSpellWords() || isSpellTitles()) |
| 410 | + if(isSingle() || isSpell() || isLinkAnalysis()) |
416 | 411 | return searchHosts; |
417 | 412 | else{ |
418 | 413 | // add all hosts that search: dbname and all parts |
— | — | @@ -462,7 +457,7 @@ |
463 | 458 | */ |
464 | 459 | public HashSet<String> getPhysicalIndexes() { |
465 | 460 | HashSet<String> ret = new HashSet<String>(); |
466 | | - if(isSingle() || isSpellWords() || isSpellTitles()) |
| 461 | + if(isSingle() || isSpell() || isLinkAnalysis()) |
467 | 462 | ret.add(dbrole); |
468 | 463 | else if(isMainsplit() || isSplit() || isNssplit()){ |
469 | 464 | for(String p : splitParts) |
— | — | @@ -534,13 +529,13 @@ |
535 | 530 | } |
536 | 531 | |
537 | 532 | /** Get the coresponding spell words iid */ |
538 | | - public IndexId getSpellWords() { |
539 | | - return get(dbname+".spell_words"); |
| 533 | + public IndexId getSpell() { |
| 534 | + return get(dbname+".spell"); |
540 | 535 | } |
541 | 536 | |
542 | | - /** Get the coresponding spell titles iid */ |
543 | | - public IndexId getSpellTitles() { |
544 | | - return get(dbname+".spell_titles"); |
| 537 | + /** Get the link analysis iid */ |
| 538 | + public IndexId getLinkAnalysis() { |
| 539 | + return get(dbname+".link_analysis"); |
545 | 540 | } |
546 | 541 | |
547 | 542 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java |
— | — | @@ -91,6 +91,16 @@ |
92 | 92 | return included.get(namespace); |
93 | 93 | } |
94 | 94 | |
| 95 | + /** Set bit for namespace to true */ |
| 96 | + public void set(int namespace){ |
| 97 | + included.set(namespace); |
| 98 | + } |
| 99 | + |
| 100 | + /** Set bit for namespace to false */ |
| 101 | + public void unset(int namespace){ |
| 102 | + included.set(namespace,false); |
| 103 | + } |
| 104 | + |
95 | 105 | public boolean contains(int namespace){ |
96 | 106 | return included.get(namespace); |
97 | 107 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -40,11 +40,9 @@ |
41 | 41 | global = GlobalConfiguration.getInstance(); |
42 | 42 | |
43 | 43 | Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup"); |
44 | | - if(iid.isSpellCheck()){ |
| 44 | + if(iid.isSpell()); // no warmup for spell-chekers |
| 45 | + else if(warmup == null){ |
45 | 46 | makeNamespaceFilters(is,iid); |
46 | | - log.info("Warmed up spell-check index "+iid); |
47 | | - } else if(warmup == null){ |
48 | | - makeNamespaceFilters(is,iid); |
49 | 47 | simpleWarmup(is,iid); |
50 | 48 | log.info("Warmed up "+iid); |
51 | 49 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -40,7 +40,7 @@ |
41 | 41 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
42 | 42 | import org.wikimedia.lsearch.config.IndexId; |
43 | 43 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
44 | | -import org.wikimedia.lsearch.spell.api.TitleIndexer; |
| 44 | +import org.wikimedia.lsearch.spell.api.SpellCheckIndexer; |
45 | 45 | import org.wikimedia.lsearch.util.Localization; |
46 | 46 | |
47 | 47 | /** |
— | — | @@ -370,15 +370,6 @@ |
371 | 371 | boolean succ = succAdd; // it's OK if articles cannot be deleted |
372 | 372 | trans.commit(); |
373 | 373 | |
374 | | - // if there is a titles spell-check index, update it |
375 | | - if(iid.getSpellTitles() != null){ |
376 | | - TitleIndexer spell = new TitleIndexer(iid); |
377 | | - trans = new Transaction(iid.getSpellTitles()); |
378 | | - trans.begin(); |
379 | | - spell.update(updateRecords); |
380 | | - trans.commit(); |
381 | | - } |
382 | | - |
383 | 374 | // send reports back to the main indexer host |
384 | 375 | RMIMessengerClient messenger = new RMIMessengerClient(); |
385 | 376 | if(modifier.reportQueue.size() != 0) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SplitAnalyzer.java |
— | — | @@ -2,39 +2,36 @@ |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.io.Reader; |
| 6 | +import java.util.Iterator; |
6 | 7 | |
7 | 8 | import org.apache.lucene.analysis.Analyzer; |
8 | 9 | import org.apache.lucene.analysis.Token; |
9 | 10 | import org.apache.lucene.analysis.TokenStream; |
10 | 11 | import org.apache.lucene.analysis.Tokenizer; |
| 12 | +import org.wikimedia.lsearch.ranks.StringList; |
11 | 13 | |
12 | 14 | /** Split the text by some specific char */ |
13 | 15 | public class SplitAnalyzer extends Analyzer { |
14 | 16 | class SplitTokenStream extends Tokenizer { |
15 | | - String[] tokens; |
| 17 | + Iterator<String> it = null; |
16 | 18 | int in = 0; |
17 | 19 | int start = 0; |
18 | | - SplitTokenStream(String inputStr){ |
19 | | - tokens = inputStr.split(""+splitChar); |
| 20 | + SplitTokenStream(String input){ |
| 21 | + it = new StringList(input).iterator(); |
20 | 22 | } |
21 | 23 | @Override |
22 | 24 | public Token next() throws IOException { |
23 | | - if(in >= tokens.length) |
| 25 | + if(!it.hasNext()) |
24 | 26 | return null; |
25 | 27 | else{ |
| 28 | + String str = it.next(); |
26 | 29 | int s = start; |
27 | | - int e = start + tokens[in].length(); |
28 | | - start = e + 1; |
29 | | - return new Token(tokens[in++],s,e); |
30 | | - } |
| 30 | + int e = start + str.length(); |
| 31 | + return new Token(str,s,e); |
| 32 | + } |
31 | 33 | } |
32 | 34 | } |
33 | | - char splitChar; |
34 | 35 | |
35 | | - public SplitAnalyzer(char splitChar){ |
36 | | - this.splitChar = splitChar; |
37 | | - } |
38 | | - |
39 | 36 | @Override |
40 | 37 | public TokenStream tokenStream(String fieldName, String text) { |
41 | 38 | return new SplitTokenStream(text); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java |
— | — | @@ -69,7 +69,7 @@ |
70 | 70 | } |
71 | 71 | public void writeEndPage() throws IOException { |
72 | 72 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect(), |
73 | | - references,redirects,new ArrayList<RelatedTitle>()); // references and related titles are set correctly later (in incremental updater) |
| 73 | + references,redirects,new ArrayList<RelatedTitle>(), new ArrayList<String>()); // references and related titles are set correctly later (in incremental updater) |
74 | 74 | log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects); |
75 | 75 | records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE)); |
76 | 76 | log.debug(iid+": Update for "+article); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java |
— | — | @@ -31,6 +31,8 @@ |
32 | 32 | import org.wikimedia.lsearch.ranks.OldLinks; |
33 | 33 | import org.wikimedia.lsearch.ranks.Related; |
34 | 34 | import org.wikimedia.lsearch.ranks.RelatedTitle; |
| 35 | +import org.wikimedia.lsearch.storage.ArticleAnalytics; |
| 36 | +import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
35 | 37 | import org.wikimedia.lsearch.storage.Storage; |
36 | 38 | import org.wikimedia.lsearch.util.Localization; |
37 | 39 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -284,7 +286,6 @@ |
285 | 287 | } |
286 | 288 | |
287 | 289 | protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, String dbname) throws IOException { |
288 | | - Storage store = Storage.getInstance(); |
289 | 290 | ArrayList<Title> titles = new ArrayList<Title>(); |
290 | 291 | for(IndexUpdateRecord rec : records){ |
291 | 292 | if(rec.isDelete()) |
— | — | @@ -298,23 +299,31 @@ |
299 | 300 | } |
300 | 301 | } |
301 | 302 | // fetch |
302 | | - OldLinks links = new OldLinks(store.getPageReferences(titles,dbname)); |
303 | | - HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname); |
| 303 | + LinkAnalysisStorage store = new LinkAnalysisStorage(IndexId.get(dbname)); |
| 304 | + //OldLinks links = new OldLinks(store.getPageReferences(titles,dbname)); |
| 305 | + //HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname); |
304 | 306 | // update |
305 | 307 | for(IndexUpdateRecord rec : records){ |
306 | 308 | if(rec.isDelete()) |
307 | 309 | continue; |
308 | 310 | Article ar = rec.getArticle(); |
309 | 311 | Title t = ar.makeTitle(); |
| 312 | + ArticleAnalytics aa = store.getAnalitics(t.getKey()); |
| 313 | + ArrayList<String> anchors = new ArrayList<String>(); |
| 314 | + anchors.addAll(aa.getAnchorText()); |
310 | 315 | // set references |
311 | | - ar.setReferences(links.getLinks(t.getKey())); |
| 316 | + ar.setReferences(aa.getReferences()); |
312 | 317 | if(ar.getRedirects() != null){ |
313 | 318 | for(Redirect r : ar.getRedirects()){ |
314 | | - r.setReferences(links.getLinks(r.makeTitle().getKey())); |
| 319 | + ArticleAnalytics raa = store.getAnalitics(r.makeTitle().getKey()); |
| 320 | + r.setReferences(raa.getReferences()); |
| 321 | + anchors.addAll(raa.getAnchorText()); |
315 | 322 | } |
316 | 323 | } |
| 324 | + // set anchors |
| 325 | + ar.setAnchorText(anchors); |
317 | 326 | // set related |
318 | | - ArrayList<RelatedTitle> rt = rel.get(t.getKey()); |
| 327 | + /*ArrayList<RelatedTitle> rt = rel.get(t.getKey()); |
319 | 328 | if(rt != null){ |
320 | 329 | Collections.sort(rt,new Comparator<RelatedTitle>() { |
321 | 330 | public int compare(RelatedTitle o1, RelatedTitle o2){ |
— | — | @@ -325,7 +334,7 @@ |
326 | 335 | } |
327 | 336 | }); |
328 | 337 | ar.setRelated(rt); |
329 | | - } |
| 338 | + }*/ |
330 | 339 | } |
331 | 340 | } |
332 | 341 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/PHPParser.java |
— | — | @@ -12,6 +12,8 @@ |
13 | 13 | import java.util.regex.Matcher; |
14 | 14 | import java.util.regex.Pattern; |
15 | 15 | |
| 16 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
| 17 | + |
16 | 18 | /** |
17 | 19 | * Extract some variable from MediaWiki MessageXX.php files. In particular, |
18 | 20 | * the localized namespace names (needed for proper parsing of wiki code). |
— | — | @@ -160,6 +162,58 @@ |
161 | 163 | return servers; |
162 | 164 | } |
163 | 165 | |
| 166 | + /** Get wgNamespacesToBeSearchedDefault from InitialiseSettings */ |
| 167 | + public Hashtable<String,NamespaceFilter> getDefaultSearch(String text){ |
| 168 | + text = text.replaceAll("(#.*)",""); // strip comments |
| 169 | + Hashtable<String,NamespaceFilter> ret = new Hashtable<String,NamespaceFilter>(); |
| 170 | + |
| 171 | + int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL; |
| 172 | + //Pattern wgns = Pattern.compile("[\"']wgNamespacesToBeSearchedDefault[\"']\\s*=>\\s*array\\s*\\(((.*?\\(.*?\\).*?)+)\\)",flags); |
| 173 | + Pattern db = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags); |
| 174 | + Pattern entry = Pattern.compile("(-?[0-9]+)\\s*=>\\s*([01])",flags); |
| 175 | + String t = fetchArray(text,"'wgNamespacesToBeSearchedDefault'"); |
| 176 | + Matcher md = db.matcher(t); |
| 177 | + while(md.find()){ |
| 178 | + String dbname = md.group(1); |
| 179 | + NamespaceFilter nsf = new NamespaceFilter(); |
| 180 | + Matcher me = entry.matcher(md.group(2)); |
| 181 | + while(me.find()){ |
| 182 | + if(!me.group(2).equals("0")) |
| 183 | + nsf.set(Integer.parseInt(me.group(1))); |
| 184 | + } |
| 185 | + ret.put(dbname,nsf); |
| 186 | + } |
| 187 | + return ret; |
| 188 | + } |
| 189 | + |
| 190 | + /** Fetche array by balancing out parenthesis */ |
| 191 | + public String fetchArray(String text, String var){ |
| 192 | + int start = text.indexOf(var); |
| 193 | + if(start == -1) |
| 194 | + return null; |
| 195 | + char[] t = text.toCharArray(); |
| 196 | + int level = 0; boolean ret = false; |
| 197 | + boolean comment = false; |
| 198 | + for(int i=start+var.length();i<t.length;i++){ |
| 199 | + if(level == 0 && ret) |
| 200 | + return new String(t,start+var.length(),i-start-var.length()); |
| 201 | + if(comment){ |
| 202 | + if(t[i] == '#') |
| 203 | + comment = false; |
| 204 | + else |
| 205 | + continue; |
| 206 | + } |
| 207 | + if(t[i] == '('){ |
| 208 | + ret = true; |
| 209 | + level ++; |
| 210 | + } else if(t[i] == ')') |
| 211 | + level--; |
| 212 | + else if(t[i] == '#') |
| 213 | + comment = true; |
| 214 | + } |
| 215 | + return null; |
| 216 | + } |
| 217 | + |
164 | 218 | public String readFile(String path){ |
165 | 219 | char buffer[] = new char[32768]; |
166 | 220 | String text = ""; |
— | — | @@ -221,6 +275,7 @@ |
222 | 276 | String initset = p.readURL(new URL("file:///home/rainman/Desktop/InitialiseSettings.php")); |
223 | 277 | System.out.println(p.getLanguages(initset)); |
224 | 278 | System.out.println(p.getServer(initset)); |
| 279 | + System.out.println(p.getDefaultSearch(initset)); |
225 | 280 | |
226 | 281 | |
227 | 282 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/RankBuilder.java |
— | — | @@ -4,16 +4,20 @@ |
5 | 5 | import java.io.InputStream; |
6 | 6 | import java.util.ArrayList; |
7 | 7 | import java.util.Arrays; |
| 8 | +import java.util.BitSet; |
8 | 9 | import java.util.Collection; |
9 | 10 | import java.util.Collections; |
10 | 11 | import java.util.Comparator; |
11 | 12 | import java.util.HashMap; |
12 | 13 | import java.util.HashSet; |
| 14 | +import java.util.Iterator; |
13 | 15 | import java.util.PriorityQueue; |
14 | 16 | import java.util.Map.Entry; |
15 | 17 | |
16 | 18 | import org.apache.log4j.Logger; |
17 | 19 | import org.apache.lucene.document.Field.Store; |
| 20 | +import org.apache.lucene.index.Term; |
| 21 | +import org.apache.lucene.index.TermDocs; |
18 | 22 | import org.mediawiki.dumper.ProgressFilter; |
19 | 23 | import org.mediawiki.dumper.Tools; |
20 | 24 | import org.mediawiki.importer.XmlDumpReader; |
— | — | @@ -24,6 +28,10 @@ |
25 | 29 | import org.wikimedia.lsearch.config.IndexId; |
26 | 30 | import org.wikimedia.lsearch.index.IndexThread; |
27 | 31 | import org.wikimedia.lsearch.spell.SuggestResult; |
| 32 | +import org.wikimedia.lsearch.spell.api.Dictionary; |
| 33 | +import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
| 34 | +import org.wikimedia.lsearch.storage.ArticleAnalytics; |
| 35 | +import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
28 | 36 | import org.wikimedia.lsearch.storage.Storage; |
29 | 37 | import org.wikimedia.lsearch.util.Localization; |
30 | 38 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -44,6 +52,7 @@ |
45 | 53 | public static void main(String[] args) throws IOException { |
46 | 54 | String inputfile = null; |
47 | 55 | String dbname = null; |
| 56 | + boolean useExistingTemp = false; |
48 | 57 | |
49 | 58 | System.out.println("MediaWiki Lucene search indexer - build rank info from xml dumps.\n"); |
50 | 59 | |
— | — | @@ -51,17 +60,26 @@ |
52 | 61 | log = Logger.getLogger(RankBuilder.class); |
53 | 62 | |
54 | 63 | if(args.length < 2){ |
55 | | - System.out.println("Syntax: java RankBuilder <inputfile> <dbname>"); |
| 64 | + System.out.println("Syntax: java RankBuilder [-t] <inputfile> <dbname>"); |
| 65 | + System.out.println("Options:"); |
| 66 | + System.out.println(" -t - use existing temporary ranking index"); |
56 | 67 | return; |
57 | 68 | } |
58 | | - inputfile = args[0]; |
59 | | - dbname = args[1]; |
| 69 | + for(int i=0;i<args.length;i++){ |
| 70 | + if(args[i].equals("-t")) |
| 71 | + useExistingTemp = true; |
| 72 | + else if(inputfile == null) |
| 73 | + inputfile = args[i]; |
| 74 | + else if(dbname == null) |
| 75 | + dbname = args[i]; |
| 76 | + } |
60 | 77 | if(inputfile == null || dbname == null){ |
61 | 78 | System.out.println("Please specify both input xml file and database name"); |
62 | 79 | return; |
63 | 80 | } |
64 | 81 | |
65 | 82 | String langCode = GlobalConfiguration.getInstance().getLanguage(dbname); |
| 83 | + IndexId iid = IndexId.get(dbname); |
66 | 84 | // preload |
67 | 85 | UnicodeDecomposer.getInstance(); |
68 | 86 | Localization.readLocalization(langCode); |
— | — | @@ -69,19 +87,51 @@ |
70 | 88 | |
71 | 89 | long start = System.currentTimeMillis(); |
72 | 90 | |
73 | | - // regenerate link info |
74 | | - OldLinks links = processLinks(inputfile,getTitles(inputfile,langCode),langCode,LinkReader.NO_REDIRECTS); |
75 | | - links.compactAll(); |
76 | | - Storage store = Storage.getInstance(); |
77 | | - store.storePageReferences(links.getAll(),dbname); |
78 | | - storeRelated(store,links,dbname); |
| 91 | + // link info |
| 92 | + Links links = null; |
| 93 | + if(useExistingTemp) |
| 94 | + links = Links.openExisting(iid); |
| 95 | + else |
| 96 | + links = processLinks(inputfile,getTitles(inputfile,langCode,iid),langCode); |
| 97 | + //links.cacheInLinks(); |
| 98 | + /*log.info("Creating ref count cache"); |
| 99 | + HashMap<String,Integer> refCount = new HashMap<String,Integer>(); |
| 100 | + HashMap<Integer,String> keyCache = new HashMap<Integer,String>(); |
| 101 | + Word w; Dictionary d = links.getKeys(); |
| 102 | + while((w = d.next()) != null){ |
| 103 | + String key = w.getWord(); |
| 104 | + refCount.put(key,links.getNumInLinks(key)); |
| 105 | + keyCache.put(links.getDocId(key),key); |
| 106 | + } */ |
| 107 | + storeLinkAnalysis(links,iid); |
| 108 | + //Storage store = Storage.getInstance(); |
| 109 | + //store.storePageReferences(links.getAll(),dbname); |
| 110 | + //storeRelated(store,links,dbname); |
79 | 111 | |
80 | 112 | long end = System.currentTimeMillis(); |
81 | 113 | |
82 | 114 | System.out.println("Finished generating ranks in "+formatTime(end-start)); |
83 | 115 | } |
84 | 116 | |
85 | | - public static OldLinks processLinks(String inputfile, OldLinks links, String langCode, boolean readRedirects) { |
| 117 | + public static void storeLinkAnalysis(Links links, IndexId iid) throws IOException{ |
| 118 | + log.info("Storing link analysis data"); |
| 119 | + LinkAnalysisStorage store = new LinkAnalysisStorage(iid); |
| 120 | + Word w; |
| 121 | + Dictionary keys = links.getKeys(); |
| 122 | + while((w = keys.next()) != null){ |
| 123 | + String key = w.getWord(); |
| 124 | + int ref = links.getNumInLinks(key); |
| 125 | + String redirectTarget = links.getRedirectTarget(key); |
| 126 | + ArrayList<String> anchor = links.getAnchors(key); |
| 127 | + ArrayList<Related> related = new ArrayList<Related>(); //FIXME: too slow getRelated(key,links,refCount,keyCache); |
| 128 | + ArrayList<String> redirect = links.getRedirectsTo(key); |
| 129 | + store.addAnalitics(new ArticleAnalytics(key,ref,redirectTarget,anchor,related,redirect)); |
| 130 | + } |
| 131 | + store.snapshot(); |
| 132 | + |
| 133 | + } |
| 134 | + |
| 135 | + public static Links processLinks(String inputfile, Links links, String langCode) { |
86 | 136 | log.info("Second pass, calculating article links..."); |
87 | 137 | InputStream input = null; |
88 | 138 | // second pass - calculate page ranks |
— | — | @@ -92,10 +142,11 @@ |
93 | 143 | return null; |
94 | 144 | } |
95 | 145 | // calculate ranks |
96 | | - LinkReader rr = new LinkReader(links,langCode,readRedirects); |
| 146 | + LinkReader rr = new LinkReader(links,langCode); |
97 | 147 | XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000)); |
98 | 148 | try { |
99 | 149 | reader.readDump(); |
| 150 | + links.flush(); |
100 | 151 | } catch (IOException e) { |
101 | 152 | log.fatal("I/O error reading dump while calculating ranks for from "+inputfile); |
102 | 153 | return null; |
— | — | @@ -103,7 +154,7 @@ |
104 | 155 | return links; |
105 | 156 | } |
106 | 157 | |
107 | | - public static OldLinks getTitles(String inputfile,String langCode) { |
| 158 | + public static Links getTitles(String inputfile,String langCode,IndexId iid) { |
108 | 159 | log.info("First pass, getting a list of valid articles..."); |
109 | 160 | InputStream input = null; |
110 | 161 | try { |
— | — | @@ -112,49 +163,56 @@ |
113 | 164 | log.fatal("I/O error opening "+inputfile); |
114 | 165 | return null; |
115 | 166 | } |
116 | | - // first pass, get titles |
117 | | - TitleReader tr = new TitleReader(langCode); |
118 | | - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
119 | 167 | try { |
| 168 | + // first pass, get titles |
| 169 | + TitleReader tr = new TitleReader(langCode,iid); |
| 170 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000)); |
120 | 171 | reader.readDump(); |
121 | 172 | input.close(); |
| 173 | + Links links = tr.getLinks(); |
| 174 | + links.flush(); |
| 175 | + return links; |
122 | 176 | } catch (IOException e) { |
123 | 177 | log.fatal("I/O error reading dump while getting titles from "+inputfile); |
124 | 178 | return null; |
125 | 179 | } |
126 | | - return tr.getTitles(); |
| 180 | + |
127 | 181 | } |
128 | 182 | |
129 | | - public static void storeRelated(Storage store, OldLinks links, String dbname) throws IOException{ |
130 | | - int num = 0; |
131 | | - int total = links.getAll().size(); |
132 | | - ArrayList<Related> buf = new ArrayList<Related>(); |
133 | | - for(CompactArticleLinks cs : links.getAll()){ |
134 | | - num++; |
135 | | - log.debug("["+num+"/"+total+" - "+cs.linksInIndex+"] "+cs.toString()); |
136 | | - buf.addAll(getRelated(cs,links)); |
137 | | - if(buf.size() > 10000){ |
138 | | - store.storeRelatedPages(buf,dbname); |
139 | | - buf.clear(); |
140 | | - } |
141 | | - } |
142 | | - } |
143 | | - |
144 | 183 | /** |
145 | 184 | * Get related articles, sorted descending by score |
| 185 | + * @throws IOException |
146 | 186 | */ |
147 | | - public static ArrayList<Related> getRelated(CompactArticleLinks cs, OldLinks links){ |
| 187 | + public static ArrayList<Related> getRelated(String key, Links links, HashMap<String,Integer> refCache, HashMap<Integer,String> keyCache) throws IOException{ |
148 | 188 | ArrayList<Related> ret = new ArrayList<Related>(); |
149 | 189 | |
150 | | - HashSet<CompactArticleLinks> ll = new HashSet<CompactArticleLinks>(); |
151 | | - if(cs.linksIn != null){ |
152 | | - for(CompactArticleLinks csl : cs.linksIn) |
153 | | - ll.add(csl); |
| 190 | + HashMap<String,Integer> map = new HashMap<String,Integer>(); |
| 191 | + int i = 1; |
| 192 | + ArrayList<String> inLinks = links.getInLinks(key,keyCache); |
| 193 | + for(String in : inLinks){ |
| 194 | + map.put(in,i++); |
154 | 195 | } |
155 | | - for(CompactArticleLinks from : ll){ |
156 | | - double score = relatedScore(cs,ll,from); |
| 196 | + HashSet<Long> internal = new HashSet<Long>(); |
| 197 | + for(Entry<String,Integer> e : map.entrySet()){ |
| 198 | + String from = e.getKey(); |
| 199 | + long inx = e.getValue(); |
| 200 | + long offset = inx << 32; |
| 201 | + StringList sl = links.getOutLinks(from); |
| 202 | + Iterator<String> it = sl.iterator(); |
| 203 | + while(it.hasNext()){ |
| 204 | + Integer inx2 = map.get(it.next()); |
| 205 | + if(inx2 != null){ |
| 206 | + internal.add(offset + inx2); |
| 207 | + } |
| 208 | + } |
| 209 | + } |
| 210 | + for(Entry<String,Integer> e : map.entrySet()){ |
| 211 | + String from = e.getKey(); |
| 212 | + int inx = e.getValue(); |
| 213 | + //double score = relatedScore(links,in,from,refCount); |
| 214 | + double score = relatedScore(inx,internal,inLinks,refCache); |
157 | 215 | if(score != 0) |
158 | | - ret.add(new Related(cs,from,score)); |
| 216 | + ret.add(new Related(key,from,score)); |
159 | 217 | } |
160 | 218 | Collections.sort(ret,new Comparator<Related>() { |
161 | 219 | public int compare(Related o1, Related o2){ |
— | — | @@ -171,7 +229,7 @@ |
172 | 230 | * Get related titles (RelatedTitle is used in Article) |
173 | 231 | */ |
174 | 232 | public static ArrayList<RelatedTitle> getRelatedTitles(CompactArticleLinks cs, OldLinks links){ |
175 | | - ArrayList<Related> rel = getRelated(cs,links); |
| 233 | + ArrayList<Related> rel = null; // getRelated(cs,links); |
176 | 234 | ArrayList<RelatedTitle> ret = new ArrayList<RelatedTitle>(); |
177 | 235 | for(Related r : rel){ |
178 | 236 | ret.add(new RelatedTitle(new Title(r.relates.toString()),r.score)); |
— | — | @@ -186,23 +244,49 @@ |
187 | 245 | return d; |
188 | 246 | } |
189 | 247 | |
190 | | - public static double relatedScore(CompactArticleLinks p, HashSet<CompactArticleLinks> ll, CompactArticleLinks q){ |
| 248 | + //public static double relatedScore(Links links, HashSet<String> inLinks, String from, HashMap<String,Integer> refCount) throws IOException{ |
| 249 | + public static double relatedScore(long q, HashSet<Long> internal, ArrayList<String> inLinks, HashMap<String,Integer> refCache){ |
| 250 | + //Collection<String> qInLinks = links.getInLinksFromCache(from); |
| 251 | + //Collection<String> qOutLinks = links.getOutLinks(from).toCollection(); |
191 | 252 | double score = 0; |
| 253 | + for(Long l : internal){ |
| 254 | + long l1 = l >> 32; |
| 255 | + long l2 = l - (l1 << 32); |
| 256 | + if(l1 == q && l2 == q) |
| 257 | + continue; |
| 258 | + else if(l1 == q) |
| 259 | + score += 1.0/norm(refCache.get(inLinks.get((int) (l2 - 1)))); |
| 260 | + else if(l2 == q) |
| 261 | + score += 1.0/norm(refCache.get(inLinks.get((int) (l1 - 1)))); |
| 262 | + } |
| 263 | + /*for(int i=1;i<=inLinks.size();i++){ |
| 264 | + if(i!=q && internal.contains(i*q)){ |
| 265 | + score += 1.0/norm(refCache.get(inLinks.get(i-1))); |
| 266 | + } |
| 267 | + } */ |
| 268 | + |
192 | 269 | // all r that links to q |
193 | | - for(int i=0;i<q.linksInIndex;i++){ |
194 | | - CompactArticleLinks r = q.linksIn[i]; |
195 | | - if(r != q && r.links != 0 && ll.contains(r)){ |
196 | | - score += 1.0/norm(r.links); |
| 270 | + /*for(String r : qInLinks){ |
| 271 | + if(!refCount.containsKey(r)) |
| 272 | + System.out.println("ERROR for key "+r); |
| 273 | + //int ref = links.getNumInLinks(r); |
| 274 | + int ref = refCount.get(r); |
| 275 | + if(!r.equals(from) && ref != 0 && inLinks.contains(r)){ |
| 276 | + score += 1.0/norm(ref); |
197 | 277 | } |
198 | 278 | |
199 | 279 | } |
200 | 280 | // all r that q links to |
201 | | - for(int i=0;i<q.linksOutIndex;i++){ |
202 | | - CompactArticleLinks r = q.linksOut[i]; |
203 | | - if(r != q && r.links!=0 && ll.contains(r)){ |
204 | | - score += 1.0/norm(r.links); |
| 281 | + for(String r : qOutLinks){ |
| 282 | + //int ref = links.getNumInLinks(r); |
| 283 | + if(!refCount.containsKey(r)) |
| 284 | + System.out.println("ERROR for key "+r); |
| 285 | + int ref = refCount.get(r); |
| 286 | + if(!r.equals(from) && ref != 0 && inLinks.contains(r)){ |
| 287 | + score += 1.0/norm(ref); |
205 | 288 | } |
206 | | - } |
| 289 | + |
| 290 | + } */ |
207 | 291 | return score; |
208 | 292 | } |
209 | 293 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/StringList.java |
— | — | @@ -0,0 +1,128 @@ |
| 2 | +package org.wikimedia.lsearch.ranks; |
| 3 | + |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
| 6 | +import java.util.Iterator; |
| 7 | + |
| 8 | +/** |
| 9 | + * Maintain a list of string, with emphasis on |
| 10 | + * efficient serialization and deserialization. |
| 11 | + * |
| 12 | + * Note: string length is limited to BUFFER_SIZE chars |
| 13 | + * |
| 14 | + * @author rainman |
| 15 | + * |
| 16 | + */ |
| 17 | +public class StringList { |
| 18 | + public static final int BUFFER_SIZE = 300; |
| 19 | + /** delimiter used during serialization/deserialization */ |
| 20 | + public static final char DELIMITER = '\0'; |
| 21 | + |
| 22 | + protected char[] serialized = null; |
| 23 | + protected Collection<String> collection = null; |
| 24 | + protected String serializedStr = null; |
| 25 | + |
| 26 | + |
| 27 | + /** Build a list form serialized input string */ |
| 28 | + public StringList(String serialized){ |
| 29 | + if(serialized == null) |
| 30 | + this.serialized = new char[0]; |
| 31 | + else |
| 32 | + this.serialized = serialized.toCharArray(); |
| 33 | + } |
| 34 | + |
| 35 | + /** Build from a collection of string */ |
| 36 | + public StringList(Collection<String> inputCollection){ |
| 37 | + this.collection = inputCollection; |
| 38 | + } |
| 39 | + |
| 40 | + public Iterator<String> iterator(){ |
| 41 | + if(collection != null) |
| 42 | + return collection.iterator(); |
| 43 | + else if(serialized != null) |
| 44 | + return new StringListIterator(); |
| 45 | + else |
| 46 | + return null; |
| 47 | + } |
| 48 | + |
| 49 | + public Collection<String> toCollection(){ |
| 50 | + if(collection != null) |
| 51 | + return collection; |
| 52 | + Iterator<String> it = iterator(); |
| 53 | + collection = new ArrayList<String>(); |
| 54 | + if(it != null){ |
| 55 | + while(it.hasNext()) |
| 56 | + collection.add(it.next()); |
| 57 | + } |
| 58 | + return collection; |
| 59 | + |
| 60 | + } |
| 61 | + |
| 62 | + public String serialize(){ |
| 63 | + if(serialized != null) |
| 64 | + return new String(serialized); |
| 65 | + else if(serializedStr != null) |
| 66 | + return serializedStr; |
| 67 | + else if(collection == null) |
| 68 | + throw new RuntimeException("String list to be serialized is null"); |
| 69 | + StringBuilder sb = new StringBuilder(); |
| 70 | + boolean first = true; |
| 71 | + for(String s : collection){ |
| 72 | + if(!first) |
| 73 | + sb.append(DELIMITER); |
| 74 | + sb.append(s); |
| 75 | + first = false; |
| 76 | + } |
| 77 | + serializedStr = sb.toString(); |
| 78 | + return serializedStr; |
| 79 | + } |
| 80 | + |
| 81 | + |
| 82 | + |
| 83 | + @Override |
| 84 | + public String toString() { |
| 85 | + return serialize(); |
| 86 | + } |
| 87 | + |
| 88 | + |
| 89 | + |
| 90 | + class StringListIterator implements Iterator<String> { |
| 91 | + char[] buffer = new char[BUFFER_SIZE]; |
| 92 | + int length = 0; |
| 93 | + int pos = 0; // position in serialized[] |
| 94 | + |
| 95 | + public boolean hasNext() { |
| 96 | + if(pos < serialized.length) |
| 97 | + return true; |
| 98 | + else |
| 99 | + return false; |
| 100 | + } |
| 101 | + |
| 102 | + public String next() { |
| 103 | + if(!hasNext()) |
| 104 | + return null; |
| 105 | + length = 0; |
| 106 | + for(;pos<serialized.length;pos++){ |
| 107 | + if(serialized[pos] == DELIMITER){ |
| 108 | + pos++; // position on first char of next string |
| 109 | + break; |
| 110 | + } |
| 111 | + if(length >= buffer.length){ // should never happen with wiki-titles |
| 112 | + char[] newbuf = new char[buffer.length*2]; |
| 113 | + System.arraycopy(buffer,0,newbuf,0,buffer.length); |
| 114 | + buffer = newbuf; |
| 115 | + } |
| 116 | + buffer[length++] = serialized[pos]; |
| 117 | + } |
| 118 | + return new String(buffer,0,length); |
| 119 | + } |
| 120 | + |
| 121 | + public void remove() { |
| 122 | + throw new UnsupportedOperationException(); |
| 123 | + } |
| 124 | + |
| 125 | + } |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/TitleReader.java |
— | — | @@ -12,6 +12,8 @@ |
13 | 13 | import org.mediawiki.importer.Revision; |
14 | 14 | import org.mediawiki.importer.Siteinfo; |
15 | 15 | import org.wikimedia.lsearch.beans.ArticleLinks; |
| 16 | +import org.wikimedia.lsearch.beans.Title; |
| 17 | +import org.wikimedia.lsearch.config.IndexId; |
16 | 18 | import org.wikimedia.lsearch.util.Localization; |
17 | 19 | |
18 | 20 | /** |
— | — | @@ -23,11 +25,12 @@ |
24 | 26 | public class TitleReader implements DumpWriter{ |
25 | 27 | Page page; |
26 | 28 | Revision revision; |
27 | | - OldLinks links = new OldLinks(); |
| 29 | + Links links; |
28 | 30 | protected String langCode; |
29 | 31 | |
30 | | - public TitleReader(String langCode){ |
| 32 | + public TitleReader(String langCode, IndexId iid) throws IOException{ |
31 | 33 | this.langCode = langCode; |
| 34 | + this.links = Links.createNew(iid); |
32 | 35 | } |
33 | 36 | |
34 | 37 | public void writeRevision(Revision revision) throws IOException { |
— | — | @@ -38,9 +41,9 @@ |
39 | 42 | } |
40 | 43 | public void writeEndPage() throws IOException { |
41 | 44 | String key = page.Title.Namespace+":"+page.Title.Text; |
42 | | - links.add(key,0); |
| 45 | + links.addTitle(new Title(key)); |
43 | 46 | } |
44 | | - public OldLinks getTitles() { |
| 47 | + public Links getLinks() { |
45 | 48 | return links; |
46 | 49 | } |
47 | 50 | public void close() throws IOException { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java |
— | — | @@ -32,21 +32,12 @@ |
33 | 33 | Revision revision; |
34 | 34 | Siteinfo siteinfo; |
35 | 35 | /** ns:title -> number of referring articles */ |
36 | | - OldLinks links; |
| 36 | + Links links; |
37 | 37 | HashSet<String> interwiki; |
38 | 38 | String langCode; |
39 | | - boolean readRedirects; |
40 | | - |
41 | | - public static final boolean READ_REDIRECTS = true; |
42 | | - public static final boolean NO_REDIRECTS = false; |
43 | 39 | |
44 | | - public LinkReader(OldLinks links, String langCode){ |
45 | | - this(links,langCode,false); |
46 | | - } |
47 | | - |
48 | | - public LinkReader(OldLinks links, String langCode, boolean readRedirects){ |
| 40 | + public LinkReader(Links links, String langCode){ |
49 | 41 | this.links = links; |
50 | | - this.readRedirects = readRedirects; |
51 | 42 | if(langCode == null || langCode.equals("")) |
52 | 43 | langCode = "en"; |
53 | 44 | this.langCode = langCode; |
— | — | @@ -59,113 +50,8 @@ |
60 | 51 | this.page = page; |
61 | 52 | } |
62 | 53 | public void writeEndPage() throws IOException { |
63 | | - CompactArticleLinks p = links.get(page.Title.Namespace+":"+page.Title.Text); |
64 | | - // register redirect |
65 | | - Title redirect = Localization.getRedirectTitle(revision.Text,langCode); |
66 | | - if(redirect != null && readRedirects){ |
67 | | - CompactArticleLinks cs = findArticleLinks(redirect.getNamespace(),redirect.getTitle()); |
68 | | - if(cs != null){ |
69 | | - links.setRedirect(page.Title.Namespace+":"+page.Title.Text,cs); |
70 | | - } |
71 | | - } else if(redirect == null){ // process only non-redirects |
72 | | - processLinks(p,revision.Text,page.Title.Namespace); |
73 | | - } |
74 | | - } |
75 | | - |
76 | | - /** Find the links object for the ns:title key */ |
77 | | - protected CompactArticleLinks findArticleLinks(int ns, String title){ |
78 | | - String key; |
79 | | - CompactArticleLinks rank; |
80 | | - if(title.length() == 0) |
81 | | - return null; |
82 | | - // try exact match |
83 | | - key = ns+":"+title; |
84 | | - rank = links.get(key); |
85 | | - if(rank != null) |
86 | | - return rank; |
87 | | - // try lowercase |
88 | | - key = ns+":"+title.toLowerCase(); |
89 | | - rank = links.get(key); |
90 | | - if(rank != null) |
91 | | - return rank; |
92 | | - // try lowercase with first letter upper case |
93 | | - if(title.length()==1) |
94 | | - key = ns+":"+title.toUpperCase(); |
95 | | - else |
96 | | - key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase(); |
97 | | - rank = links.get(key); |
98 | | - if(rank != null) |
99 | | - return rank; |
100 | | - // try title case |
101 | | - key = ns+":"+WordUtils.capitalize(title); |
102 | | - rank = links.get(key); |
103 | | - if(rank != null) |
104 | | - return rank; |
105 | | - // try upper case |
106 | | - key = ns+":"+title.toUpperCase(); |
107 | | - rank = links.get(key); |
108 | | - if(rank != null) |
109 | | - return rank; |
110 | | - // try capitalizing at word breaks |
111 | | - key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
112 | | - rank = links.get(key); |
113 | | - if(rank != null) |
114 | | - return rank; |
115 | | - |
116 | | - return null; |
117 | | - } |
118 | | - |
119 | | - /** Extract all links from this page, and increment ref count for linked pages */ |
120 | | - protected void processLinks(CompactArticleLinks p, String text, int namespace) { |
121 | | - Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
122 | | - Matcher matcher = linkPat.matcher(text); |
123 | | - int ns; String title; |
124 | | - boolean escaped; |
125 | | - |
126 | | - HashSet<CompactArticleLinks> pagelinks = new HashSet<CompactArticleLinks>(); |
127 | | - while(matcher.find()){ |
128 | | - String link = matcher.group(1); |
129 | | - int fragment = link.lastIndexOf('#'); |
130 | | - if(fragment != -1) |
131 | | - link = link.substring(0,fragment); |
132 | | - //System.out.println("Got link "+link); |
133 | | - if(link.startsWith(":")){ |
134 | | - escaped = true; |
135 | | - link = link.substring(1); |
136 | | - } else escaped = false; |
137 | | - ns = 0; |
138 | | - title = link; |
139 | | - // check for ns:title syntax |
140 | | - String[] parts = link.split(":",2); |
141 | | - if(parts.length == 2 && parts[0].length() > 1){ |
142 | | - Integer inx = siteinfo.Namespaces.getIndex(parts[0].substring(0,1).toUpperCase()+parts[0].substring(1).toLowerCase()); |
143 | | - if(!escaped && (parts[0].equalsIgnoreCase("category") || (inx!=null && inx==14))) |
144 | | - continue; // categories, ignore |
145 | | - if(inx!=null && inx < 0) |
146 | | - continue; // special pages, ignore |
147 | | - if(inx != null){ |
148 | | - ns = inx; |
149 | | - title = parts[1]; |
150 | | - } |
151 | | - |
152 | | - // ignore interwiki links |
153 | | - if(interwiki.contains(parts[0])) |
154 | | - continue; |
155 | | - } |
156 | | - if(ns == 0 && namespace!=0) |
157 | | - continue; // skip links from other namespaces into the main namespace |
158 | | - // register as link |
159 | | - CompactArticleLinks target = findArticleLinks(ns,title); |
160 | | - if(target != null) |
161 | | - pagelinks.add(target); |
162 | | - } |
163 | | - // increment page ranks |
164 | | - for(CompactArticleLinks rank : pagelinks){ |
165 | | - rank.links++; |
166 | | - rank.addInLink(p); |
167 | | - p.addOutLink(rank); |
168 | | - } |
169 | | - } |
| 54 | + links.addArticleInfo(revision.Text,new Title(page.Title.Namespace,page.Title.Text)); |
| 55 | + } |
170 | 56 | public void writeSiteinfo(Siteinfo info) throws IOException { |
171 | 57 | siteinfo = info; |
172 | 58 | } |
— | — | @@ -178,7 +64,7 @@ |
179 | 65 | public void writeStartWiki() throws IOException { |
180 | 66 | // nop |
181 | 67 | } |
182 | | - public OldLinks getRanks() { |
| 68 | + public Links getLinks() { |
183 | 69 | return links; |
184 | 70 | } |
185 | 71 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -10,7 +10,9 @@ |
11 | 11 | import java.util.regex.Pattern; |
12 | 12 | |
13 | 13 | import org.apache.commons.lang.WordUtils; |
| 14 | +import org.apache.log4j.Logger; |
14 | 15 | import org.apache.lucene.analysis.Analyzer; |
| 16 | +import org.apache.lucene.analysis.SimpleAnalyzer; |
15 | 17 | import org.apache.lucene.document.Document; |
16 | 18 | import org.apache.lucene.document.Field; |
17 | 19 | import org.apache.lucene.index.IndexReader; |
— | — | @@ -18,15 +20,21 @@ |
19 | 21 | import org.apache.lucene.index.Term; |
20 | 22 | import org.apache.lucene.index.TermDocs; |
21 | 23 | import org.apache.lucene.index.TermEnum; |
| 24 | +import org.apache.lucene.store.Directory; |
| 25 | +import org.apache.lucene.store.RAMDirectory; |
22 | 26 | import org.wikimedia.lsearch.analyzers.SplitAnalyzer; |
23 | 27 | import org.wikimedia.lsearch.beans.Article; |
24 | 28 | import org.wikimedia.lsearch.beans.Title; |
25 | 29 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
26 | 30 | import org.wikimedia.lsearch.config.IndexId; |
27 | 31 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 32 | +import org.wikimedia.lsearch.spell.api.Dictionary; |
| 33 | +import org.wikimedia.lsearch.spell.api.LuceneDictionary; |
| 34 | +import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
28 | 35 | import org.wikimedia.lsearch.util.Localization; |
29 | 36 | |
30 | 37 | public class Links { |
| 38 | + static Logger log = Logger.getLogger(Links.class); |
31 | 39 | protected IndexId iid; |
32 | 40 | protected String langCode; |
33 | 41 | protected IndexWriter writer = null; |
— | — | @@ -34,24 +42,61 @@ |
35 | 43 | protected HashSet<String> interwiki = new HashSet<String>(); |
36 | 44 | protected IndexReader reader = null; |
37 | 45 | protected String path; |
| 46 | + protected enum State { MODIFIED_TITLES, FLUSHED, MODIFIED_ARTICLES, READ }; |
| 47 | + protected State state; |
| 48 | + protected Directory directory; |
38 | 49 | |
39 | | - public static final char DELIMITER = '\0'; |
40 | | - |
41 | 50 | private Links(IndexId iid){ |
42 | 51 | this.iid = iid; |
43 | 52 | this.langCode = GlobalConfiguration.getInstance().getLanguage(iid); |
44 | 53 | } |
45 | 54 | |
| 55 | + public static Links openExisting(IndexId iid) throws IOException{ |
| 56 | + Links links = new Links(iid); |
| 57 | + links.path = iid.getTempPath(); |
| 58 | + log.info("Using index at "+links.path); |
| 59 | + links.writer = WikiIndexModifier.openForWrite(links.path,false); |
| 60 | + initWriter(links.writer); |
| 61 | + links.reader = IndexReader.open(links.path); |
| 62 | + links.nsmap = Localization.getLocalizedNamespaces(links.langCode); |
| 63 | + links.interwiki = Localization.getInterwiki(); |
| 64 | + links.state = State.FLUSHED; |
| 65 | + links.directory = links.writer.getDirectory(); |
| 66 | + return links; |
| 67 | + } |
| 68 | + |
| 69 | + private static void initWriter(IndexWriter writer) { |
| 70 | + writer.setMergeFactor(20); |
| 71 | + writer.setMaxBufferedDocs(500); |
| 72 | + writer.setUseCompoundFile(true); |
| 73 | + } |
| 74 | + |
46 | 75 | public static Links createNew(IndexId iid) throws IOException{ |
47 | 76 | Links links = new Links(iid); |
48 | 77 | links.path = iid.getTempPath(); |
| 78 | + log.info("Making index at "+links.path); |
49 | 79 | links.writer = WikiIndexModifier.openForWrite(links.path,true); |
50 | 80 | links.reader = IndexReader.open(links.path); |
51 | 81 | links.nsmap = Localization.getLocalizedNamespaces(links.langCode); |
52 | 82 | links.interwiki = Localization.getInterwiki(); |
| 83 | + links.state = State.FLUSHED; |
| 84 | + links.directory = links.writer.getDirectory(); |
53 | 85 | return links; |
54 | 86 | } |
55 | 87 | |
| 88 | + public static Links createNewInMemory(IndexId iid) throws IOException{ |
| 89 | + Links links = new Links(iid); |
| 90 | + links.path = iid.getTempPath(); |
| 91 | + log.info("Making index at "+links.path); |
| 92 | + links.writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true); |
| 93 | + links.reader = IndexReader.open(links.path); |
| 94 | + links.nsmap = Localization.getLocalizedNamespaces(links.langCode); |
| 95 | + links.interwiki = Localization.getInterwiki(); |
| 96 | + links.state = State.FLUSHED; |
| 97 | + links.directory = links.writer.getDirectory(); |
| 98 | + return links; |
| 99 | + } |
| 100 | + |
56 | 101 | /** Add more entries to namespace mapping (ns_name -> ns_index) */ |
57 | 102 | public void addToNamespaceMap(HashMap<String,Integer> map){ |
58 | 103 | for(Entry<String,Integer> e : map.entrySet()){ |
— | — | @@ -64,11 +109,31 @@ |
65 | 110 | public void flush() throws IOException{ |
66 | 111 | // close & optimize |
67 | 112 | reader.close(); |
| 113 | + if(writer != null){ |
| 114 | + writer.optimize(); |
| 115 | + writer.close(); |
| 116 | + } |
| 117 | + // reopen |
| 118 | + writer = new IndexWriter(directory, new SimpleAnalyzer(), false); |
| 119 | + initWriter(writer); |
| 120 | + reader = IndexReader.open(path); |
| 121 | + state = State.FLUSHED; |
| 122 | + } |
| 123 | + |
| 124 | + /** |
| 125 | + * Flush, and stop using this instance for writing. |
| 126 | + * Can still read. |
| 127 | + * @throws IOException |
| 128 | + */ |
| 129 | + public void flushForRead() throws IOException{ |
| 130 | + // close & optimize |
| 131 | + reader.close(); |
68 | 132 | writer.optimize(); |
69 | 133 | writer.close(); |
70 | 134 | // reopen |
71 | | - writer = WikiIndexModifier.openForWrite(path,false); |
72 | | - reader = IndexReader.open(path); |
| 135 | + reader = IndexReader.open(path); |
| 136 | + writer = null; |
| 137 | + state = State.READ; |
73 | 138 | } |
74 | 139 | |
75 | 140 | /** Add a title to enable proper link analysis when adding articles |
— | — | @@ -77,21 +142,23 @@ |
78 | 143 | Document doc = new Document(); |
79 | 144 | doc.add(new Field("namespace",Integer.toString(t.getNamespace()),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
80 | 145 | doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
81 | | - doc.add(new Field("key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
82 | | - writer.addDocument(doc); |
| 146 | + doc.add(new Field("title_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 147 | + writer.addDocument(doc); |
| 148 | + state = State.MODIFIED_TITLES; |
83 | 149 | } |
84 | 150 | |
85 | 151 | /** Add links and other info from article |
86 | 152 | * @throws IOException */ |
87 | | - public void addArticleInfo(Article a) throws IOException{ |
| 153 | + public void addArticleInfo(String text, Title t) throws IOException{ |
| 154 | + if(state == State.MODIFIED_TITLES) |
| 155 | + flush(); |
88 | 156 | Pattern linkPat = Pattern.compile("\\[\\[(.*?)(\\|(.*?))?\\]\\]"); |
89 | | - String text = a.getContents(); |
90 | | - int namespace = Integer.parseInt(a.getNamespace()); |
| 157 | + int namespace = t.getNamespace(); |
91 | 158 | Matcher matcher = linkPat.matcher(text); |
92 | 159 | int ns; String title; |
93 | 160 | boolean escaped; |
94 | 161 | HashSet<String> pagelinks = new HashSet<String>(); |
95 | | - HashSet<String> linkkeys = new HashSet<String>(); |
| 162 | + HashSet<String> linkkeys = new HashSet<String>(); |
96 | 163 | |
97 | 164 | Title redirect = Localization.getRedirectTitle(text,langCode); |
98 | 165 | String redirectsTo = null; |
— | — | @@ -101,10 +168,12 @@ |
102 | 169 | while(matcher.find()){ |
103 | 170 | String link = matcher.group(1); |
104 | 171 | String anchor = matcher.group(2); |
| 172 | + if(anchor != null && anchor.length()>1 && anchor.substring(1).equalsIgnoreCase(title(link))) |
| 173 | + anchor = null; // anchor same as link text |
105 | 174 | int fragment = link.lastIndexOf('#'); |
106 | 175 | if(fragment != -1) |
107 | 176 | link = link.substring(0,fragment); |
108 | | - System.out.println("Got link "+link+"|"+anchor); |
| 177 | + //System.out.println("Got link "+link+anchor); |
109 | 178 | if(link.startsWith(":")){ |
110 | 179 | escaped = true; |
111 | 180 | link = link.substring(1); |
— | — | @@ -132,40 +201,33 @@ |
133 | 202 | continue; // skip links from other namespaces into the main namespace |
134 | 203 | String target = findTargetLink(ns,title); |
135 | 204 | if(target != null){ |
| 205 | + //System.out.println("Found "+link); |
136 | 206 | linkkeys.add(target); // for outlink storage |
137 | 207 | pagelinks.add(target+"|"); // for backlinks |
138 | | - if(anchor != null && !"".equals(anchor)) |
139 | | - pagelinks.add(target+"|"+anchor); // for efficient anchortext extraction |
| 208 | + if(anchor != null && !"|".equals(anchor)) |
| 209 | + pagelinks.add(target+anchor); // for efficient anchortext extraction |
140 | 210 | } |
141 | 211 | } |
142 | 212 | } |
143 | | - // index article |
144 | | - Analyzer an = new SplitAnalyzer(DELIMITER); |
| 213 | + // index article |
| 214 | + StringList sl = new StringList(pagelinks); |
| 215 | + StringList lk = new StringList(linkkeys); |
| 216 | + Analyzer an = new SplitAnalyzer(); |
145 | 217 | Document doc = new Document(); |
146 | | - doc.add(new Field("namespace",a.getNamespace(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
147 | | - doc.add(new Field("title",a.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
148 | | - doc.add(new Field("article_key",a.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 218 | + doc.add(new Field("namespace",t.getNamespaceAsString(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 219 | + doc.add(new Field("title",t.getTitle(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 220 | + doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
149 | 221 | if(redirectsTo != null) |
150 | 222 | doc.add(new Field("redirect",redirectsTo,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
151 | | - else |
152 | | - doc.add(new Field("links",join(pagelinks,DELIMITER),Field.Store.NO,Field.Index.TOKENIZED)); |
153 | | - doc.add(new Field("links_stored",join(linkkeys,DELIMITER),Field.Store.YES,Field.Index.NO)); |
| 223 | + else{ |
| 224 | + doc.add(new Field("links",sl.toString(),Field.Store.NO,Field.Index.TOKENIZED)); |
| 225 | + doc.add(new Field("links_stored",lk.toString(),Field.Store.YES,Field.Index.TOKENIZED)); |
| 226 | + } |
154 | 227 | |
155 | 228 | writer.addDocument(doc,an); |
| 229 | + state = State.MODIFIED_ARTICLES; |
156 | 230 | } |
157 | 231 | |
158 | | - protected String join(Collection<String> strs, char join){ |
159 | | - StringBuilder sb = new StringBuilder(); |
160 | | - boolean first = true; |
161 | | - for(String s : strs){ |
162 | | - if(!first) |
163 | | - sb.append(join); |
164 | | - sb.append(s); |
165 | | - first = false; |
166 | | - } |
167 | | - return sb.toString(); |
168 | | - } |
169 | | - |
170 | 232 | /** Find the target key to title (ns:title) to which the links is pointing to |
171 | 233 | * @throws IOException */ |
172 | 234 | protected String findTargetLink(int ns, String title) throws IOException{ |
— | — | @@ -174,55 +236,102 @@ |
175 | 237 | return null; |
176 | 238 | // try exact match |
177 | 239 | key = ns+":"+title; |
178 | | - if(reader.docFreq(new Term("key",key)) != 0) |
| 240 | + if(reader.docFreq(new Term("title_key",key)) != 0) |
179 | 241 | return key; |
180 | 242 | // try lowercase |
181 | 243 | key = ns+":"+title.toLowerCase(); |
182 | | - if(reader.docFreq(new Term("key",key)) != 0) |
| 244 | + if(reader.docFreq(new Term("title_key",key)) != 0) |
183 | 245 | return key; |
184 | 246 | // try lowercase with first letter upper case |
185 | 247 | if(title.length()==1) |
186 | 248 | key = ns+":"+title.toUpperCase(); |
187 | 249 | else |
188 | 250 | key = ns+":"+title.substring(0,1).toUpperCase()+title.substring(1).toLowerCase(); |
189 | | - if(reader.docFreq(new Term("key",key)) != 0) |
| 251 | + if(reader.docFreq(new Term("title_key",key)) != 0) |
190 | 252 | return key; |
191 | 253 | // try title case |
192 | 254 | key = ns+":"+WordUtils.capitalize(title); |
193 | | - if(reader.docFreq(new Term("key",key)) != 0) |
| 255 | + if(reader.docFreq(new Term("title_key",key)) != 0) |
194 | 256 | return key; |
195 | 257 | // try upper case |
196 | 258 | key = ns+":"+title.toUpperCase(); |
197 | | - if(reader.docFreq(new Term("key",key)) != 0) |
| 259 | + if(reader.docFreq(new Term("title_key",key)) != 0) |
198 | 260 | return key; |
199 | 261 | // try capitalizing at word breaks |
200 | 262 | key = ns+":"+WordUtils.capitalize(title,new char[] {' ','-','(',')','}','{','.',',','?','!'}); |
201 | | - if(reader.docFreq(new Term("key",key)) != 0) |
| 263 | + if(reader.docFreq(new Term("title_key",key)) != 0) |
202 | 264 | return key; |
203 | 265 | |
204 | 266 | return null; |
205 | 267 | } |
206 | 268 | |
207 | 269 | /** Get number of backlinks to this title */ |
208 | | - public int getNumInLinks(Title t) throws IOException{ |
209 | | - return reader.docFreq(new Term("links",t.getKey()+"|")); |
| 270 | + public int getNumInLinks(String key) throws IOException{ |
| 271 | + return reader.docFreq(new Term("links",key+"|")); |
210 | 272 | } |
211 | 273 | |
212 | 274 | /** Get all article titles that redirect to given title */ |
213 | | - public ArrayList<Title> getRedirectsTo(Title t) throws IOException{ |
214 | | - ArrayList<Title> ret = new ArrayList<Title>(); |
215 | | - TermDocs td = reader.termDocs(new Term("redirect",t.getKey())); |
| 275 | + public ArrayList<String> getRedirectsTo(String key) throws IOException{ |
| 276 | + ArrayList<String> ret = new ArrayList<String>(); |
| 277 | + TermDocs td = reader.termDocs(new Term("redirect",key)); |
216 | 278 | while(td.next()){ |
217 | | - ret.add(new Title(reader.document(td.doc()).get("article_key"))); |
| 279 | + ret.add(reader.document(td.doc()).get("article_key")); |
218 | 280 | } |
219 | 281 | return ret; |
220 | 282 | } |
221 | 283 | |
| 284 | + protected void ensureRead() throws IOException { |
| 285 | + if(state != State.READ) |
| 286 | + flushForRead(); |
| 287 | + } |
| 288 | + |
| 289 | + |
| 290 | + /** If an article is a redirect |
| 291 | + * @throws IOException */ |
| 292 | + public boolean isRedirect(String key) throws IOException{ |
| 293 | + ensureRead(); |
| 294 | + TermDocs td = reader.termDocs(new Term("article_key",key)); |
| 295 | + if(td.next()){ |
| 296 | + if(reader.document(td.doc()).get("redirect")!=null) |
| 297 | + return true; |
| 298 | + } |
| 299 | + return false; |
| 300 | + } |
| 301 | + |
| 302 | + /** If article is redirect, get target, else null */ |
| 303 | + public String getRedirectTarget(String key) throws IOException{ |
| 304 | + ensureRead(); |
| 305 | + TermDocs td = reader.termDocs(new Term("article_key",key)); |
| 306 | + if(td.next()){ |
| 307 | + return reader.document(td.doc()).get("redirect"); |
| 308 | + } |
| 309 | + return null; |
| 310 | + } |
| 311 | + |
| 312 | + /** Get only anchors without frequency */ |
| 313 | + public ArrayList<String> getAnchors(String key) throws IOException{ |
| 314 | + ensureRead(); |
| 315 | + ArrayList<String> ret = new ArrayList<String>(); |
| 316 | + TermEnum te = reader.terms(new Term("links",key+"|")); |
| 317 | + while(te.next()){ |
| 318 | + String t = te.term().text(); |
| 319 | + if(!t.startsWith(key) || !te.term().field().equals("links")) |
| 320 | + break; |
| 321 | + ret.add(t.substring(key.length()+1)); |
| 322 | + } |
| 323 | + return ret; |
| 324 | + } |
| 325 | + |
| 326 | + /** Get title part of the key (ns:title) */ |
| 327 | + private String title(String key) { |
| 328 | + return key.substring(key.indexOf(':')+1); |
| 329 | + } |
| 330 | + |
222 | 331 | /** Get anchor texts for given title |
223 | 332 | * @throws IOException */ |
224 | | - public ArrayList<AnchorText> getAnchorText(Title t) throws IOException{ |
| 333 | + public ArrayList<AnchorText> getAnchorText(String key) throws IOException{ |
| 334 | + ensureRead(); |
225 | 335 | ArrayList<AnchorText> ret = new ArrayList<AnchorText>(); |
226 | | - String key = t.getKey(); |
227 | 336 | TermEnum te = reader.terms(new Term("links",key+"|")); |
228 | 337 | while(te.next()){ |
229 | 338 | if(!te.term().text().startsWith(key) || !te.term().field().equals("links")) |
— | — | @@ -233,7 +342,7 @@ |
234 | 343 | } |
235 | 344 | |
236 | 345 | static public class AnchorText { |
237 | | - public String text; |
| 346 | + public String text; /** ns:title **/ |
238 | 347 | public int freq; |
239 | 348 | public AnchorText(String text, int freq) { |
240 | 349 | this.text = text; |
— | — | @@ -243,25 +352,81 @@ |
244 | 353 | |
245 | 354 | /** Get all article titles linking to given title |
246 | 355 | * @throws IOException */ |
247 | | - public ArrayList<Title> getInLinks(Title t) throws IOException{ |
248 | | - ArrayList<Title> ret = new ArrayList<Title>(); |
249 | | - TermDocs td = reader.termDocs(new Term("links",t.getKey()+"|")); |
| 356 | + public ArrayList<String> getInLinks(String key, HashMap<Integer,String> keyCache) throws IOException{ |
| 357 | + ensureRead(); |
| 358 | + ArrayList<String> ret = new ArrayList<String>(); |
| 359 | + TermDocs td = reader.termDocs(new Term("links",key+"|")); |
250 | 360 | while(td.next()){ |
251 | | - ret.add(new Title(reader.document(td.doc()).get("article_key"))); |
| 361 | + //ret.add(keyCache.get(td.doc())); |
| 362 | + ret.add(reader.document(td.doc()).get("article_key")); |
252 | 363 | } |
253 | 364 | return ret; |
254 | 365 | } |
255 | 366 | |
256 | 367 | /** Get links from this article to other articles */ |
257 | | - public ArrayList<Title> getOutLinks(Title t) throws IOException{ |
258 | | - ArrayList<Title> ret = new ArrayList<Title>(); |
259 | | - TermDocs td = reader.termDocs(new Term("article_key",t.getKey())); |
| 368 | + public StringList getOutLinks(String key) throws IOException{ |
| 369 | + ensureRead(); |
| 370 | + TermDocs td = reader.termDocs(new Term("article_key",key)); |
260 | 371 | if(td.next()){ |
261 | | - String links = reader.document(td.doc()).get("links_stored"); |
262 | | - for(String key : links.split(""+DELIMITER)){ |
263 | | - ret.add(new Title(key)); |
| 372 | + return new StringList(reader.document(td.doc()).get("links_stored")); |
| 373 | + } |
| 374 | + return null; |
| 375 | + } |
| 376 | + |
| 377 | + public Dictionary getKeys() throws IOException{ |
| 378 | + ensureRead(); |
| 379 | + return new LuceneDictionary(reader,"article_key"); |
| 380 | + } |
| 381 | + @Deprecated |
| 382 | + protected void cacheInLinks() throws IOException{ |
| 383 | + if(state != State.FLUSHED) |
| 384 | + flush(); |
| 385 | + log.info("Caching in-links"); |
| 386 | + int count = 0; |
| 387 | + // docid -> key |
| 388 | + HashMap<Integer,String> keyCache = new HashMap<Integer,String>(); |
| 389 | + Dictionary dict = new LuceneDictionary(reader,"article_key"); |
| 390 | + Word w; |
| 391 | + // build key cache |
| 392 | + while((w = dict.next()) != null){ |
| 393 | + String key = w.getWord(); |
| 394 | + TermDocs td = reader.termDocs(new Term("article_key",key)); |
| 395 | + if(td.next()){ |
| 396 | + keyCache.put(td.doc(),key); |
| 397 | + } else |
| 398 | + log.error("Cannot find article for key "+key); |
| 399 | + } |
| 400 | + |
| 401 | + // get inlinks |
| 402 | + for(String key : keyCache.values()){ |
| 403 | + ArrayList<String> in = getInLinks(key,keyCache); |
| 404 | + Document doc = new Document(); |
| 405 | + doc.add(new Field("inlinks_key",key,Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 406 | + doc.add(new Field("inlinks",new StringList(in).toString(),Field.Store.YES,Field.Index.UN_TOKENIZED)); |
| 407 | + writer.addDocument(doc); |
| 408 | + count ++; |
| 409 | + if(count % 1000 == 0){ |
| 410 | + System.out.println("Cached inlinks for "+count); |
264 | 411 | } |
265 | 412 | } |
266 | | - return ret; |
267 | 413 | } |
| 414 | + |
| 415 | + /** Get all article titles linking to given title (from inlinks cache) |
| 416 | + * @throws IOException */ |
| 417 | + public Collection<String> getInLinksFromCache(String key) throws IOException{ |
| 418 | + ensureRead(); |
| 419 | + TermDocs td = reader.termDocs(new Term("inlinks_key",key)); |
| 420 | + while(td.next()){ |
| 421 | + return new StringList(reader.document(td.doc()).get("inlinks")).toCollection(); |
| 422 | + } |
| 423 | + return new ArrayList<String>(); |
| 424 | + } |
| 425 | + |
| 426 | + public Integer getDocId(String key) throws IOException { |
| 427 | + TermDocs td = reader.termDocs(new Term("article_key",key)); |
| 428 | + if(td.next()){ |
| 429 | + return td.doc(); |
| 430 | + } |
| 431 | + return null; |
| 432 | + } |
268 | 433 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Related.java |
— | — | @@ -1,22 +1,55 @@ |
2 | 2 | package org.wikimedia.lsearch.ranks; |
3 | 3 | |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
| 6 | + |
4 | 7 | public class Related { |
5 | | - protected CompactArticleLinks title; |
6 | | - protected CompactArticleLinks relates; |
| 8 | + protected String title; |
| 9 | + protected String relates; |
7 | 10 | protected double score; |
8 | | - public Related(CompactArticleLinks title, CompactArticleLinks relates, double score) { |
| 11 | + public Related(String title, String relates, double score) { |
9 | 12 | this.title = title; |
10 | 13 | this.relates = relates; |
11 | 14 | this.score = score; |
12 | 15 | } |
| 16 | + |
| 17 | + public Related(String serialized) { |
| 18 | + this.title = null; |
| 19 | + int i = serialized.indexOf(' '); |
| 20 | + this.score = Double.parseDouble(serialized.substring(0,i)); |
| 21 | + this.relates = serialized.substring(i+1); |
| 22 | + } |
| 23 | + |
13 | 24 | @Override |
14 | 25 | public String toString() { |
15 | 26 | return title+"->"+relates+" : "+score; |
16 | 27 | } |
17 | | - public CompactArticleLinks getRelates() { |
| 28 | + |
| 29 | + |
| 30 | + public static ArrayList<String> convertToStringList(Collection<Related> rel){ |
| 31 | + ArrayList<String> ret = new ArrayList<String>(); |
| 32 | + for(Related r : rel){ |
| 33 | + ret.add(r.serialize()); |
| 34 | + } |
| 35 | + return ret; |
| 36 | + } |
| 37 | + |
| 38 | + public static ArrayList<Related> convertToRelatedList(Collection<String> sl){ |
| 39 | + ArrayList<Related> ret = new ArrayList<Related>(); |
| 40 | + for(String s : sl){ |
| 41 | + ret.add(new Related(s)); |
| 42 | + } |
| 43 | + return ret; |
| 44 | + } |
| 45 | + |
| 46 | + public String serialize(){ |
| 47 | + return score+" "+relates; |
| 48 | + } |
| 49 | + |
| 50 | + public String getRelates() { |
18 | 51 | return relates; |
19 | 52 | } |
20 | | - public void setRelates(CompactArticleLinks relates) { |
| 53 | + public void setRelates(String relates) { |
21 | 54 | this.relates = relates; |
22 | 55 | } |
23 | 56 | public double getScore() { |
— | — | @@ -25,10 +58,10 @@ |
26 | 59 | public void setScore(double score) { |
27 | 60 | this.score = score; |
28 | 61 | } |
29 | | - public CompactArticleLinks getTitle() { |
| 62 | + public String getTitle() { |
30 | 63 | return title; |
31 | 64 | } |
32 | | - public void setTitle(CompactArticleLinks title) { |
| 65 | + public void setTitle(String title) { |
33 | 66 | this.title = title; |
34 | 67 | } |
35 | 68 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java |
— | — | @@ -26,6 +26,8 @@ |
27 | 27 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
28 | 28 | import org.wikimedia.lsearch.ranks.OldLinks; |
29 | 29 | import org.wikimedia.lsearch.ranks.RelatedTitle; |
| 30 | +import org.wikimedia.lsearch.storage.ArticleAnalytics; |
| 31 | +import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
30 | 32 | import org.wikimedia.lsearch.util.Localization; |
31 | 33 | |
32 | 34 | /** |
— | — | @@ -40,11 +42,13 @@ |
41 | 43 | Revision revision; |
42 | 44 | CleanIndexWriter writer; |
43 | 45 | String langCode; |
| 46 | + LinkAnalysisStorage las; |
44 | 47 | |
45 | 48 | public CleanIndexImporter(IndexId iid, String langCode) throws IOException{ |
46 | 49 | Configuration.open(); // make sure configuration is loaded |
47 | 50 | this.writer = new CleanIndexWriter(iid); |
48 | 51 | this.langCode = langCode; |
| 52 | + this.las = new LinkAnalysisStorage(iid); |
49 | 53 | } |
50 | 54 | public void writeRevision(Revision revision) throws IOException { |
51 | 55 | this.revision = revision; |
— | — | @@ -53,22 +57,27 @@ |
54 | 58 | this.page = page; |
55 | 59 | } |
56 | 60 | public void writeEndPage() throws IOException { |
| 61 | + String key = page.Title.Namespace+":"+page.Title.Text; |
| 62 | + ArticleAnalytics aa = las.getAnalitics(key); |
| 63 | + int references = aa.getReferences(); |
| 64 | + boolean isRedirect = aa.isRedirect(); |
| 65 | + |
| 66 | + // make list of redirects |
57 | 67 | ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
58 | | - boolean isRedirect = Localization.getRedirectTarget(revision.Text,langCode) != null; |
59 | | - ArrayList<RelatedTitle> related = new ArrayList<RelatedTitle>(); |
| 68 | + ArrayList<String> anchors = new ArrayList<String>(); |
| 69 | + anchors.addAll(aa.getAnchorText()); |
| 70 | + for(String rk : aa.getRedirectKeys()){ |
| 71 | + String[] parts = rk.toString().split(":",2); |
| 72 | + ArticleAnalytics raa = las.getAnalitics(rk); |
| 73 | + redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences())); |
| 74 | + anchors.addAll(raa.getAnchorText()); |
| 75 | + } |
60 | 76 | // make article |
61 | | - Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects,related); |
62 | | - //if(page.Title.Namespace != 0) |
63 | | - // article.setContents(""); |
| 77 | + Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect, |
| 78 | + references,redirects,new ArrayList<RelatedTitle>(),anchors); |
| 79 | + // Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,0,redirects,related); |
64 | 80 | |
65 | | - writer.addMainArticle(article); |
66 | | - //writer.addAllArticle(article); |
67 | | - // generate phrases |
68 | | - /* FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(page.Title.Text,langCode,false); |
69 | | - ArrayList<Token> tokens = parser.parse(); |
70 | | - for(int i=0;i<tokens.size()-1;i++){ |
71 | | - phrases.addPhrase(tokens.get(i).termText(),tokens.get(i+1).termText()); |
72 | | - } */ |
| 81 | + writer.addArticle(article); |
73 | 82 | } |
74 | 83 | |
75 | 84 | public void close() throws IOException { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -8,8 +8,6 @@ |
9 | 9 | import java.util.HashMap; |
10 | 10 | import java.util.HashSet; |
11 | 11 | import java.util.Iterator; |
12 | | -import java.util.LinkedList; |
13 | | -import java.util.List; |
14 | 12 | import java.util.Set; |
15 | 13 | import java.util.WeakHashMap; |
16 | 14 | import java.util.Map.Entry; |
— | — | @@ -40,7 +38,6 @@ |
41 | 39 | import org.wikimedia.lsearch.search.NamespaceFilter; |
42 | 40 | import org.wikimedia.lsearch.search.NamespaceFilterWrapper; |
43 | 41 | import org.wikimedia.lsearch.search.SearcherCache; |
44 | | -import org.wikimedia.lsearch.spell.api.NamespaceFreq; |
45 | 42 | import org.wikimedia.lsearch.spell.api.NgramIndexer; |
46 | 43 | import org.wikimedia.lsearch.spell.dist.DoubleMetaphone; |
47 | 44 | import org.wikimedia.lsearch.spell.dist.EditDistance; |
— | — | @@ -48,11 +45,8 @@ |
49 | 46 | public class Suggest { |
50 | 47 | static Logger log = Logger.getLogger(Suggest.class); |
51 | 48 | protected IndexId iid; |
52 | | - protected IndexSearcher words; |
53 | | - protected IndexSearcher titles; |
54 | | - protected IndexReader titlesReader; |
55 | | - protected int minHitsWords; |
56 | | - protected int minHitsTitles; |
| 49 | + protected IndexSearcher searcher; |
| 50 | + protected IndexReader reader; |
57 | 51 | protected static WeakHashMap<IndexSearcher,Set<String>> stopWordsIndexes = new WeakHashMap<IndexSearcher,Set<String>>(); |
58 | 52 | protected Set<String> stopWords; |
59 | 53 | |
— | — | @@ -100,26 +94,23 @@ |
101 | 95 | SearcherCache cache = SearcherCache.getInstance(); |
102 | 96 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
103 | 97 | this.iid = iid; |
104 | | - this.words = cache.getLocalSearcher(iid.getSpellWords()); |
105 | | - this.titles = cache.getLocalSearcher(iid.getSpellTitles()); |
106 | | - this.titlesReader = titles.getIndexReader(); |
107 | | - this.minHitsWords = global.getIntDBParam(iid.getDBname(),"spell_words","minHits",20); |
108 | | - this.minHitsTitles = global.getIntDBParam(iid.getDBname(),"spell_titles","minHits",20); |
| 98 | + this.searcher = cache.getLocalSearcher(iid.getSpell()); |
| 99 | + this.reader = searcher.getIndexReader(); |
109 | 100 | |
110 | 101 | synchronized(stopWordsIndexes){ |
111 | | - if(!stopWordsIndexes.containsKey(titles)){ |
| 102 | + if(!stopWordsIndexes.containsKey(searcher)){ |
112 | 103 | Set<String> s = Collections.synchronizedSet(new HashSet<String>()); |
113 | | - stopWordsIndexes.put(titles,s); |
114 | | - TermDocs d = titles.getIndexReader().termDocs(new Term("metadata_key","stopWords")); |
| 104 | + stopWordsIndexes.put(searcher,s); |
| 105 | + TermDocs d = searcher.getIndexReader().termDocs(new Term("metadata_key","stopWords")); |
115 | 106 | if(d.next()){ |
116 | | - String val = titles.doc(d.doc()).get("metadata_value"); |
| 107 | + String val = searcher.doc(d.doc()).get("metadata_value"); |
117 | 108 | for(String sw : val.split(" ")){ |
118 | 109 | s.add(sw); |
119 | 110 | } |
120 | 111 | } |
121 | 112 | } |
122 | 113 | this.stopWords = new HashSet<String>(); |
123 | | - this.stopWords.addAll(stopWordsIndexes.get(titles)); |
| 114 | + this.stopWords.addAll(stopWordsIndexes.get(searcher)); |
124 | 115 | log.info("Using stop words "+stopWords); |
125 | 116 | } |
126 | 117 | } |
— | — | @@ -224,7 +215,7 @@ |
225 | 216 | continue; |
226 | 217 | |
227 | 218 | String phrase = w+gap+w2; |
228 | | - if(titlesReader.docFreq(new Term("phrase",phrase)) != 0){ |
| 219 | + if(reader.docFreq(new Term("phrase",phrase)) != 0){ |
229 | 220 | correctPhrases.add(i); |
230 | 221 | correctPhrases.add(i2); |
231 | 222 | } else if(correctWords.contains(w) && correctWords.contains(w2)){ |
— | — | @@ -258,9 +249,9 @@ |
259 | 250 | // suggest word |
260 | 251 | ArrayList<SuggestResult> sug; |
261 | 252 | if(correctWords.contains(w)) |
262 | | - sug = suggestWordsFromTitle(w,w,nsf,POOL/2,POOL/2); |
| 253 | + sug = suggestWords(w,w,nsf,POOL/2,POOL/2); |
263 | 254 | else |
264 | | - sug = suggestWordsFromTitle(w,nsf,POOL); |
| 255 | + sug = suggestWords(w,nsf,POOL); |
265 | 256 | if(sug.size() > 0){ |
266 | 257 | wordSug.add(sug); |
267 | 258 | SuggestResult maybeStopWord = null; |
— | — | @@ -287,7 +278,7 @@ |
288 | 279 | possibleStopWords.add(null); |
289 | 280 | } |
290 | 281 | // suggest split |
291 | | - SuggestResult split = suggestSplitFromTitle(w,nsf,minFreq); |
| 282 | + SuggestResult split = suggestSplit(w,nsf,minFreq); |
292 | 283 | if(split != null){ |
293 | 284 | Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT); |
294 | 285 | sc.substitutes.put(i,split.word.replace("_"," ")); |
— | — | @@ -297,7 +288,7 @@ |
298 | 289 | if(i-1 >= 0 |
299 | 290 | && (wordSug.get(i-1)==null || wordSug.get(i-1).get(0).dist!=0) |
300 | 291 | && (wordSug.get(i)==null || wordSug.get(i).get(0).dist!=0)){ |
301 | | - SuggestResult join = suggestJoinFromTitle(tokens.get(i-1).termText(),w,nsf,minFreq); |
| 292 | + SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,nsf,minFreq); |
302 | 293 | if(join != null){ |
303 | 294 | Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN); |
304 | 295 | sc.substitutes.put(i-1,""); |
— | — | @@ -356,12 +347,12 @@ |
357 | 348 | String phrase = s1.word+gap+s2.word; |
358 | 349 | int freq = 0; |
359 | 350 | boolean inTitle = false; |
360 | | - TermDocs td = titlesReader.termDocs(new Term("phrase",phrase)); |
| 351 | + TermDocs td = reader.termDocs(new Term("phrase",phrase)); |
361 | 352 | if(td.next()){ |
362 | 353 | int docid = td.doc(); |
363 | | - String f = titlesReader.document(docid).get("freq"); |
| 354 | + String f = reader.document(docid).get("freq"); |
364 | 355 | freq = Integer.parseInt(f.substring(2,f.length()-1)); |
365 | | - String it = titlesReader.document(docid).get("intitle"); |
| 356 | + String it = reader.document(docid).get("intitle"); |
366 | 357 | if(it!=null && it.equals("1")) |
367 | 358 | inTitle = true; |
368 | 359 | |
— | — | @@ -413,7 +404,7 @@ |
414 | 405 | } |
415 | 406 | if(madeChanges){ |
416 | 407 | // check if some title exactly matches the spell-checked query |
417 | | - if(titlesReader.docFreq(new Term("title",title.toLowerCase())) != 0){ |
| 408 | + if(reader.docFreq(new Term("title",title.toLowerCase())) != 0){ |
418 | 409 | log.info("Found title match for "+title); |
419 | 410 | return new SuggestQuery(tidy(title),tidy(formated)); |
420 | 411 | } |
— | — | @@ -431,7 +422,7 @@ |
432 | 423 | if(r.getDist() > maxdist) |
433 | 424 | break; |
434 | 425 | String title = r.getWord(); |
435 | | - if(titlesReader.docFreq(new Term("title",title.toLowerCase())) != 0){ |
| 426 | + if(reader.docFreq(new Term("title",title.toLowerCase())) != 0){ |
436 | 427 | log.info("Found title match for "+title); |
437 | 428 | return new SuggestQuery(tidy(title),tidy(markSuggestion(searchterm,t,title))); |
438 | 429 | } |
— | — | @@ -492,48 +483,6 @@ |
493 | 484 | } |
494 | 485 | |
495 | 486 | return null; |
496 | | - } |
497 | | - |
498 | | - protected boolean addPhraseSuggestion(ArrayList<Token> tokens, int i1, int i2, ArrayList<Change> suggestions, NamespaceFilter nsf, int minFreq) { |
499 | | - Token t1 = tokens.get(i1); |
500 | | - Token t2 = tokens.get(i2); |
501 | | - if(t2.type().equals(t1.type())){ |
502 | | - String word1 = t1.termText(); |
503 | | - String word2 = t2.termText(); |
504 | | - if(stopWords.contains(word1) || stopWords.contains(word2)) |
505 | | - return false; |
506 | | - log.info("spell-check phrase \""+word1+" "+word2+"\""); |
507 | | - // phrase |
508 | | - ArrayList<SuggestResult> r = suggestPhraseFromTitle(word1,word2,1,nsf,minFreq); |
509 | | - if(r.size() > 0){ |
510 | | - SuggestResult res = r.get(0); |
511 | | - String[] ph = res.word.split("_"); |
512 | | - if(ph.length == 2){ |
513 | | - // figure out which words need to be changed |
514 | | - Change sc = new Change(res.dist,res.frequency,Change.Type.PHRASE); |
515 | | - if(!ph[0].equals(word1)) |
516 | | - sc.substitutes.put(i1,ph[0]); |
517 | | - else |
518 | | - sc.preserves.put(i1,ph[0]); |
519 | | - if(!ph[1].equals(word2)) |
520 | | - sc.substitutes.put(i2,ph[1]); |
521 | | - else |
522 | | - sc.preserves.put(i2,ph[1]); |
523 | | - suggestions.add(sc); |
524 | | - } else |
525 | | - log.error("Unexpected phrase in suggest result "+res); |
526 | | - } |
527 | | - // join |
528 | | - SuggestResult join = suggestJoinFromTitle(word1,word2,nsf,minFreq); |
529 | | - if(join != null){ |
530 | | - Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN); |
531 | | - sc.substitutes.put(i1,""); |
532 | | - sc.substitutes.put(i2,join.word); |
533 | | - suggestions.add(sc); |
534 | | - } |
535 | | - return true; |
536 | | - } |
537 | | - return false; |
538 | 487 | } |
539 | 488 | |
540 | 489 | protected String markSuggestion(String formated, Token t, String newWord){ |
— | — | @@ -616,59 +565,12 @@ |
617 | 566 | return new Object[] {proposedChanges, preservedWords}; |
618 | 567 | } |
619 | 568 | |
620 | | - /** Suggest some words from the words index */ |
621 | | - public ArrayList<SuggestResult> suggestWords(String word, int num){ |
622 | | - Metric metric = new Metric(word); |
623 | | - BooleanQuery bq = new BooleanQuery(); |
624 | | - addQuery(bq,"metaphone1",metric.meta1,2); |
625 | | - addQuery(bq,"metaphone2",metric.meta2,2); |
626 | | - bq.add(makeWordQuery(word,""),BooleanClause.Occur.SHOULD); |
627 | | - |
628 | | - try { |
629 | | - TopDocs docs = words.search(bq,null,POOL); |
630 | | - ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
631 | | - int minfreq = -1; |
632 | | - // fetch results, calculate various edit distances |
633 | | - for(ScoreDoc sc : docs.scoreDocs){ |
634 | | - Document d = words.doc(sc.doc); |
635 | | - String w = d.get("word"); |
636 | | - SuggestResult r = new SuggestResult(w, |
637 | | - Integer.parseInt(d.get("freq")), |
638 | | - metric); |
639 | | - if(word.equals(r.word)){ |
640 | | - minfreq = r.frequency; |
641 | | - } |
642 | | - if(acceptWord(r,metric)) |
643 | | - res.add(r); |
644 | | - } |
645 | | - // filter out |
646 | | - if(minfreq != -1){ |
647 | | - for(int i=0;i<res.size();){ |
648 | | - if(res.get(i).frequency < minfreq ){ |
649 | | - res.remove(i); |
650 | | - } else |
651 | | - i++; |
652 | | - } |
653 | | - } |
654 | | - // sort |
655 | | - Collections.sort(res,new SuggestResult.Comparator()); |
656 | | - ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>(); |
657 | | - for(int i=0;i<num && i<res.size();i++) |
658 | | - ret.add(res.get(i)); |
659 | | - return ret; |
660 | | - } catch (IOException e) { |
661 | | - log.error("Cannot get suggestions for "+word+" at "+iid+" : "+e.getMessage()); |
662 | | - e.printStackTrace(); |
663 | | - return new ArrayList<SuggestResult>(); |
664 | | - } |
665 | | - } |
666 | | - |
667 | | - public ArrayList<SuggestResult> suggestWordsFromTitle(String word, NamespaceFilter nsf, int num){ |
668 | | - ArrayList<SuggestResult> r1 = suggestWordsFromTitle(word,word,nsf,POOL,POOL); |
| 569 | + public ArrayList<SuggestResult> suggestWords(String word, NamespaceFilter nsf, int num){ |
| 570 | + ArrayList<SuggestResult> r1 = suggestWords(word,word,nsf,POOL,POOL); |
669 | 571 | if(r1 != null && r1.size() > 0){ |
670 | 572 | if(r1.get(0).dist == 0) |
671 | 573 | return r1; |
672 | | - ArrayList<SuggestResult> r2 = suggestWordsFromTitle(word,r1.get(0).word,nsf,POOL/2,POOL/2); |
| 574 | + ArrayList<SuggestResult> r2 = suggestWords(word,r1.get(0).word,nsf,POOL/2,POOL/2); |
673 | 575 | if(r2 != null && r2.size() > 0){ |
674 | 576 | HashSet<SuggestResult> hr = new HashSet<SuggestResult>(); |
675 | 577 | hr.addAll(r1); hr.addAll(r2); |
— | — | @@ -682,54 +584,27 @@ |
683 | 585 | return r1; |
684 | 586 | } |
685 | 587 | |
686 | | - public ArrayList<SuggestResult> suggestWordsFromTitle(String word, String searchword, NamespaceFilter nsf, int num, int pool_size){ |
| 588 | + public ArrayList<SuggestResult> suggestWords(String word, String searchword, NamespaceFilter nsf, int num, int pool_size){ |
687 | 589 | Metric metric = new Metric(word); |
688 | 590 | BooleanQuery bq = new BooleanQuery(); |
689 | 591 | bq.add(makeWordQuery(searchword,"word"),BooleanClause.Occur.SHOULD); |
690 | 592 | |
691 | 593 | try { |
692 | | - TopDocs docs = titles.search(bq,new NamespaceFilterWrapper(nsf),pool_size); |
| 594 | + TopDocs docs = searcher.search(bq,new NamespaceFilterWrapper(nsf),pool_size); |
693 | 595 | ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
694 | | - int minfreq = -1; |
695 | 596 | // fetch results, calculate various edit distances |
696 | 597 | for(ScoreDoc sc : docs.scoreDocs){ |
697 | | - Document d = titles.doc(sc.doc); |
| 598 | + Document d = searcher.doc(sc.doc); |
698 | 599 | String w = d.get("word"); |
699 | 600 | String f = d.get("freq"); |
700 | 601 | String meta1 = d.get("meta1"); |
701 | 602 | String meta2 = d.get("meta2"); |
702 | 603 | SuggestResult r = new SuggestResult(w, // new NamespaceFreq(d.get("freq")).getFrequency(nsf), |
703 | 604 | Integer.parseInt(f.substring(2,f.length()-1)), |
704 | | - metric, meta1, meta2); |
705 | | - if(word.equals(r.word)){ |
706 | | - minfreq = r.frequency; |
707 | | - } |
| 605 | + metric, meta1, meta2); |
708 | 606 | if(acceptWord(r,metric)) |
709 | 607 | res.add(r); |
710 | 608 | } |
711 | | - // filter out |
712 | | - /*if(minfreq != -1){ |
713 | | - for(int i=0;i<res.size();){ |
714 | | - if(res.get(i).frequency < minfreq ){ |
715 | | - res.remove(i); |
716 | | - } else |
717 | | - i++; |
718 | | - } |
719 | | - } */ |
720 | | - // suggest simple inversion since it probably won't be found |
721 | | - /* if(word.length() == 2){ |
722 | | - String inv = NgramIndexer.reverse(word); |
723 | | - TermDocs td = titlesReader.termDocs(new Term("word",inv)); |
724 | | - int freq = 0; |
725 | | - if(td.next()){ |
726 | | - freq = new NamespaceFreq(titlesReader.document(td.doc()).get("freq")).getFrequency(nsf); |
727 | | - SuggestResult r = new SuggestResult(inv, |
728 | | - freq, |
729 | | - metric); |
730 | | - //if(acceptWord(r,metric)) |
731 | | - res.add(r); |
732 | | - } |
733 | | - } */ |
734 | 609 | // sort |
735 | 610 | Collections.sort(res,new SuggestResult.Comparator()); |
736 | 611 | ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>(); |
— | — | @@ -785,22 +660,22 @@ |
786 | 661 | } |
787 | 662 | |
788 | 663 | /** Try to split word into 2 words which make up a phrase */ |
789 | | - public SuggestResult suggestSplitFromTitle(String word, NamespaceFilter nsf, int minFreq){ |
| 664 | + public SuggestResult suggestSplit(String word, NamespaceFilter nsf, int minFreq){ |
790 | 665 | int freq = 0; |
791 | 666 | Hits hits; |
792 | 667 | ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
793 | 668 | try { |
794 | 669 | // find frequency |
795 | | - hits = titles.search(new TermQuery(new Term("word",word)),new NamespaceFilterWrapper(nsf)); |
| 670 | + hits = searcher.search(new TermQuery(new Term("word",word)),new NamespaceFilterWrapper(nsf)); |
796 | 671 | if(hits.length() == 1) |
797 | | - freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf); |
| 672 | + freq = Integer.parseInt(hits.doc(0).get("freq")); |
798 | 673 | |
799 | 674 | // try different splits |
800 | 675 | for(int i=1;i<word.length()-1;i++){ |
801 | 676 | String phrase = word.substring(0,i) + "_" + word.substring(i); |
802 | | - hits = titles.search(new TermQuery(new Term("phrase",phrase)),new NamespaceFilterWrapper(nsf)); |
| 677 | + hits = searcher.search(new TermQuery(new Term("phrase",phrase)),new NamespaceFilterWrapper(nsf)); |
803 | 678 | if(hits.length() > 0){ |
804 | | - int pfreq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf); |
| 679 | + int pfreq = Integer.parseInt(hits.doc(0).get("freq")); |
805 | 680 | if(pfreq >= freq && pfreq > minFreq) |
806 | 681 | res.add(new SuggestResult(phrase,pfreq,2)); |
807 | 682 | } |
— | — | @@ -817,11 +692,11 @@ |
818 | 693 | } |
819 | 694 | |
820 | 695 | /** Returns suggestion if joining words makes sense */ |
821 | | - public SuggestResult suggestJoinFromTitle(String word1, String word2, NamespaceFilter nsf, int minFreq){ |
| 696 | + public SuggestResult suggestJoin(String word1, String word2, NamespaceFilter nsf, int minFreq){ |
822 | 697 | try { |
823 | | - Hits hits = titles.search(new TermQuery(new Term("word",word1+word2)),new NamespaceFilterWrapper(nsf)); |
| 698 | + Hits hits = searcher.search(new TermQuery(new Term("word",word1+word2)),new NamespaceFilterWrapper(nsf)); |
824 | 699 | if(hits.length() > 0){ |
825 | | - int freq = new NamespaceFreq(hits.doc(0).get("freq")).getFrequency(nsf); |
| 700 | + int freq = Integer.parseInt(hits.doc(0).get("freq")); |
826 | 701 | if(freq >= minFreq) |
827 | 702 | return new SuggestResult(word1+word2,freq,1); |
828 | 703 | } |
— | — | @@ -832,55 +707,6 @@ |
833 | 708 | return null; |
834 | 709 | } |
835 | 710 | |
836 | | - /** Suggest phrase from a titles index, if the phrase is correct will return it as first result */ |
837 | | - public ArrayList<SuggestResult> suggestPhraseFromTitle(String word1, String word2, int num, NamespaceFilter nsf, int minFreq){ |
838 | | - String phrase = word1+"_"+word2; |
839 | | - Query q = makeWordQuery(phrase,"phrase"); |
840 | | - Metric m1 = new Metric(word1); |
841 | | - Metric m2 = new Metric(word2); |
842 | | - Metric metric = new Metric(phrase); |
843 | | - try { |
844 | | - TopDocs docs = titles.search(q,new NamespaceFilterWrapper(nsf),POOL/2); |
845 | | - ArrayList<SuggestResult> res = new ArrayList<SuggestResult>(); |
846 | | - int minfreq = (minFreq == 0)? -1 : minFreq; |
847 | | - // fetch results |
848 | | - for(ScoreDoc sc : docs.scoreDocs){ |
849 | | - Document d = titles.doc(sc.doc); |
850 | | - String p = d.get("phrase"); |
851 | | - int freq = new NamespaceFreq(d.get("freq")).getFrequency(nsf); |
852 | | - SuggestResult r = new SuggestResult(p,freq,metric); |
853 | | - if(phrase.equals(r.word) && minfreq == -1){ |
854 | | - minfreq = r.frequency; |
855 | | - } |
856 | | - String[] words = p.split("_"); |
857 | | - SuggestResult r1 = new SuggestResult(words[0],freq,m1); |
858 | | - SuggestResult r2 = new SuggestResult(words[1],freq,m2); |
859 | | - if(r.dist < phrase.length() / 2 && acceptWord(r1,m1) && acceptWord(r2,m2)) // don't add if it will change more than half of the phrase |
860 | | - res.add(r); |
861 | | - } |
862 | | - // filter out |
863 | | - if(minfreq != -1){ |
864 | | - for(int i=0;i<res.size();){ |
865 | | - if(res.get(i).frequency < minfreq ){ |
866 | | - res.remove(i); |
867 | | - } else |
868 | | - i++; |
869 | | - } |
870 | | - } |
871 | | - // sort |
872 | | - Collections.sort(res,new SuggestResult.Comparator()); |
873 | | - // get first num results |
874 | | - while(res.size() > num){ |
875 | | - res.remove(res.size()-1); |
876 | | - } |
877 | | - return res; |
878 | | - } catch (IOException e) { |
879 | | - log.error("Cannot get suggestions for "+phrase+" at "+iid+" : "+e.getMessage()); |
880 | | - e.printStackTrace(); |
881 | | - return new ArrayList<SuggestResult>(); |
882 | | - } |
883 | | - } |
884 | | - |
885 | 711 | /** check if two words have same stemmed variants */ |
886 | 712 | public boolean stemsToSame(String word1, String word2, FilterFactory filters){ |
887 | 713 | if(!filters.hasStemmer()) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java |
— | — | @@ -62,19 +62,14 @@ |
63 | 63 | if(text.length()>=2){ |
64 | 64 | System.out.println("METAPHONES: "+dmeta.doubleMetaphone(text)+", "+dmeta.doubleMetaphone(text,true)); |
65 | 65 | System.out.println("SUGGEST: "); |
66 | | - for(SuggestResult r : sc.suggestWords(text,10)){ |
| 66 | + for(SuggestResult r : sc.suggestWords(text,new NamespaceFilter(ns),10)){ |
67 | 67 | System.out.println(r); |
68 | 68 | } |
69 | | - System.out.println("SUGGEST_TITLE: "); |
70 | | - for(SuggestResult r : sc.suggestWordsFromTitle(text,new NamespaceFilter(ns),10)){ |
71 | | - System.out.println(r); |
72 | | - } |
73 | 69 | |
74 | | - System.out.println("SPLIT: "+sc.suggestSplitFromTitle(text,new NamespaceFilter(ns),0)); |
| 70 | + System.out.println("SPLIT: "+sc.suggestSplit(text,new NamespaceFilter(ns),0)); |
75 | 71 | } |
76 | 72 | if(last != null){ |
77 | | - System.out.println("JOIN: "+sc.suggestJoinFromTitle(last,text,new NamespaceFilter(ns),0)); |
78 | | - System.out.println("PHRASE: "+sc.suggestPhraseFromTitle(last,text,2,new NamespaceFilter(ns),0)); |
| 73 | + System.out.println("JOIN: "+sc.suggestJoin(last,text,new NamespaceFilter(ns),0)); |
79 | 74 | } |
80 | 75 | last = text; |
81 | 76 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | import org.wikimedia.lsearch.index.IndexUpdateRecord; |
22 | 22 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
23 | 23 | import org.wikimedia.lsearch.index.WikiSimilarity; |
| 24 | +import org.wikimedia.lsearch.search.NamespaceFilter; |
24 | 25 | import org.wikimedia.lsearch.util.HighFreqTerms; |
25 | 26 | |
26 | 27 | /** |
— | — | @@ -32,37 +33,85 @@ |
33 | 34 | public class CleanIndexWriter { |
34 | 35 | static Logger log = Logger.getLogger(CleanIndexWriter.class); |
35 | 36 | protected IndexId iid; |
36 | | - protected IndexWriter writerMain; |
37 | | - protected IndexWriter writerAll; |
| 37 | + protected IndexWriter writer; |
38 | 38 | protected FieldBuilder builder; |
39 | 39 | protected String langCode; |
| 40 | + protected NamespaceFilter nsf; |
40 | 41 | |
41 | 42 | public static final String[] ENGLISH_STOP_WORDS = { |
42 | | - "a", "an", "and", "are", "as", "at", "be", "but", "by", |
43 | | - "for", "if", "in", "into", "is", "it", |
44 | | - "no", "not", "of", "on", "or", "such", |
45 | | - "that", "the", "their", "then", "there", "these", |
46 | | - "they", "this", "to", "was", "will", "with" |
47 | | - }; |
| 43 | + "a", "an", "and", "are", "as", "at", "be", "but", "by", |
| 44 | + "for", "if", "in", "into", "is", "it", |
| 45 | + "no", "not", "of", "on", "or", "such", |
| 46 | + "that", "the", "their", "then", "there", "these", |
| 47 | + "they", "this", "to", "was", "will", "with" |
| 48 | + }; |
| 49 | + |
| 50 | + public final static String[] FRENCH_STOP_WORDS = { |
| 51 | + "a", "afin", "ai", "ainsi", "apres", "attendu", "au", "aujourd", "auquel", "aussi", |
| 52 | + "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", |
| 53 | + "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain", |
| 54 | + "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci", |
| 55 | + "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout", |
| 56 | + "dedans", "dehors", "dela", "depuis", "derriere", "des", "desormais", "desquelles", |
| 57 | + "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse", |
| 58 | + "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "des", "elle", "elles", |
| 59 | + "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepte", "hormis", |
| 60 | + "hors", "helas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle", |
| 61 | + "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "la", |
| 62 | + "ma", "mais", "malgre", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi", |
| 63 | + "moins", "mon", "moyennant", "meme", "memes", "n", "ne", "ni", "non", "nos", "notre", |
| 64 | + "nous", "neanmoins", "notre", "notres", "on", "ont", "ou", "outre", "ou", "par", "parmi", |
| 65 | + "partant", "pas", "passe", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi", |
| 66 | + "proche", "pres", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels", |
| 67 | + "qui", "quoi", "quoique", "revoici", "revoila", "s", "sa", "sans", "sauf", "se", "selon", |
| 68 | + "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit", |
| 69 | + "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes", |
| 70 | + "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers", |
| 71 | + "voici", "voila", "vos", "votre", "vous", "vu", "votre", "votres", "y", "a", "ca", "es", |
| 72 | + "ete", "etre", "o" |
| 73 | + }; |
48 | 74 | |
| 75 | + public final static String[] GERMAN_STOP_WORDS = { |
| 76 | + "einer", "eine", "eines", "einem", "einen", |
| 77 | + "der", "die", "das", "dass", "daß", |
| 78 | + "du", "er", "sie", "es", |
| 79 | + "was", "wer", "wie", "wir", |
| 80 | + "und", "oder", "ohne", "mit", |
| 81 | + "am", "im", "in", "aus", "auf", |
| 82 | + "ist", "sein", "war", "wird", |
| 83 | + "ihr", "ihre", "ihres", |
| 84 | + "als", "für", "von", "mit", |
| 85 | + "dich", "dir", "mich", "mir", |
| 86 | + "mein", "sein", "kein", |
| 87 | + "durch", "wegen", "wird" |
| 88 | + }; |
| 89 | + |
49 | 90 | public CleanIndexWriter(IndexId iid) throws IOException{ |
| 91 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
50 | 92 | this.iid = iid; |
51 | 93 | this.builder = new FieldBuilder("",FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK); |
52 | | - this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
| 94 | + this.langCode = global.getLanguage(iid.getDBname()); |
53 | 95 | HashSet<String> stopWords = new HashSet<String>(); |
54 | | - if(langCode.equals("en")){ |
55 | | - for(String w : ENGLISH_STOP_WORDS) |
56 | | - stopWords.add(w); |
| 96 | + String[] words = null; |
| 97 | + if(langCode.equals("en")) |
| 98 | + words = ENGLISH_STOP_WORDS; |
| 99 | + else if(langCode.equals("de")) |
| 100 | + words = GERMAN_STOP_WORDS; |
| 101 | + else if(langCode.equals("fr")) |
| 102 | + words = FRENCH_STOP_WORDS; |
| 103 | + |
| 104 | + if(words != null){ |
| 105 | + for(String w : words) |
| 106 | + stopWords.add(w); |
57 | 107 | } else{ |
58 | 108 | stopWords.addAll(HighFreqTerms.getHighFreqTerms(iid.getDB(),"contents",20)); |
59 | 109 | } |
60 | 110 | log.info("Using phrase stopwords: "+stopWords); |
61 | 111 | builder.getBuilder().getFilters().setStopWords(stopWords); |
62 | | - String pathMain = iid.getSpellWords().getTempPath(); |
63 | | - //String pathAll = iid.getSpellTitles().getTempPath(); |
64 | | - writerMain = open(pathMain); |
65 | | - //writerAll = open(pathAll); |
66 | | - addMetadata(writerMain,"stopWords",stopWords); |
| 112 | + String path = iid.getSpell().getTempPath(); |
| 113 | + writer = open(path); |
| 114 | + addMetadata(writer,"stopWords",stopWords); |
| 115 | + nsf = global.getDefaultNamespace(iid); |
67 | 116 | } |
68 | 117 | |
69 | 118 | protected IndexWriter open(String path) throws IOException { |
— | — | @@ -88,16 +137,12 @@ |
89 | 138 | return writer; |
90 | 139 | } |
91 | 140 | |
92 | | - /** Add to index used for spell_words */ |
93 | | - public void addMainArticle(Article a){ |
94 | | - if(a.getNamespace().equals("0")) |
95 | | - addArticle(a,writerMain); |
| 141 | + /** Add to index used for spell-check */ |
| 142 | + public void addArticle(Article a){ |
| 143 | + if(nsf.contains(Integer.parseInt(a.getNamespace()))) |
| 144 | + addArticle(a,writer); |
96 | 145 | } |
97 | | - /** Add to inde used for spell_titles */ |
98 | | - public void addAllArticle(Article a){ |
99 | | - //addArticle(a,writerAll); |
100 | | - } |
101 | | - |
| 146 | + |
102 | 147 | /** Add single article */ |
103 | 148 | protected void addArticle(Article a, IndexWriter writer){ |
104 | 149 | if(!WikiIndexModifier.checkAddPreconditions(a,langCode)) |
— | — | @@ -121,12 +166,10 @@ |
122 | 167 | * @throws IOException */ |
123 | 168 | public void close() throws IOException{ |
124 | 169 | try{ |
125 | | - writerMain.optimize(); |
126 | | - writerMain.close(); |
127 | | - //writerAll.optimize(); |
128 | | - //writerAll.close(); |
| 170 | + writer.optimize(); |
| 171 | + writer.close(); |
129 | 172 | } catch(IOException e){ |
130 | | - log.warn("I/O error optimizing/closing index at "+iid.getTempPath()); |
| 173 | + log.error("I/O error optimizing/closing index at "+iid.getTempPath()+" : "+e.getMessage()); |
131 | 174 | throw e; |
132 | 175 | } |
133 | 176 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestBuilder.java |
— | — | @@ -2,46 +2,18 @@ |
3 | 3 | |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.io.InputStream; |
6 | | -import java.util.ArrayList; |
7 | | -import java.util.HashMap; |
8 | | -import java.util.HashSet; |
9 | | -import java.util.Hashtable; |
10 | | -import java.util.Map.Entry; |
11 | 6 | |
12 | 7 | import org.apache.log4j.Logger; |
13 | | -import org.apache.lucene.analysis.Token; |
14 | | -import org.apache.lucene.document.Document; |
15 | | -import org.apache.lucene.index.IndexReader; |
16 | | -import org.apache.lucene.index.Term; |
17 | | -import org.apache.lucene.search.CachingWrapperFilter; |
18 | | -import org.apache.lucene.search.Filter; |
19 | | -import org.apache.lucene.search.Hits; |
20 | | -import org.apache.lucene.search.IndexSearcher; |
21 | | -import org.apache.lucene.search.PhraseQuery; |
22 | | -import org.apache.lucene.search.QueryFilter; |
23 | | -import org.apache.lucene.search.TermQuery; |
24 | | -import org.apache.lucene.store.FSDirectory; |
25 | 8 | import org.mediawiki.dumper.ProgressFilter; |
26 | 9 | import org.mediawiki.dumper.Tools; |
27 | 10 | import org.mediawiki.importer.XmlDumpReader; |
28 | | -import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
29 | | -import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
30 | 11 | import org.wikimedia.lsearch.config.Configuration; |
31 | 12 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
32 | 13 | import org.wikimedia.lsearch.config.IndexId; |
33 | | -import org.wikimedia.lsearch.config.IndexRegistry; |
34 | | -import org.wikimedia.lsearch.importer.DumpImporter; |
35 | 14 | import org.wikimedia.lsearch.index.IndexThread; |
36 | | -import org.wikimedia.lsearch.search.NamespaceFilter; |
37 | | -import org.wikimedia.lsearch.spell.api.LuceneDictionary; |
38 | | -import org.wikimedia.lsearch.spell.api.NamespaceFreq; |
39 | | -import org.wikimedia.lsearch.spell.api.TitleIndexer; |
40 | | -import org.wikimedia.lsearch.spell.api.WordsIndexer; |
41 | | -import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
| 15 | +import org.wikimedia.lsearch.spell.api.SpellCheckIndexer; |
42 | 16 | import org.wikimedia.lsearch.util.Localization; |
43 | | -import org.wikimedia.lsearch.util.StringCounter; |
44 | 17 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
45 | | -import org.wikimedia.lsearch.util.StringCounter.Count; |
46 | 18 | |
47 | 19 | /** |
48 | 20 | * Build suggest (did you mean...) indexes |
— | — | @@ -55,12 +27,12 @@ |
56 | 28 | String inputfile = null; |
57 | 29 | String dbname = null; |
58 | 30 | |
59 | | - System.out.println("MediaWiki Lucene search indexer - build suggestions index.\n"); |
| 31 | + System.out.println("MediaWiki Lucene search indexer - build spelling suggestion index.\n"); |
60 | 32 | |
61 | 33 | Configuration.open(); |
62 | 34 | |
63 | 35 | if(args.length !=1 && args.length != 2){ |
64 | | - System.out.println("Syntax: java SpellCheckBuilder <dbname> [<dumpfile>]"); |
| 36 | + System.out.println("Syntax: java SuggestBuilder <dbname> [<dumpfile>]"); |
65 | 37 | return; |
66 | 38 | } |
67 | 39 | inputfile = args.length>1? args[1] : null; |
— | — | @@ -75,10 +47,9 @@ |
76 | 48 | |
77 | 49 | long start = System.currentTimeMillis(); |
78 | 50 | IndexId iid = IndexId.get(dbname); |
79 | | - IndexId words = iid.getSpellWords(); |
80 | | - IndexId titles = iid.getSpellTitles(); |
81 | | - if(words == null || titles == null){ |
82 | | - log.fatal("Index "+iid+" doesn't have both spell-check indexes assigned. Enable them in global configuration."); |
| 51 | + IndexId spell = iid.getSpell(); |
| 52 | + if(spell == null){ |
| 53 | + log.fatal("Index "+iid+" doesn't have a spell-check index assigned. Enable them in global configuration."); |
83 | 54 | return; |
84 | 55 | } |
85 | 56 | |
— | — | @@ -95,7 +66,7 @@ |
96 | 67 | |
97 | 68 | // make fresh clean index |
98 | 69 | try { |
99 | | - CleanIndexImporter importer = new CleanIndexImporter(words,langCode); |
| 70 | + CleanIndexImporter importer = new CleanIndexImporter(spell,langCode); |
100 | 71 | XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(importer, 1000)); |
101 | 72 | reader.readDump(); |
102 | 73 | importer.closeIndex(); |
— | — | @@ -106,36 +77,19 @@ |
107 | 78 | } |
108 | 79 | } |
109 | 80 | } |
110 | | - // make words index |
111 | | - /*log.info("Making words index"); |
112 | | - try { |
113 | | - LuceneDictionary dict = new LuceneDictionary(IndexReader.open(words.getTempPath()),"contents"); |
114 | | - WordsIndexer writer = new WordsIndexer(words.getImportPath(),(dbname.equals("wikilucene")? 3 : 50)); |
115 | | - writer.createIndex(); |
116 | | - Word word; |
117 | | - while((word = dict.next()) != null){ |
118 | | - writer.addWord(word); |
119 | | - } |
120 | | - writer.closeAndOptimze(); |
121 | | - } catch (IOException e) { |
122 | | - log.fatal("Cannot open clean dictionary for "+words+" : "+e.getMessage()); |
123 | | - e.printStackTrace(); |
124 | | - return; |
125 | | - }*/ |
126 | 81 | |
127 | | - log.info("Making suggest title index"); |
| 82 | + log.info("Making spell-check index"); |
128 | 83 | // make phrase index |
129 | 84 | |
130 | | - TitleIndexer tInx = new TitleIndexer(titles); |
| 85 | + SpellCheckIndexer tInx = new SpellCheckIndexer(spell); |
131 | 86 | tInx.createFromTempIndex(); |
132 | 87 | |
133 | 88 | long end = System.currentTimeMillis(); |
134 | 89 | |
135 | 90 | // make snapshots |
136 | | - //IndexThread.makeIndexSnapshot(words,words.getImportPath()); |
137 | | - IndexThread.makeIndexSnapshot(titles,titles.getImportPath()); |
| 91 | + IndexThread.makeIndexSnapshot(spell,spell.getImportPath()); |
138 | 92 | |
139 | | - System.out.println("Finished making suggest index in "+formatTime(end-start)); |
| 93 | + System.out.println("Finished making spell-check index in "+formatTime(end-start)); |
140 | 94 | } |
141 | 95 | |
142 | 96 | private static String formatTime(long l) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/WordsIndexer.java |
— | — | @@ -1,58 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.spell.api; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | - |
6 | | -import org.apache.log4j.Logger; |
7 | | -import org.apache.lucene.analysis.SimpleAnalyzer; |
8 | | -import org.apache.lucene.document.Document; |
9 | | -import org.apache.lucene.document.Field; |
10 | | -import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
11 | | -import org.wikimedia.lsearch.spell.dist.DoubleMetaphone; |
12 | | - |
13 | | -/** |
14 | | - * Create the index with words. Overview: |
15 | | - * - 1 word = 1 document |
16 | | - * - split the word into ngrams and index those |
17 | | - * |
18 | | - * @author rainman |
19 | | - * |
20 | | - */ |
21 | | -public class WordsIndexer { |
22 | | - static Logger log = Logger.getLogger(WordsIndexer.class); |
23 | | - protected DoubleMetaphone dmeta; |
24 | | - /** If word occurs less that minFreq times, it will be discarded */ |
25 | | - protected int minFreq; |
26 | | - protected NgramIndexer indexer; |
27 | | - String path; |
28 | | - |
29 | | - public WordsIndexer(String path, int minFreq) throws IOException { |
30 | | - this.path = path; |
31 | | - this.minFreq = minFreq; |
32 | | - this.dmeta = new DoubleMetaphone(); |
33 | | - this.indexer = new NgramIndexer(); |
34 | | - } |
35 | | - |
36 | | - public void createIndex() throws IOException{ |
37 | | - indexer.createIndex(path, new SimpleAnalyzer()); |
38 | | - } |
39 | | - |
40 | | - /** Add word to the index, make sure index is open */ |
41 | | - public void addWord(Word word){ |
42 | | - if(word.frequency < minFreq) |
43 | | - return; |
44 | | - if(word.getWord().length() < 2) |
45 | | - return; |
46 | | - Document doc = new Document(); |
47 | | - indexer.createNgramFields(doc,"",word.word); |
48 | | - doc.add(new Field("word",word.word, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
49 | | - doc.add(new Field("freq",Integer.toString(word.frequency), Field.Store.YES, Field.Index.NO)); |
50 | | - doc.add(new Field("metaphone1",dmeta.doubleMetaphone(word.word), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
51 | | - doc.add(new Field("metaphone2",dmeta.doubleMetaphone(word.word,true), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
52 | | - |
53 | | - indexer.addDocument(doc); |
54 | | - } |
55 | | - |
56 | | - public void closeAndOptimze() throws IOException{ |
57 | | - indexer.closeAndOptimize(); |
58 | | - } |
59 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/TitleIndexer.java |
— | — | @@ -1,527 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.spell.api; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | -import java.util.ArrayList; |
6 | | -import java.util.Collection; |
7 | | -import java.util.HashMap; |
8 | | -import java.util.HashSet; |
9 | | -import java.util.Map.Entry; |
10 | | - |
11 | | -import org.apache.log4j.Logger; |
12 | | -import org.apache.lucene.analysis.SimpleAnalyzer; |
13 | | -import org.apache.lucene.analysis.Token; |
14 | | -import org.apache.lucene.document.Document; |
15 | | -import org.apache.lucene.document.Field; |
16 | | -import org.apache.lucene.index.IndexReader; |
17 | | -import org.apache.lucene.index.Term; |
18 | | -import org.apache.lucene.index.TermDocs; |
19 | | -import org.apache.lucene.search.Hits; |
20 | | -import org.apache.lucene.search.IndexSearcher; |
21 | | -import org.apache.lucene.search.MultiSearcher; |
22 | | -import org.apache.lucene.search.PhraseQuery; |
23 | | -import org.apache.lucene.search.Query; |
24 | | -import org.apache.lucene.search.SearchableMul; |
25 | | -import org.apache.lucene.search.Searcher; |
26 | | -import org.apache.lucene.search.TermQuery; |
27 | | -import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
28 | | -import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
29 | | -import org.wikimedia.lsearch.config.GlobalConfiguration; |
30 | | -import org.wikimedia.lsearch.config.IndexId; |
31 | | -import org.wikimedia.lsearch.config.IndexRegistry; |
32 | | -import org.wikimedia.lsearch.index.IndexUpdateRecord; |
33 | | -import org.wikimedia.lsearch.index.WikiIndexModifier; |
34 | | -import org.wikimedia.lsearch.search.IndexSearcherMul; |
35 | | -import org.wikimedia.lsearch.search.WikiSearcher; |
36 | | -import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
37 | | -import org.wikimedia.lsearch.spell.dist.DoubleMetaphone; |
38 | | -import org.wikimedia.lsearch.util.HighFreqTerms; |
39 | | - |
40 | | -/** |
41 | | - * Index words and phrases from article titles. |
42 | | - * |
43 | | - * Fields: |
44 | | - * * word - word from title |
45 | | - * * phrase - phrase like douglas_adams |
46 | | - * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3) |
47 | | - * * namespace - namespaces where the word/phrase is present |
48 | | - * |
49 | | - * @author rainman |
50 | | - * |
51 | | - */ |
52 | | -public class TitleIndexer { |
53 | | - static Logger log = Logger.getLogger(TitleIndexer.class); |
54 | | - protected NgramIndexer ngramWriter; |
55 | | - public static final boolean NEW_INDEX = true; |
56 | | - protected boolean createNew; |
57 | | - protected int minWordFreq, minPhraseFreq; |
58 | | - protected IndexId iid,titles; |
59 | | - protected String langCode; |
60 | | - protected IndexRegistry registry; |
61 | | - protected DoubleMetaphone dmeta = new DoubleMetaphone(); |
62 | | - |
63 | | - public TitleIndexer(IndexId iid){ |
64 | | - this(iid,false); |
65 | | - } |
66 | | - |
67 | | - public TitleIndexer(IndexId titles, boolean createNew){ |
68 | | - this.titles = titles; |
69 | | - this.iid = titles.getDB(); |
70 | | - GlobalConfiguration global = GlobalConfiguration.getInstance(); |
71 | | - this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell_titles","wordsMinFreq",3); |
72 | | - this.minPhraseFreq = global.getIntDBParam(iid.getDBname(),"spell_titles","phrasesMinFreq",1); |
73 | | - this.createNew = createNew; |
74 | | - this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
75 | | - this.ngramWriter = new NgramIndexer(); |
76 | | - this.registry = IndexRegistry.getInstance(); |
77 | | - } |
78 | | - |
79 | | - protected Searcher makeSearcher(IndexId main) throws IOException{ |
80 | | - if(main.isSingle()) |
81 | | - return new IndexSearcherMul(registry.getLatestSnapshot(main).path); |
82 | | - else{ |
83 | | - ArrayList<IndexSearcherMul> searchers = new ArrayList<IndexSearcherMul>(); |
84 | | - for(String part : main.getPhysicalIndexes()){ |
85 | | - searchers.add(new IndexSearcherMul(registry.getLatestSnapshot(IndexId.get(part)).path)); |
86 | | - } |
87 | | - return new MultiSearcher(searchers.toArray(new SearchableMul[]{})); |
88 | | - } |
89 | | - } |
90 | | - |
91 | | - /** Returns {NamespaceFreq, HashSet<Integer>} */ |
92 | | - protected Object[] getFreqAndNamespaces(Searcher searcher, int[] namespaces, int[] ranks, Query q) throws IOException { |
93 | | - Hits hits = searcher.search(q); |
94 | | - NamespaceFreq wnf = new NamespaceFreq(); |
95 | | - HashSet<Integer> ns = new HashSet<Integer>(); |
96 | | - for(int i=0;i<hits.length();i++){ |
97 | | - /*Document d = hits.doc(i); |
98 | | - int n = Integer.parseInt(d.get("namespace")); |
99 | | - String rr = d.get("rank"); |
100 | | - int r = rr==null? 0 : Integer.parseInt(d.get("rank")); */ |
101 | | - int id = hits.id(i); |
102 | | - int n = namespaces[id]; |
103 | | - int r = ranks[id]; |
104 | | - wnf.incFrequency(n,r); |
105 | | - ns.add(n); |
106 | | - } |
107 | | - return new Object[] {wnf,ns}; |
108 | | - } |
109 | | - |
110 | | - protected Object[] getFreqAndNamespaces(Searcher searcher, int[] ns, int[] ranks, String word) throws IOException { |
111 | | - return getFreqAndNamespaces(searcher,ns,ranks,new TermQuery(new Term("title",word))); |
112 | | - } |
113 | | - |
114 | | - protected Object[] getFreqAndNamespaces(Searcher searcher, int[] ns, int[] ranks, String[] phrase) throws IOException{ |
115 | | - PhraseQuery pq = new PhraseQuery(); |
116 | | - for(String p : phrase){ |
117 | | - pq.add(new Term("title",p)); |
118 | | - } |
119 | | - return getFreqAndNamespaces(searcher,ns,ranks,pq); |
120 | | - } |
121 | | - |
122 | | - protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, Query q) throws IOException{ |
123 | | - Hits hits = searcher.search(q); |
124 | | - NamespaceFreq wnf = new NamespaceFreq(); |
125 | | - //wnf.setFrequency(-10,hits.length()); |
126 | | - for(int j=0;j<hits.length();j++){ |
127 | | - wnf.incFrequency(namespaces[hits.id(j)]); |
128 | | - } |
129 | | - return wnf; |
130 | | - } |
131 | | - |
132 | | - /** Get frequency for a single word */ |
133 | | - protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String word) throws IOException{ |
134 | | - return getFrequency(searcher,namespaces,new TermQuery(new Term("contents",word))); |
135 | | - } |
136 | | - |
137 | | - /** Get frequency of phrase (invidual words as array) */ |
138 | | - protected NamespaceFreq getFrequency(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{ |
139 | | - PhraseQuery pq = new PhraseQuery(); |
140 | | - for(String p : phrase){ |
141 | | - pq.add(new Term("contents",p)); |
142 | | - } |
143 | | - return getFrequency(searcher,namespaces,pq); |
144 | | - } |
145 | | - |
146 | | - /** Get namespaces where word appears in title */ |
147 | | - protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, Query q) throws IOException{ |
148 | | - Hits hits = searcher.search(q); |
149 | | - HashSet<Integer> ns = new HashSet<Integer>(); |
150 | | - for(int j=0;j<hits.length();j++){ |
151 | | - ns.add(namespaces[hits.id(j)]); |
152 | | - } |
153 | | - return ns; |
154 | | - } |
155 | | - |
156 | | - protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String word) throws IOException{ |
157 | | - return getNamespaces(searcher,namespaces,new TermQuery(new Term("title",word))); |
158 | | - } |
159 | | - |
160 | | - protected Collection<Integer> getNamespaces(Searcher searcher, int[] namespaces, String[] phrase) throws IOException{ |
161 | | - PhraseQuery pq = new PhraseQuery(); |
162 | | - for(String p : phrase){ |
163 | | - pq.add(new Term("title",p)); |
164 | | - } |
165 | | - return getNamespaces(searcher,namespaces,pq); |
166 | | - } |
167 | | - |
168 | | - /** |
169 | | - * Returns the namespace for each doc_id |
170 | | - * @throws IOException |
171 | | - * @FIXME: assumes optimized index |
172 | | - */ |
173 | | - protected Object[] makeNamespaceMap(Searcher searcher) throws IOException{ |
174 | | - log.debug("Making namespace map..."); |
175 | | - int[] namespaces = new int[searcher.maxDoc()]; |
176 | | - int[] ranks = new int[searcher.maxDoc()]; |
177 | | - for(int i=0;i<namespaces.length;i++){ |
178 | | - namespaces[i] = -100; |
179 | | - Document doc = searcher.doc(i); |
180 | | - if(doc != null){ |
181 | | - namespaces[i] = Integer.parseInt(doc.get("namespace")); |
182 | | - String rr = doc.get("rank"); |
183 | | - ranks[i] = rr==null? 0 : Integer.parseInt(rr); |
184 | | - } |
185 | | - } |
186 | | - log.debug("Done making namespace map"); |
187 | | - return new Object[] {namespaces,ranks}; |
188 | | - } |
189 | | - |
190 | | - /** |
191 | | - * Create new index from an index *snapshot* by reading all terms in the index. |
192 | | - * Index will be created in the import directory. |
193 | | - */ |
194 | | - @SuppressWarnings("unchecked") |
195 | | - public void createFromSnapshot(){ |
196 | | - String path = titles.getImportPath(); // dest where to put index |
197 | | - try{ |
198 | | - log.debug("Creating new suggest index"); |
199 | | - ngramWriter.createIndex(path,new SimpleAnalyzer()); |
200 | | - Searcher searcher = makeSearcher(iid); |
201 | | - //IndexSearcher searcher = new IndexSearcherMul(iid.getSpellTitles().getTempPath()); |
202 | | - // map doc_id -> namespace |
203 | | - //int[] namespaces = makeNamespaceMap(searcher); |
204 | | - Object[] nsr = makeNamespaceMap(searcher); |
205 | | - int[] namespaces = (int[]) nsr[0]; |
206 | | - int[] ranks = (int[]) nsr[1]; |
207 | | - int totalAdded = 0, lastReport=0; |
208 | | - |
209 | | - for(String dbrole : iid.getPhysicalIndexes()){ |
210 | | - log.info("Processing index "+dbrole); |
211 | | - if(!ngramWriter.isOpen()) // if we closed the index previously |
212 | | - ngramWriter.reopenIndex(path,new SimpleAnalyzer()); |
213 | | - |
214 | | - IndexId part = IndexId.get(dbrole); |
215 | | - //IndexReader ir = searcher.getIndexReader(); |
216 | | - IndexReader ir = IndexReader.open(registry.getLatestSnapshot(part).path); |
217 | | - LuceneDictionary dict = new LuceneDictionary(ir,"title"); |
218 | | - IndexSearcher ngramSearcher = new IndexSearcher(path); |
219 | | - Word word; |
220 | | - // get all words, and all phrases beginning with word |
221 | | - while((word = dict.next()) != null){ |
222 | | - log.debug("Processing word "+word); |
223 | | - String w = word.getWord(); |
224 | | - |
225 | | - // check if word is already in the index |
226 | | - if(ngramSearcher.docFreq(new Term("word",w)) != 0) |
227 | | - continue; |
228 | | - |
229 | | - int freq = searcher.docFreq(new Term("contents",w)); |
230 | | - if(freq > minWordFreq){ |
231 | | - // index word |
232 | | - Object[] ret = getFreqAndNamespaces(searcher,namespaces,ranks,w); |
233 | | - NamespaceFreq wnf = (NamespaceFreq) ret[0]; |
234 | | - Collection<Integer> wns = (Collection<Integer>) ret[1]; |
235 | | - //NamespaceFreq wnf = getFrequency(searcher,namespaces,w); |
236 | | - if(wnf.getFrequency() > minWordFreq){ |
237 | | - //Collection<Integer> wns = getNamespaces(searcher,namespaces,w); |
238 | | - addWord(w,wnf,wns); |
239 | | - } |
240 | | - } |
241 | | - if(freq > minPhraseFreq){ |
242 | | - // index phrases |
243 | | - HashSet<String> phrases = new HashSet<String>(); |
244 | | - Hits hits = searcher.search(new TermQuery(new Term("title",w))); |
245 | | - // from titles find phrases beginning with word |
246 | | - for(int i=0;i<hits.length();i++){ |
247 | | - Document doc = hits.doc(i); |
248 | | - // tokenize to make phrases |
249 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(doc.get("title"),langCode,false); |
250 | | - ArrayList<Token> tokens = parser.parse(); |
251 | | - for(int j=0;j<tokens.size()-1;j++){ |
252 | | - Token t = tokens.get(j); |
253 | | - // ignore aliases |
254 | | - if(t.getPositionIncrement() == 0) |
255 | | - continue; |
256 | | - // find phrases beginning with the target word |
257 | | - if(w.equals(t.termText())){ |
258 | | - phrases.add(t.termText()+"_"+tokens.get(j+1).termText()); |
259 | | - } |
260 | | - } |
261 | | - } |
262 | | - log.debug("Adding "+phrases.size()+" phrases "+phrases); |
263 | | - // index phrases |
264 | | - for(String phrase : phrases){ |
265 | | - Object[] ret = getFreqAndNamespaces(searcher,namespaces,ranks,phrase.split("_")); |
266 | | - NamespaceFreq nf = (NamespaceFreq) ret[0]; |
267 | | - Collection<Integer> pns = (Collection<Integer>) ret[1]; |
268 | | - //NamespaceFreq nf = getFrequency(searcher,namespaces,phrase.split("_")); |
269 | | - if(nf.getFrequency() > minPhraseFreq){ |
270 | | - //Collection<Integer> pns = getNamespaces(searcher,namespaces,phrase.split("_")); |
271 | | - addPhrase(phrase,nf,pns,false); |
272 | | - } |
273 | | - } |
274 | | - totalAdded += phrases.size(); |
275 | | - if(totalAdded - lastReport > 1000){ |
276 | | - log.info("Processed "+totalAdded+" phrases"); |
277 | | - lastReport = totalAdded; |
278 | | - } |
279 | | - } |
280 | | - } |
281 | | - log.debug("Finished index "+iid+", closing/optimizing."); |
282 | | - ir.close(); |
283 | | - ngramSearcher.close(); |
284 | | - ngramWriter.closeAndOptimize(); |
285 | | - } |
286 | | - searcher.close(); |
287 | | - } catch (IOException e) { |
288 | | - log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage()); |
289 | | - e.printStackTrace(); |
290 | | - return; |
291 | | - } |
292 | | - } |
293 | | - |
294 | | - public void createFromTempIndex(){ |
295 | | - String path = titles.getImportPath(); // dest where to put index |
296 | | - FieldNameFactory fields = new FieldNameFactory(); |
297 | | - final String title = fields.title(); |
298 | | - final String contents = fields.contents(); |
299 | | - final String alttitle = fields.alttitle(); |
300 | | - try { |
301 | | - ngramWriter.createIndex(path,new SimpleAnalyzer()); |
302 | | - IndexReader ir = IndexReader.open(iid.getSpellWords().getTempPath()); |
303 | | - HashSet<String> stopWords = new HashSet<String>(); |
304 | | - TermDocs td = ir.termDocs(new Term("metadata_key","stopWords")); |
305 | | - if(td.next()){ |
306 | | - for(String s : ir.document(td.doc()).get("metadata_value").split(" ")) |
307 | | - stopWords.add(s); |
308 | | - } |
309 | | - addMetadata("stopWords",stopWords); |
310 | | - |
311 | | - // add all titles |
312 | | - for(int i=0;i<ir.maxDoc();i++){ |
313 | | - if(ir.isDeleted(i)) |
314 | | - continue; |
315 | | - String titleText = ir.document(i).get(title); |
316 | | - if(titleText != null) |
317 | | - addTitle(titleText); |
318 | | - // FIXME: alttitle fiels is not generated! |
319 | | - for(int j=0;j<WikiIndexModifier.ALT_TITLES;j++){ |
320 | | - String altTitleText = ir.document(i).get(alttitle+j); |
321 | | - if(altTitleText != null) |
322 | | - addTitle(altTitleText); |
323 | | - } |
324 | | - } |
325 | | - |
326 | | - LuceneDictionary dict = new LuceneDictionary(ir,contents); |
327 | | - Word word; |
328 | | - while((word = dict.next()) != null){ |
329 | | - String w = word.getWord(); |
330 | | - int freq = word.getFrequency(); |
331 | | - if(w.contains("_")){ // phrase |
332 | | - String[] words = w.split("_+"); |
333 | | - if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1])) |
334 | | - continue; |
335 | | - boolean allowed = true; |
336 | | - for(String ww : words){ |
337 | | - // allow only those phrases consisting of title words |
338 | | - if(ir.docFreq(new Term(title,ww)) == 0){ |
339 | | - allowed = false; |
340 | | - break; |
341 | | - } |
342 | | - } |
343 | | - if(allowed && freq > minPhraseFreq){ |
344 | | - boolean inTitle = ir.docFreq(new Term(title,w))!= 0; |
345 | | - NamespaceFreq nsf = new NamespaceFreq(); |
346 | | - nsf.setFrequency(0,freq); |
347 | | - ArrayList<Integer> nss = new ArrayList<Integer>(); |
348 | | - nss.add(0); |
349 | | - addPhrase(w,nsf,nss,inTitle); |
350 | | - } |
351 | | - } else{ |
352 | | - if(freq > minWordFreq){ |
353 | | - NamespaceFreq nsf = new NamespaceFreq(); |
354 | | - nsf.setFrequency(0,freq); |
355 | | - ArrayList<Integer> nss = new ArrayList<Integer>(); |
356 | | - nss.add(0); |
357 | | - addWord(w,nsf,nss); |
358 | | - } |
359 | | - } |
360 | | - } |
361 | | - //ngramWriter.closeAndOptimize(); |
362 | | - //ngramWriter.reopenIndex(path,new SimpleAnalyzer()); |
363 | | - //IndexReader ngramReader = ngramWriter.getReader(); |
364 | | - // add stuff from titles with stop words |
365 | | - dict = new LuceneDictionary(ir,title); |
366 | | - while((word = dict.next()) != null){ |
367 | | - String w = word.getWord(); |
368 | | - if(w.contains("_")){ // phrase |
369 | | - String[] words = w.split("_+"); |
370 | | - if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1])){ |
371 | | - int freq = ir.docFreq(new Term("contents",w)); |
372 | | - NamespaceFreq nsf = new NamespaceFreq(); |
373 | | - nsf.setFrequency(0,freq); |
374 | | - ArrayList<Integer> nss = new ArrayList<Integer>(); |
375 | | - nss.add(0); |
376 | | - addPhrase(w,nsf,nss,true); |
377 | | - } |
378 | | - } |
379 | | - } |
380 | | - ngramWriter.closeAndOptimize(); |
381 | | - ir.close(); |
382 | | - |
383 | | - } catch (IOException e) { |
384 | | - log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage()); |
385 | | - e.printStackTrace(); |
386 | | - return; |
387 | | - } |
388 | | - |
389 | | - } |
390 | | - |
391 | | - /** |
392 | | - * Register a title in the index, without tokenization, just lowercase. |
393 | | - * |
394 | | - * @param title |
395 | | - */ |
396 | | - public void addTitle(String title){ |
397 | | - Document doc = new Document(); |
398 | | - doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
399 | | - ngramWriter.addDocument(doc); |
400 | | - } |
401 | | - /** |
402 | | - * Add phrase to index |
403 | | - * |
404 | | - * @param phrase - 2+ words joined with underscore |
405 | | - * @param nf - frequencies of phrase in various namespaces |
406 | | - * @param namespaces - namespaces where phrase appears in title |
407 | | - */ |
408 | | - public void addPhrase(String phrase, NamespaceFreq nf, Collection<Integer> namespaces, boolean inTitle){ |
409 | | - String freq = nf.serialize(minPhraseFreq); |
410 | | - if(freq.length() == 0) |
411 | | - return; |
412 | | - if(phrase.length() <= 2){ |
413 | | - log.warn("Invalid phrase: "+phrase); |
414 | | - return; |
415 | | - } |
416 | | - Document doc = new Document(); |
417 | | - //ngramWriter.createNgramFields(doc,"phrase",phrase); |
418 | | - doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
419 | | - doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO)); |
420 | | - for(Integer ns : namespaces){ |
421 | | - doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED)); |
422 | | - } |
423 | | - if(inTitle) |
424 | | - doc.add(new Field("intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
425 | | - |
426 | | - ngramWriter.addDocument(doc); |
427 | | - } |
428 | | - |
429 | | - /** |
430 | | - * Add into metadata_key and metadata_value. |
431 | | - * Collection is assumed to contain words (without spaces) |
432 | | - */ |
433 | | - public void addMetadata(String key, Collection<String> values){ |
434 | | - StringBuilder sb = new StringBuilder(); |
435 | | - // serialize by joining with spaces |
436 | | - for(String val : values){ |
437 | | - if(sb.length() != 0) |
438 | | - sb.append(" "); |
439 | | - sb.append(val); |
440 | | - } |
441 | | - Document doc = new Document(); |
442 | | - doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
443 | | - doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO)); |
444 | | - |
445 | | - ngramWriter.addDocument(doc); |
446 | | - } |
447 | | - |
448 | | - /** Add ordinary word to the index |
449 | | - * |
450 | | - * @param word - word to add |
451 | | - * @param nf - frequencies in namespaces |
452 | | - * @param namespaces - namespaces where word appears in title |
453 | | - */ |
454 | | - public void addWord(String word, NamespaceFreq nf, Collection<Integer> namespaces){ |
455 | | - if(word.length() < 2) |
456 | | - return; |
457 | | - String freq = nf.serialize(); |
458 | | - if(freq.length() == 0) |
459 | | - return; |
460 | | - Document doc = new Document(); |
461 | | - ngramWriter.createNgramFields(doc,"word",word); |
462 | | - doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
463 | | - doc.add(new Field("freq",freq, Field.Store.YES, Field.Index.NO)); |
464 | | - doc.add(new Field("meta1",dmeta.doubleMetaphone(word), Field.Store.YES, Field.Index.NO)); |
465 | | - doc.add(new Field("meta2",dmeta.doubleMetaphone(word,true), Field.Store.YES, Field.Index.NO)); |
466 | | - for(Integer ns : namespaces){ |
467 | | - doc.add(new Field("namespace",ns.toString(),Field.Store.NO, Field.Index.UN_TOKENIZED)); |
468 | | - } |
469 | | - |
470 | | - ngramWriter.addDocument(doc); |
471 | | - } |
472 | | - |
473 | | - /** Update the index */ |
474 | | - public void update(Collection<IndexUpdateRecord> records){ |
475 | | - /*String path = iid.getIndexPath(); |
476 | | - try{ |
477 | | - log.info("Updating suggest index for "+iid+" with "+records.size()); |
478 | | - IndexReader ir = IndexReader.open(path); |
479 | | - Searcher searcher = makeSearcher(iid.getDB()); |
480 | | - // TODO: don't use namespaces, but fetch fields, it's likely to be more efficient for small updates |
481 | | - int[] namespaces = makeNamespaceMap(searcher); |
482 | | - // get all words and phrases |
483 | | - HashSet<String> words = new HashSet<String>(); |
484 | | - HashSet<String> phrases = new HashSet<String>(); |
485 | | - for(IndexUpdateRecord rec : records){ |
486 | | - String title = rec.getArticle().getTitle(); |
487 | | - ArrayList<Token> tokens = new FastWikiTokenizerEngine(title,langCode,false).parse(); |
488 | | - String last = null; |
489 | | - // register word/phrases |
490 | | - for(Token t : tokens){ |
491 | | - String w = t.termText(); |
492 | | - words.add(w); |
493 | | - if(last != null){ |
494 | | - phrases.add(last+"_"+w); |
495 | | - } |
496 | | - last = w; |
497 | | - } |
498 | | - } |
499 | | - searcher.close(); |
500 | | - |
501 | | - // batch delete old values |
502 | | - for(String word : words){ |
503 | | - ir.deleteDocuments(new Term("word",word)); |
504 | | - } |
505 | | - for(String phrase : phrases){ |
506 | | - ir.deleteDocuments(new Term("phrase",phrase)); |
507 | | - } |
508 | | - ir.close(); |
509 | | - ngramWriter.reopenIndex(path,new SimpleAnalyzer()); |
510 | | - |
511 | | - // batch add new stuff |
512 | | - for(String word : words){ |
513 | | - addWord(word,getFrequency(searcher,namespaces,word),getNamespaces(searcher,namespaces,word)); |
514 | | - } |
515 | | - for(String phrase : phrases){ |
516 | | - String[] ph = phrase.split("_"); |
517 | | - addPhrase(phrase,getFrequency(searcher,namespaces,ph),getNamespaces(searcher,namespaces,ph)); |
518 | | - } |
519 | | - |
520 | | - ngramWriter.close(); |
521 | | - } catch(IOException e){ |
522 | | - log.error("Cannot update index for "+iid+" : "+e.getMessage()); |
523 | | - e.printStackTrace(); |
524 | | - return; |
525 | | - }*/ |
526 | | - } |
527 | | - |
528 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/NamespaceFreq.java |
— | — | @@ -1,117 +0,0 @@ |
2 | | -package org.wikimedia.lsearch.spell.api; |
3 | | - |
4 | | -import java.util.BitSet; |
5 | | -import java.util.HashMap; |
6 | | -import java.util.Set; |
7 | | -import java.util.Map.Entry; |
8 | | - |
9 | | -import org.wikimedia.lsearch.search.NamespaceFilter; |
10 | | - |
11 | | -/** Mapping from namespaces to frequencies */ |
12 | | -public class NamespaceFreq { |
13 | | - class IntWrap{ |
14 | | - int val = 0; |
15 | | - IntWrap() {} |
16 | | - IntWrap(int value){ val = value; } |
17 | | - IntWrap(String value){ val = Integer.parseInt(value); } |
18 | | - public String toString(){ return ""+val; } |
19 | | - } |
20 | | - /** namespace -> frequency */ |
21 | | - protected HashMap<Integer,IntWrap> nsmap = new HashMap<Integer,IntWrap>(); |
22 | | - |
23 | | - /** Construct from serialized field value */ |
24 | | - public NamespaceFreq(String field){ |
25 | | - String[] pairs = field.split(" "); |
26 | | - for(String pair : pairs){ |
27 | | - if(pair.length() == 0) |
28 | | - continue; |
29 | | - String[] nsf = pair.split(":"); |
30 | | - if(nsf.length == 2) |
31 | | - nsmap.put(Integer.parseInt(nsf[0]),new IntWrap(nsf[1])); |
32 | | - else { |
33 | | - throw new RuntimeException("Bad syntax for namespace-frequency pairs : "+field); |
34 | | - } |
35 | | - } |
36 | | - } |
37 | | - |
38 | | - public NamespaceFreq() { |
39 | | - } |
40 | | - |
41 | | - /** Get frequency of term for one namespace */ |
42 | | - public int getFrequency(int namespace){ |
43 | | - if(nsmap.containsKey(-10)) |
44 | | - return nsmap.get(-10).val; |
45 | | - else if(nsmap.containsKey(namespace)) |
46 | | - return nsmap.get(namespace).val; |
47 | | - else |
48 | | - return 0; |
49 | | - } |
50 | | - |
51 | | - /** Get frequency of term over some set of namespaces */ |
52 | | - public int getFrequency(NamespaceFilter nsf){ |
53 | | - if(nsmap.containsKey(-10)) |
54 | | - return nsmap.get(-10).val; |
55 | | - int sum = 0; |
56 | | - BitSet ns = nsf.getIncluded(); |
57 | | - for(int i=ns.nextSetBit(0); i>=0; i=ns.nextSetBit(i+1)){ |
58 | | - sum += getFrequency(i); |
59 | | - } |
60 | | - return sum; |
61 | | - } |
62 | | - |
63 | | - /** Get total frequency of term over all namespaces */ |
64 | | - public int getFrequency(){ |
65 | | - if(nsmap.containsKey(-10)) |
66 | | - return nsmap.get(-10).val; |
67 | | - int sum = 0; |
68 | | - for(IntWrap i : nsmap.values()){ |
69 | | - sum += i.val; |
70 | | - } |
71 | | - return sum; |
72 | | - } |
73 | | - |
74 | | - /** Serialize only if total frequency is at least minFreq */ |
75 | | - public String serialize(int minFreq){ |
76 | | - StringBuilder sb = new StringBuilder(); |
77 | | - int sum = 0; |
78 | | - for(Entry<Integer,IntWrap> e : nsmap.entrySet()){ |
79 | | - sum += e.getValue().val; |
80 | | - sb.append(e.getKey()); |
81 | | - sb.append(":"); |
82 | | - sb.append(e.getValue()); |
83 | | - sb.append(" "); |
84 | | - } |
85 | | - if(sum < minFreq) |
86 | | - return ""; |
87 | | - return sb.toString(); |
88 | | - } |
89 | | - |
90 | | - /** Serialize into a field format: ns:freq ns2:freq2 ... */ |
91 | | - public String serialize(){ |
92 | | - return serialize(0); |
93 | | - } |
94 | | - |
95 | | - /** Modify frequency value for some namespace */ |
96 | | - public void setFrequency(int namespace, int frequency){ |
97 | | - nsmap.put(namespace,new IntWrap(frequency)); |
98 | | - } |
99 | | - |
100 | | - /** Incremental term frequency in namespace */ |
101 | | - public void incFrequency(int namespace){ |
102 | | - incFrequency(namespace,1); |
103 | | - } |
104 | | - |
105 | | - /** Incremental term frequency in namespace */ |
106 | | - public void incFrequency(int namespace, int inc){ |
107 | | - if(nsmap.containsKey(namespace)){ |
108 | | - nsmap.get(namespace).val+=inc; |
109 | | - } else |
110 | | - nsmap.put(namespace,new IntWrap(inc)); |
111 | | - } |
112 | | - |
113 | | - /** Get all namespaces where term has nonzero frequency */ |
114 | | - public Set<Integer> getNamespaces(){ |
115 | | - return nsmap.keySet(); |
116 | | - } |
117 | | - |
118 | | -} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java |
— | — | @@ -0,0 +1,230 @@ |
| 2 | +package org.wikimedia.lsearch.spell.api; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
| 7 | +import java.util.HashMap; |
| 8 | +import java.util.HashSet; |
| 9 | +import java.util.Map.Entry; |
| 10 | + |
| 11 | +import org.apache.log4j.Logger; |
| 12 | +import org.apache.lucene.analysis.SimpleAnalyzer; |
| 13 | +import org.apache.lucene.analysis.Token; |
| 14 | +import org.apache.lucene.document.Document; |
| 15 | +import org.apache.lucene.document.Field; |
| 16 | +import org.apache.lucene.index.IndexReader; |
| 17 | +import org.apache.lucene.index.Term; |
| 18 | +import org.apache.lucene.index.TermDocs; |
| 19 | +import org.apache.lucene.search.Hits; |
| 20 | +import org.apache.lucene.search.IndexSearcher; |
| 21 | +import org.apache.lucene.search.MultiSearcher; |
| 22 | +import org.apache.lucene.search.PhraseQuery; |
| 23 | +import org.apache.lucene.search.Query; |
| 24 | +import org.apache.lucene.search.SearchableMul; |
| 25 | +import org.apache.lucene.search.Searcher; |
| 26 | +import org.apache.lucene.search.TermQuery; |
| 27 | +import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 28 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
| 29 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 30 | +import org.wikimedia.lsearch.config.IndexId; |
| 31 | +import org.wikimedia.lsearch.config.IndexRegistry; |
| 32 | +import org.wikimedia.lsearch.index.IndexUpdateRecord; |
| 33 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 34 | +import org.wikimedia.lsearch.search.IndexSearcherMul; |
| 35 | +import org.wikimedia.lsearch.search.WikiSearcher; |
| 36 | +import org.wikimedia.lsearch.spell.api.Dictionary.Word; |
| 37 | +import org.wikimedia.lsearch.spell.dist.DoubleMetaphone; |
| 38 | +import org.wikimedia.lsearch.util.HighFreqTerms; |
| 39 | + |
| 40 | +/** |
| 41 | + * Index words and phrases from article titles. |
| 42 | + * |
| 43 | + * Fields: |
| 44 | + * * word - word from title |
| 45 | + * * phrase - phrase like douglas_adams |
| 46 | + * * freq - stored serialized NamespaceFreq (ns:frequency, e.g. 0:234 1:12 14:3) |
| 47 | + * * namespace - namespaces where the word/phrase is present |
| 48 | + * |
| 49 | + * @author rainman |
| 50 | + * |
| 51 | + */ |
| 52 | +public class SpellCheckIndexer { |
| 53 | + static Logger log = Logger.getLogger(SpellCheckIndexer.class); |
| 54 | + protected NgramIndexer ngramWriter; |
| 55 | + public static final boolean NEW_INDEX = true; |
| 56 | + protected boolean createNew; |
| 57 | + protected int minWordFreq, minPhraseFreq; |
| 58 | + protected IndexId iid,titles; |
| 59 | + protected String langCode; |
| 60 | + protected IndexRegistry registry; |
| 61 | + protected DoubleMetaphone dmeta = new DoubleMetaphone(); |
| 62 | + |
| 63 | + public SpellCheckIndexer(IndexId iid){ |
| 64 | + this(iid,false); |
| 65 | + } |
| 66 | + |
| 67 | + public SpellCheckIndexer(IndexId titles, boolean createNew){ |
| 68 | + this.titles = titles; |
| 69 | + this.iid = titles.getDB(); |
| 70 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
| 71 | + this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell","wordsMinFreq",3); |
| 72 | + this.minPhraseFreq = global.getIntDBParam(iid.getDBname(),"spell","phrasesMinFreq",1); |
| 73 | + this.createNew = createNew; |
| 74 | + this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
| 75 | + this.ngramWriter = new NgramIndexer(); |
| 76 | + this.registry = IndexRegistry.getInstance(); |
| 77 | + } |
| 78 | + |
| 79 | + public void createFromTempIndex(){ |
| 80 | + String path = titles.getImportPath(); // dest where to put index |
| 81 | + FieldNameFactory fields = new FieldNameFactory(); |
| 82 | + final String title = fields.title(); |
| 83 | + final String contents = fields.contents(); |
| 84 | + final String alttitle = fields.alttitle(); |
| 85 | + try { |
| 86 | + ngramWriter.createIndex(path,new SimpleAnalyzer()); |
| 87 | + IndexReader ir = IndexReader.open(iid.getSpell().getTempPath()); |
| 88 | + HashSet<String> stopWords = new HashSet<String>(); |
| 89 | + TermDocs td = ir.termDocs(new Term("metadata_key","stopWords")); |
| 90 | + if(td.next()){ |
| 91 | + for(String s : ir.document(td.doc()).get("metadata_value").split(" ")) |
| 92 | + stopWords.add(s); |
| 93 | + } |
| 94 | + addMetadata("stopWords",stopWords); |
| 95 | + |
| 96 | + log.info("Adding titles"); |
| 97 | + // add all titles |
| 98 | + for(int i=0;i<ir.maxDoc();i++){ |
| 99 | + if(ir.isDeleted(i)) |
| 100 | + continue; |
| 101 | + String titleText = ir.document(i).get(title); |
| 102 | + if(titleText != null) |
| 103 | + addTitle(titleText); |
| 104 | + for(int j=0;j<WikiIndexModifier.ALT_TITLES;j++){ |
| 105 | + String altTitleText = ir.document(i).get(alttitle+j); |
| 106 | + if(altTitleText != null) |
| 107 | + addTitle(altTitleText); |
| 108 | + } |
| 109 | + } |
| 110 | + log.info("Adding words and phrases"); |
| 111 | + LuceneDictionary dict = new LuceneDictionary(ir,contents); |
| 112 | + Word word; |
| 113 | + while((word = dict.next()) != null){ |
| 114 | + String w = word.getWord(); |
| 115 | + int freq = word.getFrequency(); |
| 116 | + if(w.contains("_")){ // phrase |
| 117 | + String[] words = w.split("_+"); |
| 118 | + if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1])) |
| 119 | + continue; |
| 120 | + boolean allowed = true; |
| 121 | + for(String ww : words){ |
| 122 | + // allow only those phrases consisting of title words |
| 123 | + if(ir.docFreq(new Term(title,ww)) == 0){ |
| 124 | + allowed = false; |
| 125 | + break; |
| 126 | + } |
| 127 | + } |
| 128 | + if(allowed && freq > minPhraseFreq){ |
| 129 | + boolean inTitle = ir.docFreq(new Term(title,w))!= 0; |
| 130 | + addPhrase(w,freq,inTitle); |
| 131 | + } |
| 132 | + } else{ |
| 133 | + if(freq > minWordFreq){ |
| 134 | + addWord(w,freq); |
| 135 | + } |
| 136 | + } |
| 137 | + } |
| 138 | + log.info("Adding phrases with stop words from titles"); |
| 139 | + // add stuff from titles with stop words |
| 140 | + dict = new LuceneDictionary(ir,title); |
| 141 | + while((word = dict.next()) != null){ |
| 142 | + String w = word.getWord(); |
| 143 | + if(w.contains("_")){ // phrase |
| 144 | + String[] words = w.split("_+"); |
| 145 | + if(stopWords.contains(words[0]) || stopWords.contains(words[words.length-1])){ |
| 146 | + int freq = ir.docFreq(new Term("contents",w)); |
| 147 | + addPhrase(w,freq,true); |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + ngramWriter.closeAndOptimize(); |
| 152 | + ir.close(); |
| 153 | + |
| 154 | + } catch (IOException e) { |
| 155 | + log.fatal("Cannot build titles suggest index for "+iid+" : "+e.getMessage()); |
| 156 | + e.printStackTrace(); |
| 157 | + return; |
| 158 | + } |
| 159 | + |
| 160 | + } |
| 161 | + |
| 162 | + /** |
| 163 | + * Register a title in the index, without tokenization, just lowercase. |
| 164 | + * |
| 165 | + * @param title |
| 166 | + */ |
| 167 | + public void addTitle(String title){ |
| 168 | + Document doc = new Document(); |
| 169 | + doc.add(new Field("title", title.toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED)); |
| 170 | + ngramWriter.addDocument(doc); |
| 171 | + } |
| 172 | + /** |
| 173 | + * Add phrase to index |
| 174 | + * |
| 175 | + * @param phrase - 2+ words joined with underscore |
| 176 | + * @param nf - frequencies of phrase in various namespaces |
| 177 | + * @param namespaces - namespaces where phrase appears in title |
| 178 | + */ |
| 179 | + public void addPhrase(String phrase, int freq, boolean inTitle){ |
| 180 | + if(phrase.length() <= 2){ |
| 181 | + log.warn("Invalid phrase: "+phrase); |
| 182 | + return; |
| 183 | + } |
| 184 | + Document doc = new Document(); |
| 185 | + //ngramWriter.createNgramFields(doc,"phrase",phrase); |
| 186 | + doc.add(new Field("phrase",phrase, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 187 | + doc.add(new Field("freq",Integer.toString(freq), Field.Store.YES, Field.Index.NO)); |
| 188 | + if(inTitle) |
| 189 | + doc.add(new Field("intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 190 | + |
| 191 | + ngramWriter.addDocument(doc); |
| 192 | + } |
| 193 | + |
| 194 | + /** |
| 195 | + * Add into metadata_key and metadata_value. |
| 196 | + * Collection is assumed to contain words (without spaces) |
| 197 | + */ |
| 198 | + public void addMetadata(String key, Collection<String> values){ |
| 199 | + StringBuilder sb = new StringBuilder(); |
| 200 | + // serialize by joining with spaces |
| 201 | + for(String val : values){ |
| 202 | + if(sb.length() != 0) |
| 203 | + sb.append(" "); |
| 204 | + sb.append(val); |
| 205 | + } |
| 206 | + Document doc = new Document(); |
| 207 | + doc.add(new Field("metadata_key",key, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 208 | + doc.add(new Field("metadata_value",sb.toString(), Field.Store.YES, Field.Index.NO)); |
| 209 | + |
| 210 | + ngramWriter.addDocument(doc); |
| 211 | + } |
| 212 | + |
| 213 | + /** Add ordinary word to the index |
| 214 | + * |
| 215 | + * @param word - word to add |
| 216 | + * @param nf - frequencies in namespaces |
| 217 | + * @param namespaces - namespaces where word appears in title |
| 218 | + */ |
| 219 | + public void addWord(String word, int freq){ |
| 220 | + if(word.length() < 2) |
| 221 | + return; |
| 222 | + Document doc = new Document(); |
| 223 | + ngramWriter.createNgramFields(doc,"word",word); |
| 224 | + doc.add(new Field("word",word, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| 225 | + doc.add(new Field("freq",Integer.toString(freq), Field.Store.YES, Field.Index.NO)); |
| 226 | + doc.add(new Field("meta1",dmeta.doubleMetaphone(word), Field.Store.YES, Field.Index.NO)); |
| 227 | + doc.add(new Field("meta2",dmeta.doubleMetaphone(word,true), Field.Store.YES, Field.Index.NO)); |
| 228 | + |
| 229 | + ngramWriter.addDocument(doc); |
| 230 | + } |
| 231 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/LuceneDictionary.java |
— | — | @@ -39,6 +39,7 @@ |
40 | 40 | private TermEnum termEnum; |
41 | 41 | private int count = 0; |
42 | 42 | private String field; |
| 43 | + private boolean first = true; |
43 | 44 | |
44 | 45 | public LuceneDictionary(IndexReader reader, String field) { |
45 | 46 | try { |
— | — | @@ -55,10 +56,14 @@ |
56 | 57 | } |
57 | 58 | try { |
58 | 59 | while(true){ |
59 | | - if(!termEnum.next()) |
| 60 | + if(first){ |
| 61 | + first = false; |
| 62 | + break; |
| 63 | + } |
| 64 | + else if(!termEnum.next()) |
60 | 65 | return null; |
61 | 66 | else if(!termEnum.term().field().equals(field)) |
62 | | - continue; // skip terms that are not from the desired field |
| 67 | + return null; // end of our field |
63 | 68 | |
64 | 69 | break; |
65 | 70 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -66,7 +66,7 @@ |
67 | 67 | showTokens(text); |
68 | 68 | text = "Dž (Dž), dž (dž), d' (ď), l' (ľ), t' (ť), IJ (IJ), ij (ij), LJ (LJ), Lj (Lj), lj (lj). NJ (NJ), Nj (Nj), nj (nj). All characters in parentheses are the single-unicode form; those not in parentheses are component character forms. There's also the issue of searching for AE (Æ), ae (æ), OE (Œ), & oe (œ)."; |
69 | 69 | showTokens(text); |
70 | | - text = "Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained "; |
| 70 | + text = "ça Алекса́ндр Серге́евич Пу́шкин Đ đViệt Nam Đ/đ ↔ D/d contains רוּחַ should be treated as though it contained "; |
71 | 71 | showTokens(text); |
72 | 72 | text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]"; |
73 | 73 | showTokens(text); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | int bad=0; |
41 | 41 | long start = System.currentTimeMillis(); |
42 | 42 | for(String[] m : DATA){ |
43 | | - ArrayList<SuggestResult> res = sc.suggestWordsFromTitle(m[0],new NamespaceFilter(0),5); |
| 43 | + ArrayList<SuggestResult> res = sc.suggestWords(m[0],new NamespaceFilter(0),5); |
44 | 44 | if(res.size() > 0){ |
45 | 45 | SuggestResult r = res.get(0); |
46 | 46 | if(r.getWord().equals(m[1])) |
— | — | @@ -48,7 +48,7 @@ |
49 | 49 | && res.get(1).getWord().equals(m[1])) |
50 | 50 | good++; |
51 | 51 | else if(r.getDist() > 1){ |
52 | | - SuggestResult split = sc.suggestSplitFromTitle(m[0],new NamespaceFilter(0),0); |
| 52 | + SuggestResult split = sc.suggestSplit(m[0],new NamespaceFilter(0),0); |
53 | 53 | if(split!=null && m[1].equals(split.getWord())) |
54 | 54 | good++; |
55 | 55 | else{ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -145,7 +145,7 @@ |
146 | 146 | assertEquals("detest",sir[3]); |
147 | 147 | assertEquals("rutest",sir[4]); |
148 | 148 | assertEquals("frtest",sir[5]); |
149 | | - assertEquals(8,sir.length); |
| 149 | + assertEquals(11,sir.length); |
150 | 150 | |
151 | 151 | // indexLocation |
152 | 152 | Hashtable indexLocation = testgc.getIndexLocation(); |
— | — | @@ -191,14 +191,9 @@ |
192 | 192 | assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki")); |
193 | 193 | |
194 | 194 | // test suggest tag |
195 | | - Hashtable<String,String> sug = testgc.getDBParams("entest","spell_words"); |
196 | | - assertEquals("3",sug.get("minFreq")); |
197 | | - assertEquals("20",sug.get("minHits")); |
198 | | - |
199 | | - sug = testgc.getDBParams("entest","spell_titles"); |
| 195 | + Hashtable<String,String> sug = testgc.getDBParams("entest","spell"); |
200 | 196 | assertEquals("1",sug.get("wordsMinFreq")); |
201 | 197 | assertEquals("2",sug.get("phrasesMinFreq")); |
202 | | - assertEquals("20",sug.get("minHits")); |
203 | 198 | |
204 | 199 | } catch (MalformedURLException e) { |
205 | 200 | e.printStackTrace(); |
— | — | @@ -272,10 +267,10 @@ |
273 | 268 | assertEquals(2,njawiki2.getPartNum()); |
274 | 269 | assertEquals("[192.168.0.1]",njawiki2.getSearchHosts().toString()); |
275 | 270 | |
276 | | - IndexId sug = IndexId.get("entest.spell_words"); |
277 | | - assertTrue(sug.isSpellWords()); |
| 271 | + IndexId sug = IndexId.get("entest.spell"); |
| 272 | + assertTrue(sug.isSpell()); |
278 | 273 | assertFalse(sug.isLogical()); |
279 | | - assertEquals(sug,sug.getSpellWords()); |
| 274 | + assertEquals(sug,sug.getSpell()); |
280 | 275 | |
281 | 276 | } |
282 | 277 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Title.java |
— | — | @@ -99,5 +99,14 @@ |
100 | 100 | public void setTitle(java.lang.String title) { |
101 | 101 | this.title = title; |
102 | 102 | } |
| 103 | + |
| 104 | + /** |
| 105 | + * Get string representation of namespace |
| 106 | + * |
| 107 | + * @return |
| 108 | + */ |
| 109 | + public String getNamespaceAsString(){ |
| 110 | + return Integer.toString(namespace); |
| 111 | + } |
103 | 112 | |
104 | 113 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -51,6 +51,8 @@ |
52 | 52 | private transient int rank; |
53 | 53 | /** names of articles that relate to this article */ |
54 | 54 | private ArrayList<RelatedTitle> related; |
| 55 | + /** names of articles that relate to this article */ |
| 56 | + private ArrayList<String> anchorText; |
55 | 57 | |
56 | 58 | public Article(){ |
57 | 59 | namespace=""; |
— | — | @@ -61,6 +63,7 @@ |
62 | 64 | references = 0; |
63 | 65 | redirects=new ArrayList<Redirect>(); |
64 | 66 | related = new ArrayList<RelatedTitle>(); |
| 67 | + anchorText = new ArrayList<String>(); |
65 | 68 | } |
66 | 69 | |
67 | 70 | public Article(long pageId, Title title, String text, boolean redirect, int references) { |
— | — | @@ -72,6 +75,7 @@ |
73 | 76 | this.references = references; |
74 | 77 | this.redirects = new ArrayList<Redirect>(); |
75 | 78 | this.related = new ArrayList<RelatedTitle>(); |
| 79 | + this.anchorText = new ArrayList<String>(); |
76 | 80 | } |
77 | 81 | |
78 | 82 | public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references) { |
— | — | @@ -83,9 +87,11 @@ |
84 | 88 | this.references = references; |
85 | 89 | this.redirects = new ArrayList<Redirect>(); |
86 | 90 | this.related = new ArrayList<RelatedTitle>(); |
| 91 | + this.anchorText = new ArrayList<String>(); |
87 | 92 | } |
88 | 93 | |
89 | | - public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related) { |
| 94 | + public Article(long pageId, int namespace, String titleText, String text, boolean redirect, int references, |
| 95 | + ArrayList<Redirect> redirects, ArrayList<RelatedTitle> related, ArrayList<String> anchorText) { |
90 | 96 | this.namespace = Integer.toString(namespace); |
91 | 97 | this.title = titleText; |
92 | 98 | contents = text; |
— | — | @@ -94,6 +100,7 @@ |
95 | 101 | this.references = references; |
96 | 102 | this.redirects = redirects; |
97 | 103 | this.related = related; |
| 104 | + this.anchorText = anchorText; |
98 | 105 | } |
99 | 106 | |
100 | 107 | public boolean isRedirect() { |
— | — | @@ -216,11 +223,14 @@ |
217 | 224 | |
218 | 225 | public void setRelated(ArrayList<RelatedTitle> related) { |
219 | 226 | this.related = related; |
| 227 | + } |
| 228 | + |
| 229 | + public ArrayList<String> getAnchorText() { |
| 230 | + return anchorText; |
| 231 | + } |
| 232 | + |
| 233 | + public void setAnchorText(ArrayList<String> anchorText) { |
| 234 | + this.anchorText = anchorText; |
220 | 235 | } |
221 | 236 | |
222 | | - |
223 | | - |
224 | | - |
225 | | - |
226 | | - |
227 | 237 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -17,8 +17,10 @@ |
18 | 18 | import org.wikimedia.lsearch.config.IndexId; |
19 | 19 | import org.wikimedia.lsearch.index.IndexThread; |
20 | 20 | import org.wikimedia.lsearch.ranks.LinkReader; |
| 21 | +import org.wikimedia.lsearch.ranks.Links; |
21 | 22 | import org.wikimedia.lsearch.ranks.OldLinks; |
22 | 23 | import org.wikimedia.lsearch.ranks.RankBuilder; |
| 24 | +import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
23 | 25 | import org.wikimedia.lsearch.storage.Storage; |
24 | 26 | import org.wikimedia.lsearch.util.Localization; |
25 | 27 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
— | — | @@ -42,7 +44,7 @@ |
43 | 45 | Boolean optimize = null; |
44 | 46 | Integer mergeFactor = null, maxBufDocs = null; |
45 | 47 | boolean newIndex = true, makeSnapshot = false; |
46 | | - boolean snapshotDb = false; boolean updateReferences=false; |
| 48 | + boolean snapshotDb = false, useOldLinkAnalysis = false; |
47 | 49 | |
48 | 50 | System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n"); |
49 | 51 | |
— | — | @@ -50,11 +52,11 @@ |
51 | 53 | log = Logger.getLogger(Importer.class); |
52 | 54 | |
53 | 55 | if(args.length < 2){ |
54 | | - System.out.println("Syntax: java Importer [-n] [-s] [-r] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>"); |
| 56 | + System.out.println("Syntax: java Importer [-a] [-n] [-s] [-la] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>"); |
55 | 57 | System.out.println("Options: "); |
56 | 58 | System.out.println(" -a - don't create new index, append to old"); |
57 | 59 | System.out.println(" -s - make index snapshot when finished"); |
58 | | - System.out.println(" -r - update references info on storage backend"); |
| 60 | + System.out.println(" -la - use earlier link analysis index, don't recalculate"); |
59 | 61 | System.out.println(" -l limit_num - add at most limit_num articles"); |
60 | 62 | System.out.println(" -o optimize - true/false overrides optimization param from global settings"); |
61 | 63 | System.out.println(" -m mergeFactor - overrides param from global settings"); |
— | — | @@ -73,8 +75,8 @@ |
74 | 76 | maxBufDocs = Integer.parseInt(args[++i]); |
75 | 77 | else if(args[i].equals("-a")) |
76 | 78 | newIndex = false; |
77 | | - else if(args[i].equals("-r")) |
78 | | - updateReferences = true; |
| 79 | + else if(args[i].equals("-la")) |
| 80 | + useOldLinkAnalysis = true; |
79 | 81 | else if(args[i].equals("-s")) |
80 | 82 | makeSnapshot = true; |
81 | 83 | else if(args[i].equals("--snapshot")){ |
— | — | @@ -95,6 +97,7 @@ |
96 | 98 | } |
97 | 99 | |
98 | 100 | String langCode = GlobalConfiguration.getInstance().getLanguage(dbname); |
| 101 | + IndexId iid = IndexId.get(dbname); |
99 | 102 | // preload |
100 | 103 | UnicodeDecomposer.getInstance(); |
101 | 104 | Localization.readLocalization(langCode); |
— | — | @@ -102,19 +105,16 @@ |
103 | 106 | |
104 | 107 | long start = System.currentTimeMillis(); |
105 | 108 | |
106 | | - // regenerate link and redirect information |
107 | | - OldLinks links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,LinkReader.READ_REDIRECTS); |
108 | | - |
109 | | - if(updateReferences){ |
| 109 | + if(!useOldLinkAnalysis){ |
| 110 | + // regenerate link and redirect information |
| 111 | + Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode,iid),langCode); |
110 | 112 | try { |
111 | | - Storage.getInstance().storePageReferences(links.getAll(),dbname); |
| 113 | + RankBuilder.storeLinkAnalysis(links,iid); |
112 | 114 | } catch (IOException e) { |
113 | | - log.error("Failed to update references info: "+e.getMessage()); |
| 115 | + log.fatal("Cannot store link analytics: "+e.getMessage()); |
| 116 | + return; |
114 | 117 | } |
115 | 118 | } |
116 | | - links.generateRedirectLists(); |
117 | | - links.compactAll(); |
118 | | - |
119 | 119 | log.info("Third pass, indexing articles..."); |
120 | 120 | |
121 | 121 | // open |
— | — | @@ -125,9 +125,9 @@ |
126 | 126 | log.fatal("I/O error opening "+inputfile); |
127 | 127 | return; |
128 | 128 | } |
129 | | - |
| 129 | + LinkAnalysisStorage las = new LinkAnalysisStorage(iid); |
130 | 130 | // read |
131 | | - DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,links,langCode); |
| 131 | + DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex,las,langCode); |
132 | 132 | XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 1000)); |
133 | 133 | try { |
134 | 134 | reader.readDump(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -21,9 +21,12 @@ |
22 | 22 | import org.wikimedia.lsearch.config.Configuration; |
23 | 23 | import org.wikimedia.lsearch.config.IndexId; |
24 | 24 | import org.wikimedia.lsearch.ranks.CompactArticleLinks; |
| 25 | +import org.wikimedia.lsearch.ranks.Links; |
25 | 26 | import org.wikimedia.lsearch.ranks.OldLinks; |
26 | 27 | import org.wikimedia.lsearch.ranks.RankBuilder; |
27 | 28 | import org.wikimedia.lsearch.ranks.RelatedTitle; |
| 29 | +import org.wikimedia.lsearch.storage.ArticleAnalytics; |
| 30 | +import org.wikimedia.lsearch.storage.LinkAnalysisStorage; |
28 | 31 | import org.wikimedia.lsearch.util.Localization; |
29 | 32 | |
30 | 33 | public class DumpImporter implements DumpWriter { |
— | — | @@ -32,15 +35,15 @@ |
33 | 36 | Revision revision; |
34 | 37 | SimpleIndexWriter writer; |
35 | 38 | int count = 0, limit; |
36 | | - OldLinks links; |
| 39 | + LinkAnalysisStorage las; |
37 | 40 | String langCode; |
38 | 41 | |
39 | 42 | public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor, |
40 | | - Integer maxBufDocs, boolean newIndex, OldLinks ranks, String langCode){ |
| 43 | + Integer maxBufDocs, boolean newIndex, LinkAnalysisStorage las, String langCode){ |
41 | 44 | Configuration.open(); // make sure configuration is loaded |
42 | 45 | writer = new SimpleIndexWriter(IndexId.get(dbname), optimize, mergeFactor, maxBufDocs, newIndex); |
43 | 46 | this.limit = limit; |
44 | | - this.links = ranks; |
| 47 | + this.las = las; |
45 | 48 | this.langCode = langCode; |
46 | 49 | } |
47 | 50 | public void writeRevision(Revision revision) throws IOException { |
— | — | @@ -50,28 +53,25 @@ |
51 | 54 | this.page = page; |
52 | 55 | } |
53 | 56 | public void writeEndPage() throws IOException { |
54 | | - // get reference count |
55 | 57 | String key = page.Title.Namespace+":"+page.Title.Text; |
56 | | - CompactArticleLinks r = links.get(key); |
57 | | - int references; |
58 | | - boolean isRedirect = r.redirectsTo != null; |
59 | | - if(r == null){ |
60 | | - references = 0; |
61 | | - log.error("Reference count for "+key+" is undefined, which should never happen."); |
62 | | - } else |
63 | | - references = r.links; |
| 58 | + ArticleAnalytics aa = las.getAnalitics(key); |
| 59 | + int references = aa.getReferences(); |
| 60 | + boolean isRedirect = aa.isRedirect(); |
| 61 | + |
64 | 62 | // make list of redirects |
65 | 63 | ArrayList<Redirect> redirects = new ArrayList<Redirect>(); |
66 | | - if(r.redirected != null){ |
67 | | - for(CompactArticleLinks rk : r.redirected){ |
68 | | - String[] parts = rk.toString().split(":",2); |
69 | | - redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],rk.links)); |
70 | | - } |
| 64 | + ArrayList<String> anchors = new ArrayList<String>(); |
| 65 | + anchors.addAll(aa.getAnchorText()); |
| 66 | + for(String rk : aa.getRedirectKeys()){ |
| 67 | + String[] parts = rk.toString().split(":",2); |
| 68 | + ArticleAnalytics raa = las.getAnalitics(rk); |
| 69 | + redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],raa.getReferences())); |
| 70 | + anchors.addAll(raa.getAnchorText()); |
71 | 71 | } |
72 | | - ArrayList<RelatedTitle> related = RankBuilder.getRelatedTitles(r,links); |
| 72 | + //TODO: ArrayList<RelatedTitle> related = RankBuilder.getRelatedTitles(r,links); |
73 | 73 | // make article |
74 | 74 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect, |
75 | | - references,redirects,related); |
| 75 | + references,redirects,new ArrayList<RelatedTitle>(),anchors); |
76 | 76 | writer.addArticle(article); |
77 | 77 | count++; |
78 | 78 | if(limit >= 0 && count > limit) |
Index: branches/lucene-search-2.1/lsearch-global.conf |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | wikidev : (single) (language,sr) |
19 | 19 | wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[]) |
20 | 20 | wikilucene : (language,en) (warmup,10) |
21 | | -wikilucene : (spell_words,10,2) (spell_titles,3,1,2) |
| 21 | +wikilucene : (spell,3,1) |
22 | 22 | |
23 | 23 | # Search groups |
24 | 24 | # Index parts of a split index are always taken from the node's group |
— | — | @@ -56,7 +56,7 @@ |
57 | 57 | |
58 | 58 | # suffix for databases that should also have exact-case index built |
59 | 59 | # note: this will also turn off stemming! |
60 | | -ExactCase.suffix=wiktionary wikilucene |
| 60 | +ExactCase.suffix=wiktionary |
61 | 61 | |
62 | 62 | # wmf-style init file, attempt to read wgserver (for oai) and lang info |
63 | 63 | # for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html |