Index: branches/lucene-search-2.1/lib/dict/wordnet-en.txt.gz |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalMultiQuery.java |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | protected PositionalOptions options; |
22 | 22 | protected int stopWordCount = 0; |
23 | 23 | protected ArrayList<ArrayList<Float>> boosts = new ArrayList<ArrayList<Float>>(); |
| 24 | + protected boolean scaledBoosts = false; |
24 | 25 | |
25 | 26 | public PositionalMultiQuery(PositionalOptions options){ |
26 | 27 | this.options = options; |
— | — | @@ -109,18 +110,17 @@ |
110 | 111 | av /= terms.length; |
111 | 112 | idf += av; |
112 | 113 | |
113 | | - // rescale boosts to reinstall right idfs per term |
114 | | - ArrayList<Float> fb = boosts.get(count); |
115 | | - for(int j=0; j<idfs.length; j++){ |
116 | | - fb.set(j,fb.get(j)*(idfs[j]/av)); |
117 | | - } |
| 114 | + if(!scaledBoosts){ |
| 115 | + // rescale boosts to reinstall right idfs per term |
| 116 | + ArrayList<Float> fb = boosts.get(count); |
| 117 | + for(int j=0; j<idfs.length; j++){ |
| 118 | + fb.set(j,fb.get(j)*(idfs[j]/av)); |
| 119 | + } |
| 120 | + } |
118 | 121 | count++; |
119 | 122 | } |
| 123 | + scaledBoosts = true; |
120 | 124 | } |
121 | | - |
122 | | - private final float sq(float x){ |
123 | | - return x*x; |
124 | | - } |
125 | 125 | |
126 | 126 | public Scorer scorer(IndexReader reader) throws IOException { |
127 | 127 | if (termArrays.size() == 0) // optimize zero-term case |
— | — | @@ -224,7 +224,8 @@ |
225 | 225 | } |
226 | 226 | |
227 | 227 | public Query rewrite(IndexReader reader) { |
228 | | - if (termArrays.size() == 1) { // optimize one-term case |
| 228 | + // optimize one-term case |
| 229 | + if (termArrays.size() == 1 && (options==null || !options.takeMaxScore)) { |
229 | 230 | Term[] terms = (Term[])termArrays.get(0); |
230 | 231 | ArrayList<Float> boost = boosts.get(0); |
231 | 232 | if(terms.length == 1){ |
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java |
— | — | @@ -67,6 +67,7 @@ |
68 | 68 | public Alttitle(){ |
69 | 69 | aggregateMeta = new AggregateInfoImpl(); |
70 | 70 | takeMaxScore = true; |
| 71 | + //exactBoost = 2; |
71 | 72 | //wholeBoost = 10; |
72 | 73 | } |
73 | 74 | } |
— | — | @@ -96,6 +97,7 @@ |
97 | 98 | public Related(){ |
98 | 99 | aggregateMeta = new AggregateInfoImpl(); |
99 | 100 | takeMaxScore = true; |
| 101 | + //exactBoost = 2; |
100 | 102 | } |
101 | 103 | } |
102 | 104 | |
— | — | @@ -157,6 +159,14 @@ |
158 | 160 | } |
159 | 161 | } |
160 | 162 | |
| 163 | + /** Near match phrases, when more than 50% of nonstopwords are matched */ |
| 164 | + public static class AlttitleNearMatch extends PositionalOptions { |
| 165 | + public AlttitleNearMatch(){ |
| 166 | + aggregateMeta = new AggregateInfoImpl(); |
| 167 | + takeMaxScore = true; |
| 168 | + } |
| 169 | + } |
| 170 | + |
161 | 171 | public abstract static class NamespaceBoost implements Serializable { |
162 | 172 | public abstract float getBoost(int namespace); |
163 | 173 | |
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java |
— | — | @@ -444,7 +444,6 @@ |
445 | 445 | */ |
446 | 446 | protected final float phraseFreq() throws IOException { |
447 | 447 | int end = initPhrasePositionsBoost(); |
448 | | - |
449 | 448 | float freq = 0.0f; |
450 | 449 | boolean done = (end<0); |
451 | 450 | while (!done) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -305,7 +305,7 @@ |
306 | 306 | } |
307 | 307 | if(templateLevel == 0 && tableLevel == 0) |
308 | 308 | keywordTokens+=gap; // inc by gap (usually 1, can be more before paragraphs and sections) |
309 | | - |
| 309 | + |
310 | 310 | // add exact token |
311 | 311 | Token exact; |
312 | 312 | if(options.exactCase) |
— | — | @@ -322,6 +322,14 @@ |
323 | 323 | exact.setType("titlecase"); |
324 | 324 | } |
325 | 325 | addToTokens(exact); |
| 326 | + |
| 327 | + // extra uppercase token, prevent exact-matches for titles |
| 328 | + if(options.extraUpperCaseToken && allUpperCase){ |
| 329 | + Token t = makeToken(new String(buffer, 0, length), start, start + length, false); |
| 330 | + t.setPositionIncrement(0); |
| 331 | + t.setType(exact.type()); |
| 332 | + addToTokens(t); |
| 333 | + } |
326 | 334 | |
327 | 335 | if(!options.noAliases){ |
328 | 336 | // add decomposed token to stream |
— | — | @@ -650,8 +658,7 @@ |
651 | 659 | prefixLen = 0; |
652 | 660 | semicolonInx = -1; |
653 | 661 | break; |
654 | | - } |
655 | | - if(Character.isLetter(lc)){ |
| 662 | + } else{ |
656 | 663 | prefixBuf[ prefixLen++ ] = Character.toLowerCase(lc); |
657 | 664 | } |
658 | 665 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -1187,7 +1187,7 @@ |
1188 | 1188 | |
1189 | 1189 | BooleanQuery wrap = new BooleanQuery(true); |
1190 | 1190 | wrap.add(full,Occur.SHOULD); |
1191 | | - wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD); |
| 1191 | + wrap.add(makeComplete(expandedWordsTitle,expandedBoostTitle,expandedTypes),Occur.SHOULD); |
1192 | 1192 | if(forbidden != null) |
1193 | 1193 | wrap.add(forbidden,Occur.MUST_NOT); |
1194 | 1194 | |
— | — | @@ -1197,7 +1197,7 @@ |
1198 | 1198 | AgeScaling age = iid.getAgeScaling(); |
1199 | 1199 | if(age != AgeScaling.NONE){ |
1200 | 1200 | switch(age){ |
1201 | | - case STRONG: scale = new ArticleScaling.SqrtScale(0.3f,1); break; |
| 1201 | + case STRONG: scale = new ArticleScaling.StepScale(0.3f,1); break; |
1202 | 1202 | case MEDIUM: scale = new ArticleScaling.StepScale(0.6f,1); break; |
1203 | 1203 | case WEAK: scale = new ArticleScaling.StepScale(0.9f,1); break; |
1204 | 1204 | default: throw new RuntimeException("Unsupported age scaling "+age); |
— | — | @@ -1231,14 +1231,15 @@ |
1232 | 1232 | } |
1233 | 1233 | |
1234 | 1234 | /** Make alternate "complete" query that will match redirects not in contents like los angles -> los angeles */ |
1235 | | - private Query makeComplete(ArrayList<ArrayList<String>> expanded) { |
1236 | | - PositionalQuery pq = new PositionalQuery(new PositionalOptions.RedirectComplete()); |
| 1235 | + private Query makeComplete(ArrayList<ArrayList<String>> expanded, ArrayList<ArrayList<Float>> boosts, ArrayList<ExpandedType> types) { |
| 1236 | + return makePositionalMulti(expanded,boosts,types,fields.alttitle(),new PositionalOptions.RedirectComplete(),0,1); |
| 1237 | + /* PositionalQuery pq = new PositionalQuery(new PositionalOptions.RedirectComplete()); |
1237 | 1238 | for(int i=0;i<expanded.size();i++){ |
1238 | 1239 | for(String w : expanded.get(i)){ |
1239 | 1240 | pq.add(new Term(fields.alttitle(),w),i,stopWords.contains(w)); |
1240 | 1241 | } |
1241 | 1242 | } |
1242 | | - return pq; |
| 1243 | + return pq; */ |
1243 | 1244 | } |
1244 | 1245 | |
1245 | 1246 | private ArrayList<String> cleanupWords(ArrayList<String> words) { |
— | — | @@ -1475,8 +1476,12 @@ |
1476 | 1477 | return query; |
1477 | 1478 | BooleanQuery bq = new BooleanQuery(true); |
1478 | 1479 | bq.add(query,Occur.SHOULD); |
1479 | | - for(Query q : additional) |
1480 | | - bq.add(q,Occur.SHOULD); |
| 1480 | + for(Query q : additional){ |
| 1481 | + if(q != null) |
| 1482 | + bq.add(q,Occur.SHOULD); |
| 1483 | + } |
| 1484 | + if(bq.clauses().size()==1) |
| 1485 | + return query; |
1481 | 1486 | return bq; |
1482 | 1487 | } |
1483 | 1488 | |
— | — | @@ -1637,6 +1642,15 @@ |
1638 | 1643 | return bq; |
1639 | 1644 | } |
1640 | 1645 | |
| 1646 | + private int countNonStopWords(ArrayList<String> words){ |
| 1647 | + int count = 0; |
| 1648 | + for(String w : words){ |
| 1649 | + if(!stopWords.contains(w)) |
| 1650 | + count++; |
| 1651 | + } |
| 1652 | + return count; |
| 1653 | + } |
| 1654 | + |
1641 | 1655 | /** Make query with short subphrases anchored in non-stop words */ |
1642 | 1656 | protected Query makeAnchoredQueryMulti(ArrayList<ArrayList<String>> words, ArrayList<ArrayList<Float>> boosts, ArrayList<ExpandedType> types, |
1643 | 1657 | String field, PositionalOptions options, PositionalOptions whole, PositionalOptions wholeSloppy, |
— | — | @@ -1768,9 +1782,7 @@ |
1769 | 1783 | |
1770 | 1784 | Query q = parseRaw(queryText); |
1771 | 1785 | |
1772 | | - ArrayList<String> words = wordsFromParser; |
1773 | | - if(words == null || words.size() == 0) |
1774 | | - return q; |
| 1786 | + ArrayList<String> words = wordsFromParser; |
1775 | 1787 | |
1776 | 1788 | this.builder = oldBuilder; |
1777 | 1789 | this.defaultField = oldDefaultField; |
— | — | @@ -1786,31 +1798,33 @@ |
1787 | 1799 | BooleanQuery full = new BooleanQuery(true); |
1788 | 1800 | full.add(q,Occur.MUST); |
1789 | 1801 | |
1790 | | - // main relevance |
1791 | | - Query redirects = makeAlttitleForRedirects(words,20,1); |
1792 | | - if(redirects != null) |
1793 | | - full.add(redirects,Occur.SHOULD); |
| 1802 | + /*if(words != null || words.size() > 0){ |
| 1803 | + // main relevance |
| 1804 | + Query redirects = makeAlttitleForRedirects(words,20,1); |
| 1805 | + if(redirects != null) |
| 1806 | + full.add(redirects,Occur.SHOULD); |
| 1807 | + |
| 1808 | + // singular words |
| 1809 | + ArrayList<String> singularWords = makeSingularWords(words); |
| 1810 | + if(singularWords != null){ |
| 1811 | + Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f); |
| 1812 | + if(redirectsSing != null) |
| 1813 | + full.add(redirectsSing,Occur.SHOULD); |
| 1814 | + } |
| 1815 | + } */ |
1794 | 1816 | |
1795 | | - // singular words |
1796 | | - ArrayList<String> singularWords = makeSingularWords(words); |
1797 | | - if(singularWords != null){ |
1798 | | - Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f); |
1799 | | - if(redirectsSing != null) |
1800 | | - full.add(redirectsSing,Occur.SHOULD); |
1801 | | - } |
1802 | | - |
1803 | 1817 | // fuzzy & wildcards |
1804 | 1818 | // NOTE: for these to work parseForTitles needs to called after parse() |
1805 | | - if(hasWildcards() || hasFuzzy()){ |
1806 | | - Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f); |
1807 | | - if(redirectsMulti != null) |
1808 | | - full.add(redirectsMulti,Occur.SHOULD); |
1809 | | - } |
| 1819 | + //if(hasWildcards() || hasFuzzy()){ |
| 1820 | + Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f); |
| 1821 | + if(redirectsMulti != null) |
| 1822 | + full.add(redirectsMulti,Occur.SHOULD); |
| 1823 | + //} |
1810 | 1824 | |
1811 | 1825 | // add another for complete matches |
1812 | 1826 | BooleanQuery wrap = new BooleanQuery(true); |
1813 | 1827 | wrap.add(full,Occur.SHOULD); |
1814 | | - wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD); |
| 1828 | + wrap.add(makeComplete(expandedWordsTitle,expandedBoostTitle,expandedTypes),Occur.SHOULD); |
1815 | 1829 | if(forbidden != null) |
1816 | 1830 | wrap.add(forbidden,Occur.MUST_NOT); |
1817 | 1831 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -35,6 +35,7 @@ |
36 | 36 | protected ArrayList<Class> additionalFilters = null; |
37 | 37 | protected Singular singular = null; |
38 | 38 | protected boolean hasCanonicalFilter = false; |
| 39 | + protected boolean hasLanguageVariants = false; |
39 | 40 | |
40 | 41 | protected FilterFactory noStemmerFilterFactory=null; |
41 | 42 | protected Set<String> stopWords; |
— | — | @@ -146,6 +147,9 @@ |
147 | 148 | if(lang.equals("sr")) |
148 | 149 | hasCanonicalFilter = true; |
149 | 150 | |
| 151 | + // variants (TODO: add zh) |
| 152 | + if(lang.equals("sr")) |
| 153 | + hasLanguageVariants = true; |
150 | 154 | } |
151 | 155 | |
152 | 156 | public static boolean isCJKLanguage(String lang){ |
— | — | @@ -368,4 +372,14 @@ |
369 | 373 | public boolean isSpellCheck(){ |
370 | 374 | return type == Type.SPELL_CHECK; |
371 | 375 | } |
| 376 | + |
| 377 | + /** Convert word into language variants if any */ |
| 378 | + public ArrayList<String> getVariants(String word){ |
| 379 | + if(!hasLanguageVariants) |
| 380 | + return null; |
| 381 | + if(lang.equals("sr")){ |
| 382 | + return SerbianFilter.getVariants(word); |
| 383 | + } else |
| 384 | + return null; |
| 385 | + } |
372 | 386 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java |
— | — | @@ -28,6 +28,8 @@ |
29 | 29 | boolean extendedTrailing = false; |
30 | 30 | /** if to split tokens with apostrophes and points in them */ |
31 | 31 | boolean split = true; |
| 32 | + /** generate extra original token if the word is in upper case */ |
| 33 | + boolean extraUpperCaseToken = false; |
32 | 34 | |
33 | 35 | public TokenizerOptions(boolean exactCase){ |
34 | 36 | this.exactCase = exactCase; |
— | — | @@ -53,10 +55,11 @@ |
54 | 56 | relocationParsing = false; |
55 | 57 | noCaseDetection = true; |
56 | 58 | extendedTrailing = true; |
| 59 | + extraUpperCaseToken = true; |
57 | 60 | } |
58 | 61 | } |
59 | 62 | |
60 | | - public static class TitleNoSplit extends Title{ |
| 63 | + public static class TitleNoSplit extends Title { |
61 | 64 | public TitleNoSplit(boolean exactCase){ |
62 | 65 | super(exactCase); |
63 | 66 | this.split = false; |
— | — | @@ -111,6 +114,7 @@ |
112 | 115 | super(false); |
113 | 116 | noAliases = true; |
114 | 117 | noTrailing = true; |
| 118 | + extraUpperCaseToken = false; |
115 | 119 | } |
116 | 120 | } |
117 | 121 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/SerbianFilter.java |
— | — | @@ -1,6 +1,8 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.BitSet; |
5 | 7 | import java.util.HashMap; |
6 | 8 | |
7 | 9 | import org.apache.lucene.analysis.Token; |
— | — | @@ -20,10 +22,13 @@ |
21 | 23 | private final char[] buffer = new char[MAX_WORD_LEN+1]; |
22 | 24 | private int length; |
23 | 25 | protected static String[] conv = null; |
| 26 | + protected static String[] toLatin = null, toCyrillic = null; |
| 27 | + protected static HashMap<String,String> toCyrillicMap = null; |
| 28 | + protected static BitSet toCyrillicTwo = null; // pairs of two chars |
24 | 29 | protected Token nextToken; |
25 | 30 | protected boolean aliasDiff; |
26 | 31 | |
27 | | - public void init(){ |
| 32 | + public static synchronized void init(){ |
28 | 33 | conv = new String[65536]; |
29 | 34 | |
30 | 35 | for(int i=0;i<65536;i++) |
— | — | @@ -46,6 +51,110 @@ |
47 | 52 | conv['đ'] = "dj"; conv['Đ']="Dj"; |
48 | 53 | } |
49 | 54 | |
| 55 | + public static synchronized void initVariants(){ |
| 56 | + toLatin = new String[65536]; |
| 57 | + toCyrillic = new String[65536]; |
| 58 | + toCyrillicMap = new HashMap<String,String>(); |
| 59 | + toCyrillicTwo = new BitSet(); |
| 60 | + |
| 61 | + for(int i=0;i<65536;i++){ |
| 62 | + toLatin[i] = null; |
| 63 | + toCyrillic[i] = null; |
| 64 | + } |
| 65 | + |
| 66 | + toLatin['а']="a"; toLatin['б']="b"; toLatin['в']="v"; toLatin['г']="g"; toLatin['д']="d"; |
| 67 | + toLatin['ђ']="đ"; toLatin['е']="e"; toLatin['ж']="ž"; toLatin['з']="z"; toLatin['и']="i"; |
| 68 | + toLatin['ј']="j"; toLatin['к']="k"; toLatin['л']="l"; toLatin['љ']="lj"; toLatin['м']="m"; |
| 69 | + toLatin['н']="n"; toLatin['њ']="nj"; toLatin['о']="o"; toLatin['п']="p"; toLatin['р']="r"; |
| 70 | + toLatin['с']="s"; toLatin['т']="t"; toLatin['ћ']="ć"; toLatin['у']="u"; toLatin['ф']="f"; |
| 71 | + toLatin['х']="h"; toLatin['ц']="c"; toLatin['ч']="č"; toLatin['џ']="dž"; toLatin['ш']="š"; |
| 72 | + |
| 73 | + toLatin['А']="A"; toLatin['Б']="B"; toLatin['В']="V"; toLatin['Г']="G"; toLatin['Д']="D"; |
| 74 | + toLatin['Ђ']="Đ"; toLatin['Е']="E"; toLatin['Ж']="Ž"; toLatin['З']="Z"; toLatin['И']="I"; |
| 75 | + toLatin['Ј']="J"; toLatin['К']="K"; toLatin['Л']="L"; toLatin['Љ']="Lj"; toLatin['М']="M"; |
| 76 | + toLatin['Н']="N"; toLatin['Њ']="Nj"; toLatin['О']="O"; toLatin['П']="P"; toLatin['Р']="R"; |
| 77 | + toLatin['С']="S"; toLatin['Т']="T"; toLatin['Ћ']="Ć"; toLatin['У']="U"; toLatin['Ф']="F"; |
| 78 | + toLatin['Х']="H"; toLatin['Ц']="C"; toLatin['Ч']="Č"; toLatin['Џ']="Dž"; toLatin['Ш']="Š"; |
| 79 | + |
| 80 | + toCyrillic['a']="а"; toCyrillic['b']="б"; toCyrillic['c']="ц"; toCyrillic['č']="ч"; toCyrillic['ć']="ћ"; |
| 81 | + toCyrillic['d']="д"; toCyrillic['đ']="ђ"; toCyrillic['e']="е"; toCyrillic['f']="ф"; |
| 82 | + toCyrillic['g']="г"; toCyrillic['h']="х"; toCyrillic['i']="и"; toCyrillic['j']="ј"; toCyrillic['k']="к"; |
| 83 | + toCyrillic['l']="л"; toCyrillic['m']="м"; toCyrillic['n']="н"; |
| 84 | + toCyrillic['o']="о"; toCyrillic['p']="п"; toCyrillic['r']="р"; toCyrillic['s']="с"; toCyrillic['š']="ш"; |
| 85 | + toCyrillic['t']="т"; toCyrillic['u']="у"; toCyrillic['v']="в"; toCyrillic['z']="з"; toCyrillic['ž']="ж"; |
| 86 | + |
| 87 | + toCyrillic['A']="А"; toCyrillic['B']="Б"; toCyrillic['C']="Ц"; toCyrillic['Č']="Ч"; toCyrillic['Ć']="Ћ"; |
| 88 | + toCyrillic['D']="Д"; toCyrillic['Đ']="Ђ"; toCyrillic['E']="Е"; toCyrillic['F']="Ф"; |
| 89 | + toCyrillic['G']="Г"; toCyrillic['H']="Х"; toCyrillic['I']="И"; toCyrillic['J']="Ј"; toCyrillic['K']="К"; |
| 90 | + toCyrillic['L']="Л"; toCyrillic['M']="М"; toCyrillic['N']="Н"; |
| 91 | + toCyrillic['O']="О"; toCyrillic['P']="П"; toCyrillic['R']="Р"; toCyrillic['S']="С"; toCyrillic['Š']="Ш"; |
| 92 | + toCyrillic['T']="Т"; toCyrillic['U']="У"; toCyrillic['V']="В"; toCyrillic['Z']="З"; toCyrillic['Ž']="Ж"; |
| 93 | + |
| 94 | + toCyrillicMap.put("DŽ","Џ"); toCyrillicMap.put("Lj","Љ"); toCyrillicMap.put("Nj","Њ"); |
| 95 | + toCyrillicMap.put("LJ","Љ"); toCyrillicMap.put("Dž","Џ"); toCyrillicMap.put("nj","њ"); |
| 96 | + toCyrillicMap.put("dž","џ"); toCyrillicMap.put("lj","љ"); toCyrillicMap.put("NJ","Њ"); |
| 97 | + |
| 98 | + toCyrillicTwo.set('D'); toCyrillicTwo.set('d'); toCyrillicTwo.set('Ž'); toCyrillicTwo.set('ž'); |
| 99 | + toCyrillicTwo.set('L'); toCyrillicTwo.set('l'); toCyrillicTwo.set('J'); toCyrillicTwo.set('j'); |
| 100 | + toCyrillicTwo.set('N'); toCyrillicTwo.set('n'); |
| 101 | + } |
| 102 | + |
| 103 | + /** get latin and cyrillic variant of the text */ |
| 104 | + public static ArrayList<String> getVariants(String text){ |
| 105 | + if(toLatin == null || toCyrillic==null) |
| 106 | + initVariants(); |
| 107 | + if(text.length() == 0) |
| 108 | + return null; |
| 109 | + else if(text.length() == 1){ |
| 110 | + ArrayList<String> ret = new ArrayList<String>(); |
| 111 | + String l = toLatin[text.charAt(0)]; |
| 112 | + if(l != null) |
| 113 | + ret.add(l); |
| 114 | + String c = toCyrillic[text.charAt(0)]; |
| 115 | + if(c != null) |
| 116 | + ret.add(c); |
| 117 | + return ret; |
| 118 | + } |
| 119 | + StringBuilder lat = new StringBuilder(); |
| 120 | + StringBuilder cyr = new StringBuilder(); |
| 121 | + char c='\0', c1=text.charAt(0); |
| 122 | + for(int i=1;i<text.length()+1;i++){ |
| 123 | + c = c1; |
| 124 | + c1 = i<text.length()? text.charAt(i) : '\0'; |
| 125 | + String l = toLatin[c]; |
| 126 | + if(l != null) |
| 127 | + lat.append(l); |
| 128 | + else |
| 129 | + lat.append(c); |
| 130 | + } |
| 131 | + |
| 132 | + c='\0'; c1=text.charAt(0); |
| 133 | + for(int i=1;i<text.length()+1;i++){ |
| 134 | + c = c1; |
| 135 | + c1 = i<text.length()? text.charAt(i) : '\0'; |
| 136 | + String cl = null; |
| 137 | + // quick check if we should try the two-letter map |
| 138 | + if(toCyrillicTwo.get(c) && toCyrillicTwo.get(c1)) |
| 139 | + cl = toCyrillicMap.get(""+c+c1); |
| 140 | + |
| 141 | + if(cl != null){ |
| 142 | + i++; |
| 143 | + c = c1; |
| 144 | + c1 = i<text.length()? text.charAt(i) : '\0'; |
| 145 | + } else // single letter map |
| 146 | + cl = toCyrillic[c]; |
| 147 | + if(cl != null) |
| 148 | + cyr.append(cl); |
| 149 | + else |
| 150 | + cyr.append(c); |
| 151 | + } |
| 152 | + ArrayList<String> ret = new ArrayList<String>(); |
| 153 | + ret.add(lat.toString()); |
| 154 | + ret.add(cyr.toString()); |
| 155 | + return ret; |
| 156 | + } |
| 157 | + |
| 158 | + /** Convert to ascii */ |
50 | 159 | public String convert(String text){ |
51 | 160 | length = 0; |
52 | 161 | String cv; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java |
— | — | @@ -116,6 +116,9 @@ |
117 | 117 | if(isStub()){ |
118 | 118 | try { |
119 | 119 | setTermText(new String(serialized,termTextStart,termTextEnd-termTextStart,"utf-8")); |
| 120 | + // check if this is a cjk token |
| 121 | + if(termText().length()>0 && type==Type.TEXT && CJKFilter.isCJKChar(termText().codePointAt(0))) |
| 122 | + setType("cjk"); |
120 | 123 | unstubOriginal(); |
121 | 124 | } catch (UnsupportedEncodingException e) { |
122 | 125 | e.printStackTrace(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/CJKFilter.java |
— | — | @@ -5,6 +5,7 @@ |
6 | 6 | import org.apache.lucene.analysis.Token; |
7 | 7 | import org.apache.lucene.analysis.TokenFilter; |
8 | 8 | import org.apache.lucene.analysis.TokenStream; |
| 9 | +import org.wikimedia.lsearch.analyzers.ExtToken.Type; |
9 | 10 | |
10 | 11 | /** |
11 | 12 | * Simple CJK (Chinese Japanese Korean) token filter. |
— | — | @@ -24,10 +25,16 @@ |
25 | 26 | if(buffer.size()!=0) |
26 | 27 | return buffer.removeFirst(); |
27 | 28 | |
28 | | - Token token = input.next(); |
29 | | - if(token == null) |
30 | | - return null; |
| 29 | + Token token; |
| 30 | + do{ |
| 31 | + token = input.next(); |
| 32 | + if(token == null) |
| 33 | + return null; |
| 34 | + } while(token.getPositionIncrement()==0); // discard aliases |
31 | 35 | |
| 36 | + if(token instanceof ExtToken && ((ExtToken)token).getType()!=Type.TEXT) |
| 37 | + return token; |
| 38 | + |
32 | 39 | String text = token.termText(); |
33 | 40 | |
34 | 41 | int i,offset,c; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Utf8Set.java |
— | — | @@ -63,9 +63,13 @@ |
64 | 64 | protected Utf8String str = new Utf8String(); |
65 | 65 | |
66 | 66 | public Utf8Set(Set<String> words){ |
67 | | - for(String w : words){ |
68 | | - lookup[w.charAt(0)&MASK] = true; |
69 | | - set.add(new Utf8String(w)); |
| 67 | + try{ |
| 68 | + for(String w : words){ |
| 69 | + lookup[w.getBytes("utf-8")[0]&MASK] = true; |
| 70 | + set.add(new Utf8String(w)); |
| 71 | + } |
| 72 | + } catch(Exception e){ |
| 73 | + e.printStackTrace(); |
70 | 74 | } |
71 | 75 | } |
72 | 76 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java |
— | — | @@ -38,6 +38,7 @@ |
39 | 39 | import org.apache.lucene.search.TermQuery; |
40 | 40 | import org.apache.lucene.store.Directory; |
41 | 41 | import org.apache.lucene.store.RAMDirectory; |
| 42 | +import org.wikimedia.lsearch.analyzers.FilterFactory; |
42 | 43 | import org.wikimedia.lsearch.analyzers.PrefixAnalyzer; |
43 | 44 | import org.wikimedia.lsearch.analyzers.SplitAnalyzer; |
44 | 45 | import org.wikimedia.lsearch.beans.Article; |
— | — | @@ -74,6 +75,7 @@ |
75 | 76 | protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly; |
76 | 77 | protected boolean optimized = false; |
77 | 78 | protected boolean autoOptimize = false; |
| 79 | + protected FilterFactory filters = null; |
78 | 80 | |
79 | 81 | private Links(IndexId iid, String path, IndexWriter writer, boolean autoOptimize) throws CorruptIndexException, IOException{ |
80 | 82 | this.writer = writer; |
— | — | @@ -93,6 +95,7 @@ |
94 | 96 | redirectOnly = makeSelector("redirect"); |
95 | 97 | contextOnly = makeSelector("context"); |
96 | 98 | linksOnly = makeSelector("links"); |
| 99 | + filters = new FilterFactory(iid.getDB()); |
97 | 100 | } |
98 | 101 | |
99 | 102 | protected FieldSelector makeSelector(String field){ |
— | — | @@ -259,7 +262,6 @@ |
260 | 263 | if(redirect != null){ |
261 | 264 | redirectsTo = findTargetLink(redirect.getNamespace(),redirect.getTitle(),exactCase); |
262 | 265 | } else { |
263 | | - HashSet<String> contextLinks = new HashSet<String>(); |
264 | 266 | ContextParser.Context curContext = null; |
265 | 267 | while(true){ |
266 | 268 | boolean hasNext = matcher.find(); |
— | — | @@ -275,7 +277,6 @@ |
276 | 278 | curContext = context; |
277 | 279 | else if(curContext!=context){ |
278 | 280 | pagelinks.add(""); |
279 | | - contextLinks.clear(); |
280 | 281 | curContext = context; |
281 | 282 | } |
282 | 283 | } |
— | — | @@ -314,13 +315,10 @@ |
315 | 316 | continue; // skip links from other namespaces into the main namespace |
316 | 317 | String target = findTargetLink(ns,title,exactCase); |
317 | 318 | if(target != null){ |
318 | | - int targetNs = Integer.parseInt(target.substring(0,target.indexOf(':'))); |
| 319 | + ArrayList<String> variants = filters.getVariants(target); |
319 | 320 | pagelinks.add(target); |
320 | | - // register context of this link |
321 | | - if(context != null && nsf.contains(targetNs)){ |
322 | | - contextLinks.add(target); |
323 | | - } |
324 | | - |
| 321 | + if(variants != null) |
| 322 | + pagelinks.addAll(variants); |
325 | 323 | } |
326 | 324 | } |
327 | 325 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java |
— | — | @@ -157,8 +157,8 @@ |
158 | 158 | QueryParser parser = new QueryParser("contents",new CJKAnalyzer()); |
159 | 159 | Query q = parser.parse("プロサッカークラブをつくろう"); |
160 | 160 | System.out.println("Japanese in standard analyzer: "+q); |
161 | | - displayTokens(new CJKAnalyzer(),"『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
162 | | - displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
| 161 | + displayTokens(new CJKAnalyzer(),"は、工学者、大学教授、工学博士。『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
| 162 | + displayTokens(Analyzers.getHighlightAnalyzer(IndexId.get("jawiki"),false),"鈴木 孝治(すずき こうじ、1954年 - )『パンツぱんくろう』というタイトルは、阪本牙城の漫画『タンクタンクロー』が元ネタになっているといわれる。ただし、このアニメと『タンクタンクロー』に内容的な直接の関係は全く無い。"); |
163 | 163 | displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"『パンツぱんくろう』というタjavaイトルはbalaton"); |
164 | 164 | displayTokens(Analyzers.getSearcherAnalyzer(IndexId.get("jawiki")),"パ ン"); |
165 | 165 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -24,7 +24,7 @@ |
25 | 25 | |
26 | 26 | public class FastWikiTokenizerTest { |
27 | 27 | public static void displayTokensForParser(String text) { |
28 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions.Highlight(true)); |
| 28 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions.Highlight(false)); |
29 | 29 | Token[] tokens = parser.parse().toArray(new Token[] {}); |
30 | 30 | for (int i = 0; i < tokens.length; i++) { |
31 | 31 | Token token = tokens[i]; |
— | — | @@ -70,11 +70,13 @@ |
71 | 71 | |
72 | 72 | public static void main(String args[]) throws Exception{ |
73 | 73 | Configuration.open(); |
74 | | - String text = "''italic'' text bre! <nowiki><!-- see--></nowiki> <!-- nosee --> (ant) and some. it's stupid it's something and 5\"6' or more, links abacus"; |
| 74 | + String text = "ATA, [[:link]] [[zh-min-nan:Something]] [[zh-min-nana:Something]] str_replace"; |
75 | 75 | showTokens(text); |
| 76 | + text = "''italic'' text bre! <nowiki><!-- see--></nowiki> <!-- nosee --> (ant) and some. it's stupid it's something and 5\"6' or more, links abacus"; |
| 77 | + showTokens(text); |
76 | 78 | text = ":''This article is about the humorist. For the [[Indo-Europeanist]] see [[Douglas Q. Adams]].''\n{{Infobox writer <!-- for more information see [[:Template:Infobox writer]] -->\n| name = Douglas Adams\n| image = Douglas adams cropped.jpg\n| caption = Douglas Adams signing books at ApacheCon 2000\n| birthdate = {{birth date|1952|3|11|df=yes}}\n| birthplace = [[Cambridge]], [[England]]\n| deathdate = {{Death date and age|2001|5|11|1952|3|11|df=yes}}\n| deathplace = [[Santa Barbara, California]], [[United States|U.S.]]\n| occupation = comedy writer, novelist, dramatist, fantasist\n| genre = [[Science fiction]], [[Comedy]]\n| movement =\n| influences = [[Richard Dawkins]] <ref>[http://www.bbc.co.uk/cult/hitchhikers/metaguide/radio.shtml Interview extract (in RealAudio format)] where Adams states the influences on his work.</ref>, [[Monty Python]], [[Neil Gaiman]], [[Robert Sheckley]], [[Kurt Vonnegut]], <br/>[[P. G. Wodehouse]]\n| influenced =\n| website = http://www.douglasadams.com/\n}} And now text"; |
77 | 79 | showTokens(text); |
78 | | - text = "klarinet3.jpg Also, I think that the syntax could be changed to\n <nowiki>[[category:''category_name''|''sort_key''|''display_text'']]</nowiki>\nwith ''sort_key'' and ''display_text'' defaulting to ''category_name''."; |
| 80 | + text = "メインページ klarinet3.jpg Also, I think that the syntax could be changed to\n <nowiki>[[category:''category_name''|''sort_key''|''display_text'']]</nowiki>\nwith ''sort_key'' and ''display_text'' defaulting to ''category_name''."; |
79 | 81 | showTokens(text); |
80 | 82 | text = "[[meta:jao]] L.A. W. B.M.W and This. is a '''list of [[African]] countries and dependencies by [[population]]'''.\n\n{| border=\"1\" cellpadding=\"2\" cellspacing=\"0\" style=\"border-collapse:collapse; text-align:right;\"\n|- style=\"text-align:center; background:#efefef\"\n!Pos !! Country !! Population\n|-\n| align=\"left\" |-\n| align=\"left\" |'''Africa''' || 934,283,426\n|-\n"; |
81 | 83 | showTokens(text); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java |
— | — | @@ -116,6 +116,7 @@ |
117 | 117 | {"Douglas Adams's Guide to The Hitch-Hiker's Guide to the Galaxy",""}, |
118 | 118 | {"bethlem jesus","bethlehem jesus"}, |
119 | 119 | {"los angles gardens","los angeles gardens"}, |
| 120 | + {"huston we have a problem","houston we have a problem"}, |
120 | 121 | |
121 | 122 | }; |
122 | 123 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -49,6 +49,7 @@ |
50 | 50 | |
51 | 51 | public class Suggest { |
52 | 52 | static Logger log = Logger.getLogger(Suggest.class); |
| 53 | + protected static GlobalConfiguration global=null; |
53 | 54 | protected IndexId iid; |
54 | 55 | protected IndexSearcher searcher; |
55 | 56 | protected IndexReader reader; |
— | — | @@ -58,6 +59,7 @@ |
59 | 60 | protected HashMap<String,Boolean> wordExistCache = new HashMap<String,Boolean>(); |
60 | 61 | protected enum Filtering { STRONG, WEAK }; |
61 | 62 | protected boolean useLogging = true; |
| 63 | + protected int minWordFreq = 0; |
62 | 64 | |
63 | 65 | /** Distance an metaphone metrics */ |
64 | 66 | static public class Metric { |
— | — | @@ -175,10 +177,13 @@ |
176 | 178 | this.iid = iid; |
177 | 179 | if(searcher == null) |
178 | 180 | searcher = cache.getLocalSearcher(iid.getSpell()); |
| 181 | + if(global == null) |
| 182 | + global = GlobalConfiguration.getInstance(); |
179 | 183 | this.searcher = searcher; |
180 | 184 | this.reader = searcher.getIndexReader(); |
181 | 185 | this.defaultNs = iid.getDefaultNamespace(); |
182 | 186 | this.useLogging = useLogging; |
| 187 | + this.minWordFreq = global.getIntDBParam(iid.getDBname(),"spell","wordsMinFreq",3); |
183 | 188 | |
184 | 189 | synchronized(stopWordsIndexes){ |
185 | 190 | if(!stopWordsIndexes.containsKey(searcher)){ |
— | — | @@ -397,7 +402,7 @@ |
398 | 403 | continue; |
399 | 404 | } |
400 | 405 | // words found within context should be spell-checked only if they are not valid words |
401 | | - if(info.foundInContext.contains(w) && wordExists(w,ns)){ |
| 406 | + if(info.foundInContext.contains(w) && wordExists(w,ns) && wordFrequency(w,ns)>minWordFreq*100){ |
402 | 407 | addCorrectWord(w,wordSug,possibleStopWords); |
403 | 408 | continue; |
404 | 409 | } |
— | — | @@ -544,7 +549,7 @@ |
545 | 550 | if(s1.word.equals(w1)) |
546 | 551 | c.preserves.put(i,w1); |
547 | 552 | else if((!good1 && !info.foundInTitles.contains(w1)) |
548 | | - || ((inTitle||inContext) && diff1 <=2 && !info.foundInContext.contains(w1)) ) |
| 553 | + || ((inTitle||inContext) && diff1 <=2 && !info.foundInTitles.contains(w1)) ) |
549 | 554 | c.substitutes.put(i,s1.word); |
550 | 555 | else |
551 | 556 | accept = false; |
— | — | @@ -552,7 +557,7 @@ |
553 | 558 | if(s2.word.equals(w2)) |
554 | 559 | c.preserves.put(i2,w2); |
555 | 560 | else if((!good2 && !info.foundInTitles.contains(w2)) |
556 | | - || ((inTitle||inContext) && diff2 <= 2 && !info.foundInContext.contains(w2)) ) |
| 561 | + || ((inTitle||inContext) && diff2 <= 2 && !info.foundInTitles.contains(w2)) ) |
557 | 562 | c.substitutes.put(i2,s2.word); |
558 | 563 | else |
559 | 564 | accept = false; |
— | — | @@ -1205,6 +1210,7 @@ |
1206 | 1211 | * @return |
1207 | 1212 | */ |
1208 | 1213 | public ArrayList<SuggestResult> suggestWords(String word, int num, Namespaces namespaces, Filtering filter){ |
| 1214 | + log.debug("Suggesting words for "+word); |
1209 | 1215 | if(namespaces == null) // default |
1210 | 1216 | return suggestWordsOnNamespaces(word,word,num,num,null,filter); |
1211 | 1217 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.lucene.search.RemoteSearchableMul; |
12 | 12 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
13 | 13 | import org.wikimedia.lsearch.config.IndexId; |
| 14 | +import org.wikimedia.lsearch.config.IndexRegistry; |
14 | 15 | import org.wikimedia.lsearch.search.SearcherCache; |
15 | 16 | |
16 | 17 | /** Starts the RMI registry and binds all RMI objects */ |
— | — | @@ -17,6 +18,7 @@ |
18 | 19 | protected static org.apache.log4j.Logger log = Logger.getLogger(RMIServer.class); |
19 | 20 | |
20 | 21 | protected static SearcherCache cache = null; |
| 22 | + protected static IndexRegistry indexes = null; |
21 | 23 | |
22 | 24 | public static void register(Remote engine, String name){ |
23 | 25 | try { |
— | — | @@ -41,18 +43,24 @@ |
42 | 44 | } |
43 | 45 | |
44 | 46 | /** After updating local copy of iid, rebind it's rmi object */ |
45 | | - public static void rebind(IndexId iid){ |
| 47 | + public static boolean rebind(IndexId iid){ |
46 | 48 | if(cache == null) |
47 | 49 | cache = SearcherCache.getInstance(); |
| 50 | + if(indexes == null) |
| 51 | + indexes = IndexRegistry.getInstance(); |
48 | 52 | String name = "RemoteSearchable<"+iid+">"; |
49 | 53 | try { |
50 | | - RemoteSearchableMul rs = new RemoteSearchableMul(cache.getLocalSearcher(iid)); |
51 | | - register(rs,name); |
| 54 | + if(indexes.getCurrentSearch(iid) != null){ |
| 55 | + RemoteSearchableMul rs = new RemoteSearchableMul(cache.getLocalSearcher(iid)); |
| 56 | + register(rs,name); |
| 57 | + return true; |
| 58 | + } |
52 | 59 | } catch (RemoteException e) { |
53 | 60 | log.warn("Error making remote searchable for "+name); |
54 | 61 | } catch(Exception e){ |
55 | 62 | // do nothing, error is logged by some other class (possible SearchCache) |
56 | 63 | } |
| 64 | + return false; |
57 | 65 | } |
58 | 66 | |
59 | 67 | /** Bind all RMI objects (Messenger, RemoteSeachables and RMIIndexDaemon) */ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java |
— | — | @@ -45,6 +45,8 @@ |
46 | 46 | public class PrefixIndexBuilder { |
47 | 47 | static Logger log = Logger.getLogger(PrefixIndexBuilder.class); |
48 | 48 | |
| 49 | + public static float EXACT_BOOST = 25; |
| 50 | + |
49 | 51 | protected IndexId iid, prefixIid, pre; |
50 | 52 | protected FilterFactory filters; |
51 | 53 | protected Links links=null; |
— | — | @@ -187,7 +189,7 @@ |
188 | 190 | } |
189 | 191 | |
190 | 192 | if(key.equalsIgnoreCase(prefix)) |
191 | | - ref *= 100; // boost for exact match |
| 193 | + ref *= EXACT_BOOST; // boost for exact match |
192 | 194 | refs.put(key,ref); |
193 | 195 | } |
194 | 196 | ArrayList<Entry<String,Double>> sorted = new ArrayList<Entry<String,Double>>(); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/BuildAll.java |
— | — | @@ -31,7 +31,7 @@ |
32 | 32 | static org.apache.log4j.Logger log = null; |
33 | 33 | |
34 | 34 | protected static void printHelp(){ |
35 | | - System.out.println("Syntax: BuildAll [-f <file>] [-lt] [-i] [-sc] [dbname] [dump file]"); |
| 35 | + System.out.println("Syntax: BuildAll [-f <file>] [-lt] [-i] [-sc] [dump file] [dbname]"); |
36 | 36 | System.out.println("Options:"); |
37 | 37 | System.out.println(" -f <file> - use a file with a list of pairs <dbname> <dump file>"); |
38 | 38 | System.out.println(" -lt - leave titles - don't delete old titles indexes"); |
— | — | @@ -55,10 +55,14 @@ |
56 | 56 | importOnly = true; |
57 | 57 | else if(args[i].equals("-sc")) |
58 | 58 | noSpellcheck = true; |
| 59 | + else if(args[i].startsWith("-")){ |
| 60 | + System.out.println("Unrecognized option "+args[i]); |
| 61 | + printHelp(); |
| 62 | + return; |
| 63 | + } else if(dump == null) |
| 64 | + dump = args[i]; |
59 | 65 | else if(dbname == null) |
60 | 66 | dbname = args[i]; |
61 | | - else if(dump == null) |
62 | | - dump = args[i]; |
63 | 67 | else if(args[i].equals("--help")){ |
64 | 68 | printHelp(); |
65 | 69 | return; |
— | — | @@ -145,7 +149,7 @@ |
146 | 150 | } |
147 | 151 | } |
148 | 152 | } |
149 | | - System.out.println("Finished building in "+ProgressReport.formatTime(System.currentTimeMillis()-start)); |
| 153 | + System.out.println("Finished build in "+ProgressReport.formatTime(System.currentTimeMillis()-start)); |
150 | 154 | } |
151 | 155 | |
152 | 156 | protected static void copy(String from, String to) throws IOException{ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -218,7 +218,7 @@ |
219 | 219 | IndexThread.makeIndexSnapshot(p,p.getImportPath()); |
220 | 220 | } |
221 | 221 | } |
222 | | - if(makeTitles){ |
| 222 | + if(makeTitles && iid.hasTitlesIndex()){ |
223 | 223 | for(IndexId p : iid.getTitlesIndex().getPhysicalIndexIds()){ |
224 | 224 | if(snapshotDb) |
225 | 225 | IndexThread.optimizeIndex(p,p.getImportPath(),IndexId.Transaction.IMPORT); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -899,5 +899,10 @@ |
900 | 900 | public IndexId getTitleNgram(){ |
901 | 901 | return IndexId.get(dbname+".title_ngram"); |
902 | 902 | } |
| 903 | + |
| 904 | + /** If this iid is in chinese or japanese */ |
| 905 | + public boolean isCJK(){ |
| 906 | + return FilterFactory.isCJKLanguage(getLangCode()); |
| 907 | + } |
903 | 908 | |
904 | 909 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java |
— | — | @@ -254,9 +254,9 @@ |
255 | 255 | // update registry, cache, rmi object |
256 | 256 | registry.refreshUpdates(iid); |
257 | 257 | warmupAndDeploy(pool,li,type); |
| 258 | + registry.refreshCurrent(li); |
258 | 259 | if(type != RebuildType.STANDALONE) |
259 | 260 | RMIServer.rebind(iid); |
260 | | - registry.refreshCurrent(li); |
261 | 261 | |
262 | 262 | // notify all remote searchers of change |
263 | 263 | messenger.notifyIndexUpdated(iid,iid.getDBSearchHosts()); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java |
— | — | @@ -91,6 +91,7 @@ |
92 | 92 | searcher = new IndexSearcherMul(path); |
93 | 93 | searcher.setSimilarity(new WikiSimilarity()); |
94 | 94 | } catch (IOException e) { |
| 95 | + e.printStackTrace(); |
95 | 96 | // tell registry this is not a good index |
96 | 97 | IndexRegistry.getInstance().invalidateCurrent(iid); |
97 | 98 | log.error("I/O Error opening index at path "+iid.getCanonicalSearchPath()+" : "+e.getMessage()); |
— | — | @@ -257,18 +258,21 @@ |
258 | 259 | |
259 | 260 | /** Warmup all local IndexSearcher (create if necessary) */ |
260 | 261 | public void warmupLocalCache(){ |
| 262 | + IndexRegistry registry = IndexRegistry.getInstance(); |
261 | 263 | HashSet<IndexId> mys = global.getMySearch(); |
262 | 264 | for(IndexId iid : mys){ |
263 | 265 | try { |
264 | 266 | if(iid.isLogical()) |
265 | 267 | continue; |
266 | | - IndexSearcherMul[] pool = getSearcherPool(iid); |
267 | | - for(IndexSearcherMul is : pool) |
268 | | - Warmup.warmupIndexSearcher(is,iid,false); |
269 | | - |
270 | | - Warmup.waitForAggregate(pool); |
| 268 | + if(registry.getCurrentSearch(iid) != null){ |
| 269 | + IndexSearcherMul[] pool = getSearcherPool(iid); |
| 270 | + for(IndexSearcherMul is : pool) |
| 271 | + Warmup.warmupIndexSearcher(is,iid,false); |
| 272 | + |
| 273 | + Warmup.waitForAggregate(pool); |
| 274 | + } |
271 | 275 | } catch (IOException e) { |
272 | | - log.warn("I/O error warming index for "+iid); |
| 276 | + log.warn("I/O error warming index for "+iid+" : "+e.getMessage()); |
273 | 277 | } |
274 | 278 | } |
275 | 279 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/ArticleMeta.java |
— | — | @@ -56,6 +56,7 @@ |
57 | 57 | protected SimpleDateFormat isoDate; |
58 | 58 | protected long now = 0; |
59 | 59 | protected NamespaceFilter subpages; |
| 60 | + protected boolean isOptimized; |
60 | 61 | |
61 | 62 | protected class CachingThread extends Thread { |
62 | 63 | public void run(){ |
— | — | @@ -67,12 +68,16 @@ |
68 | 69 | subpage = new boolean[reader.maxDoc()]; |
69 | 70 | daysOld = new float[reader.maxDoc()]; |
70 | 71 | for(int i=0;i<reader.maxDoc();i++){ |
| 72 | + if(!isOptimized && reader.isDeleted(i)) |
| 73 | + continue; |
71 | 74 | try{ |
72 | | - subpage[i] = resolveSubpage(i); |
73 | | - daysOld[i] = resolveDaysOld(i); |
| 75 | + Document d = reader.document(i); |
| 76 | + subpage[i] = resolveSubpage(d); |
| 77 | + daysOld[i] = resolveDaysOld(d); |
74 | 78 | } catch(Exception e2){ |
75 | 79 | e2.printStackTrace(); |
76 | 80 | log.error("Error reading article meta for docid="+i+" : "+e2.getMessage()); |
| 81 | + throw e2; |
77 | 82 | } |
78 | 83 | } |
79 | 84 | log.info("Finished caching article info for "+reader.directory()); |
— | — | @@ -90,10 +95,7 @@ |
91 | 96 | /** See if article is a subpage |
92 | 97 | * @throws IOException |
93 | 98 | * @throws CorruptIndexException */ |
94 | | - protected final boolean resolveSubpage(int docid) throws IOException{ |
95 | | - if(reader.isDeleted(docid)) |
96 | | - return false; |
97 | | - Document d = reader.document(docid); |
| 99 | + protected final boolean resolveSubpage(Document d) throws IOException{ |
98 | 100 | String ns = d.get("namespace"); |
99 | 101 | if(ns == null) |
100 | 102 | return false; |
— | — | @@ -107,10 +109,7 @@ |
108 | 110 | return false; |
109 | 111 | } |
110 | 112 | /** Calculate how old the indexed article is */ |
111 | | - protected final float resolveDaysOld(int docid) throws IOException { |
112 | | - if(reader.isDeleted(docid)) |
113 | | - return 0; |
114 | | - Document d = reader.document(docid); |
| 113 | + protected final float resolveDaysOld(Document d) throws IOException { |
115 | 114 | String dateStr = d.get("date"); |
116 | 115 | if(dateStr == null) |
117 | 116 | return 0; |
— | — | @@ -141,6 +140,7 @@ |
142 | 141 | this.subpages = subpages; |
143 | 142 | isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); |
144 | 143 | isoDate.setTimeZone(TimeZone.getTimeZone("GMT")); |
| 144 | + this.isOptimized = reader.isOptimized(); |
145 | 145 | |
146 | 146 | // run background caching |
147 | 147 | new CachingThread().start(); |
— | — | @@ -148,14 +148,14 @@ |
149 | 149 | |
150 | 150 | public final boolean isSubpage(int docid) throws IOException { |
151 | 151 | if(!finishedCaching) |
152 | | - return resolveSubpage(docid); |
| 152 | + return resolveSubpage(reader.document(docid)); |
153 | 153 | |
154 | 154 | return subpage[docid]; |
155 | 155 | } |
156 | 156 | |
157 | 157 | public float daysOld(int docid) throws IOException { |
158 | 158 | if(!finishedCaching) |
159 | | - return resolveDaysOld(docid); |
| 159 | + return resolveDaysOld(reader.document(docid)); |
160 | 160 | |
161 | 161 | return daysOld[docid]; |
162 | 162 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java |
— | — | @@ -81,6 +81,7 @@ |
82 | 82 | protected IndexReader reader = null; |
83 | 83 | protected String field; |
84 | 84 | protected boolean cachingFinished = false; |
| 85 | + protected boolean isOptimized; |
85 | 86 | |
86 | 87 | protected class CachingThread extends Thread { |
87 | 88 | public void run(){ |
— | — | @@ -105,7 +106,7 @@ |
106 | 107 | for(int i=0;i<maxdoc;i++){ |
107 | 108 | byte[] stored = null; |
108 | 109 | try{ |
109 | | - if(reader.isDeleted(i)) |
| 110 | + if(!isOptimized && reader.isDeleted(i)) |
110 | 111 | continue; |
111 | 112 | Document doc = reader.document(i); |
112 | 113 | stored = doc.getBinaryValue(field); |
— | — | @@ -134,6 +135,7 @@ |
135 | 136 | } catch(Exception e){ |
136 | 137 | log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage()); |
137 | 138 | e.printStackTrace(); |
| 139 | + throw e; |
138 | 140 | } |
139 | 141 | } |
140 | 142 | // compact arrays |
— | — | @@ -178,6 +180,7 @@ |
179 | 181 | protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{ |
180 | 182 | this.reader = reader; |
181 | 183 | this.field = fieldBase+"_meta"; |
| 184 | + this.isOptimized = reader.isOptimized(); |
182 | 185 | Collection fields = reader.getFieldNames(FieldOption.ALL); |
183 | 186 | if(!fields.contains(field)){ |
184 | 187 | cachingFinished = true; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -76,6 +76,7 @@ |
77 | 77 | try{ |
78 | 78 | boolean waitForAggregate = Configuration.open().getString("Search","warmupaggregate","false").equalsIgnoreCase("true"); |
79 | 79 | if(waitForAggregate){ // wait for aggregate fields to be cached |
| 80 | + log.info("Wait for aggregate caches..."); |
80 | 81 | boolean wait; |
81 | 82 | do{ |
82 | 83 | wait = false; |
— | — | @@ -109,35 +110,46 @@ |
110 | 111 | |
111 | 112 | int count = getWarmupCount(iid); |
112 | 113 | |
113 | | - if(iid.isSpell() && count > 0){ |
114 | | - Terms terms = getTermsForLang(iid.getLangCode()); |
115 | | - Suggest sug = new Suggest(iid,is,false); |
116 | | - WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid)); |
117 | | - NamespaceFilter nsf = iid.getDefaultNamespace(); |
118 | | - for(int i=0;i<count;i++){ |
119 | | - String searchterm = terms.next(); |
120 | | - sug.suggest(searchterm,parser.tokenizeForSpellCheck(searchterm),new Suggest.ExtraInfo(),nsf); |
| 114 | + if(iid.isSpell()){ |
| 115 | + if(count > 0){ |
| 116 | + Terms terms = getTermsForLang(iid.getLangCode()); |
| 117 | + Suggest sug = new Suggest(iid,is,false); |
| 118 | + WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid)); |
| 119 | + NamespaceFilter nsf = iid.getDefaultNamespace(); |
| 120 | + for(int i=0;i<count;i++){ |
| 121 | + String searchterm = terms.next(); |
| 122 | + sug.suggest(searchterm,parser.tokenizeForSpellCheck(searchterm),new Suggest.ExtraInfo(),nsf); |
| 123 | + } |
121 | 124 | } |
122 | | - } else if(iid.isTitleNgram() && count > 0){ |
123 | | - Terms terms = getTermsForLang(iid.getLangCode()); |
124 | | - SuggestSimilar sim = new SuggestSimilar(iid,is); |
125 | | - for(int i=0;i<count;i++){ |
126 | | - sim.getSimilarTitles(terms.next(),new NamespaceFilter(),4); |
| 125 | + } else if(iid.isTitleNgram()){ |
| 126 | + if(count > 0){ |
| 127 | + Terms terms = getTermsForLang(iid.getLangCode()); |
| 128 | + SuggestSimilar sim = new SuggestSimilar(iid,is); |
| 129 | + for(int i=0;i<count;i++){ |
| 130 | + sim.getSimilarTitles(terms.next(),new NamespaceFilter(),4); |
| 131 | + } |
127 | 132 | } |
128 | | - } else if(iid.isPrefix() && count > 0){ |
129 | | - Terms terms = getTermsForLang(iid.getLangCode()); |
130 | | - SearchEngine search = new SearchEngine(); |
131 | | - for(int i=0;i<count;i++){ |
132 | | - String searchterm = terms.next(); |
133 | | - searchterm = searchterm.substring(0,(int)Math.min(8*Math.random()+1,searchterm.length())); |
134 | | - search.searchPrefixLocal(iid,searchterm,20,iid.getDefaultNamespace(),is); |
| 133 | + } else if(iid.isPrefix()){ |
| 134 | + if(count > 0){ |
| 135 | + Terms terms = getTermsForLang(iid.getLangCode()); |
| 136 | + SearchEngine search = new SearchEngine(); |
| 137 | + for(int i=0;i<count;i++){ |
| 138 | + String searchterm = terms.next(); |
| 139 | + searchterm = searchterm.substring(0,(int)Math.min(8*Math.random()+1,searchterm.length())); |
| 140 | + search.searchPrefixLocal(iid,searchterm,20,iid.getDefaultNamespace(),is); |
| 141 | + } |
135 | 142 | } |
136 | | - } else if((iid.isHighlight() || iid.isRelated()) && count > 0 && !iid.isTitlesBySuffix()){ |
137 | | - // NOTE: this might not warmup all caches, but should read stuff into memory buffers |
138 | | - for(int i=0;i<count;i++){ |
139 | | - int docid = (int)(Math.random()*is.maxDoc()); |
140 | | - reader.document(docid).get("key"); |
141 | | - } |
| 143 | + } else if((iid.isHighlight() || iid.isRelated()) && !iid.isTitlesBySuffix()){ |
| 144 | + if(count > 0){ |
| 145 | + // NOTE: this might not warmup all caches, but should read stuff into memory buffers |
| 146 | + for(int i=0;i<count;i++){ |
| 147 | + int docid = (int)(Math.random()*is.maxDoc()); |
| 148 | + reader.document(docid).get("key"); |
| 149 | + } |
| 150 | + } |
| 151 | + } else if(iid.isTitlesBySuffix()){ |
| 152 | + // just initiate meta field caching, we want to avoid caching unnecessary filters |
| 153 | + AggregateMetaField.getCachedSource(is.getIndexReader(),"alttitle"); |
142 | 154 | } else{ |
143 | 155 | // normal indexes |
144 | 156 | if(count == 0){ |
— | — | @@ -180,7 +192,7 @@ |
181 | 193 | log.error("Error warming up local IndexSearcherMul for "+iid); |
182 | 194 | } catch (Exception e) { |
183 | 195 | e.printStackTrace(); |
184 | | - log.error("Exception during warmup "+e.getMessage()); |
| 196 | + log.error("Exception during warmup of "+iid+" : "+e.getMessage()); |
185 | 197 | } |
186 | 198 | } |
187 | 199 | |
— | — | @@ -188,14 +200,9 @@ |
189 | 201 | protected static Terms getTermsForLang(String lang) { |
190 | 202 | String lib = Configuration.open().getLibraryPath(); |
191 | 203 | if("en".equals(lang) || "de".equals(lang) || "es".equals(lang) || "fr".equals(lang) || "it".equals(lang) || "pt".equals(lang)) |
192 | | - langTerms.put(lang,new WordTerms(lib+Configuration.PATH_SEP+"dict"+Configuration.PATH_SEP+"terms-"+lang+".txt.gz")); |
193 | | - if(lang.equals("sample")) |
194 | | - return new SampleTerms(); |
195 | | - |
196 | | - if(langTerms.containsKey(lang)) |
197 | | - return langTerms.get(lang); |
| 204 | + return new WordTerms(lib+Configuration.PATH_SEP+"dict"+Configuration.PATH_SEP+"terms-"+lang+".txt.gz"); |
198 | 205 | else |
199 | | - return langTerms.get("en"); |
| 206 | + return new SampleTerms(); |
200 | 207 | } |
201 | 208 | |
202 | 209 | /** Preload all predefined filters */ |
— | — | @@ -218,7 +225,7 @@ |
219 | 226 | try{ |
220 | 227 | FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder(); |
221 | 228 | WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null); |
222 | | - Query q = parser.parse("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons"); |
| 229 | + Query q = parser.parse("wikimedia foundation"); |
223 | 230 | is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0"))); |
224 | 231 | } catch (IOException e) { |
225 | 232 | log.error("Error warming up local IndexSearcherMul for "+iid); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java |
— | — | @@ -177,6 +177,7 @@ |
178 | 178 | return WildcardType.INVALID; |
179 | 179 | } |
180 | 180 | |
| 181 | + /** Get terms from a local searcher if available */ |
181 | 182 | public static ArrayList<String> getLocalTerms(IndexId iid, String wildcard, boolean exactCase) throws IOException { |
182 | 183 | if(searcherCache == null) |
183 | 184 | searcherCache = SearcherCache.getInstance(); |
— | — | @@ -218,6 +219,7 @@ |
219 | 220 | return list; |
220 | 221 | } |
221 | 222 | |
| 223 | + /** Fetch terms matching a wildcard pattern into the target collection */ |
222 | 224 | protected static void addTerms(Collection<String> ret, Term wildcardTerm, IndexReader reader, WildcardType type) throws IOException{ |
223 | 225 | Term t; |
224 | 226 | WildcardTermEnum te = new WildcardTermEnum(reader,wildcardTerm); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -45,6 +45,7 @@ |
46 | 46 | import org.wikimedia.lsearch.highlight.HighlightResult; |
47 | 47 | import org.wikimedia.lsearch.index.MessengerThread; |
48 | 48 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
| 49 | +import org.wikimedia.lsearch.prefix.PrefixIndexBuilder; |
49 | 50 | import org.wikimedia.lsearch.ranks.StringList; |
50 | 51 | import org.wikimedia.lsearch.related.Related; |
51 | 52 | import org.wikimedia.lsearch.related.RelatedTitle; |
— | — | @@ -371,12 +372,18 @@ |
372 | 373 | ArrayList<String> keys = new ArrayList<String>(); |
373 | 374 | if(prefixKey.startsWith("0:")){ |
374 | 375 | String title = prefixKey.substring(2); |
| 376 | + String alt = null; |
| 377 | + if(title.startsWith("\"") && title.length()>1) |
| 378 | + alt = title.substring(1); |
375 | 379 | for(Integer ns : nsf.getNamespacesOrdered()){ |
376 | 380 | keys.add(ns+":"+title); |
| 381 | + if(alt != null) |
| 382 | + keys.add(ns+":"+alt); |
377 | 383 | } |
| 384 | + |
378 | 385 | } else |
379 | 386 | keys.add(prefixKey); |
380 | | - |
| 387 | + |
381 | 388 | ArrayList<PrefixMatch> results = new ArrayList<PrefixMatch>(); |
382 | 389 | IndexReader reader = searcher.getIndexReader(); |
383 | 390 | |
— | — | @@ -403,7 +410,7 @@ |
404 | 411 | if(td1.next()){ |
405 | 412 | PrefixMatch m = new PrefixMatch(reader.document(td1.doc()).get("article")); |
406 | 413 | if(r.equals(key)) |
407 | | - m.score *= 100; // exact boost |
| 414 | + m.score *= PrefixIndexBuilder.EXACT_BOOST; // exact boost |
408 | 415 | results.add(m); |
409 | 416 | |
410 | 417 | } |
— | — | @@ -996,8 +1003,9 @@ |
997 | 1004 | } |
998 | 1005 | |
999 | 1006 | protected void sendStats(long delta){ |
1000 | | - boolean succ = delta < 10000; // we queries taking more than 10s as bad |
1001 | | - SearchServer.stats.add(succ, delta, SearchDaemon.getOpenCount()); |
| 1007 | + boolean succ = delta < 10000; // we queries taking more than 10s as bad |
| 1008 | + if(SearchServer.stats != null) |
| 1009 | + SearchServer.stats.add(succ, delta, SearchDaemon.getOpenCount()); |
1002 | 1010 | } |
1003 | 1011 | |
1004 | 1012 | protected void logRequest(IndexId iid, String what, String searchterm, Query query, int numhits, long start, Searchable searcher) { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/RawSnippet.java |
— | — | @@ -36,6 +36,7 @@ |
37 | 37 | protected Set<String> stopWords; |
38 | 38 | |
39 | 39 | protected boolean highlightAllStop = false; |
| 40 | + protected boolean isCJK = false; |
40 | 41 | |
41 | 42 | // for custom scoring |
42 | 43 | protected int textLength = 0; |
— | — | @@ -236,7 +237,7 @@ |
237 | 238 | // make snippet in range showBegin,showEnd |
238 | 239 | Snippet s = new Snippet(); |
239 | 240 | StringBuilder sb = new StringBuilder(); |
240 | | - int start=0, end=0; // range |
| 241 | + int start=0, end=0, mid=0; // range |
241 | 242 | if(showBegin > 0 && tokens.get(showBegin).getType() == ExtToken.Type.TEXT) |
242 | 243 | showBegin--; // always start with nontext token to catch " and ( |
243 | 244 | if(showEnd == tokens.size()) |
— | — | @@ -275,12 +276,42 @@ |
276 | 277 | continue; |
277 | 278 | } |
278 | 279 | if(t.getPositionIncrement() != 0){ |
| 280 | + if(isCJK && t.getType() == Type.TEXT && t.type().equals("cjk")){ |
| 281 | + boolean lastOnly = false; |
| 282 | + // reconstruct CJK tokens from stream C1C2 C2C3 C3C4 -> C1C2C3C4 |
| 283 | + if(mainToken != null && mainToken.getType()==Type.TEXT && mainToken.type().equals("cjk") && mid!=start){ |
| 284 | + start = mid; // C2C3 token, start of this token is "in the middle of last added token" |
| 285 | + lastOnly = true; |
| 286 | + } else |
| 287 | + start = getLength(sb); // C1C2 token |
| 288 | + |
| 289 | + // add current |
| 290 | + mid = start; |
| 291 | + String tt = t.getText(); |
| 292 | + int len = tt.length(); |
| 293 | + if(len>=2){ |
| 294 | + // not terminal, calculate new midpoint |
| 295 | + int point = len-1; |
| 296 | + if(Character.isSurrogatePair(tt.charAt(len-2),tt.charAt(len-1))) |
| 297 | + point = len-2; |
| 298 | + |
| 299 | + if(!lastOnly) |
| 300 | + sb.append(tt.substring(0,point)); |
| 301 | + mid = getLength(sb); |
| 302 | + sb.append(tt.substring(point)); |
| 303 | + } else |
| 304 | + sb.append(tt); |
| 305 | + |
| 306 | + end = getLength(sb); |
| 307 | + } else{ |
| 308 | + start = getLength(sb); |
| 309 | + sb.append(t.getText()); |
| 310 | + end = getLength(sb); |
| 311 | + } |
279 | 312 | mainToken = t; |
280 | | - start = getLength(sb); |
281 | | - sb.append(t.getText()); |
282 | | - end = getLength(sb); |
283 | 313 | } |
284 | 314 | if(highlight.contains(t.termText()) && !isolatedStopWords(t.termText(),i)){ |
| 315 | + // highlight part of the text |
285 | 316 | if(mainToken != null && mainToken!=t && (mainToken.termText().contains(".") || mainToken.termText().contains("'"))){ |
286 | 317 | Snippet.Range range = findSubRange(mainToken,t,start); |
287 | 318 | if(range != null) |
— | — | @@ -293,6 +324,7 @@ |
294 | 325 | if(alttitle != null) |
295 | 326 | s.setOriginalText(alttitle.getTitle()); |
296 | 327 | |
| 328 | + s.simplifyRanges(); |
297 | 329 | return s; |
298 | 330 | } |
299 | 331 | |
— | — | @@ -362,7 +394,9 @@ |
363 | 395 | } |
364 | 396 | } |
365 | 397 | |
366 | | - public RawSnippet(ArrayList<ExtToken> tokens, FragmentScore f, Set<String> highlight, Set<String> newTerms, Set<String> stopWords){ |
| 398 | + public RawSnippet(ArrayList<ExtToken> tokens, FragmentScore f, |
| 399 | + Set<String> highlight, Set<String> newTerms, Set<String> stopWords, |
| 400 | + boolean isCJK){ |
367 | 401 | this.tokens = new ArrayList<ExtToken>(); |
368 | 402 | // include initial nontext token |
369 | 403 | if(f.start > 0 && f.start < tokens.size() && tokens.get(f.start).getType()==ExtToken.Type.TEXT) |
— | — | @@ -385,6 +419,7 @@ |
386 | 420 | this.cur = f; |
387 | 421 | this.sequenceNum = f.sequenceNum; |
388 | 422 | this.stopWords = stopWords; |
| 423 | + this.isCJK = isCJK; |
389 | 424 | this.textLength = noAliasLength(); |
390 | 425 | if(stopWords!=null && stopWords.size()>0){ |
391 | 426 | highlightAllStop = true; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java |
— | — | @@ -100,6 +100,21 @@ |
101 | 101 | return getFormatted(); |
102 | 102 | } |
103 | 103 | |
| 104 | + /** If consequtive words are being highlighted, merge ranges */ |
| 105 | + public void simplifyRanges(){ |
| 106 | + Range last = null; |
| 107 | + ArrayList<Range> simplified = new ArrayList<Range>(); |
| 108 | + for(Range r : highlighted){ |
| 109 | + if(last != null && last.end >= r.start) |
| 110 | + last.end = r.end; |
| 111 | + else{ |
| 112 | + simplified.add(r); |
| 113 | + last = r; |
| 114 | + } |
| 115 | + } |
| 116 | + highlighted = simplified; |
| 117 | + } |
| 118 | + |
104 | 119 | /** Get default formatting with <b> and </b> tags */ |
105 | 120 | public String getFormatted(){ |
106 | 121 | return getFormatted("<b>","</b>"); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java |
— | — | @@ -111,7 +111,11 @@ |
112 | 112 | boolean foundAllInTitle = false, foundAllInAltTitle = false; |
113 | 113 | int firstHitRank = 0; |
114 | 114 | HashSet<String> inTitle = new HashSet<String>(); |
| 115 | + boolean isCJK = iid.getDB().isCJK(); |
115 | 116 | |
| 117 | + //System.out.println("Terms: "+Arrays.toString(terms)); |
| 118 | + //System.out.println("Words: "+words); |
| 119 | + |
116 | 120 | // terms weighted with idf |
117 | 121 | HashMap<String,Double> weightTerm = new HashMap<String,Double>(); |
118 | 122 | for(int i=0;i<terms.length;i++){ |
— | — | @@ -155,12 +159,12 @@ |
156 | 160 | firstHitRank = alttitles.getTitle().getRank(); |
157 | 161 | |
158 | 162 | HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles,wordIndex); |
159 | | - ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine ); |
160 | | - ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,words,wordIndex,1,true,stopWords,false,phrases,inContext,false,false); |
| 163 | + ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine, isCJK ); |
| 164 | + ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,words,wordIndex,1,true,stopWords,false,phrases,inContext,false,false,isCJK); |
161 | 165 | RawSnippet redirectSnippet = null; |
162 | 166 | // don't show redirect if we matched whole title |
163 | 167 | if(! (titleSnippets.size()>0 && titleSnippets.get(0).countPositions()==titleSnippets.get(0).noAliasLength())){ |
164 | | - redirectSnippet = getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext); |
| 168 | + redirectSnippet = getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext,isCJK); |
165 | 169 | } |
166 | 170 | RawSnippet sectionSnippet = null; |
167 | 171 | if(redirectSnippet == null){ |
— | — | @@ -169,7 +173,7 @@ |
170 | 174 | if(notInTitle.containsKey(s)) |
171 | 175 | notInTitle.remove(s); |
172 | 176 | } |
173 | | - sectionSnippet = getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext); |
| 177 | + sectionSnippet = getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,words,wordIndex,0,phrases,inContext,isCJK); |
174 | 178 | } |
175 | 179 | |
176 | 180 | HighlightResult hr = new HighlightResult(); |
— | — | @@ -182,7 +186,7 @@ |
183 | 187 | boolean addSection = true, added = true; |
184 | 188 | while(added && more(hr.textLength())){ |
185 | 189 | // add more snippets if there is still space |
186 | | - added = extendSnippet(raw,hr,raw.size()-1,tokens,addSection,stopWords); |
| 190 | + added = extendSnippet(raw,hr,raw.size()-1,tokens,addSection,stopWords,isCJK); |
187 | 191 | addSection = false; |
188 | 192 | } |
189 | 193 | } else if(textSnippets.size() >= 2){ |
— | — | @@ -203,13 +207,13 @@ |
204 | 208 | if(more(hr.textLength())){ |
205 | 209 | // first pass of snippet extension, extend shortest first |
206 | 210 | if(s1.length() < s2.length()){ |
207 | | - extendSnippet(raw,hr,0,tokens,true,stopWords); |
| 211 | + extendSnippet(raw,hr,0,tokens,true,stopWords,isCJK); |
208 | 212 | if(more(hr.textLength())) |
209 | | - extendSnippet(raw,hr,raw.size()-1,tokens,true,stopWords); |
| 213 | + extendSnippet(raw,hr,raw.size()-1,tokens,true,stopWords,isCJK); |
210 | 214 | } else { |
211 | | - extendSnippet(raw,hr,1,tokens,true,stopWords); |
| 215 | + extendSnippet(raw,hr,1,tokens,true,stopWords,isCJK); |
212 | 216 | if(more(hr.textLength())) |
213 | | - extendSnippet(raw,hr,0,tokens,true,stopWords); |
| 217 | + extendSnippet(raw,hr,0,tokens,true,stopWords,isCJK); |
214 | 218 | } |
215 | 219 | } |
216 | 220 | boolean added = true; |
— | — | @@ -219,7 +223,7 @@ |
220 | 224 | for(int i=0;i<hr.getText().size() && more(hr.textLength());i++){ |
221 | 225 | boolean addedNow = false; |
222 | 226 | if(hr.getText().get(i).isExtendable()){ |
223 | | - addedNow = extendSnippet(raw,hr,i,tokens,false,stopWords); |
| 227 | + addedNow = extendSnippet(raw,hr,i,tokens,false,stopWords,isCJK); |
224 | 228 | if(addedNow) |
225 | 229 | i++; |
226 | 230 | } |
— | — | @@ -337,14 +341,14 @@ |
338 | 342 | } |
339 | 343 | |
340 | 344 | private static boolean extendSnippet(ArrayList<RawSnippet> raw, HighlightResult hr, int index, |
341 | | - ArrayList<ExtToken> tokens, boolean addSection, HashSet<String> stopWords){ |
| 345 | + ArrayList<ExtToken> tokens, boolean addSection, HashSet<String> stopWords, boolean isCJK){ |
342 | 346 | Snippet curS = hr.getText().get(index); |
343 | 347 | RawSnippet curRs = raw.get(index); |
344 | 348 | int len = hr.textLength(); |
345 | 349 | boolean added = false; |
346 | 350 | // add section |
347 | 351 | if(addSection && more(len)){ |
348 | | - RawSnippet rs = sectionSnippet(curRs,curS,tokens,stopWords); |
| 352 | + RawSnippet rs = sectionSnippet(curRs,curS,tokens,stopWords,isCJK); |
349 | 353 | if(rs != null && !raw.contains(rs)){ |
350 | 354 | Snippet s = rs.makeSnippet(diff(len)); |
351 | 355 | setSuffix(s,rs); |
— | — | @@ -364,7 +368,7 @@ |
365 | 369 | } |
366 | 370 | // add next snippet |
367 | 371 | if(more(len)){ |
368 | | - RawSnippet rs = nextSnippet(curRs,curS,tokens,stopWords); |
| 372 | + RawSnippet rs = nextSnippet(curRs,curS,tokens,stopWords,isCJK); |
369 | 373 | if(rs != null && !raw.contains(rs)){ |
370 | 374 | Snippet s = rs.makeSnippet(diff(len)); |
371 | 375 | setSuffix(curS,curRs); |
— | — | @@ -378,17 +382,17 @@ |
379 | 383 | return added; |
380 | 384 | } |
381 | 385 | |
382 | | - protected static RawSnippet nextSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords){ |
| 386 | + protected static RawSnippet nextSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords, boolean isCJK){ |
383 | 387 | if(rs.next == null) |
384 | 388 | return null; |
385 | | - return new RawSnippet(tokens,rs.next,rs.highlight,new HashSet<String>(),stopWords); |
| 389 | + return new RawSnippet(tokens,rs.next,rs.highlight,new HashSet<String>(),stopWords,isCJK); |
386 | 390 | } |
387 | 391 | |
388 | | - protected static RawSnippet sectionSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords){ |
| 392 | + protected static RawSnippet sectionSnippet(RawSnippet rs, Snippet s, ArrayList<ExtToken> tokens, HashSet<String> stopWords, boolean isCJK){ |
389 | 393 | if(rs.section == null) |
390 | 394 | return null; |
391 | 395 | if(s.length() < SHORT_SNIPPET) |
392 | | - return new RawSnippet(tokens,rs.section,rs.highlight,new HashSet<String>(),stopWords); |
| 396 | + return new RawSnippet(tokens,rs.section,rs.highlight,new HashSet<String>(),stopWords,isCJK); |
393 | 397 | return null; |
394 | 398 | } |
395 | 399 | |
— | — | @@ -418,7 +422,7 @@ |
419 | 423 | /** Alttitle and sections highlighting */ |
420 | 424 | protected static RawSnippet getBestAltTitle(ArrayList<Alttitles.Info> altInfos, HashMap<String,Double> weightTerm, |
421 | 425 | HashMap<String,Double> notInTitle, HashSet<String> stopWords, ArrayList<String> words, HashMap<String,Integer> wordIndex, |
422 | | - int minAdditional, HashSet<String> phrases, HashSet<String> inContext){ |
| 426 | + int minAdditional, HashSet<String> phrases, HashSet<String> inContext, boolean isCJK){ |
423 | 427 | ArrayList<RawSnippet> res = new ArrayList<RawSnippet>(); |
424 | 428 | for(Alttitles.Info ainf : altInfos){ |
425 | 429 | double matched = 0, additionalScore = 0; |
— | — | @@ -445,7 +449,7 @@ |
446 | 450 | } |
447 | 451 | } |
448 | 452 | if(length == matchedPositions.size() || additional > minAdditional || (additional != 0 && additional == notInTitle.size())){ |
449 | | - ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false); |
| 453 | + ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, words, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false, isCJK); |
450 | 454 | if(snippets.size() > 0){ |
451 | 455 | RawSnippet snippet = snippets.get(0); |
452 | 456 | snippet.setAlttitle(ainf); |
— | — | @@ -520,7 +524,8 @@ |
521 | 525 | /** Highlight text */ |
522 | 526 | protected static ArrayList<RawSnippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms, |
523 | 527 | ArrayList<String> words, HashMap<String,Integer> wordIndex, int maxSnippets, boolean ignoreBreaks, HashSet<String> stopWords, |
524 | | - boolean showFirstIfNone, HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases, final boolean alwaysIncludeFirstLine) { |
| 528 | + boolean showFirstIfNone, HashSet<String> phrases, HashSet<String> foundInContext, |
| 529 | + final boolean sortByPhrases, final boolean alwaysIncludeFirstLine, final boolean isCJK) { |
525 | 530 | |
526 | 531 | // pieces of text to ge highlighted |
527 | 532 | ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>(); |
— | — | @@ -593,7 +598,7 @@ |
594 | 599 | if(foundAllInFirst && beginLen > 2*MAX_CONTEXT && firstFragment!=null){ |
595 | 600 | // made enough snippets, return the first one |
596 | 601 | ArrayList<RawSnippet> res = new ArrayList<RawSnippet>(); |
597 | | - res.add(new RawSnippet(tokens,firstFragment,weightTerms.keySet(),firstFragment.found,stopWords)); |
| 602 | + res.add(new RawSnippet(tokens,firstFragment,weightTerms.keySet(),firstFragment.found,stopWords,isCJK)); |
598 | 603 | return res; |
599 | 604 | } |
600 | 605 | fs.next = new FragmentScore(fs.end, sequence++); // link into list |
— | — | @@ -759,7 +764,7 @@ |
760 | 765 | if(f.found != null) |
761 | 766 | termsFound.addAll(f.found); |
762 | 767 | adjustBest(f,tokens,weightTerms,words,wordIndex,newTerms); |
763 | | - RawSnippet s = new RawSnippet(tokens,f,wordHighlight,newTerms,stopWords); |
| 768 | + RawSnippet s = new RawSnippet(tokens,f,wordHighlight,newTerms,stopWords,isCJK); |
764 | 769 | res.add(s); |
765 | 770 | } else if(resNoNew.size() < maxSnippets) |
766 | 771 | resNoNew.add(f); |
— | — | @@ -768,7 +773,7 @@ |
769 | 774 | } |
770 | 775 | // if text doesn't match show some body text |
771 | 776 | if(showFirstIfNone && res.size() == 0 && fragmentsBeginning != null){ |
772 | | - res.add(new RawSnippet(tokens,fragmentsBeginning,wordHighlight,wordHighlight,stopWords)); |
| 777 | + res.add(new RawSnippet(tokens,fragmentsBeginning,wordHighlight,wordHighlight,stopWords,isCJK)); |
773 | 778 | } |
774 | 779 | // always show snippet that is before in the text first |
775 | 780 | Collections.sort(res, new Comparator<RawSnippet>() { |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -787,6 +787,10 @@ |
788 | 788 | doc.add(new Field("date",isoDate.format(article.getDate()),Store.YES,Index.NO)); |
789 | 789 | |
790 | 790 | float rankBoost = transformRank(article.getRank()); |
| 791 | + |
| 792 | + // add both title and redirects to content, so queries that match part of title and content won't fail |
| 793 | + String contents = article.getContents(); |
| 794 | + contents = article.getTitle()+". "+contents+". "+serializeRedirects(article.getRedirectKeywords()); |
791 | 795 | |
792 | 796 | /** Following fields can be optionally case-dependent */ |
793 | 797 | for(FieldBuilder.BuilderSet bs : builder.getBuilders()){ |
— | — | @@ -797,7 +801,7 @@ |
798 | 802 | TokenizerOptions options = new TokenizerOptions(bs.isExactCase()); |
799 | 803 | if(filters.isSpellCheck()) |
800 | 804 | options = new TokenizerOptions.SpellCheck(); |
801 | | - WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,options); |
| 805 | + WikiTokenizer tokenizer = new WikiTokenizer(contents,iid,options); |
802 | 806 | tokenizer.tokenize(); |
803 | 807 | |
804 | 808 | // title |
— | — | @@ -844,6 +848,18 @@ |
845 | 849 | return doc; |
846 | 850 | } |
847 | 851 | |
| 852 | + /** Serialize redirects that will be added to end of the article */ |
| 853 | + private static String serializeRedirects(ArrayList<String> redirectKeywords) { |
| 854 | + if(redirectKeywords.size()==0) |
| 855 | + return ""; |
| 856 | + StringBuilder sb = new StringBuilder(); |
| 857 | + for(String s : redirectKeywords){ |
| 858 | + sb.append(s); |
| 859 | + sb.append(". "); |
| 860 | + } |
| 861 | + return sb.toString(); |
| 862 | + } |
| 863 | + |
848 | 864 | /** Make the document that will be indexed as highlighting data */ |
849 | 865 | public static Document makeHighlightDocument(Article article, FieldBuilder builder, IndexId iid) throws IOException{ |
850 | 866 | WikiIndexModifier.transformArticleForIndexing(article); |
Index: branches/lucene-search-2.1/webinterface/lsweb.py |
— | — | @@ -6,7 +6,14 @@ |
7 | 7 | from urllib2 import URLError, HTTPError |
8 | 8 | |
9 | 9 | #search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv79:8123' } |
10 | | -search_host = {'<default>' : 'localhost:8123', 'enwiki' : "srv79:8123", 'srwiki' : "srv79:8123" } |
| 10 | +search_host = {'<default>' : 'srv79:8123', |
| 11 | + 'jawiki' : "localhost:8123", |
| 12 | + 'frwiki' : "localhost:8123", |
| 13 | + 'dewiki' : "localhost:8123", |
| 14 | + 'itwiki' : "localhost:8123", |
| 15 | + 'jawikiquote' : "localhost:8123", |
| 16 | + 'wikilucene' : 'localhost:8123' } |
| 17 | +#search_host = {'<default>' : 'localhost:8123'} |
11 | 18 | |
12 | 19 | canon_namespaces = { 0 : '', 1: 'Talk', 2: 'User', 3: 'User_talk', |
13 | 20 | 4 : 'Project', 5 : 'Project_talk', 6 : 'Image', 7 : 'Image_talk', |
Index: branches/lucene-search-2.1/webinterface/searchForm.html |
— | — | @@ -41,7 +41,7 @@ |
42 | 42 | </p> |
43 | 43 | |
44 | 44 | <p> |
45 | | -<strong>Status</strong> only en.wiki updated |
| 45 | +<strong>Status</strong> Up |
46 | 46 | </p> |
47 | 47 | <strong>Search:</strong> |
48 | 48 | <hr> |
— | — | @@ -53,25 +53,19 @@ |
54 | 54 | <option value="enwiki">enwiki</option> |
55 | 55 | <option value="dewiki">dewiki</option> |
56 | 56 | <option value="frwiki">frwiki</option> |
57 | | - <option value="mediawikiwiki">mediawikiwiki</option> |
58 | | - <option value="metawiki">metawiki</option> |
59 | | - <option value="wikilucene">wikilucene</option> |
60 | | - <option value="wikidev">wikidev</option> |
61 | | - <option value="enwiktionary">enwiktionary</option> |
62 | | - <option value="enwiktionary-exact">enwiktionary-exact</option> |
63 | | - <option value="enwikinews">enwikinews</option> |
64 | | - <option value="plwiki">plwiki</option> |
65 | 57 | <option value="jawiki">jawiki</option> |
66 | | - <option value="nlwiki">nlwiki</option> |
67 | 58 | <option value="itwiki">itwiki</option> |
68 | | - <option value="ptwiki">ptwiki</option> |
69 | | - <option value="eswiki">eswiki</option> |
70 | | - <option value="svwiki">svwiki</option> |
71 | | - <option value="ruwiki">ruwiki</option> |
72 | | - <option value="zhwiki">zhwiki</option> |
73 | | - <option value="fiwiki">fiwiki</option> |
74 | | - <option value="nowiki">nowiki</option> |
75 | | - <option value="srwiki">srwiki</option> |
| 59 | + <option value="srwiki">srwiki</option> |
| 60 | + <option value="enwiktionary">enwiktionary</option> |
| 61 | + <option value="enwikinews">enwikinews</option> |
| 62 | + <option value="enwikisource">enwikisource</option> |
| 63 | + <option value="enwikiquote">enwikiquote</option> |
| 64 | + <option value="enwikibooks">enwikibooks</option> |
| 65 | + <option value="enwikiversity">enwikiversity</option> |
| 66 | + <option value="enwiktionary-exact">enwiktionary-exact</option> |
| 67 | + <!--<option value="jawikiquote">jawikiquote</option> |
| 68 | + <option value="wikilucene">wikilucene</option> |
| 69 | + <option value="wikidev">wikidev</option> --> |
76 | 70 | </select> |
77 | 71 | |
78 | 72 | Search for <input type='text' name="query" value="" size="30" id="lsearchbox" /> |
Index: branches/lucene-search-2.1/lsearch-global.conf |
— | — | @@ -21,6 +21,7 @@ |
22 | 22 | wikiwiktionary, wikiwikisource : (single) (language,en) (prefix) |
23 | 23 | enwiki,viwiki,srwiki,eswiki,dewiki,mlwiki,zhwiki,jawiki,itwiki,thwiki : (single) |
24 | 24 | mediawikiwiki, metawiki : (single) (language,en) |
| 25 | +jawikiquote : (single) (prefix) |
25 | 26 | |
26 | 27 | # Titles group by interwiki, <all> is the general rule, exceptions can be explicitely set |
27 | 28 | [Database-Group] |
— | — | @@ -32,16 +33,7 @@ |
33 | 34 | # host : db1.part db2.part |
34 | 35 | # Mulitple hosts can search multiple dbs (N-N mapping) |
35 | 36 | [Search-Group] |
36 | | -oblak : wikilucene* wikidev* |
37 | | -#oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links |
38 | | -#oblak : wikilucene.nspart1.sub1 wikilucene.nspart1.sub2 |
39 | | -#oblak : wikilucene.nspart1.sub1.hl wikilucene.nspart1.sub2.hl |
40 | | -#oblak : wikilucene.nspart2.hl wikilucene.nspart3.hl |
41 | | -#oblak : wikilucene.nspart2 wikilucene.nspart3 wikilucene.title_ngram |
42 | | -#oblak : wikilucene.prefix wikilucene.spell wikilucene.related wikilucene.links |
43 | | -#oblak : wikiwiktionary wikiwikisource wikiwiktionary.prefix |
44 | | -#oblak : wiki-titles wiki-titles.tspart1 wiki-titles.tspart2 |
45 | | -#oblak : wikidev.prefix wikidev.hl wikidev.spell |
| 37 | +oblak : wikilucene* wikidev* ja* |
46 | 38 | |
47 | 39 | # Index nodes |
48 | 40 | # host: db1.part db2.part |
— | — | @@ -66,7 +58,7 @@ |
67 | 59 | # Global properies |
68 | 60 | [Properties] |
69 | 61 | # suffixes to database name, the rest is assumed to be language code |
70 | | -Database.suffix=wiki wiktionary wikisource |
| 62 | +Database.suffix=wiki wiktionary wikisource wikiquote |
71 | 63 | |
72 | 64 | # use languages codes as interwiki prefixes (usefultokenizer heuristics for WMF-style wiki farms) |
73 | 65 | Database.smartInterwiki=false |