Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -41,6 +41,7 @@ |
42 | 42 | Links links; |
43 | 43 | String langCode; |
44 | 44 | RelatedStorage related; |
| 45 | + boolean makeIndex, makeHighlight; |
45 | 46 | |
46 | 47 | public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor, |
47 | 48 | Integer maxBufDocs, boolean newIndex, Links links, String langCode, |
— | — | @@ -53,6 +54,8 @@ |
54 | 55 | highlightWriter = new SimpleIndexWriter(iid.getHighlight(), optimize, mergeFactor, maxBufDocs, newIndex); |
55 | 56 | this.limit = limit; |
56 | 57 | this.links = links; |
| 58 | + this.makeIndex = makeIndex; |
| 59 | + this.makeHighlight = makeHighlight; |
57 | 60 | this.langCode = langCode; |
58 | 61 | this.related = new RelatedStorage(iid); |
59 | 62 | if(!related.canRead()) |
— | — | @@ -86,7 +89,7 @@ |
87 | 90 | redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef)); |
88 | 91 | } |
89 | 92 | // related |
90 | | - if(related != null) |
| 93 | + if(makeIndex && related != null) |
91 | 94 | rel = related.getRelated(key); |
92 | 95 | // make article |
93 | 96 | Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect, |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -46,7 +46,8 @@ |
47 | 47 | protected String langCode; |
48 | 48 | protected Links links; |
49 | 49 | protected Analyzer indexAnalyzer; |
50 | | - protected ReusableLanguageAnalyzer highlightAnalyzer; |
| 50 | + protected Analyzer highlightAnalyzer; |
| 51 | + protected ReusableLanguageAnalyzer highlightContentAnalyzer; |
51 | 52 | protected HashSet<String> stopWords; |
52 | 53 | |
53 | 54 | public SimpleIndexWriter(IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){ |
— | — | @@ -61,7 +62,8 @@ |
62 | 63 | builder = new FieldBuilder(iid,dCase); |
63 | 64 | indexes = new HashMap<String,IndexWriter>(); |
64 | 65 | indexAnalyzer = Analyzers.getIndexerAnalyzer(builder); |
65 | | - highlightAnalyzer = new ReusableLanguageAnalyzer(builder.getBuilder().getFilters(),false,true); |
| 66 | + highlightAnalyzer = Analyzers.getHighlightAnalyzer(iid); |
| 67 | + highlightContentAnalyzer = new ReusableLanguageAnalyzer(builder.getBuilder().getFilters(),false,true); |
66 | 68 | stopWords = StopWords.getPredefinedSet(iid); |
67 | 69 | // open all relevant indexes |
68 | 70 | for(IndexId part : iid.getPhysicalIndexIds()){ |
— | — | @@ -151,17 +153,12 @@ |
152 | 154 | IndexWriter writer = indexes.get(target.toString()); |
153 | 155 | if(writer == null) |
154 | 156 | return; |
155 | | - String key = a.getTitleObject().getKey(); |
156 | 157 | try { |
157 | | - // TODO: move to WikiIndexModifier? |
158 | | - Document doc = new Document(); |
159 | | - doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED)); |
160 | | - doc.add(new Field("text",ExtToken.serialize(highlightAnalyzer.tokenStream("contents",a.getContents())),Store.COMPRESS)); |
161 | | - doc.add(new Field("alttitle",WikiIndexModifier.serializeAltTitle(a,iid,highlightAnalyzer.getWikiTokenizer().getHeadingText()),Store.COMPRESS)); |
| 158 | + Document doc = WikiIndexModifier.makeHighlightDocument(a,highlightAnalyzer,highlightContentAnalyzer,target); |
162 | 159 | addDocument(writer,doc,a,target); |
163 | 160 | } catch (IOException e) { |
164 | 161 | e.printStackTrace(); |
165 | | - log.error("Error adding document for key="+key+" : "+e.getMessage()); |
| 162 | + log.error("Error adding document for key="+a.getTitleObject().getKey()+" : "+e.getMessage()); |
166 | 163 | } |
167 | 164 | } |
168 | 165 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java |
— | — | @@ -0,0 +1,107 @@ |
| 2 | +package org.wikimedia.lsearch.highlight; |
| 3 | + |
| 4 | +import java.io.Serializable; |
| 5 | +import java.util.ArrayList; |
| 6 | + |
| 7 | +import org.wikimedia.lsearch.analyzers.Alttitles; |
| 8 | + |
| 9 | +/** |
| 10 | + * Snippet of highlighted text. |
| 11 | + * |
| 12 | + * @author rainman |
| 13 | + * |
| 14 | + */ |
| 15 | +public class Snippet implements Serializable { |
| 16 | + public static class Range implements Serializable { |
| 17 | + public int start; |
| 18 | + public int end; |
| 19 | + |
| 20 | + public Range(int start, int end){ |
| 21 | + this.start = start; |
| 22 | + this.end = end; |
| 23 | + } |
| 24 | + |
| 25 | + @Override |
| 26 | + public int hashCode() { |
| 27 | + final int PRIME = 31; |
| 28 | + int result = 1; |
| 29 | + result = PRIME * result + end; |
| 30 | + result = PRIME * result + start; |
| 31 | + return result; |
| 32 | + } |
| 33 | + |
| 34 | + @Override |
| 35 | + public boolean equals(Object obj) { |
| 36 | + if (this == obj) |
| 37 | + return true; |
| 38 | + if (obj == null) |
| 39 | + return false; |
| 40 | + if (getClass() != obj.getClass()) |
| 41 | + return false; |
| 42 | + final Range other = (Range) obj; |
| 43 | + if (end != other.end) |
| 44 | + return false; |
| 45 | + if (start != other.start) |
| 46 | + return false; |
| 47 | + return true; |
| 48 | + } |
| 49 | + |
| 50 | + |
| 51 | + } |
| 52 | + protected String text = null; |
| 53 | + protected ArrayList<Range> highlighted = new ArrayList<Range>(); |
| 54 | + |
| 55 | + protected Alttitles.Info alttitle = null; |
| 56 | + |
| 57 | + public Snippet(){ |
| 58 | + |
| 59 | + } |
| 60 | + public Snippet(String text){ |
| 61 | + this.text = text; |
| 62 | + } |
| 63 | + |
| 64 | + public void addRange(Range r){ |
| 65 | + if(highlighted.size() != 0 && r.equals(highlighted.get(highlighted.size()-1))){ |
| 66 | + return; // don't allow duplicates! |
| 67 | + } |
| 68 | + highlighted.add(r); |
| 69 | + } |
| 70 | + |
| 71 | + public ArrayList<Range> getHighlighted() { |
| 72 | + return highlighted; |
| 73 | + } |
| 74 | + |
| 75 | + public String getText() { |
| 76 | + return text; |
| 77 | + } |
| 78 | + |
| 79 | + public void setText(String text){ |
| 80 | + this.text = text; |
| 81 | + } |
| 82 | + |
| 83 | + public String getFormatted(){ |
| 84 | + StringBuilder sb = new StringBuilder(); |
| 85 | + int last = 0; |
| 86 | + for(Range r : highlighted){ |
| 87 | + sb.append(text.substring(last,r.start)); |
| 88 | + sb.append("<b>"); |
| 89 | + sb.append(text.substring(r.start,r.end)); |
| 90 | + sb.append("</b>"); |
| 91 | + last = r.end; |
| 92 | + } |
| 93 | + if(last != text.length()) |
| 94 | + sb.append(text.substring(last)); |
| 95 | + return sb.toString(); |
| 96 | + } |
| 97 | + public Alttitles.Info getAlttitle() { |
| 98 | + return alttitle; |
| 99 | + } |
| 100 | + public void setAlttitle(Alttitles.Info alttitle) { |
| 101 | + this.alttitle = alttitle; |
| 102 | + } |
| 103 | + public void setHighlighted(ArrayList<Range> highlighted) { |
| 104 | + this.highlighted = highlighted; |
| 105 | + } |
| 106 | + |
| 107 | + |
| 108 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightResult.java |
— | — | @@ -0,0 +1,53 @@ |
| 2 | +package org.wikimedia.lsearch.highlight; |
| 3 | + |
| 4 | +import java.io.Serializable; |
| 5 | + |
| 6 | +/** |
| 7 | + * Result of higlighting, contains |
| 8 | + * snippets for title, redirect, sections, and text |
| 9 | + * @author rainman |
| 10 | + * |
| 11 | + */ |
| 12 | +public class HighlightResult implements Serializable { |
| 13 | + protected Snippet title = null; |
| 14 | + protected Snippet redirect = null; |
| 15 | + protected Snippet section = null; |
| 16 | + protected Snippet text = null; |
| 17 | + |
| 18 | + public HighlightResult(){ |
| 19 | + } |
| 20 | + |
| 21 | + public Snippet getRedirect() { |
| 22 | + return redirect; |
| 23 | + } |
| 24 | + |
| 25 | + public void setRedirect(Snippet redirect) { |
| 26 | + this.redirect = redirect; |
| 27 | + } |
| 28 | + |
| 29 | + public Snippet getSection() { |
| 30 | + return section; |
| 31 | + } |
| 32 | + |
| 33 | + public void setSection(Snippet section) { |
| 34 | + this.section = section; |
| 35 | + } |
| 36 | + |
| 37 | + public Snippet getText() { |
| 38 | + return text; |
| 39 | + } |
| 40 | + |
| 41 | + public void setText(Snippet text) { |
| 42 | + this.text = text; |
| 43 | + } |
| 44 | + |
| 45 | + public Snippet getTitle() { |
| 46 | + return title; |
| 47 | + } |
| 48 | + |
| 49 | + public void setTitle(Snippet title) { |
| 50 | + this.title = title; |
| 51 | + } |
| 52 | + |
| 53 | + |
| 54 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java |
— | — | @@ -0,0 +1,324 @@ |
| 2 | +package org.wikimedia.lsearch.highlight; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Arrays; |
| 7 | +import java.util.Collections; |
| 8 | +import java.util.Comparator; |
| 9 | +import java.util.HashMap; |
| 10 | +import java.util.HashSet; |
| 11 | +import java.util.Set; |
| 12 | + |
| 13 | +import org.apache.log4j.Logger; |
| 14 | +import org.apache.lucene.document.Document; |
| 15 | +import org.apache.lucene.index.IndexReader; |
| 16 | +import org.apache.lucene.index.Term; |
| 17 | +import org.apache.lucene.index.TermDocs; |
| 18 | +import org.wikimedia.lsearch.analyzers.Alttitles; |
| 19 | +import org.wikimedia.lsearch.analyzers.ExtToken; |
| 20 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
| 21 | +import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
| 22 | +import org.wikimedia.lsearch.analyzers.ExtToken.Position; |
| 23 | +import org.wikimedia.lsearch.analyzers.ExtToken.Type; |
| 24 | +import org.wikimedia.lsearch.config.IndexId; |
| 25 | +import org.wikimedia.lsearch.search.SearcherCache; |
| 26 | + |
| 27 | +public class Highlight { |
| 28 | + protected static SearcherCache cache = null; |
| 29 | + static Logger log = Logger.getLogger(Highlight.class); |
| 30 | + |
| 31 | + public static final int SLOP = WikiQueryParser.MAINPHRASE_SLOP; |
| 32 | + /** maximal length of text that surrounds highlighted words */ |
| 33 | + public static final int MAX_CONTEXT = 75; |
| 34 | + |
| 35 | + public static final double PHRASE_BOOST = 1; |
| 36 | + |
| 37 | + /** boost (preference) factors for varius parts of the text */ |
| 38 | + public static final HashMap<Position,Double> BOOST = new HashMap<Position,Double>(); |
| 39 | + static { |
| 40 | + BOOST.put(Position.FIRST_SECTION,5.0); |
| 41 | + BOOST.put(Position.HEADING,2.0); |
| 42 | + BOOST.put(Position.NORMAL,1.0); |
| 43 | + BOOST.put(Position.TEMPLATE,0.1); |
| 44 | + BOOST.put(Position.IMAGE_CAT_IW,0.01); |
| 45 | + BOOST.put(Position.EXT_LINK,0.5); |
| 46 | + BOOST.put(Position.REFERENCE,0.5); |
| 47 | + } |
| 48 | + /** |
| 49 | + * |
| 50 | + * @param hits - keys of articles that need to be higlighted |
| 51 | + * @param iid - highlight index |
| 52 | + * @param terms - terms to highlight |
| 53 | + * @param df - their document frequencies |
| 54 | + * @param words - in order words (from main phrase) |
| 55 | + * @param exactCase - if these are results from exactCase search |
| 56 | + * @throws IOException |
| 57 | + * @returns map: key -> what to highlight |
| 58 | + */ |
| 59 | + @SuppressWarnings("unchecked") |
| 60 | + public static HashMap<String,HighlightResult> highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, HashSet<String> stopWords) throws IOException{ |
| 61 | + if(cache == null) |
| 62 | + cache = SearcherCache.getInstance(); |
| 63 | + |
| 64 | + System.out.println("Highlighting: "+Arrays.toString(terms)); |
| 65 | + |
| 66 | + FieldNameFactory fields = new FieldNameFactory(exactCase); |
| 67 | + |
| 68 | + if(stopWords == null) |
| 69 | + stopWords = new HashSet<String>(); |
| 70 | + |
| 71 | + // terms weighted with idf |
| 72 | + HashMap<String,Double> weightTerm = new HashMap<String,Double>(); |
| 73 | + for(int i=0;i<terms.length;i++){ |
| 74 | + Term t = terms[i]; |
| 75 | + if(t.field().equals(fields.contents())){ |
| 76 | + double idf = idf(df[i],maxDoc); |
| 77 | + weightTerm.put(t.text(),idf); |
| 78 | + } |
| 79 | + } |
| 80 | + // position within main phrase |
| 81 | + HashMap<String,Integer> wordIndex = new HashMap<String,Integer>(); |
| 82 | + for(int i=0;i<words.size();i++) |
| 83 | + wordIndex.put(words.get(i),i); |
| 84 | + |
| 85 | + // process requested documents |
| 86 | + IndexReader reader = cache.getLocalSearcher(iid.getHighlight()).getIndexReader(); |
| 87 | + HashMap<String,HighlightResult> res = new HashMap<String,HighlightResult>(); |
| 88 | + for(String key : hits){ |
| 89 | + Object[] ret = getTokens(reader,key); |
| 90 | + if(ret == null) |
| 91 | + continue; |
| 92 | + ArrayList<ExtToken> tokens = (ArrayList<ExtToken>) ret[0]; |
| 93 | + Alttitles alttitles = (Alttitles) ret[1]; |
| 94 | + HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles); |
| 95 | + |
| 96 | + getBestTextSnippets(tokens, weightTerm, wordIndex, 2); |
| 97 | + getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,1); |
| 98 | + getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,0); |
| 99 | + |
| 100 | + } |
| 101 | + return res; |
| 102 | + } |
| 103 | + |
| 104 | + /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ |
| 105 | + protected static double idf(int docFreq, int numDocs) { |
| 106 | + return Math.log(numDocs/(double)(docFreq+1)) + 1.0; |
| 107 | + } |
| 108 | + |
| 109 | + @SuppressWarnings("unchecked") |
| 110 | + protected static HashMap<String,Double> getTermsNotInTitle(HashMap<String,Double> weightTerm, Alttitles alttitles){ |
| 111 | + Alttitles.Info info = alttitles.getTitle(); |
| 112 | + ArrayList<ExtToken> tokens = info.getTokens(); |
| 113 | + HashMap<String,Double> ret = (HashMap<String, Double>) weightTerm.clone(); |
| 114 | + // delete all terms from title |
| 115 | + for(ExtToken t : tokens){ |
| 116 | + if(ret.containsKey(t.termText())) |
| 117 | + ret.remove(t.termText()); |
| 118 | + } |
| 119 | + return ret; |
| 120 | + |
| 121 | + } |
| 122 | + |
| 123 | + /** Alttitle and sections highlighting */ |
| 124 | + |
| 125 | + protected static class ScoredSnippet { |
| 126 | + Snippet snippet = null; |
| 127 | + double score = 0; |
| 128 | + public ScoredSnippet(Snippet snippet, double score) { |
| 129 | + this.snippet = snippet; |
| 130 | + this.score = score; |
| 131 | + } |
| 132 | + |
| 133 | + } |
| 134 | + |
| 135 | + protected static Snippet getBestAltTitle(ArrayList<Alttitles.Info> altInfos, HashMap<String,Double> weightTerm, |
| 136 | + HashMap<String,Double> notInTitle, HashSet<String> stopWords, int minAdditional){ |
| 137 | + ArrayList<ScoredSnippet> res = new ArrayList<ScoredSnippet>(); |
| 138 | + for(Alttitles.Info ainf : altInfos){ |
| 139 | + double matched = 0, additional=0; |
| 140 | + ArrayList<ExtToken> tokens = ainf.getTokens(); |
| 141 | + boolean completeMatch=true; |
| 142 | + for(int i=0;i<tokens.size();i++){ |
| 143 | + ExtToken t = tokens.get(i); |
| 144 | + if(t.getPositionIncrement() == 0) |
| 145 | + continue; // skip aliases |
| 146 | + |
| 147 | + if(weightTerm.containsKey(t.termText())) |
| 148 | + matched += weightTerm.get(t.termText()); |
| 149 | + else if(!stopWords.contains(t.termText())) |
| 150 | + completeMatch = false; |
| 151 | + |
| 152 | + if(notInTitle.containsKey(t.termText())) |
| 153 | + additional += notInTitle.get(t.termText()); |
| 154 | + } |
| 155 | + if((completeMatch && additional >= minAdditional) || additional >= minAdditional+1 || additional == notInTitle.size()){ |
| 156 | + Snippet snippet = makeSnippet(tokens,0,tokens.size(),weightTerm.keySet()); |
| 157 | + snippet.setAlttitle(ainf); |
| 158 | + res.add(new ScoredSnippet(snippet,matched+additional)); |
| 159 | + } |
| 160 | + } |
| 161 | + if(res.size() > 0){ |
| 162 | + if(res.size() == 1){ |
| 163 | + return res.get(0).snippet; |
| 164 | + } else{ |
| 165 | + // get snippet with best score |
| 166 | + Collections.sort(res, new Comparator<ScoredSnippet>() { |
| 167 | + public int compare(ScoredSnippet o1, ScoredSnippet o2) { |
| 168 | + double d = o2.score - o1.score; |
| 169 | + if(d > 0) |
| 170 | + return 1; |
| 171 | + else if(d == 0) |
| 172 | + return 0; |
| 173 | + else return -1; |
| 174 | + }}); |
| 175 | + return res.get(0).snippet; |
| 176 | + } |
| 177 | + } |
| 178 | + return null; |
| 179 | + } |
| 180 | + |
| 181 | + /** Text highlighting */ |
| 182 | + |
| 183 | + protected static class FragmentScore { |
| 184 | + int start = 0; |
| 185 | + int end = 0; |
| 186 | + double score = 0; |
| 187 | + // best match in this fragment |
| 188 | + int bestStart = -1; |
| 189 | + int bestEnd = -1; |
| 190 | + double bestScore = 0; |
| 191 | + |
| 192 | + FragmentScore(int start){ |
| 193 | + this.start = start; |
| 194 | + } |
| 195 | + |
| 196 | + public String toString(){ |
| 197 | + return "start="+start+", end="+end+", score="+score+", bestStart="+bestStart+", bestEnd="+bestEnd; |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + /** Highlight text */ |
| 202 | + protected static ArrayList<Snippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms, |
| 203 | + HashMap<String,Integer> wordIndex, int maxSnippets) { |
| 204 | + |
| 205 | + // pieces of text to ge highlighted |
| 206 | + ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>(); |
| 207 | + |
| 208 | + //System.out.println("TOKENS: "+tokens); |
| 209 | + |
| 210 | + FragmentScore fs = null; |
| 211 | + ExtToken last = null; |
| 212 | + // next three are for in-order matched phrases |
| 213 | + Integer lastWord = null; // pointer to the last word found in sequence |
| 214 | + int lastIndex = 0; |
| 215 | + Double lastWeight = null; |
| 216 | + // indicator for first sentence |
| 217 | + boolean seenFirstSentence = false; |
| 218 | + for(int i=0;i<=tokens.size();i++){ |
| 219 | + ExtToken t = null; |
| 220 | + if(i < tokens.size()) |
| 221 | + t = tokens.get(i); |
| 222 | + if(last == null){ |
| 223 | + fs = new FragmentScore(i); |
| 224 | + } else if(t==null || t.getPosition() != last.getPosition() || t.getType() == Type.SENTENCE_BREAK){ |
| 225 | + Position pos = last.getPosition(); |
| 226 | + // finalize fragment |
| 227 | + |
| 228 | + if(t == null || t.getType() != Type.SENTENCE_BREAK) |
| 229 | + fs.end = i; |
| 230 | + else |
| 231 | + fs.end = i + 1; |
| 232 | + fs.score *= BOOST.get(pos); |
| 233 | + fragments.add(fs); |
| 234 | + if(pos == Position.FIRST_SECTION && !seenFirstSentence){ |
| 235 | + // boost for first sentence |
| 236 | + fs.score *= 4; |
| 237 | + seenFirstSentence = true; |
| 238 | + } |
| 239 | + fs = new FragmentScore(fs.end); |
| 240 | + } |
| 241 | + if(t == null) |
| 242 | + break; |
| 243 | + |
| 244 | + Double weight = weightTerms.get(t.termText()); |
| 245 | + if(weight != null){ |
| 246 | + fs.score += weight; |
| 247 | + Integer inx = wordIndex.get(t.termText()); |
| 248 | + if(lastWord != null && inx != null && lastWord == inx - 1 && i-lastIndex <= 2){ |
| 249 | + double phraseScore = (weight + lastWeight) * PHRASE_BOOST; |
| 250 | + fs.score += phraseScore; |
| 251 | + if(phraseScore > fs.bestScore){ |
| 252 | + fs.bestStart = lastIndex; |
| 253 | + fs.bestEnd = i; |
| 254 | + fs.bestScore = phraseScore; |
| 255 | + } |
| 256 | + } |
| 257 | + lastWord = inx; |
| 258 | + lastWeight = weight; |
| 259 | + lastIndex = i; |
| 260 | + } |
| 261 | + |
| 262 | + last = t; |
| 263 | + } |
| 264 | + |
| 265 | + // find fragments with best score |
| 266 | + Collections.sort(fragments, new Comparator<FragmentScore>() { |
| 267 | + public int compare(FragmentScore o1, FragmentScore o2) { |
| 268 | + double d = o2.score - o1.score; |
| 269 | + if(d > 0) |
| 270 | + return 1; |
| 271 | + else if(d == 0) |
| 272 | + return 0; |
| 273 | + else return -1; |
| 274 | + }}); |
| 275 | + |
| 276 | + ArrayList<Snippet> res = new ArrayList<Snippet>(); |
| 277 | + for(FragmentScore f : fragments){ |
| 278 | + if(f.score == 0) |
| 279 | + continue; |
| 280 | + Snippet s = makeSnippet(tokens,f,weightTerms.keySet()); |
| 281 | + res.add(s); |
| 282 | + System.out.println(f+" : "+s.getFormatted()); |
| 283 | + if(res.size() >= maxSnippets) |
| 284 | + break; |
| 285 | + } |
| 286 | + return res; |
| 287 | + } |
| 288 | + |
| 289 | + private static Snippet makeSnippet(ArrayList<ExtToken> tokens, FragmentScore f, Set<String> highlight) { |
| 290 | + return makeSnippet(tokens,f.start,f.end,highlight); |
| 291 | + } |
| 292 | + |
| 293 | + private static Snippet makeSnippet(ArrayList<ExtToken> tokens, int fromIndex, int toIndex, Set<String> highlight) { |
| 294 | + Snippet s = new Snippet(); |
| 295 | + StringBuilder sb = new StringBuilder(); |
| 296 | + int start=0, end=0; |
| 297 | + for(int i=fromIndex;i<toIndex;i++){ |
| 298 | + ExtToken t = tokens.get(i); |
| 299 | + if(t.getPositionIncrement() != 0){ |
| 300 | + start = sb.length(); |
| 301 | + sb.append(t.getText()); |
| 302 | + end = sb.length(); |
| 303 | + } |
| 304 | + if(highlight.contains(t.termText())){ |
| 305 | + s.addRange(new Snippet.Range(start,end)); |
| 306 | + } |
| 307 | + } |
| 308 | + s.setText(sb.toString()); |
| 309 | + return s; |
| 310 | + } |
| 311 | + |
| 312 | + /** @return ArrayList<ExtToken> tokens, Altitles alttitles */ |
| 313 | + protected static Object[] getTokens(IndexReader reader, String key) throws IOException{ |
| 314 | + TermDocs td = reader.termDocs(new Term("key",key)); |
| 315 | + if(td.next()){ |
| 316 | + System.out.println("Found "+key); |
| 317 | + Document doc = reader.document(td.doc()); |
| 318 | + ArrayList<ExtToken> tokens = ExtToken.deserialize(doc.getBinaryValue("text")); |
| 319 | + // FIXME: wrong deserialization for alttitle ! |
| 320 | + Alttitles alttitles = Alttitles.deserializeAltTitle(doc.getBinaryValue("alttitle")); |
| 321 | + return new Object[] {tokens, alttitles}; |
| 322 | + } else |
| 323 | + return null; |
| 324 | + } |
| 325 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java |
— | — | @@ -148,10 +148,17 @@ |
149 | 149 | } |
150 | 150 | |
151 | 151 | /** |
152 | | - * @return Returns the page key (via article) |
| 152 | + * @return Returns the page key -- page_id (via article) |
153 | 153 | */ |
154 | 154 | public String getKey(){ |
155 | 155 | return article.getKey(); |
156 | 156 | } |
157 | 157 | |
| 158 | + /** |
| 159 | + * @return Highlight key -- ns:title |
| 160 | + */ |
| 161 | + public String getHighlightKey(){ |
| 162 | + return article.getTitleObject().getKey(); |
| 163 | + } |
| 164 | + |
158 | 165 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -22,6 +22,8 @@ |
23 | 23 | import org.apache.lucene.analysis.SimpleAnalyzer; |
24 | 24 | import org.apache.lucene.document.Document; |
25 | 25 | import org.apache.lucene.document.Field; |
| 26 | +import org.apache.lucene.document.Field.Index; |
| 27 | +import org.apache.lucene.document.Field.Store; |
26 | 28 | import org.apache.lucene.index.IndexReader; |
27 | 29 | import org.apache.lucene.index.IndexWriter; |
28 | 30 | import org.apache.lucene.index.Term; |
— | — | @@ -29,9 +31,11 @@ |
30 | 32 | import org.apache.lucene.store.FSDirectory; |
31 | 33 | import org.wikimedia.lsearch.analyzers.Aggregate; |
32 | 34 | import org.wikimedia.lsearch.analyzers.AggregateAnalyzer; |
| 35 | +import org.wikimedia.lsearch.analyzers.Alttitles; |
33 | 36 | import org.wikimedia.lsearch.analyzers.Analyzers; |
34 | 37 | import org.wikimedia.lsearch.analyzers.CategoryAnalyzer; |
35 | 38 | import org.wikimedia.lsearch.analyzers.ContextAnalyzer; |
| 39 | +import org.wikimedia.lsearch.analyzers.ExtToken; |
36 | 40 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
37 | 41 | import org.wikimedia.lsearch.analyzers.FieldBuilder; |
38 | 42 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
— | — | @@ -39,6 +43,7 @@ |
40 | 44 | import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer; |
41 | 45 | import org.wikimedia.lsearch.analyzers.LanguageAnalyzer; |
42 | 46 | import org.wikimedia.lsearch.analyzers.RelatedAnalyzer; |
| 47 | +import org.wikimedia.lsearch.analyzers.ReusableLanguageAnalyzer; |
43 | 48 | import org.wikimedia.lsearch.analyzers.StopWords; |
44 | 49 | import org.wikimedia.lsearch.analyzers.TokenizerOptions; |
45 | 50 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
— | — | @@ -138,7 +143,11 @@ |
139 | 144 | } |
140 | 145 | for(IndexUpdateRecord rec : records){ |
141 | 146 | if(rec.doDelete()){ |
142 | | - int count = reader.deleteDocuments(new Term("key", rec.getKey())); |
| 147 | + int count = 0; |
| 148 | + if(iid.isHighlight()) |
| 149 | + count = reader.deleteDocuments(new Term("key", rec.getHighlightKey())); |
| 150 | + else // normal index |
| 151 | + count = reader.deleteDocuments(new Term("key", rec.getKey())); |
143 | 152 | if(count == 0) |
144 | 153 | nonDeleteDocuments.add(rec); |
145 | 154 | IndexReportCard card = getReportCard(rec); |
— | — | @@ -184,7 +193,14 @@ |
185 | 194 | writer.setMaxFieldLength(MAX_FIELD_LENGTH); |
186 | 195 | FieldBuilder.Case dCase = (exactCase)? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE; |
187 | 196 | FieldBuilder builder = new FieldBuilder(iid,dCase); |
188 | | - Analyzer analyzer = Analyzers.getIndexerAnalyzer(builder); |
| 197 | + Analyzer analyzer = null; |
| 198 | + ReusableLanguageAnalyzer highlightContentAnalyzer = null; |
| 199 | + if(iid.isHighlight()){ |
| 200 | + highlightContentAnalyzer = Analyzers.getReusableHighlightAnalyzer(builder.getBuilder(dCase).getFilters()); |
| 201 | + analyzer = Analyzers.getHighlightAnalyzer(iid); |
| 202 | + } else |
| 203 | + analyzer = Analyzers.getIndexerAnalyzer(builder); |
| 204 | + |
189 | 205 | HashSet<String> stopWords = StopWords.getPredefinedSet(iid); |
190 | 206 | for(IndexUpdateRecord rec : records){ |
191 | 207 | if(rec.doAdd()){ |
— | — | @@ -193,8 +209,13 @@ |
194 | 210 | if(!checkPreconditions(rec)) |
195 | 211 | continue; // article shouldn't be added for some reason |
196 | 212 | IndexReportCard card = getReportCard(rec); |
197 | | - Document doc = makeDocument(rec.getArticle(),builder,iid,stopWords); |
| 213 | + Document doc; |
198 | 214 | try { |
| 215 | + if(iid.isHighlight()) |
| 216 | + doc = makeHighlightDocument(rec.getArticle(),analyzer,highlightContentAnalyzer,iid); |
| 217 | + else // normal index |
| 218 | + doc = makeDocument(rec.getArticle(),builder,iid,stopWords); |
| 219 | + |
199 | 220 | writer.addDocument(doc,analyzer); |
200 | 221 | log.debug(iid+": Adding document "+rec.getKey()+" "+rec.getArticle()); |
201 | 222 | if(card != null) |
— | — | @@ -279,7 +300,7 @@ |
280 | 301 | * |
281 | 302 | * @param article |
282 | 303 | */ |
283 | | - protected static void transformArticleForIndexing(Article ar) { |
| 304 | + public static void transformArticleForIndexing(Article ar) { |
284 | 305 | ArrayList<Redirect> redirects = ar.getRedirects(); |
285 | 306 | // sort redirect by their rank |
286 | 307 | Collections.sort(redirects,new Comparator<Redirect>() { |
— | — | @@ -365,6 +386,18 @@ |
366 | 387 | } |
367 | 388 | |
368 | 389 | /** |
| 390 | + * Update both the search and highlight index for iid. |
| 391 | + * |
| 392 | + * @param iid |
| 393 | + * @param updateRecords |
| 394 | + */ |
| 395 | + public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
| 396 | + boolean index = updateDocumentsOn(iid,updateRecords); |
| 397 | + boolean highlight = updateDocumentsOn(iid.getHighlight(),updateRecords); |
| 398 | + return index && highlight; |
| 399 | + } |
| 400 | + |
| 401 | + /** |
369 | 402 | * Update all documents in the collection. If needed the request |
370 | 403 | * is forwarded to a remote object (i.e. if the part of the split |
371 | 404 | * index is indexed by another host). |
— | — | @@ -372,7 +405,7 @@ |
373 | 406 | * @param iid |
374 | 407 | * @param updateRecords |
375 | 408 | */ |
376 | | - public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
| 409 | + protected boolean updateDocumentsOn(IndexId iid, Collection<IndexUpdateRecord> updateRecords){ |
377 | 410 | long now = System.currentTimeMillis(); |
378 | 411 | log.info("Starting update of "+updateRecords.size()+" records on "+iid+", started at "+now); |
379 | 412 | boolean succ = true; |
— | — | @@ -521,6 +554,17 @@ |
522 | 555 | return doc; |
523 | 556 | } |
524 | 557 | |
| 558 | + /** Make the document that will be indexed as highlighting data */ |
| 559 | + public static Document makeHighlightDocument(Article article, Analyzer analyzer, ReusableLanguageAnalyzer contentAnalyzer, IndexId iid) throws IOException{ |
| 560 | + String key = article.getTitleObject().getKey(); |
| 561 | + Document doc = new Document(); |
| 562 | + doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED)); |
| 563 | + doc.add(new Field("text",ExtToken.serialize(contentAnalyzer.tokenStream("contents",article.getContents())),Store.COMPRESS)); |
| 564 | + ArrayList<String> sections = contentAnalyzer.getWikiTokenizer().getHeadingText(); |
| 565 | + doc.add(new Field("alttitle",Alttitles.serializeAltTitle(article,iid,sections,analyzer,"alttitle"),Store.COMPRESS)); |
| 566 | + return doc; |
| 567 | + } |
| 568 | + |
525 | 569 | /** add related aggregate field */ |
526 | 570 | protected static void makeRelated(Document doc, String prefix, Article article, IndexId iid, HashSet<String> stopWords){ |
527 | 571 | ArrayList<Aggregate> items = new ArrayList<Aggregate>(); |
— | — | @@ -548,34 +592,9 @@ |
549 | 593 | addToItems(items, new Aggregate(title+" "+h,rankBoost*HEADINGS_BOOST,iid,exactCase,stopWords)); |
550 | 594 | } |
551 | 595 | makeAggregate(doc,prefix,items); |
552 | | - } |
| 596 | + } |
553 | 597 | |
554 | 598 | |
555 | | - public enum AlttitleTypes { TITLE, REDIRECT, HEADING }; |
556 | | - |
557 | | - public static byte[] serializeAltTitle(Article article, IndexId iid, ArrayList<String> headingText) throws IOException{ |
558 | | - WikiIndexModifier.transformArticleForIndexing(article); |
559 | | - Buffer b = new Buffer(); |
560 | | - |
561 | | - // add title |
562 | | - String title = article.getTitle(); |
563 | | - b.writeAggregate(title,new Aggregate(title,article.getRank(),iid),AlttitleTypes.TITLE.ordinal()); |
564 | | - // add all redirects |
565 | | - ArrayList<String> redirects = article.getRedirectKeywords(); |
566 | | - ArrayList<Integer> ranks = article.getRedirectKeywordRanks(); |
567 | | - for(int i=0;i<redirects.size();i++){ |
568 | | - b.writeAggregate(redirects.get(i),new Aggregate(redirects.get(i),ranks.get(i),iid),AlttitleTypes.REDIRECT.ordinal()); |
569 | | - } |
570 | | - // add section headings! |
571 | | - for(String h : headingText){ |
572 | | - b.writeAggregate(h,new Aggregate(h,article.getRank()*HEADINGS_BOOST,iid),AlttitleTypes.HEADING.ordinal()); |
573 | | - } |
574 | | - |
575 | | - return b.getBytes(); |
576 | | - } |
577 | | - |
578 | | - |
579 | | - |
580 | 599 | private static void addToItems(ArrayList<Aggregate> items, Aggregate a){ |
581 | 600 | if(a.length() != 0) |
582 | 601 | items.add(a); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java |
— | — | @@ -100,6 +100,14 @@ |
101 | 101 | this.inCase = inCase; |
102 | 102 | } |
103 | 103 | |
| 104 | + /** get text, original if available, otherwise termtext */ |
| 105 | + public String getText(){ |
| 106 | + if(original != null) |
| 107 | + return original; |
| 108 | + else |
| 109 | + return termText(); |
| 110 | + } |
| 111 | + |
104 | 112 | public String toString(){ |
105 | 113 | return "\""+termText()+"\",t="+type+",p="+pos+(original!=null? ",o={"+original+"}" : "")+",i="+getPositionIncrement(); |
106 | 114 | } |
— | — | @@ -153,10 +161,20 @@ |
154 | 162 | b.writeString(t.termText()); |
155 | 163 | } |
156 | 164 | } |
157 | | - // control 1: original word |
| 165 | + |
158 | 166 | if(t.getPositionIncrement() > 0 && t.original != null){ |
159 | | - b.writeControl(1); |
160 | | - b.writeStringWithLength(t.original); |
| 167 | + String w = t.termText(); |
| 168 | + if(t.original.equals(w.substring(0,1).toUpperCase()+w.substring(1))){ |
| 169 | + // control 6: original is title case |
| 170 | + b.writeControl(6); |
| 171 | + } else if(t.original.equals(w.toUpperCase())){ |
| 172 | + // control 7: original is upper case |
| 173 | + b.writeControl(7); |
| 174 | + } else{ |
| 175 | + // control 1: original word |
| 176 | + b.writeControl(1); |
| 177 | + b.writeStringWithLength(t.original); |
| 178 | + } |
161 | 179 | } |
162 | 180 | // control 2: alias |
163 | 181 | if(t.getPositionIncrement() == 0){ |
— | — | @@ -294,12 +312,18 @@ |
295 | 313 | throw new RuntimeException("Bad serialized data: trying to assing a sentence break to text"); |
296 | 314 | t.setType(Type.SENTENCE_BREAK); |
297 | 315 | break; |
298 | | - case 5: |
| 316 | + case 5: // url |
299 | 317 | { int len = serialized[cur++]; |
300 | 318 | ExtToken tt = new ExtToken(new String(serialized,cur,len,"utf-8"),cur,cur+len,Type.URL,Position.EXT_LINK); |
301 | 319 | tokens.add(tt); |
302 | 320 | cur += len; |
303 | 321 | break; } |
| 322 | + case 6: // original is title case |
| 323 | + t.setOriginal(t.termText().substring(0,1).toUpperCase()+t.termText().substring(1)); |
| 324 | + break; |
| 325 | + case 7: // original is upper case |
| 326 | + t.setOriginal(t.termText().toUpperCase()); |
| 327 | + break; |
304 | 328 | default: |
305 | 329 | throw new RuntimeException("Unkown control sequence "+control); |
306 | 330 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Alttitles.java |
— | — | @@ -0,0 +1,126 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
| 7 | + |
| 8 | +import org.apache.lucene.analysis.Analyzer; |
| 9 | +import org.wikimedia.lsearch.beans.Article; |
| 10 | +import org.wikimedia.lsearch.config.IndexId; |
| 11 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 12 | +import org.wikimedia.lsearch.util.Buffer; |
| 13 | + |
| 14 | +/** |
| 15 | + * Titles and redirects, serialization/deserialization |
| 16 | + * for highlighting, etc.. |
| 17 | + * |
| 18 | + * @author rainman |
| 19 | + * |
| 20 | + */ |
| 21 | +public class Alttitles { |
| 22 | + protected Info title; |
| 23 | + protected ArrayList<Info> redirects = new ArrayList<Info>(); |
| 24 | + protected ArrayList<Info> sections = new ArrayList<Info>(); |
| 25 | + |
| 26 | + public static class Info { |
| 27 | + protected String title; |
| 28 | + protected int rank; |
| 29 | + protected ArrayList<ExtToken> tokens; |
| 30 | + public Info(String title, int rank, ArrayList<ExtToken> tokens){ |
| 31 | + this.title = title; |
| 32 | + this.rank = rank; |
| 33 | + this.tokens = tokens; |
| 34 | + } |
| 35 | + public int getRank() { |
| 36 | + return rank; |
| 37 | + } |
| 38 | + public void setRank(int rank) { |
| 39 | + this.rank = rank; |
| 40 | + } |
| 41 | + public String getTitle() { |
| 42 | + return title; |
| 43 | + } |
| 44 | + public void setTitle(String title) { |
| 45 | + this.title = title; |
| 46 | + } |
| 47 | + public ArrayList<ExtToken> getTokens() { |
| 48 | + return tokens; |
| 49 | + } |
| 50 | + public void setTokens(ArrayList<ExtToken> tokens) { |
| 51 | + this.tokens = tokens; |
| 52 | + } |
| 53 | + |
| 54 | + } |
| 55 | + |
| 56 | + public static byte[] serializeAltTitle(Article article, IndexId iid, Collection<String> sections, Analyzer analyzer, String field) throws IOException{ |
| 57 | + WikiIndexModifier.transformArticleForIndexing(article); |
| 58 | + Buffer b = new Buffer(); |
| 59 | + |
| 60 | + // add title |
| 61 | + String title = article.getTitle(); |
| 62 | + // type 0 : title |
| 63 | + b.writeAlttitleInfo(title,new Aggregate(title,article.getRank(),iid,analyzer,field),0); |
| 64 | + // add all redirects |
| 65 | + ArrayList<String> redirects = article.getRedirectKeywords(); |
| 66 | + ArrayList<Integer> ranks = article.getRedirectKeywordRanks(); |
| 67 | + for(int i=0;i<redirects.size();i++){ |
| 68 | + // type 1: redirect |
| 69 | + b.writeAlttitleInfo(redirects.get(i),new Aggregate(redirects.get(i),ranks.get(i),iid,analyzer,field),1); |
| 70 | + } |
| 71 | + |
| 72 | + // type 2: sections |
| 73 | + for(String s : sections){ |
| 74 | + b.writeAlttitleInfo(s,new Aggregate(s,1,iid,analyzer,field),2); |
| 75 | + } |
| 76 | + |
| 77 | + return b.getBytes(); |
| 78 | + } |
| 79 | + |
| 80 | + public static Alttitles deserializeAltTitle(byte[] serialized){ |
| 81 | + Buffer b = new Buffer(serialized); |
| 82 | + Alttitles t = new Alttitles(); |
| 83 | + while(b.hasMore()){ |
| 84 | + Object[] ret = b.readAlttitleInfo(); |
| 85 | + int type = (Integer)ret[0]; |
| 86 | + Info info = (Info)ret[1]; |
| 87 | + if(type == 0) |
| 88 | + t.title = info; |
| 89 | + else if(type == 1) |
| 90 | + t.redirects.add(info); |
| 91 | + else if(type == 2) |
| 92 | + t.sections.add(info); |
| 93 | + else |
| 94 | + throw new RuntimeException("Wrong type for serialized alttitle "+type); |
| 95 | + } |
| 96 | + return t; |
| 97 | + } |
| 98 | + |
| 99 | + public ArrayList<Info> getRedirects() { |
| 100 | + return redirects; |
| 101 | + } |
| 102 | + |
| 103 | + public void setRedirects(ArrayList<Info> redirects) { |
| 104 | + this.redirects = redirects; |
| 105 | + } |
| 106 | + |
| 107 | + public Info getTitle() { |
| 108 | + return title; |
| 109 | + } |
| 110 | + |
| 111 | + public void setTitle(Info title) { |
| 112 | + this.title = title; |
| 113 | + } |
| 114 | + |
| 115 | + public ArrayList<Info> getSections() { |
| 116 | + return sections; |
| 117 | + } |
| 118 | + |
| 119 | + public void setSections(ArrayList<Info> sections) { |
| 120 | + this.sections = sections; |
| 121 | + } |
| 122 | + |
| 123 | + |
| 124 | + |
| 125 | + |
| 126 | + |
| 127 | +} |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -83,6 +83,7 @@ |
84 | 84 | |
85 | 85 | private TokenStream tokenStream; |
86 | 86 | private ArrayList<Token> tokens; // tokens from analysis |
| 87 | + protected ArrayList<String> words; |
87 | 88 | |
88 | 89 | /** sometimes the fieldsubquery takes the bool modifier, to retrieve it, use this variable, |
89 | 90 | * this will always point to the last unused bool modifier */ |
— | — | @@ -1731,7 +1732,7 @@ |
1732 | 1733 | Object[] qtwords = makeTitleQuery(queryText); |
1733 | 1734 | // qt = title query, qp = title phrase query |
1734 | 1735 | Query qt = (Query) qtwords[0]; |
1735 | | - ArrayList<String> words = (ArrayList<String>) qtwords[1]; |
| 1736 | + words = (ArrayList<String>) qtwords[1]; |
1736 | 1737 | if(qc == null || qt == null) |
1737 | 1738 | return new BooleanQuery(); |
1738 | 1739 | if(qc.equals(qt)) |
— | — | @@ -1837,6 +1838,10 @@ |
1838 | 1839 | } |
1839 | 1840 | public void setBuilder(FieldBuilder.BuilderSet builder) { |
1840 | 1841 | this.builder = builder; |
1841 | | - } |
| 1842 | + } |
1842 | 1843 | |
| 1844 | + public ArrayList<String> getWords(){ |
| 1845 | + return words; |
| 1846 | + } |
| 1847 | + |
1843 | 1848 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -52,7 +52,7 @@ |
53 | 53 | * @param language |
54 | 54 | * @return |
55 | 55 | */ |
56 | | - public static Analyzer getReusableHighlightAnalyzer(FilterFactory filters){ |
| 56 | + public static ReusableLanguageAnalyzer getReusableHighlightAnalyzer(FilterFactory filters){ |
57 | 57 | return new ReusableLanguageAnalyzer(filters,false,true); |
58 | 58 | } |
59 | 59 | |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java |
— | — | @@ -1,9 +1,12 @@ |
2 | 2 | package org.wikimedia.lsearch.analyzers; |
3 | 3 | |
| 4 | +import java.io.IOException; |
4 | 5 | import java.util.ArrayList; |
5 | 6 | import java.util.HashSet; |
6 | 7 | |
| 8 | +import org.apache.lucene.analysis.Analyzer; |
7 | 9 | import org.apache.lucene.analysis.Token; |
| 10 | +import org.apache.lucene.analysis.TokenStream; |
8 | 11 | import org.wikimedia.lsearch.config.IndexId; |
9 | 12 | |
10 | 13 | /** |
— | — | @@ -20,6 +23,7 @@ |
21 | 24 | |
22 | 25 | /** Construct from arbitrary text that will be tokenized */ |
23 | 26 | public Aggregate(String text, float boost, IndexId iid, boolean exactCase, HashSet<String> stopWords){ |
| 27 | + // FIXME: hey, we should use an analyzer here!!! |
24 | 28 | TokenizerOptions options = new TokenizerOptions.NoRelocation(exactCase); |
25 | 29 | tokens = new FastWikiTokenizerEngine(text,iid,options).parse(); |
26 | 30 | this.boost = boost; |
— | — | @@ -33,14 +37,23 @@ |
34 | 38 | noStopWordsLength = tokens.size(); |
35 | 39 | } |
36 | 40 | |
37 | | - /** Construct for highlight */ |
38 | | - public Aggregate(String text, float boost, IndexId iid){ |
39 | | - TokenizerOptions options = new TokenizerOptions.Highlight(); |
40 | | - tokens = new FastWikiTokenizerEngine(text,iid,options).parse(); |
| 41 | + /** Construct with specific analyzer |
| 42 | + * @throws IOException */ |
| 43 | + public Aggregate(String text, float boost, IndexId iid, Analyzer analyzer, String field) throws IOException{ |
| 44 | + this.tokens = toTokenArray(analyzer.tokenStream(field,text)); |
41 | 45 | this.boost = boost; |
42 | 46 | this.noStopWordsLength = tokens.size(); |
43 | 47 | } |
44 | 48 | |
| 49 | + private ArrayList<Token> toTokenArray(TokenStream stream) throws IOException { |
| 50 | + ArrayList<Token> tt = new ArrayList<Token>(); |
| 51 | + Token t = null; |
| 52 | + while( (t = stream.next()) != null){ |
| 53 | + tt.add(t); |
| 54 | + } |
| 55 | + return tt; |
| 56 | + } |
| 57 | + |
45 | 58 | /** Number of tokens */ |
46 | 59 | public int length(){ |
47 | 60 | if(tokens != null) |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Buffer.java |
— | — | @@ -1,9 +1,12 @@ |
2 | 2 | package org.wikimedia.lsearch.util; |
3 | 3 | |
| 4 | +import java.io.EOFException; |
4 | 5 | import java.io.IOException; |
5 | 6 | import java.io.UnsupportedEncodingException; |
| 7 | +import java.util.ArrayList; |
6 | 8 | |
7 | 9 | import org.wikimedia.lsearch.analyzers.Aggregate; |
| 10 | +import org.wikimedia.lsearch.analyzers.Alttitles; |
8 | 11 | import org.wikimedia.lsearch.analyzers.ExtToken; |
9 | 12 | import org.wikimedia.lsearch.analyzers.LanguageAnalyzer; |
10 | 13 | |
— | — | @@ -11,6 +14,63 @@ |
12 | 15 | public byte[] buf = new byte[256]; |
13 | 16 | public int len=0; |
14 | 17 | |
| 18 | + /** for writing */ |
| 19 | + public Buffer(){ |
| 20 | + } |
| 21 | + |
| 22 | + /** for reading */ |
| 23 | + public Buffer(byte[] buf){ |
| 24 | + this.buf = buf; |
| 25 | + } |
| 26 | + |
| 27 | + public byte read(){ |
| 28 | + return buf[len++]; |
| 29 | + } |
| 30 | + |
| 31 | + public int readInt(){ |
| 32 | + int ch1 = read() & 0xFF; |
| 33 | + int ch2 = read() & 0xFF; |
| 34 | + int ch3 = read() & 0xFF; |
| 35 | + int ch4 = read() & 0xFF; |
| 36 | + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); |
| 37 | + } |
| 38 | + |
| 39 | + public String readStringWithLength(){ |
| 40 | + int strlen = read(); |
| 41 | + String s; |
| 42 | + try { |
| 43 | + s = new String(buf,len,strlen,"utf-8"); |
| 44 | + len += strlen; |
| 45 | + return s; |
| 46 | + } catch (UnsupportedEncodingException e) { |
| 47 | + e.printStackTrace(); |
| 48 | + return null; |
| 49 | + } |
| 50 | + } |
| 51 | + |
| 52 | + public byte[] readBytesWithLength(){ |
| 53 | + int l = readInt(); |
| 54 | + byte[] b = new byte[l]; |
| 55 | + System.arraycopy(buf,len,b,0,l); |
| 56 | + len += l; |
| 57 | + return b; |
| 58 | + } |
| 59 | + |
| 60 | + /** @return Integer type, String title, Integer rank(boost), ArrayList<ExtToken> tokens */ |
| 61 | + public Object[] readAlttitleInfo(){ |
| 62 | + Integer type = (int)read(); |
| 63 | + Integer boost = readInt(); |
| 64 | + String title = readStringWithLength(); |
| 65 | + ArrayList<ExtToken> tokens = ExtToken.deserialize(readBytesWithLength()); |
| 66 | + return new Object[] { type, new Alttitles.Info(title,boost,tokens)}; |
| 67 | + } |
| 68 | + |
| 69 | + public boolean hasMore(){ |
| 70 | + return len < buf.length; |
| 71 | + } |
| 72 | + |
| 73 | + ///////// WRITE /////////// |
| 74 | + |
15 | 75 | public byte[] getBytes(){ |
16 | 76 | byte[] ret = new byte[len]; |
17 | 77 | System.arraycopy(buf,0,ret,0,len); |
— | — | @@ -82,7 +142,7 @@ |
83 | 143 | |
84 | 144 | /** Format: type (1b), rank (4b), text (string), size of serialized (4b), serialized (bytes) |
85 | 145 | * @throws IOException */ |
86 | | - public final void writeAggregate(String text, Aggregate a, int type) throws IOException{ |
| 146 | + public final void writeAlttitleInfo(String text, Aggregate a, int type) throws IOException{ |
87 | 147 | write(type); |
88 | 148 | writeInt((int)a.boost()); |
89 | 149 | writeStringWithLength(text); |
Index: branches/lucene-search-2.1/build.xml |
— | — | @@ -61,6 +61,7 @@ |
62 | 62 | <zipfileset src="lib/snowball.jar" /> |
63 | 63 | <zipfileset src="lib/mwdumper.jar" /> |
64 | 64 | <zipfileset src="lib/mysql-connector-java-3.0.17-ga-bin.jar" /> |
| 65 | + <zipfileset src="lib/lucene-highlighter-2.2.0.jar" /> |
65 | 66 | </jar> |
66 | 67 | </target> |
67 | 68 | |
Index: branches/lucene-search-2.1/.classpath |
— | — | @@ -12,8 +12,8 @@ |
13 | 13 | <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> |
14 | 14 | <classpathentry kind="lib" path="lib/snowball.jar"/> |
15 | 15 | <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/> |
16 | | - <classpathentry kind="lib" path="lib/highlighter.jar" sourcepath="/sandbox/highlighter"/> |
17 | 16 | <classpathentry kind="lib" path="lib/mysql-connector-java-3.0.17-ga-bin.jar"/> |
18 | 17 | <classpathentry kind="lib" path="lib/lucene-core-2.2.0.jar" sourcepath="/lucene-2.2/src"/> |
| 18 | + <classpathentry kind="lib" path="lib/lucene-highlighter-2.2.0.jar" sourcepath="/lucene-2.2/contrib/highlighter/src/java"/> |
19 | 19 | <classpathentry kind="output" path="bin"/> |
20 | 20 | </classpath> |
Index: branches/lucene-search-2.1/lsearch-global.conf |
— | — | @@ -27,6 +27,7 @@ |
28 | 28 | [Search-Group] |
29 | 29 | oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links |
30 | 30 | oblak : wikilucene.nspart1.sub1 wikilucene.nspart1.sub2 |
| 31 | +oblak : wikilucene.nspart1.sub1.hl wikilucene.nspart1.sub2.hl |
31 | 32 | |
32 | 33 | # Index nodes |
33 | 34 | # host: db1.part db2.part |