r27370 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r27369‎ | r27370 | r27371 >
Date:22:58, 10 November 2007
Author:rainman
Status:old
Tags:
Comment:
Highlighting, more work-in-progress:
* Basic highlighting algorithm in class Highlight,
score text fragments, redirects and section names
* Untested incremental index updates for .hl indexes
* Added read operations to Buffer
Modified paths:
  • /branches/lucene-search-2.1/.classpath (modified) (history)
  • /branches/lucene-search-2.1/build.xml (modified) (history)
  • /branches/lucene-search-2.1/lsearch-global.conf (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Alttitles.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightResult.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Buffer.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -41,6 +41,7 @@
4242 Links links;
4343 String langCode;
4444 RelatedStorage related;
 45+ boolean makeIndex, makeHighlight;
4546
4647 public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
4748 Integer maxBufDocs, boolean newIndex, Links links, String langCode,
@@ -53,6 +54,8 @@
5455 highlightWriter = new SimpleIndexWriter(iid.getHighlight(), optimize, mergeFactor, maxBufDocs, newIndex);
5556 this.limit = limit;
5657 this.links = links;
 58+ this.makeIndex = makeIndex;
 59+ this.makeHighlight = makeHighlight;
5760 this.langCode = langCode;
5861 this.related = new RelatedStorage(iid);
5962 if(!related.canRead())
@@ -86,7 +89,7 @@
8790 redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef));
8891 }
8992 // related
90 - if(related != null)
 93+ if(makeIndex && related != null)
9194 rel = related.getRelated(key);
9295 // make article
9396 Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -46,7 +46,8 @@
4747 protected String langCode;
4848 protected Links links;
4949 protected Analyzer indexAnalyzer;
50 - protected ReusableLanguageAnalyzer highlightAnalyzer;
 50+ protected Analyzer highlightAnalyzer;
 51+ protected ReusableLanguageAnalyzer highlightContentAnalyzer;
5152 protected HashSet<String> stopWords;
5253
5354 public SimpleIndexWriter(IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){
@@ -61,7 +62,8 @@
6263 builder = new FieldBuilder(iid,dCase);
6364 indexes = new HashMap<String,IndexWriter>();
6465 indexAnalyzer = Analyzers.getIndexerAnalyzer(builder);
65 - highlightAnalyzer = new ReusableLanguageAnalyzer(builder.getBuilder().getFilters(),false,true);
 66+ highlightAnalyzer = Analyzers.getHighlightAnalyzer(iid);
 67+ highlightContentAnalyzer = new ReusableLanguageAnalyzer(builder.getBuilder().getFilters(),false,true);
6668 stopWords = StopWords.getPredefinedSet(iid);
6769 // open all relevant indexes
6870 for(IndexId part : iid.getPhysicalIndexIds()){
@@ -151,17 +153,12 @@
152154 IndexWriter writer = indexes.get(target.toString());
153155 if(writer == null)
154156 return;
155 - String key = a.getTitleObject().getKey();
156157 try {
157 - // TODO: move to WikiIndexModifier?
158 - Document doc = new Document();
159 - doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED));
160 - doc.add(new Field("text",ExtToken.serialize(highlightAnalyzer.tokenStream("contents",a.getContents())),Store.COMPRESS));
161 - doc.add(new Field("alttitle",WikiIndexModifier.serializeAltTitle(a,iid,highlightAnalyzer.getWikiTokenizer().getHeadingText()),Store.COMPRESS));
 158+ Document doc = WikiIndexModifier.makeHighlightDocument(a,highlightAnalyzer,highlightContentAnalyzer,target);
162159 addDocument(writer,doc,a,target);
163160 } catch (IOException e) {
164161 e.printStackTrace();
165 - log.error("Error adding document for key="+key+" : "+e.getMessage());
 162+ log.error("Error adding document for key="+a.getTitleObject().getKey()+" : "+e.getMessage());
166163 }
167164 }
168165
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java
@@ -0,0 +1,107 @@
 2+package org.wikimedia.lsearch.highlight;
 3+
 4+import java.io.Serializable;
 5+import java.util.ArrayList;
 6+
 7+import org.wikimedia.lsearch.analyzers.Alttitles;
 8+
 9+/**
 10+ * Snippet of highlighted text.
 11+ *
 12+ * @author rainman
 13+ *
 14+ */
 15+public class Snippet implements Serializable {
 16+ public static class Range implements Serializable {
 17+ public int start;
 18+ public int end;
 19+
 20+ public Range(int start, int end){
 21+ this.start = start;
 22+ this.end = end;
 23+ }
 24+
 25+ @Override
 26+ public int hashCode() {
 27+ final int PRIME = 31;
 28+ int result = 1;
 29+ result = PRIME * result + end;
 30+ result = PRIME * result + start;
 31+ return result;
 32+ }
 33+
 34+ @Override
 35+ public boolean equals(Object obj) {
 36+ if (this == obj)
 37+ return true;
 38+ if (obj == null)
 39+ return false;
 40+ if (getClass() != obj.getClass())
 41+ return false;
 42+ final Range other = (Range) obj;
 43+ if (end != other.end)
 44+ return false;
 45+ if (start != other.start)
 46+ return false;
 47+ return true;
 48+ }
 49+
 50+
 51+ }
 52+ protected String text = null;
 53+ protected ArrayList<Range> highlighted = new ArrayList<Range>();
 54+
 55+ protected Alttitles.Info alttitle = null;
 56+
 57+ public Snippet(){
 58+
 59+ }
 60+ public Snippet(String text){
 61+ this.text = text;
 62+ }
 63+
 64+ public void addRange(Range r){
 65+ if(highlighted.size() != 0 && r.equals(highlighted.get(highlighted.size()-1))){
 66+ return; // don't allow duplicates!
 67+ }
 68+ highlighted.add(r);
 69+ }
 70+
 71+ public ArrayList<Range> getHighlighted() {
 72+ return highlighted;
 73+ }
 74+
 75+ public String getText() {
 76+ return text;
 77+ }
 78+
 79+ public void setText(String text){
 80+ this.text = text;
 81+ }
 82+
 83+ public String getFormatted(){
 84+ StringBuilder sb = new StringBuilder();
 85+ int last = 0;
 86+ for(Range r : highlighted){
 87+ sb.append(text.substring(last,r.start));
 88+ sb.append("<b>");
 89+ sb.append(text.substring(r.start,r.end));
 90+ sb.append("</b>");
 91+ last = r.end;
 92+ }
 93+ if(last != text.length())
 94+ sb.append(text.substring(last));
 95+ return sb.toString();
 96+ }
 97+ public Alttitles.Info getAlttitle() {
 98+ return alttitle;
 99+ }
 100+ public void setAlttitle(Alttitles.Info alttitle) {
 101+ this.alttitle = alttitle;
 102+ }
 103+ public void setHighlighted(ArrayList<Range> highlighted) {
 104+ this.highlighted = highlighted;
 105+ }
 106+
 107+
 108+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightResult.java
@@ -0,0 +1,53 @@
 2+package org.wikimedia.lsearch.highlight;
 3+
 4+import java.io.Serializable;
 5+
 6+/**
 7+ * Result of higlighting, contains
 8+ * snippets for title, redirect, sections, and text
 9+ * @author rainman
 10+ *
 11+ */
 12+public class HighlightResult implements Serializable {
 13+ protected Snippet title = null;
 14+ protected Snippet redirect = null;
 15+ protected Snippet section = null;
 16+ protected Snippet text = null;
 17+
 18+ public HighlightResult(){
 19+ }
 20+
 21+ public Snippet getRedirect() {
 22+ return redirect;
 23+ }
 24+
 25+ public void setRedirect(Snippet redirect) {
 26+ this.redirect = redirect;
 27+ }
 28+
 29+ public Snippet getSection() {
 30+ return section;
 31+ }
 32+
 33+ public void setSection(Snippet section) {
 34+ this.section = section;
 35+ }
 36+
 37+ public Snippet getText() {
 38+ return text;
 39+ }
 40+
 41+ public void setText(Snippet text) {
 42+ this.text = text;
 43+ }
 44+
 45+ public Snippet getTitle() {
 46+ return title;
 47+ }
 48+
 49+ public void setTitle(Snippet title) {
 50+ this.title = title;
 51+ }
 52+
 53+
 54+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java
@@ -0,0 +1,324 @@
 2+package org.wikimedia.lsearch.highlight;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Arrays;
 7+import java.util.Collections;
 8+import java.util.Comparator;
 9+import java.util.HashMap;
 10+import java.util.HashSet;
 11+import java.util.Set;
 12+
 13+import org.apache.log4j.Logger;
 14+import org.apache.lucene.document.Document;
 15+import org.apache.lucene.index.IndexReader;
 16+import org.apache.lucene.index.Term;
 17+import org.apache.lucene.index.TermDocs;
 18+import org.wikimedia.lsearch.analyzers.Alttitles;
 19+import org.wikimedia.lsearch.analyzers.ExtToken;
 20+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
 21+import org.wikimedia.lsearch.analyzers.WikiQueryParser;
 22+import org.wikimedia.lsearch.analyzers.ExtToken.Position;
 23+import org.wikimedia.lsearch.analyzers.ExtToken.Type;
 24+import org.wikimedia.lsearch.config.IndexId;
 25+import org.wikimedia.lsearch.search.SearcherCache;
 26+
 27+public class Highlight {
 28+ protected static SearcherCache cache = null;
 29+ static Logger log = Logger.getLogger(Highlight.class);
 30+
 31+ public static final int SLOP = WikiQueryParser.MAINPHRASE_SLOP;
 32+ /** maximal length of text that surrounds highlighted words */
 33+ public static final int MAX_CONTEXT = 75;
 34+
 35+ public static final double PHRASE_BOOST = 1;
 36+
 37+ /** boost (preference) factors for varius parts of the text */
 38+ public static final HashMap<Position,Double> BOOST = new HashMap<Position,Double>();
 39+ static {
 40+ BOOST.put(Position.FIRST_SECTION,5.0);
 41+ BOOST.put(Position.HEADING,2.0);
 42+ BOOST.put(Position.NORMAL,1.0);
 43+ BOOST.put(Position.TEMPLATE,0.1);
 44+ BOOST.put(Position.IMAGE_CAT_IW,0.01);
 45+ BOOST.put(Position.EXT_LINK,0.5);
 46+ BOOST.put(Position.REFERENCE,0.5);
 47+ }
 48+ /**
 49+ *
 50+ * @param hits - keys of articles that need to be higlighted
 51+ * @param iid - highlight index
 52+ * @param terms - terms to highlight
 53+ * @param df - their document frequencies
 54+ * @param words - in order words (from main phrase)
 55+ * @param exactCase - if these are results from exactCase search
 56+ * @throws IOException
 57+ * @returns map: key -> what to highlight
 58+ */
 59+ @SuppressWarnings("unchecked")
 60+ public static HashMap<String,HighlightResult> highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, HashSet<String> stopWords) throws IOException{
 61+ if(cache == null)
 62+ cache = SearcherCache.getInstance();
 63+
 64+ System.out.println("Highlighting: "+Arrays.toString(terms));
 65+
 66+ FieldNameFactory fields = new FieldNameFactory(exactCase);
 67+
 68+ if(stopWords == null)
 69+ stopWords = new HashSet<String>();
 70+
 71+ // terms weighted with idf
 72+ HashMap<String,Double> weightTerm = new HashMap<String,Double>();
 73+ for(int i=0;i<terms.length;i++){
 74+ Term t = terms[i];
 75+ if(t.field().equals(fields.contents())){
 76+ double idf = idf(df[i],maxDoc);
 77+ weightTerm.put(t.text(),idf);
 78+ }
 79+ }
 80+ // position within main phrase
 81+ HashMap<String,Integer> wordIndex = new HashMap<String,Integer>();
 82+ for(int i=0;i<words.size();i++)
 83+ wordIndex.put(words.get(i),i);
 84+
 85+ // process requested documents
 86+ IndexReader reader = cache.getLocalSearcher(iid.getHighlight()).getIndexReader();
 87+ HashMap<String,HighlightResult> res = new HashMap<String,HighlightResult>();
 88+ for(String key : hits){
 89+ Object[] ret = getTokens(reader,key);
 90+ if(ret == null)
 91+ continue;
 92+ ArrayList<ExtToken> tokens = (ArrayList<ExtToken>) ret[0];
 93+ Alttitles alttitles = (Alttitles) ret[1];
 94+ HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles);
 95+
 96+ getBestTextSnippets(tokens, weightTerm, wordIndex, 2);
 97+ getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,1);
 98+ getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,0);
 99+
 100+ }
 101+ return res;
 102+ }
 103+
 104+ /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
 105+ protected static double idf(int docFreq, int numDocs) {
 106+ return Math.log(numDocs/(double)(docFreq+1)) + 1.0;
 107+ }
 108+
 109+ @SuppressWarnings("unchecked")
 110+ protected static HashMap<String,Double> getTermsNotInTitle(HashMap<String,Double> weightTerm, Alttitles alttitles){
 111+ Alttitles.Info info = alttitles.getTitle();
 112+ ArrayList<ExtToken> tokens = info.getTokens();
 113+ HashMap<String,Double> ret = (HashMap<String, Double>) weightTerm.clone();
 114+ // delete all terms from title
 115+ for(ExtToken t : tokens){
 116+ if(ret.containsKey(t.termText()))
 117+ ret.remove(t.termText());
 118+ }
 119+ return ret;
 120+
 121+ }
 122+
 123+ /** Alttitle and sections highlighting */
 124+
 125+ protected static class ScoredSnippet {
 126+ Snippet snippet = null;
 127+ double score = 0;
 128+ public ScoredSnippet(Snippet snippet, double score) {
 129+ this.snippet = snippet;
 130+ this.score = score;
 131+ }
 132+
 133+ }
 134+
 135+ protected static Snippet getBestAltTitle(ArrayList<Alttitles.Info> altInfos, HashMap<String,Double> weightTerm,
 136+ HashMap<String,Double> notInTitle, HashSet<String> stopWords, int minAdditional){
 137+ ArrayList<ScoredSnippet> res = new ArrayList<ScoredSnippet>();
 138+ for(Alttitles.Info ainf : altInfos){
 139+ double matched = 0, additional=0;
 140+ ArrayList<ExtToken> tokens = ainf.getTokens();
 141+ boolean completeMatch=true;
 142+ for(int i=0;i<tokens.size();i++){
 143+ ExtToken t = tokens.get(i);
 144+ if(t.getPositionIncrement() == 0)
 145+ continue; // skip aliases
 146+
 147+ if(weightTerm.containsKey(t.termText()))
 148+ matched += weightTerm.get(t.termText());
 149+ else if(!stopWords.contains(t.termText()))
 150+ completeMatch = false;
 151+
 152+ if(notInTitle.containsKey(t.termText()))
 153+ additional += notInTitle.get(t.termText());
 154+ }
 155+ if((completeMatch && additional >= minAdditional) || additional >= minAdditional+1 || additional == notInTitle.size()){
 156+ Snippet snippet = makeSnippet(tokens,0,tokens.size(),weightTerm.keySet());
 157+ snippet.setAlttitle(ainf);
 158+ res.add(new ScoredSnippet(snippet,matched+additional));
 159+ }
 160+ }
 161+ if(res.size() > 0){
 162+ if(res.size() == 1){
 163+ return res.get(0).snippet;
 164+ } else{
 165+ // get snippet with best score
 166+ Collections.sort(res, new Comparator<ScoredSnippet>() {
 167+ public int compare(ScoredSnippet o1, ScoredSnippet o2) {
 168+ double d = o2.score - o1.score;
 169+ if(d > 0)
 170+ return 1;
 171+ else if(d == 0)
 172+ return 0;
 173+ else return -1;
 174+ }});
 175+ return res.get(0).snippet;
 176+ }
 177+ }
 178+ return null;
 179+ }
 180+
 181+ /** Text highlighting */
 182+
 183+ protected static class FragmentScore {
 184+ int start = 0;
 185+ int end = 0;
 186+ double score = 0;
 187+ // best match in this fragment
 188+ int bestStart = -1;
 189+ int bestEnd = -1;
 190+ double bestScore = 0;
 191+
 192+ FragmentScore(int start){
 193+ this.start = start;
 194+ }
 195+
 196+ public String toString(){
 197+ return "start="+start+", end="+end+", score="+score+", bestStart="+bestStart+", bestEnd="+bestEnd;
 198+ }
 199+ }
 200+
 201+ /** Highlight text */
 202+ protected static ArrayList<Snippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms,
 203+ HashMap<String,Integer> wordIndex, int maxSnippets) {
 204+
 205+ // pieces of text to ge highlighted
 206+ ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>();
 207+
 208+ //System.out.println("TOKENS: "+tokens);
 209+
 210+ FragmentScore fs = null;
 211+ ExtToken last = null;
 212+ // next three are for in-order matched phrases
 213+ Integer lastWord = null; // pointer to the last word found in sequence
 214+ int lastIndex = 0;
 215+ Double lastWeight = null;
 216+ // indicator for first sentence
 217+ boolean seenFirstSentence = false;
 218+ for(int i=0;i<=tokens.size();i++){
 219+ ExtToken t = null;
 220+ if(i < tokens.size())
 221+ t = tokens.get(i);
 222+ if(last == null){
 223+ fs = new FragmentScore(i);
 224+ } else if(t==null || t.getPosition() != last.getPosition() || t.getType() == Type.SENTENCE_BREAK){
 225+ Position pos = last.getPosition();
 226+ // finalize fragment
 227+
 228+ if(t == null || t.getType() != Type.SENTENCE_BREAK)
 229+ fs.end = i;
 230+ else
 231+ fs.end = i + 1;
 232+ fs.score *= BOOST.get(pos);
 233+ fragments.add(fs);
 234+ if(pos == Position.FIRST_SECTION && !seenFirstSentence){
 235+ // boost for first sentence
 236+ fs.score *= 4;
 237+ seenFirstSentence = true;
 238+ }
 239+ fs = new FragmentScore(fs.end);
 240+ }
 241+ if(t == null)
 242+ break;
 243+
 244+ Double weight = weightTerms.get(t.termText());
 245+ if(weight != null){
 246+ fs.score += weight;
 247+ Integer inx = wordIndex.get(t.termText());
 248+ if(lastWord != null && inx != null && lastWord == inx - 1 && i-lastIndex <= 2){
 249+ double phraseScore = (weight + lastWeight) * PHRASE_BOOST;
 250+ fs.score += phraseScore;
 251+ if(phraseScore > fs.bestScore){
 252+ fs.bestStart = lastIndex;
 253+ fs.bestEnd = i;
 254+ fs.bestScore = phraseScore;
 255+ }
 256+ }
 257+ lastWord = inx;
 258+ lastWeight = weight;
 259+ lastIndex = i;
 260+ }
 261+
 262+ last = t;
 263+ }
 264+
 265+ // find fragments with best score
 266+ Collections.sort(fragments, new Comparator<FragmentScore>() {
 267+ public int compare(FragmentScore o1, FragmentScore o2) {
 268+ double d = o2.score - o1.score;
 269+ if(d > 0)
 270+ return 1;
 271+ else if(d == 0)
 272+ return 0;
 273+ else return -1;
 274+ }});
 275+
 276+ ArrayList<Snippet> res = new ArrayList<Snippet>();
 277+ for(FragmentScore f : fragments){
 278+ if(f.score == 0)
 279+ continue;
 280+ Snippet s = makeSnippet(tokens,f,weightTerms.keySet());
 281+ res.add(s);
 282+ System.out.println(f+" : "+s.getFormatted());
 283+ if(res.size() >= maxSnippets)
 284+ break;
 285+ }
 286+ return res;
 287+ }
 288+
 289+ private static Snippet makeSnippet(ArrayList<ExtToken> tokens, FragmentScore f, Set<String> highlight) {
 290+ return makeSnippet(tokens,f.start,f.end,highlight);
 291+ }
 292+
 293+ private static Snippet makeSnippet(ArrayList<ExtToken> tokens, int fromIndex, int toIndex, Set<String> highlight) {
 294+ Snippet s = new Snippet();
 295+ StringBuilder sb = new StringBuilder();
 296+ int start=0, end=0;
 297+ for(int i=fromIndex;i<toIndex;i++){
 298+ ExtToken t = tokens.get(i);
 299+ if(t.getPositionIncrement() != 0){
 300+ start = sb.length();
 301+ sb.append(t.getText());
 302+ end = sb.length();
 303+ }
 304+ if(highlight.contains(t.termText())){
 305+ s.addRange(new Snippet.Range(start,end));
 306+ }
 307+ }
 308+ s.setText(sb.toString());
 309+ return s;
 310+ }
 311+
 312+ /** @return ArrayList<ExtToken> tokens, Altitles alttitles */
 313+ protected static Object[] getTokens(IndexReader reader, String key) throws IOException{
 314+ TermDocs td = reader.termDocs(new Term("key",key));
 315+ if(td.next()){
 316+ System.out.println("Found "+key);
 317+ Document doc = reader.document(td.doc());
 318+ ArrayList<ExtToken> tokens = ExtToken.deserialize(doc.getBinaryValue("text"));
 319+ // FIXME: wrong deserialization for alttitle !
 320+ Alttitles alttitles = Alttitles.deserializeAltTitle(doc.getBinaryValue("alttitle"));
 321+ return new Object[] {tokens, alttitles};
 322+ } else
 323+ return null;
 324+ }
 325+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java
@@ -148,10 +148,17 @@
149149 }
150150
151151 /**
152 - * @return Returns the page key (via article)
 152+ * @return Returns the page key -- page_id (via article)
153153 */
154154 public String getKey(){
155155 return article.getKey();
156156 }
157157
 158+ /**
 159+ * @return Highlight key -- ns:title
 160+ */
 161+ public String getHighlightKey(){
 162+ return article.getTitleObject().getKey();
 163+ }
 164+
158165 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -22,6 +22,8 @@
2323 import org.apache.lucene.analysis.SimpleAnalyzer;
2424 import org.apache.lucene.document.Document;
2525 import org.apache.lucene.document.Field;
 26+import org.apache.lucene.document.Field.Index;
 27+import org.apache.lucene.document.Field.Store;
2628 import org.apache.lucene.index.IndexReader;
2729 import org.apache.lucene.index.IndexWriter;
2830 import org.apache.lucene.index.Term;
@@ -29,9 +31,11 @@
3032 import org.apache.lucene.store.FSDirectory;
3133 import org.wikimedia.lsearch.analyzers.Aggregate;
3234 import org.wikimedia.lsearch.analyzers.AggregateAnalyzer;
 35+import org.wikimedia.lsearch.analyzers.Alttitles;
3336 import org.wikimedia.lsearch.analyzers.Analyzers;
3437 import org.wikimedia.lsearch.analyzers.CategoryAnalyzer;
3538 import org.wikimedia.lsearch.analyzers.ContextAnalyzer;
 39+import org.wikimedia.lsearch.analyzers.ExtToken;
3640 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
3741 import org.wikimedia.lsearch.analyzers.FieldBuilder;
3842 import org.wikimedia.lsearch.analyzers.FieldNameFactory;
@@ -39,6 +43,7 @@
4044 import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer;
4145 import org.wikimedia.lsearch.analyzers.LanguageAnalyzer;
4246 import org.wikimedia.lsearch.analyzers.RelatedAnalyzer;
 47+import org.wikimedia.lsearch.analyzers.ReusableLanguageAnalyzer;
4348 import org.wikimedia.lsearch.analyzers.StopWords;
4449 import org.wikimedia.lsearch.analyzers.TokenizerOptions;
4550 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
@@ -138,7 +143,11 @@
139144 }
140145 for(IndexUpdateRecord rec : records){
141146 if(rec.doDelete()){
142 - int count = reader.deleteDocuments(new Term("key", rec.getKey()));
 147+ int count = 0;
 148+ if(iid.isHighlight())
 149+ count = reader.deleteDocuments(new Term("key", rec.getHighlightKey()));
 150+ else // normal index
 151+ count = reader.deleteDocuments(new Term("key", rec.getKey()));
143152 if(count == 0)
144153 nonDeleteDocuments.add(rec);
145154 IndexReportCard card = getReportCard(rec);
@@ -184,7 +193,14 @@
185194 writer.setMaxFieldLength(MAX_FIELD_LENGTH);
186195 FieldBuilder.Case dCase = (exactCase)? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
187196 FieldBuilder builder = new FieldBuilder(iid,dCase);
188 - Analyzer analyzer = Analyzers.getIndexerAnalyzer(builder);
 197+ Analyzer analyzer = null;
 198+ ReusableLanguageAnalyzer highlightContentAnalyzer = null;
 199+ if(iid.isHighlight()){
 200+ highlightContentAnalyzer = Analyzers.getReusableHighlightAnalyzer(builder.getBuilder(dCase).getFilters());
 201+ analyzer = Analyzers.getHighlightAnalyzer(iid);
 202+ } else
 203+ analyzer = Analyzers.getIndexerAnalyzer(builder);
 204+
189205 HashSet<String> stopWords = StopWords.getPredefinedSet(iid);
190206 for(IndexUpdateRecord rec : records){
191207 if(rec.doAdd()){
@@ -193,8 +209,13 @@
194210 if(!checkPreconditions(rec))
195211 continue; // article shouldn't be added for some reason
196212 IndexReportCard card = getReportCard(rec);
197 - Document doc = makeDocument(rec.getArticle(),builder,iid,stopWords);
 213+ Document doc;
198214 try {
 215+ if(iid.isHighlight())
 216+ doc = makeHighlightDocument(rec.getArticle(),analyzer,highlightContentAnalyzer,iid);
 217+ else // normal index
 218+ doc = makeDocument(rec.getArticle(),builder,iid,stopWords);
 219+
199220 writer.addDocument(doc,analyzer);
200221 log.debug(iid+": Adding document "+rec.getKey()+" "+rec.getArticle());
201222 if(card != null)
@@ -279,7 +300,7 @@
280301 *
281302 * @param article
282303 */
283 - protected static void transformArticleForIndexing(Article ar) {
 304+ public static void transformArticleForIndexing(Article ar) {
284305 ArrayList<Redirect> redirects = ar.getRedirects();
285306 // sort redirect by their rank
286307 Collections.sort(redirects,new Comparator<Redirect>() {
@@ -365,6 +386,18 @@
366387 }
367388
368389 /**
 390+ * Update both the search and highlight index for iid.
 391+ *
 392+ * @param iid
 393+ * @param updateRecords
 394+ */
 395+ public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
 396+ boolean index = updateDocumentsOn(iid,updateRecords);
 397+ boolean highlight = updateDocumentsOn(iid.getHighlight(),updateRecords);
 398+ return index && highlight;
 399+ }
 400+
 401+ /**
369402 * Update all documents in the collection. If needed the request
370403 * is forwarded to a remote object (i.e. if the part of the split
371404 * index is indexed by another host).
@@ -372,7 +405,7 @@
373406 * @param iid
374407 * @param updateRecords
375408 */
376 - public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
 409+ protected boolean updateDocumentsOn(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
377410 long now = System.currentTimeMillis();
378411 log.info("Starting update of "+updateRecords.size()+" records on "+iid+", started at "+now);
379412 boolean succ = true;
@@ -521,6 +554,17 @@
522555 return doc;
523556 }
524557
 558+ /** Make the document that will be indexed as highlighting data */
 559+ public static Document makeHighlightDocument(Article article, Analyzer analyzer, ReusableLanguageAnalyzer contentAnalyzer, IndexId iid) throws IOException{
 560+ String key = article.getTitleObject().getKey();
 561+ Document doc = new Document();
 562+ doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED));
 563+ doc.add(new Field("text",ExtToken.serialize(contentAnalyzer.tokenStream("contents",article.getContents())),Store.COMPRESS));
 564+ ArrayList<String> sections = contentAnalyzer.getWikiTokenizer().getHeadingText();
 565+ doc.add(new Field("alttitle",Alttitles.serializeAltTitle(article,iid,sections,analyzer,"alttitle"),Store.COMPRESS));
 566+ return doc;
 567+ }
 568+
525569 /** add related aggregate field */
526570 protected static void makeRelated(Document doc, String prefix, Article article, IndexId iid, HashSet<String> stopWords){
527571 ArrayList<Aggregate> items = new ArrayList<Aggregate>();
@@ -548,34 +592,9 @@
549593 addToItems(items, new Aggregate(title+" "+h,rankBoost*HEADINGS_BOOST,iid,exactCase,stopWords));
550594 }
551595 makeAggregate(doc,prefix,items);
552 - }
 596+ }
553597
554598
555 - public enum AlttitleTypes { TITLE, REDIRECT, HEADING };
556 -
557 - public static byte[] serializeAltTitle(Article article, IndexId iid, ArrayList<String> headingText) throws IOException{
558 - WikiIndexModifier.transformArticleForIndexing(article);
559 - Buffer b = new Buffer();
560 -
561 - // add title
562 - String title = article.getTitle();
563 - b.writeAggregate(title,new Aggregate(title,article.getRank(),iid),AlttitleTypes.TITLE.ordinal());
564 - // add all redirects
565 - ArrayList<String> redirects = article.getRedirectKeywords();
566 - ArrayList<Integer> ranks = article.getRedirectKeywordRanks();
567 - for(int i=0;i<redirects.size();i++){
568 - b.writeAggregate(redirects.get(i),new Aggregate(redirects.get(i),ranks.get(i),iid),AlttitleTypes.REDIRECT.ordinal());
569 - }
570 - // add section headings!
571 - for(String h : headingText){
572 - b.writeAggregate(h,new Aggregate(h,article.getRank()*HEADINGS_BOOST,iid),AlttitleTypes.HEADING.ordinal());
573 - }
574 -
575 - return b.getBytes();
576 - }
577 -
578 -
579 -
580599 private static void addToItems(ArrayList<Aggregate> items, Aggregate a){
581600 if(a.length() != 0)
582601 items.add(a);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java
@@ -100,6 +100,14 @@
101101 this.inCase = inCase;
102102 }
103103
 104+ /** get text, original if available, otherwise termtext */
 105+ public String getText(){
 106+ if(original != null)
 107+ return original;
 108+ else
 109+ return termText();
 110+ }
 111+
104112 public String toString(){
105113 return "\""+termText()+"\",t="+type+",p="+pos+(original!=null? ",o={"+original+"}" : "")+",i="+getPositionIncrement();
106114 }
@@ -153,10 +161,20 @@
154162 b.writeString(t.termText());
155163 }
156164 }
157 - // control 1: original word
 165+
158166 if(t.getPositionIncrement() > 0 && t.original != null){
159 - b.writeControl(1);
160 - b.writeStringWithLength(t.original);
 167+ String w = t.termText();
 168+ if(t.original.equals(w.substring(0,1).toUpperCase()+w.substring(1))){
 169+ // control 6: original is title case
 170+ b.writeControl(6);
 171+ } else if(t.original.equals(w.toUpperCase())){
 172+ // control 7: original is upper case
 173+ b.writeControl(7);
 174+ } else{
 175+ // control 1: original word
 176+ b.writeControl(1);
 177+ b.writeStringWithLength(t.original);
 178+ }
161179 }
162180 // control 2: alias
163181 if(t.getPositionIncrement() == 0){
@@ -294,12 +312,18 @@
295313 throw new RuntimeException("Bad serialized data: trying to assing a sentence break to text");
296314 t.setType(Type.SENTENCE_BREAK);
297315 break;
298 - case 5:
 316+ case 5: // url
299317 { int len = serialized[cur++];
300318 ExtToken tt = new ExtToken(new String(serialized,cur,len,"utf-8"),cur,cur+len,Type.URL,Position.EXT_LINK);
301319 tokens.add(tt);
302320 cur += len;
303321 break; }
 322+ case 6: // original is title case
 323+ t.setOriginal(t.termText().substring(0,1).toUpperCase()+t.termText().substring(1));
 324+ break;
 325+ case 7: // original is upper case
 326+ t.setOriginal(t.termText().toUpperCase());
 327+ break;
304328 default:
305329 throw new RuntimeException("Unkown control sequence "+control);
306330 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Alttitles.java
@@ -0,0 +1,126 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.io.IOException;
 5+import java.util.ArrayList;
 6+import java.util.Collection;
 7+
 8+import org.apache.lucene.analysis.Analyzer;
 9+import org.wikimedia.lsearch.beans.Article;
 10+import org.wikimedia.lsearch.config.IndexId;
 11+import org.wikimedia.lsearch.index.WikiIndexModifier;
 12+import org.wikimedia.lsearch.util.Buffer;
 13+
 14+/**
 15+ * Titles and redirects, serialization/deserialization
 16+ * for highlighting, etc..
 17+ *
 18+ * @author rainman
 19+ *
 20+ */
 21+public class Alttitles {
 22+ protected Info title;
 23+ protected ArrayList<Info> redirects = new ArrayList<Info>();
 24+ protected ArrayList<Info> sections = new ArrayList<Info>();
 25+
 26+ public static class Info {
 27+ protected String title;
 28+ protected int rank;
 29+ protected ArrayList<ExtToken> tokens;
 30+ public Info(String title, int rank, ArrayList<ExtToken> tokens){
 31+ this.title = title;
 32+ this.rank = rank;
 33+ this.tokens = tokens;
 34+ }
 35+ public int getRank() {
 36+ return rank;
 37+ }
 38+ public void setRank(int rank) {
 39+ this.rank = rank;
 40+ }
 41+ public String getTitle() {
 42+ return title;
 43+ }
 44+ public void setTitle(String title) {
 45+ this.title = title;
 46+ }
 47+ public ArrayList<ExtToken> getTokens() {
 48+ return tokens;
 49+ }
 50+ public void setTokens(ArrayList<ExtToken> tokens) {
 51+ this.tokens = tokens;
 52+ }
 53+
 54+ }
 55+
 56+ public static byte[] serializeAltTitle(Article article, IndexId iid, Collection<String> sections, Analyzer analyzer, String field) throws IOException{
 57+ WikiIndexModifier.transformArticleForIndexing(article);
 58+ Buffer b = new Buffer();
 59+
 60+ // add title
 61+ String title = article.getTitle();
 62+ // type 0 : title
 63+ b.writeAlttitleInfo(title,new Aggregate(title,article.getRank(),iid,analyzer,field),0);
 64+ // add all redirects
 65+ ArrayList<String> redirects = article.getRedirectKeywords();
 66+ ArrayList<Integer> ranks = article.getRedirectKeywordRanks();
 67+ for(int i=0;i<redirects.size();i++){
 68+ // type 1: redirect
 69+ b.writeAlttitleInfo(redirects.get(i),new Aggregate(redirects.get(i),ranks.get(i),iid,analyzer,field),1);
 70+ }
 71+
 72+ // type 2: sections
 73+ for(String s : sections){
 74+ b.writeAlttitleInfo(s,new Aggregate(s,1,iid,analyzer,field),2);
 75+ }
 76+
 77+ return b.getBytes();
 78+ }
 79+
 80+ public static Alttitles deserializeAltTitle(byte[] serialized){
 81+ Buffer b = new Buffer(serialized);
 82+ Alttitles t = new Alttitles();
 83+ while(b.hasMore()){
 84+ Object[] ret = b.readAlttitleInfo();
 85+ int type = (Integer)ret[0];
 86+ Info info = (Info)ret[1];
 87+ if(type == 0)
 88+ t.title = info;
 89+ else if(type == 1)
 90+ t.redirects.add(info);
 91+ else if(type == 2)
 92+ t.sections.add(info);
 93+ else
 94+ throw new RuntimeException("Wrong type for serialized alttitle "+type);
 95+ }
 96+ return t;
 97+ }
 98+
 99+ public ArrayList<Info> getRedirects() {
 100+ return redirects;
 101+ }
 102+
 103+ public void setRedirects(ArrayList<Info> redirects) {
 104+ this.redirects = redirects;
 105+ }
 106+
 107+ public Info getTitle() {
 108+ return title;
 109+ }
 110+
 111+ public void setTitle(Info title) {
 112+ this.title = title;
 113+ }
 114+
 115+ public ArrayList<Info> getSections() {
 116+ return sections;
 117+ }
 118+
 119+ public void setSections(ArrayList<Info> sections) {
 120+ this.sections = sections;
 121+ }
 122+
 123+
 124+
 125+
 126+
 127+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -83,6 +83,7 @@
8484
8585 private TokenStream tokenStream;
8686 private ArrayList<Token> tokens; // tokens from analysis
 87+ protected ArrayList<String> words;
8788
8889 /** sometimes the fieldsubquery takes the bool modifier, to retrieve it, use this variable,
8990 * this will always point to the last unused bool modifier */
@@ -1731,7 +1732,7 @@
17321733 Object[] qtwords = makeTitleQuery(queryText);
17331734 // qt = title query, qp = title phrase query
17341735 Query qt = (Query) qtwords[0];
1735 - ArrayList<String> words = (ArrayList<String>) qtwords[1];
 1736+ words = (ArrayList<String>) qtwords[1];
17361737 if(qc == null || qt == null)
17371738 return new BooleanQuery();
17381739 if(qc.equals(qt))
@@ -1837,6 +1838,10 @@
18381839 }
18391840 public void setBuilder(FieldBuilder.BuilderSet builder) {
18401841 this.builder = builder;
1841 - }
 1842+ }
18421843
 1844+ public ArrayList<String> getWords(){
 1845+ return words;
 1846+ }
 1847+
18431848 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -52,7 +52,7 @@
5353 * @param language
5454 * @return
5555 */
56 - public static Analyzer getReusableHighlightAnalyzer(FilterFactory filters){
 56+ public static ReusableLanguageAnalyzer getReusableHighlightAnalyzer(FilterFactory filters){
5757 return new ReusableLanguageAnalyzer(filters,false,true);
5858 }
5959
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java
@@ -1,9 +1,12 @@
22 package org.wikimedia.lsearch.analyzers;
33
 4+import java.io.IOException;
45 import java.util.ArrayList;
56 import java.util.HashSet;
67
 8+import org.apache.lucene.analysis.Analyzer;
79 import org.apache.lucene.analysis.Token;
 10+import org.apache.lucene.analysis.TokenStream;
811 import org.wikimedia.lsearch.config.IndexId;
912
1013 /**
@@ -20,6 +23,7 @@
2124
2225 /** Construct from arbitrary text that will be tokenized */
2326 public Aggregate(String text, float boost, IndexId iid, boolean exactCase, HashSet<String> stopWords){
 27+ // FIXME: hey, we should use an analyzer here!!!
2428 TokenizerOptions options = new TokenizerOptions.NoRelocation(exactCase);
2529 tokens = new FastWikiTokenizerEngine(text,iid,options).parse();
2630 this.boost = boost;
@@ -33,14 +37,23 @@
3438 noStopWordsLength = tokens.size();
3539 }
3640
37 - /** Construct for highlight */
38 - public Aggregate(String text, float boost, IndexId iid){
39 - TokenizerOptions options = new TokenizerOptions.Highlight();
40 - tokens = new FastWikiTokenizerEngine(text,iid,options).parse();
 41+ /** Construct with specific analyzer
 42+ * @throws IOException */
 43+ public Aggregate(String text, float boost, IndexId iid, Analyzer analyzer, String field) throws IOException{
 44+ this.tokens = toTokenArray(analyzer.tokenStream(field,text));
4145 this.boost = boost;
4246 this.noStopWordsLength = tokens.size();
4347 }
4448
 49+ private ArrayList<Token> toTokenArray(TokenStream stream) throws IOException {
 50+ ArrayList<Token> tt = new ArrayList<Token>();
 51+ Token t = null;
 52+ while( (t = stream.next()) != null){
 53+ tt.add(t);
 54+ }
 55+ return tt;
 56+ }
 57+
4558 /** Number of tokens */
4659 public int length(){
4760 if(tokens != null)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Buffer.java
@@ -1,9 +1,12 @@
22 package org.wikimedia.lsearch.util;
33
 4+import java.io.EOFException;
45 import java.io.IOException;
56 import java.io.UnsupportedEncodingException;
 7+import java.util.ArrayList;
68
79 import org.wikimedia.lsearch.analyzers.Aggregate;
 10+import org.wikimedia.lsearch.analyzers.Alttitles;
811 import org.wikimedia.lsearch.analyzers.ExtToken;
912 import org.wikimedia.lsearch.analyzers.LanguageAnalyzer;
1013
@@ -11,6 +14,63 @@
1215 public byte[] buf = new byte[256];
1316 public int len=0;
1417
 18+ /** for writing */
 19+ public Buffer(){
 20+ }
 21+
 22+ /** for reading */
 23+ public Buffer(byte[] buf){
 24+ this.buf = buf;
 25+ }
 26+
 27+ public byte read(){
 28+ return buf[len++];
 29+ }
 30+
 31+ public int readInt(){
 32+ int ch1 = read() & 0xFF;
 33+ int ch2 = read() & 0xFF;
 34+ int ch3 = read() & 0xFF;
 35+ int ch4 = read() & 0xFF;
 36+ return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
 37+ }
 38+
 39+ public String readStringWithLength(){
 40+ int strlen = read();
 41+ String s;
 42+ try {
 43+ s = new String(buf,len,strlen,"utf-8");
 44+ len += strlen;
 45+ return s;
 46+ } catch (UnsupportedEncodingException e) {
 47+ e.printStackTrace();
 48+ return null;
 49+ }
 50+ }
 51+
 52+ public byte[] readBytesWithLength(){
 53+ int l = readInt();
 54+ byte[] b = new byte[l];
 55+ System.arraycopy(buf,len,b,0,l);
 56+ len += l;
 57+ return b;
 58+ }
 59+
 60+ /** @return Integer type, String title, Integer rank(boost), ArrayList<ExtToken> tokens */
 61+ public Object[] readAlttitleInfo(){
 62+ Integer type = (int)read();
 63+ Integer boost = readInt();
 64+ String title = readStringWithLength();
 65+ ArrayList<ExtToken> tokens = ExtToken.deserialize(readBytesWithLength());
 66+ return new Object[] { type, new Alttitles.Info(title,boost,tokens)};
 67+ }
 68+
 69+ public boolean hasMore(){
 70+ return len < buf.length;
 71+ }
 72+
 73+ ///////// WRITE ///////////
 74+
1575 public byte[] getBytes(){
1676 byte[] ret = new byte[len];
1777 System.arraycopy(buf,0,ret,0,len);
@@ -82,7 +142,7 @@
83143
84144 /** Format: type (1b), rank (4b), text (string), size of serialized (4b), serialized (bytes)
85145 * @throws IOException */
86 - public final void writeAggregate(String text, Aggregate a, int type) throws IOException{
 146+ public final void writeAlttitleInfo(String text, Aggregate a, int type) throws IOException{
87147 write(type);
88148 writeInt((int)a.boost());
89149 writeStringWithLength(text);
Index: branches/lucene-search-2.1/build.xml
@@ -61,6 +61,7 @@
6262 <zipfileset src="lib/snowball.jar" />
6363 <zipfileset src="lib/mwdumper.jar" />
6464 <zipfileset src="lib/mysql-connector-java-3.0.17-ga-bin.jar" />
 65+ <zipfileset src="lib/lucene-highlighter-2.2.0.jar" />
6566 </jar>
6667 </target>
6768
Index: branches/lucene-search-2.1/.classpath
@@ -12,8 +12,8 @@
1313 <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
1414 <classpathentry kind="lib" path="lib/snowball.jar"/>
1515 <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/>
16 - <classpathentry kind="lib" path="lib/highlighter.jar" sourcepath="/sandbox/highlighter"/>
1716 <classpathentry kind="lib" path="lib/mysql-connector-java-3.0.17-ga-bin.jar"/>
1817 <classpathentry kind="lib" path="lib/lucene-core-2.2.0.jar" sourcepath="/lucene-2.2/src"/>
 18+ <classpathentry kind="lib" path="lib/lucene-highlighter-2.2.0.jar" sourcepath="/lucene-2.2/contrib/highlighter/src/java"/>
1919 <classpathentry kind="output" path="bin"/>
2020 </classpath>
Index: branches/lucene-search-2.1/lsearch-global.conf
@@ -27,6 +27,7 @@
2828 [Search-Group]
2929 oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links
3030 oblak : wikilucene.nspart1.sub1 wikilucene.nspart1.sub2
 31+oblak : wikilucene.nspart1.sub1.hl wikilucene.nspart1.sub2.hl
3132
3233 # Index nodes
3334 # host: db1.part db2.part

Status & tagging log