r27370 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r27369‎ \| r27370 \| r27371 >
Date:	22:58, 10 November 2007
Author:	rainman
Status:	old
Tags:
Comment:	Highlighting, more work-in-progress: * Basic highlighting algorithm in class Highlight, score text fragments, redirects and section names * Untested incremental index updates for .hl indexes * Added read operations to Buffer
Modified paths:	/branches/lucene-search-2.1/.classpath (modified) (history) /branches/lucene-search-2.1/build.xml (modified) (history) /branches/lucene-search-2.1/lsearch-global.conf (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Alttitles.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightResult.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Buffer.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
—	—	@@ -41,6 +41,7 @@
42	42	Links links;
43	43	String langCode;
44	44	RelatedStorage related;
	45	+ boolean makeIndex, makeHighlight;
45	46
46	47	public DumpImporter(String dbname, int limit, Boolean optimize, Integer mergeFactor,
47	48	Integer maxBufDocs, boolean newIndex, Links links, String langCode,
—	—	@@ -53,6 +54,8 @@
54	55	highlightWriter = new SimpleIndexWriter(iid.getHighlight(), optimize, mergeFactor, maxBufDocs, newIndex);
55	56	this.limit = limit;
56	57	this.links = links;
	58	+ this.makeIndex = makeIndex;
	59	+ this.makeHighlight = makeHighlight;
57	60	this.langCode = langCode;
58	61	this.related = new RelatedStorage(iid);
59	62	if(!related.canRead())
—	—	@@ -86,7 +89,7 @@
87	90	redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef));
88	91	}
89	92	// related
90		~~- if(related != null)~~
	93	+ if(makeIndex && related != null)
91	94	rel = related.getRelated(key);
92	95	// make article
93	96	Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,isRedirect,
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
—	—	@@ -46,7 +46,8 @@
47	47	protected String langCode;
48	48	protected Links links;
49	49	protected Analyzer indexAnalyzer;
50		~~- protected ReusableLanguageAnalyzer highlightAnalyzer;~~
	50	+ protected Analyzer highlightAnalyzer;
	51	+ protected ReusableLanguageAnalyzer highlightContentAnalyzer;
51	52	protected HashSet<String> stopWords;
52	53
53	54	public SimpleIndexWriter(IndexId iid, Boolean optimize, Integer mergeFactor, Integer maxBufDocs, boolean newIndex){
—	—	@@ -61,7 +62,8 @@
62	63	builder = new FieldBuilder(iid,dCase);
63	64	indexes = new HashMap<String,IndexWriter>();
64	65	indexAnalyzer = Analyzers.getIndexerAnalyzer(builder);
65		~~- highlightAnalyzer = new ReusableLanguageAnalyzer(builder.getBuilder().getFilters(),false,true);~~
	66	+ highlightAnalyzer = Analyzers.getHighlightAnalyzer(iid);
	67	+ highlightContentAnalyzer = new ReusableLanguageAnalyzer(builder.getBuilder().getFilters(),false,true);
66	68	stopWords = StopWords.getPredefinedSet(iid);
67	69	// open all relevant indexes
68	70	for(IndexId part : iid.getPhysicalIndexIds()){
—	—	@@ -151,17 +153,12 @@
152	154	IndexWriter writer = indexes.get(target.toString());
153	155	if(writer == null)
154	156	return;
155		~~- String key = a.getTitleObject().getKey();~~
156	157	try {
157		~~- // TODO: move to WikiIndexModifier?~~
158		~~- Document doc = new Document();~~
159		~~- doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED));~~
160		~~- doc.add(new Field("text",ExtToken.serialize(highlightAnalyzer.tokenStream("contents",a.getContents())),Store.COMPRESS));~~
161		~~- doc.add(new Field("alttitle",WikiIndexModifier.serializeAltTitle(a,iid,highlightAnalyzer.getWikiTokenizer().getHeadingText()),Store.COMPRESS));~~
	158	+ Document doc = WikiIndexModifier.makeHighlightDocument(a,highlightAnalyzer,highlightContentAnalyzer,target);
162	159	addDocument(writer,doc,a,target);
163	160	} catch (IOException e) {
164	161	e.printStackTrace();
165		~~- log.error("Error adding document for key="+key+" : "+e.getMessage());~~
	162	+ log.error("Error adding document for key="+a.getTitleObject().getKey()+" : "+e.getMessage());
166	163	}
167	164	}
168	165
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Snippet.java
—	—	@@ -0,0 +1,107 @@
	2	+package org.wikimedia.lsearch.highlight;
	3	+
	4	+import java.io.Serializable;
	5	+import java.util.ArrayList;
	6	+
	7	+import org.wikimedia.lsearch.analyzers.Alttitles;
	8	+
	9	+/**
	10	+ * Snippet of highlighted text.
	11	+ *
	12	+ * @author rainman
	13	+ *
	14	+ */
	15	+public class Snippet implements Serializable {
	16	+ public static class Range implements Serializable {
	17	+ public int start;
	18	+ public int end;
	19	+
	20	+ public Range(int start, int end){
	21	+ this.start = start;
	22	+ this.end = end;
	23	+ }
	24	+
	25	+ @Override
	26	+ public int hashCode() {
	27	+ final int PRIME = 31;
	28	+ int result = 1;
	29	+ result = PRIME * result + end;
	30	+ result = PRIME * result + start;
	31	+ return result;
	32	+ }
	33	+
	34	+ @Override
	35	+ public boolean equals(Object obj) {
	36	+ if (this == obj)
	37	+ return true;
	38	+ if (obj == null)
	39	+ return false;
	40	+ if (getClass() != obj.getClass())
	41	+ return false;
	42	+ final Range other = (Range) obj;
	43	+ if (end != other.end)
	44	+ return false;
	45	+ if (start != other.start)
	46	+ return false;
	47	+ return true;
	48	+ }
	49	+
	50	+
	51	+ }
	52	+ protected String text = null;
	53	+ protected ArrayList<Range> highlighted = new ArrayList<Range>();
	54	+
	55	+ protected Alttitles.Info alttitle = null;
	56	+
	57	+ public Snippet(){
	58	+
	59	+ }
	60	+ public Snippet(String text){
	61	+ this.text = text;
	62	+ }
	63	+
	64	+ public void addRange(Range r){
	65	+ if(highlighted.size() != 0 && r.equals(highlighted.get(highlighted.size()-1))){
	66	+ return; // don't allow duplicates!
	67	+ }
	68	+ highlighted.add(r);
	69	+ }
	70	+
	71	+ public ArrayList<Range> getHighlighted() {
	72	+ return highlighted;
	73	+ }
	74	+
	75	+ public String getText() {
	76	+ return text;
	77	+ }
	78	+
	79	+ public void setText(String text){
	80	+ this.text = text;
	81	+ }
	82	+
	83	+ public String getFormatted(){
	84	+ StringBuilder sb = new StringBuilder();
	85	+ int last = 0;
	86	+ for(Range r : highlighted){
	87	+ sb.append(text.substring(last,r.start));
	88	+ sb.append("<b>");
	89	+ sb.append(text.substring(r.start,r.end));
	90	+ sb.append("</b>");
	91	+ last = r.end;
	92	+ }
	93	+ if(last != text.length())
	94	+ sb.append(text.substring(last));
	95	+ return sb.toString();
	96	+ }
	97	+ public Alttitles.Info getAlttitle() {
	98	+ return alttitle;
	99	+ }
	100	+ public void setAlttitle(Alttitles.Info alttitle) {
	101	+ this.alttitle = alttitle;
	102	+ }
	103	+ public void setHighlighted(ArrayList<Range> highlighted) {
	104	+ this.highlighted = highlighted;
	105	+ }
	106	+
	107	+
	108	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/HighlightResult.java
—	—	@@ -0,0 +1,53 @@
	2	+package org.wikimedia.lsearch.highlight;
	3	+
	4	+import java.io.Serializable;
	5	+
	6	+/**
	7	+ * Result of higlighting, contains
	8	+ * snippets for title, redirect, sections, and text
	9	+ * @author rainman
	10	+ *
	11	+ */
	12	+public class HighlightResult implements Serializable {
	13	+ protected Snippet title = null;
	14	+ protected Snippet redirect = null;
	15	+ protected Snippet section = null;
	16	+ protected Snippet text = null;
	17	+
	18	+ public HighlightResult(){
	19	+ }
	20	+
	21	+ public Snippet getRedirect() {
	22	+ return redirect;
	23	+ }
	24	+
	25	+ public void setRedirect(Snippet redirect) {
	26	+ this.redirect = redirect;
	27	+ }
	28	+
	29	+ public Snippet getSection() {
	30	+ return section;
	31	+ }
	32	+
	33	+ public void setSection(Snippet section) {
	34	+ this.section = section;
	35	+ }
	36	+
	37	+ public Snippet getText() {
	38	+ return text;
	39	+ }
	40	+
	41	+ public void setText(Snippet text) {
	42	+ this.text = text;
	43	+ }
	44	+
	45	+ public Snippet getTitle() {
	46	+ return title;
	47	+ }
	48	+
	49	+ public void setTitle(Snippet title) {
	50	+ this.title = title;
	51	+ }
	52	+
	53	+
	54	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java
—	—	@@ -0,0 +1,324 @@
	2	+package org.wikimedia.lsearch.highlight;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.Arrays;
	7	+import java.util.Collections;
	8	+import java.util.Comparator;
	9	+import java.util.HashMap;
	10	+import java.util.HashSet;
	11	+import java.util.Set;
	12	+
	13	+import org.apache.log4j.Logger;
	14	+import org.apache.lucene.document.Document;
	15	+import org.apache.lucene.index.IndexReader;
	16	+import org.apache.lucene.index.Term;
	17	+import org.apache.lucene.index.TermDocs;
	18	+import org.wikimedia.lsearch.analyzers.Alttitles;
	19	+import org.wikimedia.lsearch.analyzers.ExtToken;
	20	+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
	21	+import org.wikimedia.lsearch.analyzers.WikiQueryParser;
	22	+import org.wikimedia.lsearch.analyzers.ExtToken.Position;
	23	+import org.wikimedia.lsearch.analyzers.ExtToken.Type;
	24	+import org.wikimedia.lsearch.config.IndexId;
	25	+import org.wikimedia.lsearch.search.SearcherCache;
	26	+
	27	+public class Highlight {
	28	+ protected static SearcherCache cache = null;
	29	+ static Logger log = Logger.getLogger(Highlight.class);
	30	+
	31	+ public static final int SLOP = WikiQueryParser.MAINPHRASE_SLOP;
	32	+ /** maximal length of text that surrounds highlighted words */
	33	+ public static final int MAX_CONTEXT = 75;
	34	+
	35	+ public static final double PHRASE_BOOST = 1;
	36	+
	37	+ /** boost (preference) factors for varius parts of the text */
	38	+ public static final HashMap<Position,Double> BOOST = new HashMap<Position,Double>();
	39	+ static {
	40	+ BOOST.put(Position.FIRST_SECTION,5.0);
	41	+ BOOST.put(Position.HEADING,2.0);
	42	+ BOOST.put(Position.NORMAL,1.0);
	43	+ BOOST.put(Position.TEMPLATE,0.1);
	44	+ BOOST.put(Position.IMAGE_CAT_IW,0.01);
	45	+ BOOST.put(Position.EXT_LINK,0.5);
	46	+ BOOST.put(Position.REFERENCE,0.5);
	47	+ }
	48	+ /**
	49	+ *
	50	+ * @param hits - keys of articles that need to be higlighted
	51	+ * @param iid - highlight index
	52	+ * @param terms - terms to highlight
	53	+ * @param df - their document frequencies
	54	+ * @param words - in order words (from main phrase)
	55	+ * @param exactCase - if these are results from exactCase search
	56	+ * @throws IOException
	57	+ * @returns map: key -> what to highlight
	58	+ */
	59	+ @SuppressWarnings("unchecked")
	60	+ public static HashMap<String,HighlightResult> highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, HashSet<String> stopWords) throws IOException{
	61	+ if(cache == null)
	62	+ cache = SearcherCache.getInstance();
	63	+
	64	+ System.out.println("Highlighting: "+Arrays.toString(terms));
	65	+
	66	+ FieldNameFactory fields = new FieldNameFactory(exactCase);
	67	+
	68	+ if(stopWords == null)
	69	+ stopWords = new HashSet<String>();
	70	+
	71	+ // terms weighted with idf
	72	+ HashMap<String,Double> weightTerm = new HashMap<String,Double>();
	73	+ for(int i=0;i<terms.length;i++){
	74	+ Term t = terms[i];
	75	+ if(t.field().equals(fields.contents())){
	76	+ double idf = idf(df[i],maxDoc);
	77	+ weightTerm.put(t.text(),idf);
	78	+ }
	79	+ }
	80	+ // position within main phrase
	81	+ HashMap<String,Integer> wordIndex = new HashMap<String,Integer>();
	82	+ for(int i=0;i<words.size();i++)
	83	+ wordIndex.put(words.get(i),i);
	84	+
	85	+ // process requested documents
	86	+ IndexReader reader = cache.getLocalSearcher(iid.getHighlight()).getIndexReader();
	87	+ HashMap<String,HighlightResult> res = new HashMap<String,HighlightResult>();
	88	+ for(String key : hits){
	89	+ Object[] ret = getTokens(reader,key);
	90	+ if(ret == null)
	91	+ continue;
	92	+ ArrayList<ExtToken> tokens = (ArrayList<ExtToken>) ret[0];
	93	+ Alttitles alttitles = (Alttitles) ret[1];
	94	+ HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles);
	95	+
	96	+ getBestTextSnippets(tokens, weightTerm, wordIndex, 2);
	97	+ getBestAltTitle(alttitles.getRedirects(),weightTerm,notInTitle,stopWords,1);
	98	+ getBestAltTitle(alttitles.getSections(),weightTerm,notInTitle,stopWords,0);
	99	+
	100	+ }
	101	+ return res;
	102	+ }
	103	+
	104	+ /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
	105	+ protected static double idf(int docFreq, int numDocs) {
	106	+ return Math.log(numDocs/(double)(docFreq+1)) + 1.0;
	107	+ }
	108	+
	109	+ @SuppressWarnings("unchecked")
	110	+ protected static HashMap<String,Double> getTermsNotInTitle(HashMap<String,Double> weightTerm, Alttitles alttitles){
	111	+ Alttitles.Info info = alttitles.getTitle();
	112	+ ArrayList<ExtToken> tokens = info.getTokens();
	113	+ HashMap<String,Double> ret = (HashMap<String, Double>) weightTerm.clone();
	114	+ // delete all terms from title
	115	+ for(ExtToken t : tokens){
	116	+ if(ret.containsKey(t.termText()))
	117	+ ret.remove(t.termText());
	118	+ }
	119	+ return ret;
	120	+
	121	+ }
	122	+
	123	+ /** Alttitle and sections highlighting */
	124	+
	125	+ protected static class ScoredSnippet {
	126	+ Snippet snippet = null;
	127	+ double score = 0;
	128	+ public ScoredSnippet(Snippet snippet, double score) {
	129	+ this.snippet = snippet;
	130	+ this.score = score;
	131	+ }
	132	+
	133	+ }
	134	+
	135	+ protected static Snippet getBestAltTitle(ArrayList<Alttitles.Info> altInfos, HashMap<String,Double> weightTerm,
	136	+ HashMap<String,Double> notInTitle, HashSet<String> stopWords, int minAdditional){
	137	+ ArrayList<ScoredSnippet> res = new ArrayList<ScoredSnippet>();
	138	+ for(Alttitles.Info ainf : altInfos){
	139	+ double matched = 0, additional=0;
	140	+ ArrayList<ExtToken> tokens = ainf.getTokens();
	141	+ boolean completeMatch=true;
	142	+ for(int i=0;i<tokens.size();i++){
	143	+ ExtToken t = tokens.get(i);
	144	+ if(t.getPositionIncrement() == 0)
	145	+ continue; // skip aliases
	146	+
	147	+ if(weightTerm.containsKey(t.termText()))
	148	+ matched += weightTerm.get(t.termText());
	149	+ else if(!stopWords.contains(t.termText()))
	150	+ completeMatch = false;
	151	+
	152	+ if(notInTitle.containsKey(t.termText()))
	153	+ additional += notInTitle.get(t.termText());
	154	+ }
	155	+ if((completeMatch && additional >= minAdditional) \|\| additional >= minAdditional+1 \|\| additional == notInTitle.size()){
	156	+ Snippet snippet = makeSnippet(tokens,0,tokens.size(),weightTerm.keySet());
	157	+ snippet.setAlttitle(ainf);
	158	+ res.add(new ScoredSnippet(snippet,matched+additional));
	159	+ }
	160	+ }
	161	+ if(res.size() > 0){
	162	+ if(res.size() == 1){
	163	+ return res.get(0).snippet;
	164	+ } else{
	165	+ // get snippet with best score
	166	+ Collections.sort(res, new Comparator<ScoredSnippet>() {
	167	+ public int compare(ScoredSnippet o1, ScoredSnippet o2) {
	168	+ double d = o2.score - o1.score;
	169	+ if(d > 0)
	170	+ return 1;
	171	+ else if(d == 0)
	172	+ return 0;
	173	+ else return -1;
	174	+ }});
	175	+ return res.get(0).snippet;
	176	+ }
	177	+ }
	178	+ return null;
	179	+ }
	180	+
	181	+ /** Text highlighting */
	182	+
	183	+ protected static class FragmentScore {
	184	+ int start = 0;
	185	+ int end = 0;
	186	+ double score = 0;
	187	+ // best match in this fragment
	188	+ int bestStart = -1;
	189	+ int bestEnd = -1;
	190	+ double bestScore = 0;
	191	+
	192	+ FragmentScore(int start){
	193	+ this.start = start;
	194	+ }
	195	+
	196	+ public String toString(){
	197	+ return "start="+start+", end="+end+", score="+score+", bestStart="+bestStart+", bestEnd="+bestEnd;
	198	+ }
	199	+ }
	200	+
	201	+ /** Highlight text */
	202	+ protected static ArrayList<Snippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms,
	203	+ HashMap<String,Integer> wordIndex, int maxSnippets) {
	204	+
	205	+ // pieces of text to ge highlighted
	206	+ ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>();
	207	+
	208	+ //System.out.println("TOKENS: "+tokens);
	209	+
	210	+ FragmentScore fs = null;
	211	+ ExtToken last = null;
	212	+ // next three are for in-order matched phrases
	213	+ Integer lastWord = null; // pointer to the last word found in sequence
	214	+ int lastIndex = 0;
	215	+ Double lastWeight = null;
	216	+ // indicator for first sentence
	217	+ boolean seenFirstSentence = false;
	218	+ for(int i=0;i<=tokens.size();i++){
	219	+ ExtToken t = null;
	220	+ if(i < tokens.size())
	221	+ t = tokens.get(i);
	222	+ if(last == null){
	223	+ fs = new FragmentScore(i);
	224	+ } else if(t==null \|\| t.getPosition() != last.getPosition() \|\| t.getType() == Type.SENTENCE_BREAK){
	225	+ Position pos = last.getPosition();
	226	+ // finalize fragment
	227	+
	228	+ if(t == null \|\| t.getType() != Type.SENTENCE_BREAK)
	229	+ fs.end = i;
	230	+ else
	231	+ fs.end = i + 1;
	232	+ fs.score *= BOOST.get(pos);
	233	+ fragments.add(fs);
	234	+ if(pos == Position.FIRST_SECTION && !seenFirstSentence){
	235	+ // boost for first sentence
	236	+ fs.score *= 4;
	237	+ seenFirstSentence = true;
	238	+ }
	239	+ fs = new FragmentScore(fs.end);
	240	+ }
	241	+ if(t == null)
	242	+ break;
	243	+
	244	+ Double weight = weightTerms.get(t.termText());
	245	+ if(weight != null){
	246	+ fs.score += weight;
	247	+ Integer inx = wordIndex.get(t.termText());
	248	+ if(lastWord != null && inx != null && lastWord == inx - 1 && i-lastIndex <= 2){
	249	+ double phraseScore = (weight + lastWeight) * PHRASE_BOOST;
	250	+ fs.score += phraseScore;
	251	+ if(phraseScore > fs.bestScore){
	252	+ fs.bestStart = lastIndex;
	253	+ fs.bestEnd = i;
	254	+ fs.bestScore = phraseScore;
	255	+ }
	256	+ }
	257	+ lastWord = inx;
	258	+ lastWeight = weight;
	259	+ lastIndex = i;
	260	+ }
	261	+
	262	+ last = t;
	263	+ }
	264	+
	265	+ // find fragments with best score
	266	+ Collections.sort(fragments, new Comparator<FragmentScore>() {
	267	+ public int compare(FragmentScore o1, FragmentScore o2) {
	268	+ double d = o2.score - o1.score;
	269	+ if(d > 0)
	270	+ return 1;
	271	+ else if(d == 0)
	272	+ return 0;
	273	+ else return -1;
	274	+ }});
	275	+
	276	+ ArrayList<Snippet> res = new ArrayList<Snippet>();
	277	+ for(FragmentScore f : fragments){
	278	+ if(f.score == 0)
	279	+ continue;
	280	+ Snippet s = makeSnippet(tokens,f,weightTerms.keySet());
	281	+ res.add(s);
	282	+ System.out.println(f+" : "+s.getFormatted());
	283	+ if(res.size() >= maxSnippets)
	284	+ break;
	285	+ }
	286	+ return res;
	287	+ }
	288	+
	289	+ private static Snippet makeSnippet(ArrayList<ExtToken> tokens, FragmentScore f, Set<String> highlight) {
	290	+ return makeSnippet(tokens,f.start,f.end,highlight);
	291	+ }
	292	+
	293	+ private static Snippet makeSnippet(ArrayList<ExtToken> tokens, int fromIndex, int toIndex, Set<String> highlight) {
	294	+ Snippet s = new Snippet();
	295	+ StringBuilder sb = new StringBuilder();
	296	+ int start=0, end=0;
	297	+ for(int i=fromIndex;i<toIndex;i++){
	298	+ ExtToken t = tokens.get(i);
	299	+ if(t.getPositionIncrement() != 0){
	300	+ start = sb.length();
	301	+ sb.append(t.getText());
	302	+ end = sb.length();
	303	+ }
	304	+ if(highlight.contains(t.termText())){
	305	+ s.addRange(new Snippet.Range(start,end));
	306	+ }
	307	+ }
	308	+ s.setText(sb.toString());
	309	+ return s;
	310	+ }
	311	+
	312	+ /** @return ArrayList<ExtToken> tokens, Altitles alttitles */
	313	+ protected static Object[] getTokens(IndexReader reader, String key) throws IOException{
	314	+ TermDocs td = reader.termDocs(new Term("key",key));
	315	+ if(td.next()){
	316	+ System.out.println("Found "+key);
	317	+ Document doc = reader.document(td.doc());
	318	+ ArrayList<ExtToken> tokens = ExtToken.deserialize(doc.getBinaryValue("text"));
	319	+ // FIXME: wrong deserialization for alttitle !
	320	+ Alttitles alttitles = Alttitles.deserializeAltTitle(doc.getBinaryValue("alttitle"));
	321	+ return new Object[] {tokens, alttitles};
	322	+ } else
	323	+ return null;
	324	+ }
	325	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java
—	—	@@ -148,10 +148,17 @@
149	149	}
150	150
151	151	/**
152		~~- * @return Returns the page key (via article)~~
	152	+ * @return Returns the page key -- page_id (via article)
153	153	*/
154	154	public String getKey(){
155	155	return article.getKey();
156	156	}
157	157
	158	+ /**
	159	+ * @return Highlight key -- ns:title
	160	+ */
	161	+ public String getHighlightKey(){
	162	+ return article.getTitleObject().getKey();
	163	+ }
	164	+
158	165	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -22,6 +22,8 @@
23	23	import org.apache.lucene.analysis.SimpleAnalyzer;
24	24	import org.apache.lucene.document.Document;
25	25	import org.apache.lucene.document.Field;
	26	+import org.apache.lucene.document.Field.Index;
	27	+import org.apache.lucene.document.Field.Store;
26	28	import org.apache.lucene.index.IndexReader;
27	29	import org.apache.lucene.index.IndexWriter;
28	30	import org.apache.lucene.index.Term;
—	—	@@ -29,9 +31,11 @@
30	32	import org.apache.lucene.store.FSDirectory;
31	33	import org.wikimedia.lsearch.analyzers.Aggregate;
32	34	import org.wikimedia.lsearch.analyzers.AggregateAnalyzer;
	35	+import org.wikimedia.lsearch.analyzers.Alttitles;
33	36	import org.wikimedia.lsearch.analyzers.Analyzers;
34	37	import org.wikimedia.lsearch.analyzers.CategoryAnalyzer;
35	38	import org.wikimedia.lsearch.analyzers.ContextAnalyzer;
	39	+import org.wikimedia.lsearch.analyzers.ExtToken;
36	40	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
37	41	import org.wikimedia.lsearch.analyzers.FieldBuilder;
38	42	import org.wikimedia.lsearch.analyzers.FieldNameFactory;
—	—	@@ -39,6 +43,7 @@
40	44	import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer;
41	45	import org.wikimedia.lsearch.analyzers.LanguageAnalyzer;
42	46	import org.wikimedia.lsearch.analyzers.RelatedAnalyzer;
	47	+import org.wikimedia.lsearch.analyzers.ReusableLanguageAnalyzer;
43	48	import org.wikimedia.lsearch.analyzers.StopWords;
44	49	import org.wikimedia.lsearch.analyzers.TokenizerOptions;
45	50	import org.wikimedia.lsearch.analyzers.WikiTokenizer;
—	—	@@ -138,7 +143,11 @@
139	144	}
140	145	for(IndexUpdateRecord rec : records){
141	146	if(rec.doDelete()){
142		~~- int count = reader.deleteDocuments(new Term("key", rec.getKey()));~~
	147	+ int count = 0;
	148	+ if(iid.isHighlight())
	149	+ count = reader.deleteDocuments(new Term("key", rec.getHighlightKey()));
	150	+ else // normal index
	151	+ count = reader.deleteDocuments(new Term("key", rec.getKey()));
143	152	if(count == 0)
144	153	nonDeleteDocuments.add(rec);
145	154	IndexReportCard card = getReportCard(rec);
—	—	@@ -184,7 +193,14 @@
185	194	writer.setMaxFieldLength(MAX_FIELD_LENGTH);
186	195	FieldBuilder.Case dCase = (exactCase)? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
187	196	FieldBuilder builder = new FieldBuilder(iid,dCase);
188		~~- Analyzer analyzer = Analyzers.getIndexerAnalyzer(builder);~~
	197	+ Analyzer analyzer = null;
	198	+ ReusableLanguageAnalyzer highlightContentAnalyzer = null;
	199	+ if(iid.isHighlight()){
	200	+ highlightContentAnalyzer = Analyzers.getReusableHighlightAnalyzer(builder.getBuilder(dCase).getFilters());
	201	+ analyzer = Analyzers.getHighlightAnalyzer(iid);
	202	+ } else
	203	+ analyzer = Analyzers.getIndexerAnalyzer(builder);
	204	+
189	205	HashSet<String> stopWords = StopWords.getPredefinedSet(iid);
190	206	for(IndexUpdateRecord rec : records){
191	207	if(rec.doAdd()){
—	—	@@ -193,8 +209,13 @@
194	210	if(!checkPreconditions(rec))
195	211	continue; // article shouldn't be added for some reason
196	212	IndexReportCard card = getReportCard(rec);
197		~~- Document doc = makeDocument(rec.getArticle(),builder,iid,stopWords);~~
	213	+ Document doc;
198	214	try {
	215	+ if(iid.isHighlight())
	216	+ doc = makeHighlightDocument(rec.getArticle(),analyzer,highlightContentAnalyzer,iid);
	217	+ else // normal index
	218	+ doc = makeDocument(rec.getArticle(),builder,iid,stopWords);
	219	+
199	220	writer.addDocument(doc,analyzer);
200	221	log.debug(iid+": Adding document "+rec.getKey()+" "+rec.getArticle());
201	222	if(card != null)
—	—	@@ -279,7 +300,7 @@
280	301	*
281	302	* @param article
282	303	*/
283		~~- protected static void transformArticleForIndexing(Article ar) {~~
	304	+ public static void transformArticleForIndexing(Article ar) {
284	305	ArrayList<Redirect> redirects = ar.getRedirects();
285	306	// sort redirect by their rank
286	307	Collections.sort(redirects,new Comparator<Redirect>() {
—	—	@@ -365,6 +386,18 @@
366	387	}
367	388
368	389	/**
	390	+ * Update both the search and highlight index for iid.
	391	+ *
	392	+ * @param iid
	393	+ * @param updateRecords
	394	+ */
	395	+ public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
	396	+ boolean index = updateDocumentsOn(iid,updateRecords);
	397	+ boolean highlight = updateDocumentsOn(iid.getHighlight(),updateRecords);
	398	+ return index && highlight;
	399	+ }
	400	+
	401	+ /**
369	402	* Update all documents in the collection. If needed the request
370	403	* is forwarded to a remote object (i.e. if the part of the split
371	404	* index is indexed by another host).
—	—	@@ -372,7 +405,7 @@
373	406	* @param iid
374	407	* @param updateRecords
375	408	*/
376		~~- public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){~~
	409	+ protected boolean updateDocumentsOn(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
377	410	long now = System.currentTimeMillis();
378	411	log.info("Starting update of "+updateRecords.size()+" records on "+iid+", started at "+now);
379	412	boolean succ = true;
—	—	@@ -521,6 +554,17 @@
522	555	return doc;
523	556	}
524	557
	558	+ /** Make the document that will be indexed as highlighting data */
	559	+ public static Document makeHighlightDocument(Article article, Analyzer analyzer, ReusableLanguageAnalyzer contentAnalyzer, IndexId iid) throws IOException{
	560	+ String key = article.getTitleObject().getKey();
	561	+ Document doc = new Document();
	562	+ doc.add(new Field("key",key,Store.NO,Index.UN_TOKENIZED));
	563	+ doc.add(new Field("text",ExtToken.serialize(contentAnalyzer.tokenStream("contents",article.getContents())),Store.COMPRESS));
	564	+ ArrayList<String> sections = contentAnalyzer.getWikiTokenizer().getHeadingText();
	565	+ doc.add(new Field("alttitle",Alttitles.serializeAltTitle(article,iid,sections,analyzer,"alttitle"),Store.COMPRESS));
	566	+ return doc;
	567	+ }
	568	+
525	569	/** add related aggregate field */
526	570	protected static void makeRelated(Document doc, String prefix, Article article, IndexId iid, HashSet<String> stopWords){
527	571	ArrayList<Aggregate> items = new ArrayList<Aggregate>();
—	—	@@ -548,34 +592,9 @@
549	593	addToItems(items, new Aggregate(title+" "+h,rankBoost*HEADINGS_BOOST,iid,exactCase,stopWords));
550	594	}
551	595	makeAggregate(doc,prefix,items);
552		~~- }~~
	596	+ }
553	597
554	598
555		~~- public enum AlttitleTypes { TITLE, REDIRECT, HEADING };~~
556		-
557		~~- public static byte[] serializeAltTitle(Article article, IndexId iid, ArrayList<String> headingText) throws IOException{~~
558		~~- WikiIndexModifier.transformArticleForIndexing(article);~~
559		~~- Buffer b = new Buffer();~~
560		-
561		~~- // add title~~
562		~~- String title = article.getTitle();~~
563		~~- b.writeAggregate(title,new Aggregate(title,article.getRank(),iid),AlttitleTypes.TITLE.ordinal());~~
564		~~- // add all redirects~~
565		~~- ArrayList<String> redirects = article.getRedirectKeywords();~~
566		~~- ArrayList<Integer> ranks = article.getRedirectKeywordRanks();~~
567		~~- for(int i=0;i<redirects.size();i++){~~
568		~~- b.writeAggregate(redirects.get(i),new Aggregate(redirects.get(i),ranks.get(i),iid),AlttitleTypes.REDIRECT.ordinal());~~
569		~~- }~~
570		~~- // add section headings!~~
571		~~- for(String h : headingText){~~
572		~~- b.writeAggregate(h,new Aggregate(h,article.getRank()*HEADINGS_BOOST,iid),AlttitleTypes.HEADING.ordinal());~~
573		~~- }~~
574		-
575		~~- return b.getBytes();~~
576		~~- }~~
577		-
578		-
579		-
580	599	private static void addToItems(ArrayList<Aggregate> items, Aggregate a){
581	600	if(a.length() != 0)
582	601	items.add(a);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java
—	—	@@ -100,6 +100,14 @@
101	101	this.inCase = inCase;
102	102	}
103	103
	104	+ /** get text, original if available, otherwise termtext */
	105	+ public String getText(){
	106	+ if(original != null)
	107	+ return original;
	108	+ else
	109	+ return termText();
	110	+ }
	111	+
104	112	public String toString(){
105	113	return "\""+termText()+"\",t="+type+",p="+pos+(original!=null? ",o={"+original+"}" : "")+",i="+getPositionIncrement();
106	114	}
—	—	@@ -153,10 +161,20 @@
154	162	b.writeString(t.termText());
155	163	}
156	164	}
157		~~- // control 1: original word~~
	165	+
158	166	if(t.getPositionIncrement() > 0 && t.original != null){
159		~~- b.writeControl(1);~~
160		~~- b.writeStringWithLength(t.original);~~
	167	+ String w = t.termText();
	168	+ if(t.original.equals(w.substring(0,1).toUpperCase()+w.substring(1))){
	169	+ // control 6: original is title case
	170	+ b.writeControl(6);
	171	+ } else if(t.original.equals(w.toUpperCase())){
	172	+ // control 7: original is upper case
	173	+ b.writeControl(7);
	174	+ } else{
	175	+ // control 1: original word
	176	+ b.writeControl(1);
	177	+ b.writeStringWithLength(t.original);
	178	+ }
161	179	}
162	180	// control 2: alias
163	181	if(t.getPositionIncrement() == 0){
—	—	@@ -294,12 +312,18 @@
295	313	throw new RuntimeException("Bad serialized data: trying to assing a sentence break to text");
296	314	t.setType(Type.SENTENCE_BREAK);
297	315	break;
298		~~- case 5:~~
	316	+ case 5: // url
299	317	{ int len = serialized[cur++];
300	318	ExtToken tt = new ExtToken(new String(serialized,cur,len,"utf-8"),cur,cur+len,Type.URL,Position.EXT_LINK);
301	319	tokens.add(tt);
302	320	cur += len;
303	321	break; }
	322	+ case 6: // original is title case
	323	+ t.setOriginal(t.termText().substring(0,1).toUpperCase()+t.termText().substring(1));
	324	+ break;
	325	+ case 7: // original is upper case
	326	+ t.setOriginal(t.termText().toUpperCase());
	327	+ break;
304	328	default:
305	329	throw new RuntimeException("Unkown control sequence "+control);
306	330	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Alttitles.java
—	—	@@ -0,0 +1,126 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.ArrayList;
	6	+import java.util.Collection;
	7	+
	8	+import org.apache.lucene.analysis.Analyzer;
	9	+import org.wikimedia.lsearch.beans.Article;
	10	+import org.wikimedia.lsearch.config.IndexId;
	11	+import org.wikimedia.lsearch.index.WikiIndexModifier;
	12	+import org.wikimedia.lsearch.util.Buffer;
	13	+
	14	+/**
	15	+ * Titles and redirects, serialization/deserialization
	16	+ * for highlighting, etc..
	17	+ *
	18	+ * @author rainman
	19	+ *
	20	+ */
	21	+public class Alttitles {
	22	+ protected Info title;
	23	+ protected ArrayList<Info> redirects = new ArrayList<Info>();
	24	+ protected ArrayList<Info> sections = new ArrayList<Info>();
	25	+
	26	+ public static class Info {
	27	+ protected String title;
	28	+ protected int rank;
	29	+ protected ArrayList<ExtToken> tokens;
	30	+ public Info(String title, int rank, ArrayList<ExtToken> tokens){
	31	+ this.title = title;
	32	+ this.rank = rank;
	33	+ this.tokens = tokens;
	34	+ }
	35	+ public int getRank() {
	36	+ return rank;
	37	+ }
	38	+ public void setRank(int rank) {
	39	+ this.rank = rank;
	40	+ }
	41	+ public String getTitle() {
	42	+ return title;
	43	+ }
	44	+ public void setTitle(String title) {
	45	+ this.title = title;
	46	+ }
	47	+ public ArrayList<ExtToken> getTokens() {
	48	+ return tokens;
	49	+ }
	50	+ public void setTokens(ArrayList<ExtToken> tokens) {
	51	+ this.tokens = tokens;
	52	+ }
	53	+
	54	+ }
	55	+
	56	+ public static byte[] serializeAltTitle(Article article, IndexId iid, Collection<String> sections, Analyzer analyzer, String field) throws IOException{
	57	+ WikiIndexModifier.transformArticleForIndexing(article);
	58	+ Buffer b = new Buffer();
	59	+
	60	+ // add title
	61	+ String title = article.getTitle();
	62	+ // type 0 : title
	63	+ b.writeAlttitleInfo(title,new Aggregate(title,article.getRank(),iid,analyzer,field),0);
	64	+ // add all redirects
	65	+ ArrayList<String> redirects = article.getRedirectKeywords();
	66	+ ArrayList<Integer> ranks = article.getRedirectKeywordRanks();
	67	+ for(int i=0;i<redirects.size();i++){
	68	+ // type 1: redirect
	69	+ b.writeAlttitleInfo(redirects.get(i),new Aggregate(redirects.get(i),ranks.get(i),iid,analyzer,field),1);
	70	+ }
	71	+
	72	+ // type 2: sections
	73	+ for(String s : sections){
	74	+ b.writeAlttitleInfo(s,new Aggregate(s,1,iid,analyzer,field),2);
	75	+ }
	76	+
	77	+ return b.getBytes();
	78	+ }
	79	+
	80	+ public static Alttitles deserializeAltTitle(byte[] serialized){
	81	+ Buffer b = new Buffer(serialized);
	82	+ Alttitles t = new Alttitles();
	83	+ while(b.hasMore()){
	84	+ Object[] ret = b.readAlttitleInfo();
	85	+ int type = (Integer)ret[0];
	86	+ Info info = (Info)ret[1];
	87	+ if(type == 0)
	88	+ t.title = info;
	89	+ else if(type == 1)
	90	+ t.redirects.add(info);
	91	+ else if(type == 2)
	92	+ t.sections.add(info);
	93	+ else
	94	+ throw new RuntimeException("Wrong type for serialized alttitle "+type);
	95	+ }
	96	+ return t;
	97	+ }
	98	+
	99	+ public ArrayList<Info> getRedirects() {
	100	+ return redirects;
	101	+ }
	102	+
	103	+ public void setRedirects(ArrayList<Info> redirects) {
	104	+ this.redirects = redirects;
	105	+ }
	106	+
	107	+ public Info getTitle() {
	108	+ return title;
	109	+ }
	110	+
	111	+ public void setTitle(Info title) {
	112	+ this.title = title;
	113	+ }
	114	+
	115	+ public ArrayList<Info> getSections() {
	116	+ return sections;
	117	+ }
	118	+
	119	+ public void setSections(ArrayList<Info> sections) {
	120	+ this.sections = sections;
	121	+ }
	122	+
	123	+
	124	+
	125	+
	126	+
	127	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -83,6 +83,7 @@
84	84
85	85	private TokenStream tokenStream;
86	86	private ArrayList<Token> tokens; // tokens from analysis
	87	+ protected ArrayList<String> words;
87	88
88	89	/** sometimes the fieldsubquery takes the bool modifier, to retrieve it, use this variable,
89	90	* this will always point to the last unused bool modifier */
—	—	@@ -1731,7 +1732,7 @@
1732	1733	Object[] qtwords = makeTitleQuery(queryText);
1733	1734	// qt = title query, qp = title phrase query
1734	1735	Query qt = (Query) qtwords[0];
1735		~~- ArrayList<String> words = (ArrayList<String>) qtwords[1];~~
	1736	+ words = (ArrayList<String>) qtwords[1];
1736	1737	if(qc == null \|\| qt == null)
1737	1738	return new BooleanQuery();
1738	1739	if(qc.equals(qt))
—	—	@@ -1837,6 +1838,10 @@
1838	1839	}
1839	1840	public void setBuilder(FieldBuilder.BuilderSet builder) {
1840	1841	this.builder = builder;
1841		~~- }~~
	1842	+ }
1842	1843
	1844	+ public ArrayList<String> getWords(){
	1845	+ return words;
	1846	+ }
	1847	+
1843	1848	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Analyzers.java
—	—	@@ -52,7 +52,7 @@
53	53	* @param language
54	54	* @return
55	55	*/
56		~~- public static Analyzer getReusableHighlightAnalyzer(FilterFactory filters){~~
	56	+ public static ReusableLanguageAnalyzer getReusableHighlightAnalyzer(FilterFactory filters){
57	57	return new ReusableLanguageAnalyzer(filters,false,true);
58	58	}
59	59
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java
—	—	@@ -1,9 +1,12 @@
2	2	package org.wikimedia.lsearch.analyzers;
3	3
	4	+import java.io.IOException;
4	5	import java.util.ArrayList;
5	6	import java.util.HashSet;
6	7
	8	+import org.apache.lucene.analysis.Analyzer;
7	9	import org.apache.lucene.analysis.Token;
	10	+import org.apache.lucene.analysis.TokenStream;
8	11	import org.wikimedia.lsearch.config.IndexId;
9	12
10	13	/**
—	—	@@ -20,6 +23,7 @@
21	24
22	25	/** Construct from arbitrary text that will be tokenized */
23	26	public Aggregate(String text, float boost, IndexId iid, boolean exactCase, HashSet<String> stopWords){
	27	+ // FIXME: hey, we should use an analyzer here!!!
24	28	TokenizerOptions options = new TokenizerOptions.NoRelocation(exactCase);
25	29	tokens = new FastWikiTokenizerEngine(text,iid,options).parse();
26	30	this.boost = boost;
—	—	@@ -33,14 +37,23 @@
34	38	noStopWordsLength = tokens.size();
35	39	}
36	40
37		~~- /** Construct for highlight */~~
38		~~- public Aggregate(String text, float boost, IndexId iid){~~
39		~~- TokenizerOptions options = new TokenizerOptions.Highlight();~~
40		~~- tokens = new FastWikiTokenizerEngine(text,iid,options).parse();~~
	41	+ /** Construct with specific analyzer
	42	+ * @throws IOException */
	43	+ public Aggregate(String text, float boost, IndexId iid, Analyzer analyzer, String field) throws IOException{
	44	+ this.tokens = toTokenArray(analyzer.tokenStream(field,text));
41	45	this.boost = boost;
42	46	this.noStopWordsLength = tokens.size();
43	47	}
44	48
	49	+ private ArrayList<Token> toTokenArray(TokenStream stream) throws IOException {
	50	+ ArrayList<Token> tt = new ArrayList<Token>();
	51	+ Token t = null;
	52	+ while( (t = stream.next()) != null){
	53	+ tt.add(t);
	54	+ }
	55	+ return tt;
	56	+ }
	57	+
45	58	/** Number of tokens */
46	59	public int length(){
47	60	if(tokens != null)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Buffer.java
—	—	@@ -1,9 +1,12 @@
2	2	package org.wikimedia.lsearch.util;
3	3
	4	+import java.io.EOFException;
4	5	import java.io.IOException;
5	6	import java.io.UnsupportedEncodingException;
	7	+import java.util.ArrayList;
6	8
7	9	import org.wikimedia.lsearch.analyzers.Aggregate;
	10	+import org.wikimedia.lsearch.analyzers.Alttitles;
8	11	import org.wikimedia.lsearch.analyzers.ExtToken;
9	12	import org.wikimedia.lsearch.analyzers.LanguageAnalyzer;
10	13
—	—	@@ -11,6 +14,63 @@
12	15	public byte[] buf = new byte[256];
13	16	public int len=0;
14	17
	18	+ /** for writing */
	19	+ public Buffer(){
	20	+ }
	21	+
	22	+ /** for reading */
	23	+ public Buffer(byte[] buf){
	24	+ this.buf = buf;
	25	+ }
	26	+
	27	+ public byte read(){
	28	+ return buf[len++];
	29	+ }
	30	+
	31	+ public int readInt(){
	32	+ int ch1 = read() & 0xFF;
	33	+ int ch2 = read() & 0xFF;
	34	+ int ch3 = read() & 0xFF;
	35	+ int ch4 = read() & 0xFF;
	36	+ return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
	37	+ }
	38	+
	39	+ public String readStringWithLength(){
	40	+ int strlen = read();
	41	+ String s;
	42	+ try {
	43	+ s = new String(buf,len,strlen,"utf-8");
	44	+ len += strlen;
	45	+ return s;
	46	+ } catch (UnsupportedEncodingException e) {
	47	+ e.printStackTrace();
	48	+ return null;
	49	+ }
	50	+ }
	51	+
	52	+ public byte[] readBytesWithLength(){
	53	+ int l = readInt();
	54	+ byte[] b = new byte[l];
	55	+ System.arraycopy(buf,len,b,0,l);
	56	+ len += l;
	57	+ return b;
	58	+ }
	59	+
	60	+ /** @return Integer type, String title, Integer rank(boost), ArrayList<ExtToken> tokens */
	61	+ public Object[] readAlttitleInfo(){
	62	+ Integer type = (int)read();
	63	+ Integer boost = readInt();
	64	+ String title = readStringWithLength();
	65	+ ArrayList<ExtToken> tokens = ExtToken.deserialize(readBytesWithLength());
	66	+ return new Object[] { type, new Alttitles.Info(title,boost,tokens)};
	67	+ }
	68	+
	69	+ public boolean hasMore(){
	70	+ return len < buf.length;
	71	+ }
	72	+
	73	+ ///////// WRITE ///////////
	74	+
15	75	public byte[] getBytes(){
16	76	byte[] ret = new byte[len];
17	77	System.arraycopy(buf,0,ret,0,len);
—	—	@@ -82,7 +142,7 @@
83	143
84	144	/** Format: type (1b), rank (4b), text (string), size of serialized (4b), serialized (bytes)
85	145	* @throws IOException */
86		~~- public final void writeAggregate(String text, Aggregate a, int type) throws IOException{~~
	146	+ public final void writeAlttitleInfo(String text, Aggregate a, int type) throws IOException{
87	147	write(type);
88	148	writeInt((int)a.boost());
89	149	writeStringWithLength(text);
Index: branches/lucene-search-2.1/build.xml
—	—	@@ -61,6 +61,7 @@
62	62	<zipfileset src="lib/snowball.jar" />
63	63	<zipfileset src="lib/mwdumper.jar" />
64	64	<zipfileset src="lib/mysql-connector-java-3.0.17-ga-bin.jar" />
	65	+ <zipfileset src="lib/lucene-highlighter-2.2.0.jar" />
65	66	</jar>
66	67	</target>
67	68
Index: branches/lucene-search-2.1/.classpath
—	—	@@ -12,8 +12,8 @@
13	13	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
14	14	<classpathentry kind="lib" path="lib/snowball.jar"/>
15	15	<classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/>
16		~~- <classpathentry kind="lib" path="lib/highlighter.jar" sourcepath="/sandbox/highlighter"/>~~
17	16	<classpathentry kind="lib" path="lib/mysql-connector-java-3.0.17-ga-bin.jar"/>
18	17	<classpathentry kind="lib" path="lib/lucene-core-2.2.0.jar" sourcepath="/lucene-2.2/src"/>
	18	+ <classpathentry kind="lib" path="lib/lucene-highlighter-2.2.0.jar" sourcepath="/lucene-2.2/contrib/highlighter/src/java"/>
19	19	<classpathentry kind="output" path="bin"/>
20	20	</classpath>
Index: branches/lucene-search-2.1/lsearch-global.conf
—	—	@@ -27,6 +27,7 @@
28	28	[Search-Group]
29	29	oblak : wikilucene wikidev wikilucene.prefix wikilucene.related wikilucene.links
30	30	oblak : wikilucene.nspart1.sub1 wikilucene.nspart1.sub2
	31	+oblak : wikilucene.nspart1.sub1.hl wikilucene.nspart1.sub2.hl
31	32
32	33	# Index nodes
33	34	# host: db1.part db2.part

Status & tagging log

15:22, 12 September 2011 Meno25 (talk | contribs) changed the status of r27370 [removed: ok added: old]