r109155 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r109154‎ \| r109155 \| r109156 >
Date:	15:56, 17 January 2012
Author:	oren
Status:	deferred
Tags:
Comment:	lucene api has switched to streams
Modified paths:	/trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Aggregate.java (added) (history)

Diff [purge]

Index: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Aggregate.java
—	—	@@ -0,0 +1,135 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.StringReader;
	6	+import java.util.ArrayList;
	7	+import java.util.HashSet;
	8	+
	9	+import org.apache.lucene.analysis.Analyzer;
	10	+import org.apache.lucene.analysis.Token;
	11	+import org.apache.lucene.analysis.TokenStream;
	12	+import org.wikimedia.lsearch.config.IndexId;
	13	+
	14	+/**
	15	+ * Aggregate bean that captures information about one
	16	+ * item going into the some index aggregate field.
	17	+ *
	18	+ * @author rainman
	19	+ *
	20	+ */
	21	+public class Aggregate {
	22	+ protected ArrayList<Token> tokens;
	23	+ protected float boost;
	24	+ protected int noStopWordsLength;
	25	+ protected Flags flags;
	26	+
	27	+ public enum Flags { NONE, ALTTITLE, ANCHOR, RELATED, SECTION };
	28	+
	29	+ /** Construct from arbitrary text that will be tokenized
	30	+ * @throws IOException */
	31	+ public Aggregate(String text, float boost, IndexId iid, Analyzer analyzer,
	32	+ String field, HashSet<String> stopWords, Flags flags) throws IOException{
	33	+ setTokens(toTokenArray(analyzer.tokenStream(field,new StringReader(text))),stopWords);
	34	+ this.boost = boost;
	35	+ this.flags = flags;
	36	+
	37	+ }
	38	+ /** Set new token array, calc length, etc.. */
	39	+ public void setTokens(ArrayList<Token> tokens, HashSet<String> stopWords){
	40	+ this.tokens = tokens;
	41	+ if(stopWords != null){
	42	+ noStopWordsLength = 0;
	43	+ for(Token t : tokens){
	44	+ if(!stopWords.contains(t.termText()) && t.getPositionIncrement()!=0)
	45	+ noStopWordsLength++;
	46	+ }
	47	+ } else{
	48	+ noStopWordsLength = noAliasLength();
	49	+ }
	50	+ }
	51	+ /** Number of tokens without aliases */
	52	+ public int noAliasLength(){
	53	+ int len = 0;
	54	+ for(Token t : tokens){
	55	+ if(t.getPositionIncrement() != 0)
	56	+ len++;
	57	+ }
	58	+ return len;
	59	+ }
	60	+
	61	+ /** Construct with specific analyzer
	62	+ * @throws IOException */
	63	+ public Aggregate(String text, float boost, IndexId iid, Analyzer analyzer,
	64	+ String field, Flags flags) throws IOException{
	65	+ this.tokens = toTokenArray(analyzer.tokenStream(field,new StringReader(text)));
	66	+ this.boost = boost;
	67	+ this.noStopWordsLength = noAliasLength();
	68	+ this.flags = flags;
	69	+ }
	70	+
	71	+ private ArrayList<Token> toTokenArray(TokenStream stream) throws IOException {
	72	+ ArrayList<Token> tt = new ArrayList<Token>();
	73	+ Token t = null;
	74	+ while( (t = stream.next()) != null && tt.size() < 0xff-1){
	75	+ tt.add(t);
	76	+ }
	77	+ return tt;
	78	+ }
	79	+
	80	+ /** Number of tokens */
	81	+ public int length(){
	82	+ if(tokens != null)
	83	+ return tokens.size();
	84	+ else
	85	+ return 0;
	86	+ }
	87	+
	88	+ /** Number of tokens when stop words are excluded */
	89	+ public int getNoStopWordsLength(){
	90	+ return noStopWordsLength;
	91	+ }
	92	+
	93	+ /** boost factor */
	94	+ public float boost(){
	95	+ return boost;
	96	+ }
	97	+
	98	+ public Token getToken(int index){
	99	+ return tokens.get(index);
	100	+ }
	101	+
	102	+ public ArrayList<Token> getTokens() {
	103	+ return tokens;
	104	+ }
	105	+
	106	+ public Flags getFlags() {
	107	+ return flags;
	108	+ }
	109	+ /**
	110	+ * Generate the meta field stored contents
	111	+ * format: [length] [length without stop words] [boost] [complete length] [flags] (1+1+4+1+1 bytes)
	112	+ */
	113	+ public static byte[] serializeAggregate(ArrayList<Aggregate> items){
	114	+ byte[] buf = new byte[items.size() * 8];
	115	+
	116	+ for(int i=0;i<items.size();i++){
	117	+ Aggregate ag = items.get(i);
	118	+ assert ag.length() < 0xff;
	119	+ assert ag.noAliasLength() < 0xff;
	120	+ assert ag.getNoStopWordsLength() < 0xff;
	121	+ buf[i*8] = (byte)(ag.noAliasLength() & 0xff);
	122	+ buf[i*8+1] = (byte)(ag.getNoStopWordsLength() & 0xff);
	123	+ int boost = Float.floatToIntBits(ag.boost());
	124	+ buf[i*8+2] = (byte)((boost >>> 24) & 0xff);
	125	+ buf[i*8+3] = (byte)((boost >>> 16) & 0xff);
	126	+ buf[i*8+4] = (byte)((boost >>> 8) & 0xff);
	127	+ buf[i*8+5] = (byte)((boost >>> 0) & 0xff);
	128	+ buf[i*8+6] = (byte)(ag.length() & 0xff);
	129	+ buf[i*8+7] = (byte)(ag.getFlags().ordinal() & 0xff);
	130	+ }
	131	+
	132	+ return buf;
	133	+ }
	134	+
	135	+
	136	+}
Property changes on: trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Aggregate.java
___________________________________________________________________
Added: svn:eol-style
1	137	+ native

Status & tagging log

01:16, 18 January 2012 Siebrand (talk | contribs) changed the status of r109155 [removed: new added: deferred]