r58379 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r58378‎ \| r58379 \| r58380 >
Date:	23:00, 30 October 2009
Author:	daniel
Status:	deferred
Tags:
Comment:	phrase extraction
Modified paths:	/trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java (modified) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java (added) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccurance.java (added) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccuranceSequence.java (added) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java (modified) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/Stopwords_en.properties (added) (history) /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
—	—	@@ -1,5 +1,6 @@
2	2	package de.brightbyte.wikiword.analyzer;
3	3
	4	+import java.io.IOException;
4	5	import java.net.URISyntaxException;
5	6	import java.util.Arrays;
6	7	import java.util.HashSet;
—	—	@@ -24,7 +25,7 @@
25	26	protected class TestPlainTextAnalyzer extends PlainTextAnalyzer {
26	27	//TODO: check coverage!
27	28
28		~~- public TestPlainTextAnalyzer(Corpus corpus) {~~
	29	+ public TestPlainTextAnalyzer(Corpus corpus) throws IOException {
29	30	super(corpus);
30	31	}
31	32
—	—	@@ -79,7 +80,7 @@
80	81	protected TestPlainTextAnalyzer testAnalyzer;
81	82
82	83	@Override
83		~~- public void setUp() throws URISyntaxException {~~
	84	+ public void setUp() throws URISyntaxException, IOException {
84	85	LanguageConfiguration config = new LanguageConfiguration();
85	86
86	87	corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null);
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccurance.java
—	—	@@ -0,0 +1,98 @@
	2	+package de.brightbyte.wikiword.analyzer;
	3	+
	4	+import java.io.Serializable;
	5	+
	6	+public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance> {
	7	+
	8	+ private static final long serialVersionUID = 241753475865301115L;
	9	+
	10	+ protected String phrase;
	11	+ protected int weight;
	12	+ protected int offset;
	13	+ protected int length;
	14	+
	15	+ public PhraseOccurance(String phrase, int weight, int offset, int length) {
	16	+ if (length <= 0) throw new IllegalArgumentException("bad length: "+length);
	17	+ if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string");
	18	+ //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string");
	19	+ if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string");
	20	+ if (length < phrase.length()) phrase = phrase.substring(offset, offset+length);
	21	+
	22	+ this.phrase = phrase;
	23	+ this.weight = weight;
	24	+ this.offset = offset;
	25	+ this.length = length;
	26	+ }
	27	+
	28	+ public int getLength() {
	29	+ return length;
	30	+ }
	31	+
	32	+ public int getOffset() {
	33	+ return offset;
	34	+ }
	35	+
	36	+ public int getEndOffset() {
	37	+ return getOffset() + getLength();
	38	+ }
	39	+
	40	+ public String getPhrase() {
	41	+ return phrase;
	42	+ }
	43	+
	44	+ public int getWeight() {
	45	+ return weight;
	46	+ }
	47	+
	48	+ public String toString() {
	49	+ return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";
	50	+ }
	51	+
	52	+ @Override
	53	+ public int hashCode() {
	54	+ final int PRIME = 31;
	55	+ int result = 1;
	56	+ result = PRIME * result + length;
	57	+ result = PRIME * result + offset;
	58	+ result = PRIME * result + ((phrase == null) ? 0 : phrase.hashCode());
	59	+ return result;
	60	+ }
	61	+
	62	+ @Override
	63	+ public boolean equals(Object obj) {
	64	+ if (this == obj)
	65	+ return true;
	66	+ if (obj == null)
	67	+ return false;
	68	+ if (getClass() != obj.getClass())
	69	+ return false;
	70	+ final PhraseOccurance other = (PhraseOccurance) obj;
	71	+ if (length != other.length)
	72	+ return false;
	73	+ if (offset != other.offset)
	74	+ return false;
	75	+ if (phrase == null) {
	76	+ if (other.phrase != null)
	77	+ return false;
	78	+ } else if (!phrase.equals(other.phrase))
	79	+ return false;
	80	+ return true;
	81	+ }
	82	+
	83	+ public boolean overlaps(PhraseOccurance other) {
	84	+ if (getEndOffset() <= other.getOffset()) return false;
	85	+ if (getOffset() >= other.getEndOffset()) return false;
	86	+
	87	+ return true;
	88	+ }
	89	+
	90	+ public int compareTo(PhraseOccurance other) {
	91	+ int o = getOffset() - other.getOffset();
	92	+ if (o!=0) return o; //by offset...
	93	+
	94	+ int e = getEndOffset() - other.getEndOffset();
	95	+ if (e!=0) return -e; //but longest first!
	96	+
	97	+ return 0;
	98	+ }
	99	+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
—	—	@@ -1,9 +1,14 @@
2	2	package de.brightbyte.wikiword.analyzer;
3	3
	4	+import java.io.IOException;
4	5	import java.util.ArrayList;
	6	+import java.util.Collection;
	7	+import java.util.HashSet;
5	8	import java.util.List;
	9	+import java.util.Set;
6	10	import java.util.regex.Pattern;
7	11
	12	+import de.brightbyte.data.Pair;
8	13	import de.brightbyte.wikiword.analyzer.mangler.Mangler;
9	14	import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler;
10	15
—	—	@@ -60,6 +65,21 @@
61	66	public Pattern wordPattern;
62	67
63	68	protected String languageName;
	69	+
	70	+ /**
	71	+ * List of stopwords, that is, words that are too frequent to be useful for searches.
	72	+ */
	73	+ public Set<String> stopwords;
	74	+
	75	+ /**
	76	+ * Symbols that break a phrase, like most punctuation would
	77	+ */
	78	+ public Pattern phraseBreakerPattern;
	79	+
	80	+ /**
	81	+ * pairs of matching parantecies
	82	+ */
	83	+ public Collection<Pair<String, String>> parentacies;
64	84
65	85	public LanguageConfiguration() {
66	86	this(null);
—	—	@@ -77,7 +97,7 @@
78	98	return languageName;
79	99	}
80	100
81		~~- public void defaults() {~~
	101	+ public void defaults() throws IOException {
82	102	if (this.wordPattern==null) this.wordPattern = Pattern.compile("\\p{L}+\|\\p{Nd}+");
83	103
84	104	this.sentenceManglers.add( new RegularExpressionMangler("\\s+\$.*?\$", "", 0) ); //strip parentacized blocks
—	—	@@ -85,6 +105,17 @@
86	106	this.sentencePattern = Pattern.compile("(\\r\\n\|\\n\|\\r)\|\\.[\\s\\r\\n]"); //TODO: check what happens if we allow single newlines in sentences! Breaking on single newlines causes truncated definitions.
87	107	this.sentenceTailGluePattern = Pattern.compile("(^\|\\s)([VIX]+\|\\d{1,2})$");
88	108	this.sentenceFollowGluePattern = Pattern.compile("^\\p{Ll}");
	109	+
	110	+ this.stopwords = new HashSet<String>();
	111	+ List<String> stop = AuxilliaryWikiProperties.loadList("Stopwords", languageName);
	112	+ if (stop!=null) this.stopwords.addAll(stop);
	113	+
	114	+ this.phraseBreakerPattern = Pattern.compile("[,;:]\\s\|\"");
	115	+ this.parentacies = new ArrayList<Pair<String, String>>();
	116	+ this.parentacies.add( new Pair<String, String>("(", ")") );
	117	+ this.parentacies.add( new Pair<String, String>("[", "]") );
	118	+ this.parentacies.add( new Pair<String, String>("{", "}") );
	119	+ this.parentacies.add( new Pair<String, String>("\"", "\"") );
89	120	}
90	121
91	122	public void merge(LanguageConfiguration with) {
—	—	@@ -92,8 +123,12 @@
93	124	if (with.sentenceTailGluePattern!=null) sentenceTailGluePattern = with.sentenceTailGluePattern;
94	125	if (with.sentenceFollowGluePattern!=null) sentenceFollowGluePattern = with.sentenceFollowGluePattern;
95	126
96		~~- sentenceManglers.addAll(with.sentenceManglers);~~
	127	+ if (with.sentenceManglers!=null) sentenceManglers.addAll(with.sentenceManglers);
97	128
98	129	if (with.wordPattern!=null) wordPattern = with.wordPattern;
	130	+ if (with.phraseBreakerPattern!=null) phraseBreakerPattern = with.phraseBreakerPattern;
	131	+
	132	+ if (with.stopwords!=null) stopwords.addAll(with.stopwords);
	133	+ if (with.parentacies!=null) parentacies.addAll(with.parentacies);
99	134	}
100	135	}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
—	—	@@ -1,11 +1,23 @@
2	2	package de.brightbyte.wikiword.analyzer;
3	3
	4	+import java.io.BufferedReader;
	5	+import java.io.File;
	6	+import java.io.IOException;
	7	+import java.io.InputStreamReader;
4	8	import java.lang.reflect.Constructor;
5	9	import java.lang.reflect.InvocationTargetException;
	10	+import java.text.ParsePosition;
6	11	import java.util.ArrayList;
7	12	import java.util.List;
8	13	import java.util.regex.Matcher;
9	14
	15	+import de.brightbyte.application.Arguments;
	16	+import de.brightbyte.audit.DebugUtil;
	17	+import de.brightbyte.data.DefaultLookup;
	18	+import de.brightbyte.data.Lookup;
	19	+import de.brightbyte.data.filter.Filter;
	20	+import de.brightbyte.data.filter.FixedSetFilter;
	21	+import de.brightbyte.io.ConsoleIO;
10	22	import de.brightbyte.wikiword.Corpus;
11	23	import de.brightbyte.wikiword.TweakSet;
12	24
—	—	@@ -17,9 +29,13 @@
18	30	private Matcher sentenceFollowGlueMatcher;
19	31	private Matcher wordMatcher;
20	32
	33	+ protected Filter<String> stopwordFilter;
	34	+ protected Matcher phraseBreakeMatcher;
	35	+ protected Lookup<String, String> bracketLookup;
	36	+
21	37	private Corpus corpus;
22	38
23		~~- public PlainTextAnalyzer(Corpus corpus) {~~
	39	+ public PlainTextAnalyzer(Corpus corpus) throws IOException {
24	40	this.corpus = corpus;
25	41
26	42	config = new LanguageConfiguration(corpus.getLanguage());
—	—	@@ -75,6 +91,10 @@
76	92	sentenceTailGlueMatcher = config.sentenceTailGluePattern.matcher("");
77	93	sentenceFollowGlueMatcher = config.sentenceFollowGluePattern.matcher("");
78	94	wordMatcher = config.wordPattern.matcher("");
	95	+
	96	+ phraseBreakeMatcher = config.phraseBreakerPattern.matcher("");
	97	+ stopwordFilter = new FixedSetFilter<String>(config.stopwords);
	98	+ bracketLookup = new DefaultLookup<String, String>(config.parentacies);
79	99	}
80	100
81	101
—	—	@@ -84,21 +104,35 @@
85	105	* @return
86	106	*/
87	107	public CharSequence extractFirstSentence(CharSequence text) {
	108	+ return extractNextSentence(text, null, true);
	109	+ }
	110	+
	111	+ public CharSequence extractNextSentence(CharSequence text, ParsePosition position, boolean mangle) {
88	112	if (text==null \|\| text.length()==0) return "";
89		-
90		~~- text = applyManglers(config.sentenceManglers, text);~~
	113	+
	114	+ if (mangle) text = applyManglers(config.sentenceManglers, text);
91	115	if (text.length()==0) return "";
92	116
93	117	sentenceMatcher.reset(text);
94	118	sentenceTailGlueMatcher.reset(text);
95	119	sentenceFollowGlueMatcher.reset(text);
96	120
	121	+ int ofs = 0;
	122	+ if (position!=null) {
	123	+ ofs = position.getIndex();
	124	+ if (ofs>=text.length()) return "";
	125	+
	126	+ sentenceMatcher.region(ofs, text.length());
	127	+ sentenceTailGlueMatcher.region(ofs, text.length());
	128	+ sentenceFollowGlueMatcher.region(ofs, text.length());
	129	+ }
	130	+
97	131	StringBuilder s = new StringBuilder();
98		~~- int pos = 0;~~
99	132	boolean add = false;
100	133	while (sentenceMatcher.find()) {
101		~~- int start = pos;~~
102		~~- pos = sentenceMatcher.end();~~
	134	+ int start = ofs;
	135	+ ofs = sentenceMatcher.end();
	136	+ if (position!=null) position.setIndex(ofs);
103	137
104	138	s.append(text, start, sentenceMatcher.end());
105	139
—	—	@@ -113,7 +147,7 @@
114	148	continue;
115	149	}
116	150
117		~~- sentenceFollowGlueMatcher.region(pos, text.length());~~
	151	+ sentenceFollowGlueMatcher.region(ofs, text.length());
118	152	if (sentenceFollowGlueMatcher.lookingAt()) {
119	153	add = true;
120	154	continue;
—	—	@@ -124,12 +158,16 @@
125	159	}
126	160
127	161	if (add) {
128		~~- s.append(text.subSequence(pos, text.length()));~~
129		~~- pos = text.length();~~
	162	+ s.append(text.subSequence(ofs, text.length()));
	163	+ ofs = text.length();
	164	+ if (position!=null) position.setIndex(ofs);
130	165	}
131	166
132		~~- if (pos!=0) text = AnalyzerUtils.trim(s);~~
133		~~- else AnalyzerUtils.trim(text);~~
	167	+ if (ofs!=0) text = AnalyzerUtils.trim(s);
	168	+ else {
	169	+ if (position!=null) position.setIndex(text.length());
	170	+ AnalyzerUtils.trim(text);
	171	+ }
134	172
135	173	return text;
136	174	}
—	—	@@ -150,4 +188,79 @@
151	189	return corpus;
152	190	}
153	191
	192	+ public PhraseOccuranceSequence extractPhrases(CharSequence text, int maxWeight) {
	193	+ ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
	194	+
	195	+ text = applyManglers(config.sentenceManglers, text);
	196	+
	197	+ ParsePosition pos = new ParsePosition(0);
	198	+ while (pos.getIndex() < text.length()) {
	199	+ int ofs = pos.getIndex();
	200	+ CharSequence s = extractNextSentence(text, pos, false);
	201	+ if (s==null \|\| s.length()==0) break;
	202	+
	203	+ buildPhrases(s, ofs, phrases, maxWeight);
	204	+ }
	205	+
	206	+ return new PhraseOccuranceSequence(text.toString(), phrases);
	207	+ }
	208	+
	209	+ private PhraseAggregator buildPhrasesAggregator = null;
	210	+
	211	+ private void buildPhrases(CharSequence text, int offset, ArrayList<PhraseOccurance> into, int maxWeight) {
	212	+ if (buildPhrasesAggregator==null) buildPhrasesAggregator = new PhraseAggregator(phraseBreakeMatcher);
	213	+ buildPhrasesAggregator.reset(offset, maxWeight);
	214	+
	215	+ int i = 0;
	216	+ wordMatcher.reset(text);
	217	+ while (wordMatcher.find()) {
	218	+ if (i != wordMatcher.start()) {
	219	+ CharSequence space = text.subSequence(i, wordMatcher.start());
	220	+ buildPhrasesAggregator.update(i, space, -1, into);
	221	+ }
	222	+
	223	+ i = wordMatcher.end();
	224	+ String w;
	225	+ int weight = 1;
	226	+
	227	+ if (wordMatcher.groupCount()>0) w = wordMatcher.group(1);
	228	+ else w = wordMatcher.group(0);
	229	+
	230	+ if (stopwordFilter.matches(w)) weight = 0;
	231	+ buildPhrasesAggregator.update(wordMatcher.start(), w, weight, into);
	232	+ }
	233	+
	234	+ if (i < text.length()) {
	235	+ CharSequence space = text.subSequence(i, text.length());
	236	+ buildPhrasesAggregator.update(i, space, 0, into);
	237	+ }
	238	+ }
	239	+
	240	+ public static void main(String[] argv) throws IOException, InstantiationException {
	241	+ Arguments args = new Arguments();
	242	+ args.declare("tweaks", null, true, String.class, "tweak file");
	243	+
	244	+ args.parse(argv);
	245	+
	246	+ String lang = args.getParameter(0);
	247	+
	248	+ TweakSet tweaks = new TweakSet();
	249	+
	250	+ String tf = args.getStringOption("tweaks", null);
	251	+ if (tf!=null) tweaks.loadTweaks(new File(tf));
	252	+
	253	+ tweaks.setTweaks(System.getProperties(), "wikiword.tweak."); //XXX: doc
	254	+ tweaks.setTweaks(args, "tweak."); //XXX: doc
	255	+
	256	+ Corpus corpus = Corpus.forName("TEST", lang, tweaks);
	257	+ PlainTextAnalyzer analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(corpus, tweaks);
	258	+ analyzer.initialize();
	259	+
	260	+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
	261	+ String s ;
	262	+ while ( (s = in.readLine()) != null ) {
	263	+ PhraseOccuranceSequence phrases = analyzer.extractPhrases(s, 6);
	264	+ DebugUtil.dump("", phrases, ConsoleIO.output);
	265	+ }
	266	+ }
154	267	}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java
—	—	@@ -0,0 +1,111 @@
	2	+package de.brightbyte.wikiword.analyzer;
	3	+
	4	+import java.util.ArrayList;
	5	+import java.util.Collection;
	6	+import java.util.Iterator;
	7	+import java.util.regex.Matcher;
	8	+
	9	+public class PhraseAggregator {
	10	+ public class PhraseBuilder {
	11	+ protected StringBuilder phrase;
	12	+ protected int weight;
	13	+ protected int lastWeight;
	14	+ protected int offset;
	15	+
	16	+ public PhraseBuilder(int offset) {
	17	+ this.phrase = new StringBuilder();
	18	+ this.weight = 0;
	19	+ this.offset = offset;
	20	+ }
	21	+
	22	+ public int getLength() {
	23	+ return phrase.length();
	24	+ }
	25	+
	26	+ public int getOffset() {
	27	+ return offset;
	28	+ }
	29	+
	30	+ public int getEndOffset() {
	31	+ return getOffset() + getLength();
	32	+ }
	33	+
	34	+ public String getPhrase() {
	35	+ return phrase.toString();
	36	+ }
	37	+
	38	+ public int getWeight() {
	39	+ return weight;
	40	+ }
	41	+
	42	+ public int getLastWeight() {
	43	+ return lastWeight;
	44	+ }
	45	+
	46	+ public PhraseOccurance toPhraseOccurance() {
	47	+ return new PhraseOccurance(getPhrase(), getWeight(), getOffset(), getLength());
	48	+ }
	49	+
	50	+ public String toString() {
	51	+ return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";
	52	+ }
	53	+
	54	+ public void push(CharSequence w, int weight) {
	55	+ phrase.append(w);
	56	+ if (weight>0) this.weight+= weight;
	57	+ this.lastWeight = weight;
	58	+ }
	59	+ }
	60	+
	61	+ private int offset = 0;
	62	+ private int maxWeight = 0;
	63	+
	64	+ private Matcher phraseBreakeMatcher;
	65	+ private ArrayList<PhraseBuilder> phrases = new ArrayList<PhraseBuilder>();
	66	+
	67	+ public PhraseAggregator(Matcher phraseBreakeMatcher) {
	68	+ super();
	69	+ this.phraseBreakeMatcher = phraseBreakeMatcher;
	70	+ }
	71	+
	72	+ public void reset(int offset, int maxWeight) {
	73	+ this.offset = offset;
	74	+ this.maxWeight = maxWeight;
	75	+ clear();
	76	+ }
	77	+
	78	+ public void clear() {
	79	+ phrases.clear();
	80	+ }
	81	+
	82	+ public void update(int index, CharSequence word, int weight, Collection<PhraseOccurance> into) {
	83	+ if (weight<0) {
	84	+ phraseBreakeMatcher.reset(word);
	85	+ if (phraseBreakeMatcher.matches()) {
	86	+ this.clear();
	87	+ return;
	88	+ }
	89	+ }
	90	+
	91	+ this.push(index, word, weight);
	92	+ this.commit(into);
	93	+ }
	94	+
	95	+ public void push(int index, CharSequence word, int weight) {
	96	+ if (weight >= 0) phrases.add(new PhraseBuilder(index+offset));
	97	+
	98	+ Iterator<PhraseBuilder> it = phrases.iterator();
	99	+ while (it.hasNext()) {
	100	+ PhraseBuilder b = it.next();
	101	+ b.push(word, weight);
	102	+ if (b.getWeight() > maxWeight) it.remove();
	103	+ }
	104	+ }
	105	+
	106	+ public void commit(Collection<PhraseOccurance> into) {
	107	+ for (PhraseBuilder b: phrases) {
	108	+ if (b.getWeight() > 0 && b.getLastWeight() > 0) into.add(b.toPhraseOccurance());
	109	+ }
	110	+ }
	111	+
	112	+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccuranceSequence.java
—	—	@@ -0,0 +1,82 @@
	2	+package de.brightbyte.wikiword.analyzer;
	3	+
	4	+import java.util.AbstractList;
	5	+import java.util.ArrayList;
	6	+import java.util.Collections;
	7	+import java.util.List;
	8	+import java.util.RandomAccess;
	9	+
	10	+import de.brightbyte.data.filter.Filter;
	11	+
	12	+public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess {
	13	+
	14	+ protected List<PhraseOccurance> phrases;
	15	+ protected String text;
	16	+
	17	+ public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) {
	18	+ this.text = text;
	19	+
	20	+ this.phrases = phrases;
	21	+ Collections.sort(this.phrases); //essential!
	22	+ }
	23	+
	24	+ @Override
	25	+ public PhraseOccurance get(int index) {
	26	+ return phrases.get(index);
	27	+ }
	28	+
	29	+ @Override
	30	+ public int size() {
	31	+ return phrases.size();
	32	+ }
	33	+
	34	+ public String getText() {
	35	+ return text;
	36	+ }
	37	+
	38	+ public List<PhraseOccurance> getPhrasesAt(int offs) {
	39	+ int i = 0;
	40	+ while (i<size()) {
	41	+ PhraseOccurance p = get(i);
	42	+ if (p.getOffset() >= offs) {
	43	+ offs = p.getOffset();
	44	+ break;
	45	+ }
	46	+
	47	+ i++;
	48	+ }
	49	+
	50	+ if (i>=size()) return null;
	51	+
	52	+ int j = i;
	53	+ while (j<size()) {
	54	+ PhraseOccurance p = get(j);
	55	+ if (p.getOffset() > offs) break;
	56	+ j++;
	57	+ }
	58	+
	59	+ return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
	60	+ }
	61	+
	62	+ public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
	63	+ List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
	64	+
	65	+ int i = 0;
	66	+
	67	+ outer:
	68	+ while (i<size()) {
	69	+ List<PhraseOccurance> candidates = getPhrasesAt(i);
	70	+ if (candidates == null) break;
	71	+
	72	+ for (PhraseOccurance p: candidates) {
	73	+ i = p.getEndOffset();
	74	+ if (filter.matches(p.getPhrase())) {
	75	+ phrases.add(p);
	76	+ continue outer;
	77	+ }
	78	+ }
	79	+ }
	80	+
	81	+ return phrases;
	82	+ }
	83	+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/Stopwords_en.properties
—	—	@@ -0,0 +1,38 @@
	2	+I
	3	+a
	4	+about
	5	+an
	6	+and
	7	+are
	8	+as
	9	+at
	10	+be
	11	+by
	12	+for
	13	+from
	14	+has
	15	+have
	16	+his
	17	+her
	18	+how
	19	+in
	20	+is
	21	+it
	22	+of
	23	+on
	24	+or
	25	+out
	26	+that
	27	+the
	28	+this
	29	+to
	30	+was
	31	+what
	32	+when
	33	+where
	34	+which
	35	+who
	36	+will
	37	+with
	38	+you
	39	+your
\ No newline at end of file

Status & tagging log

00:49, 31 October 2009 😂 (talk | contribs) changed the status of r58379 [removed: new added: deferred]