Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java |
— | — | @@ -1,5 +1,6 @@ |
2 | 2 | package de.brightbyte.wikiword.analyzer; |
3 | 3 | |
| 4 | +import java.io.IOException; |
4 | 5 | import java.net.URISyntaxException; |
5 | 6 | import java.util.Arrays; |
6 | 7 | import java.util.HashSet; |
— | — | @@ -24,7 +25,7 @@ |
25 | 26 | protected class TestPlainTextAnalyzer extends PlainTextAnalyzer { |
26 | 27 | //TODO: check coverage! |
27 | 28 | |
28 | | - public TestPlainTextAnalyzer(Corpus corpus) { |
| 29 | + public TestPlainTextAnalyzer(Corpus corpus) throws IOException { |
29 | 30 | super(corpus); |
30 | 31 | } |
31 | 32 | |
— | — | @@ -79,7 +80,7 @@ |
80 | 81 | protected TestPlainTextAnalyzer testAnalyzer; |
81 | 82 | |
82 | 83 | @Override |
83 | | - public void setUp() throws URISyntaxException { |
| 84 | + public void setUp() throws URISyntaxException, IOException { |
84 | 85 | LanguageConfiguration config = new LanguageConfiguration(); |
85 | 86 | |
86 | 87 | corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccurance.java |
— | — | @@ -0,0 +1,98 @@ |
| 2 | +package de.brightbyte.wikiword.analyzer; |
| 3 | + |
| 4 | +import java.io.Serializable; |
| 5 | + |
| 6 | +public class PhraseOccurance implements Serializable, Comparable<PhraseOccurance> { |
| 7 | + |
| 8 | + private static final long serialVersionUID = 241753475865301115L; |
| 9 | + |
| 10 | + protected String phrase; |
| 11 | + protected int weight; |
| 12 | + protected int offset; |
| 13 | + protected int length; |
| 14 | + |
| 15 | + public PhraseOccurance(String phrase, int weight, int offset, int length) { |
| 16 | + if (length <= 0) throw new IllegalArgumentException("bad length: "+length); |
| 17 | + if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string"); |
| 18 | + //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string"); |
| 19 | + if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string"); |
| 20 | + if (length < phrase.length()) phrase = phrase.substring(offset, offset+length); |
| 21 | + |
| 22 | + this.phrase = phrase; |
| 23 | + this.weight = weight; |
| 24 | + this.offset = offset; |
| 25 | + this.length = length; |
| 26 | + } |
| 27 | + |
| 28 | + public int getLength() { |
| 29 | + return length; |
| 30 | + } |
| 31 | + |
| 32 | + public int getOffset() { |
| 33 | + return offset; |
| 34 | + } |
| 35 | + |
| 36 | + public int getEndOffset() { |
| 37 | + return getOffset() + getLength(); |
| 38 | + } |
| 39 | + |
| 40 | + public String getPhrase() { |
| 41 | + return phrase; |
| 42 | + } |
| 43 | + |
| 44 | + public int getWeight() { |
| 45 | + return weight; |
| 46 | + } |
| 47 | + |
| 48 | + public String toString() { |
| 49 | + return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]"; |
| 50 | + } |
| 51 | + |
| 52 | + @Override |
| 53 | + public int hashCode() { |
| 54 | + final int PRIME = 31; |
| 55 | + int result = 1; |
| 56 | + result = PRIME * result + length; |
| 57 | + result = PRIME * result + offset; |
| 58 | + result = PRIME * result + ((phrase == null) ? 0 : phrase.hashCode()); |
| 59 | + return result; |
| 60 | + } |
| 61 | + |
| 62 | + @Override |
| 63 | + public boolean equals(Object obj) { |
| 64 | + if (this == obj) |
| 65 | + return true; |
| 66 | + if (obj == null) |
| 67 | + return false; |
| 68 | + if (getClass() != obj.getClass()) |
| 69 | + return false; |
| 70 | + final PhraseOccurance other = (PhraseOccurance) obj; |
| 71 | + if (length != other.length) |
| 72 | + return false; |
| 73 | + if (offset != other.offset) |
| 74 | + return false; |
| 75 | + if (phrase == null) { |
| 76 | + if (other.phrase != null) |
| 77 | + return false; |
| 78 | + } else if (!phrase.equals(other.phrase)) |
| 79 | + return false; |
| 80 | + return true; |
| 81 | + } |
| 82 | + |
| 83 | + public boolean overlaps(PhraseOccurance other) { |
| 84 | + if (getEndOffset() <= other.getOffset()) return false; |
| 85 | + if (getOffset() >= other.getEndOffset()) return false; |
| 86 | + |
| 87 | + return true; |
| 88 | + } |
| 89 | + |
| 90 | + public int compareTo(PhraseOccurance other) { |
| 91 | + int o = getOffset() - other.getOffset(); |
| 92 | + if (o!=0) return o; //by offset... |
| 93 | + |
| 94 | + int e = getEndOffset() - other.getEndOffset(); |
| 95 | + if (e!=0) return -e; //but longest first! |
| 96 | + |
| 97 | + return 0; |
| 98 | + } |
| 99 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java |
— | — | @@ -1,9 +1,14 @@ |
2 | 2 | package de.brightbyte.wikiword.analyzer; |
3 | 3 | |
| 4 | +import java.io.IOException; |
4 | 5 | import java.util.ArrayList; |
| 6 | +import java.util.Collection; |
| 7 | +import java.util.HashSet; |
5 | 8 | import java.util.List; |
| 9 | +import java.util.Set; |
6 | 10 | import java.util.regex.Pattern; |
7 | 11 | |
| 12 | +import de.brightbyte.data.Pair; |
8 | 13 | import de.brightbyte.wikiword.analyzer.mangler.Mangler; |
9 | 14 | import de.brightbyte.wikiword.analyzer.mangler.RegularExpressionMangler; |
10 | 15 | |
— | — | @@ -60,6 +65,21 @@ |
61 | 66 | public Pattern wordPattern; |
62 | 67 | |
63 | 68 | protected String languageName; |
| 69 | + |
| 70 | + /** |
| 71 | + * List of stopwords, that is, words that are too frequent to be useful for searches. |
| 72 | + */ |
| 73 | + public Set<String> stopwords; |
| 74 | + |
| 75 | + /** |
| 76 | + * Symbols that break a phrase, like most punctuation would |
| 77 | + */ |
| 78 | + public Pattern phraseBreakerPattern; |
| 79 | + |
| 80 | + /** |
| 81 | + * pairs of matching parantecies |
| 82 | + */ |
| 83 | + public Collection<Pair<String, String>> parentacies; |
64 | 84 | |
65 | 85 | public LanguageConfiguration() { |
66 | 86 | this(null); |
— | — | @@ -77,7 +97,7 @@ |
78 | 98 | return languageName; |
79 | 99 | } |
80 | 100 | |
81 | | - public void defaults() { |
| 101 | + public void defaults() throws IOException { |
82 | 102 | if (this.wordPattern==null) this.wordPattern = Pattern.compile("\\p{L}+|\\p{Nd}+"); |
83 | 103 | |
84 | 104 | this.sentenceManglers.add( new RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks |
— | — | @@ -85,6 +105,17 @@ |
86 | 106 | this.sentencePattern = Pattern.compile("(\\r\\n|\\n|\\r)|\\.[\\s\\r\\n]"); //TODO: check what happens if we allow single newlines in sentences! Breaking on single newlines causes truncated definitions. |
87 | 107 | this.sentenceTailGluePattern = Pattern.compile("(^|\\s)([VIX]+|\\d{1,2})$"); |
88 | 108 | this.sentenceFollowGluePattern = Pattern.compile("^\\p{Ll}"); |
| 109 | + |
| 110 | + this.stopwords = new HashSet<String>(); |
| 111 | + List<String> stop = AuxilliaryWikiProperties.loadList("Stopwords", languageName); |
| 112 | + if (stop!=null) this.stopwords.addAll(stop); |
| 113 | + |
| 114 | + this.phraseBreakerPattern = Pattern.compile("[,;:]\\s|\""); |
| 115 | + this.parentacies = new ArrayList<Pair<String, String>>(); |
| 116 | + this.parentacies.add( new Pair<String, String>("(", ")") ); |
| 117 | + this.parentacies.add( new Pair<String, String>("[", "]") ); |
| 118 | + this.parentacies.add( new Pair<String, String>("{", "}") ); |
| 119 | + this.parentacies.add( new Pair<String, String>("\"", "\"") ); |
89 | 120 | } |
90 | 121 | |
91 | 122 | public void merge(LanguageConfiguration with) { |
— | — | @@ -92,8 +123,12 @@ |
93 | 124 | if (with.sentenceTailGluePattern!=null) sentenceTailGluePattern = with.sentenceTailGluePattern; |
94 | 125 | if (with.sentenceFollowGluePattern!=null) sentenceFollowGluePattern = with.sentenceFollowGluePattern; |
95 | 126 | |
96 | | - sentenceManglers.addAll(with.sentenceManglers); |
| 127 | + if (with.sentenceManglers!=null) sentenceManglers.addAll(with.sentenceManglers); |
97 | 128 | |
98 | 129 | if (with.wordPattern!=null) wordPattern = with.wordPattern; |
| 130 | + if (with.phraseBreakerPattern!=null) phraseBreakerPattern = with.phraseBreakerPattern; |
| 131 | + |
| 132 | + if (with.stopwords!=null) stopwords.addAll(with.stopwords); |
| 133 | + if (with.parentacies!=null) parentacies.addAll(with.parentacies); |
99 | 134 | } |
100 | 135 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java |
— | — | @@ -1,11 +1,23 @@ |
2 | 2 | package de.brightbyte.wikiword.analyzer; |
3 | 3 | |
| 4 | +import java.io.BufferedReader; |
| 5 | +import java.io.File; |
| 6 | +import java.io.IOException; |
| 7 | +import java.io.InputStreamReader; |
4 | 8 | import java.lang.reflect.Constructor; |
5 | 9 | import java.lang.reflect.InvocationTargetException; |
| 10 | +import java.text.ParsePosition; |
6 | 11 | import java.util.ArrayList; |
7 | 12 | import java.util.List; |
8 | 13 | import java.util.regex.Matcher; |
9 | 14 | |
| 15 | +import de.brightbyte.application.Arguments; |
| 16 | +import de.brightbyte.audit.DebugUtil; |
| 17 | +import de.brightbyte.data.DefaultLookup; |
| 18 | +import de.brightbyte.data.Lookup; |
| 19 | +import de.brightbyte.data.filter.Filter; |
| 20 | +import de.brightbyte.data.filter.FixedSetFilter; |
| 21 | +import de.brightbyte.io.ConsoleIO; |
10 | 22 | import de.brightbyte.wikiword.Corpus; |
11 | 23 | import de.brightbyte.wikiword.TweakSet; |
12 | 24 | |
— | — | @@ -17,9 +29,13 @@ |
18 | 30 | private Matcher sentenceFollowGlueMatcher; |
19 | 31 | private Matcher wordMatcher; |
20 | 32 | |
| 33 | + protected Filter<String> stopwordFilter; |
| 34 | + protected Matcher phraseBreakeMatcher; |
| 35 | + protected Lookup<String, String> bracketLookup; |
| 36 | + |
21 | 37 | private Corpus corpus; |
22 | 38 | |
23 | | - public PlainTextAnalyzer(Corpus corpus) { |
| 39 | + public PlainTextAnalyzer(Corpus corpus) throws IOException { |
24 | 40 | this.corpus = corpus; |
25 | 41 | |
26 | 42 | config = new LanguageConfiguration(corpus.getLanguage()); |
— | — | @@ -75,6 +91,10 @@ |
76 | 92 | sentenceTailGlueMatcher = config.sentenceTailGluePattern.matcher(""); |
77 | 93 | sentenceFollowGlueMatcher = config.sentenceFollowGluePattern.matcher(""); |
78 | 94 | wordMatcher = config.wordPattern.matcher(""); |
| 95 | + |
| 96 | + phraseBreakeMatcher = config.phraseBreakerPattern.matcher(""); |
| 97 | + stopwordFilter = new FixedSetFilter<String>(config.stopwords); |
| 98 | + bracketLookup = new DefaultLookup<String, String>(config.parentacies); |
79 | 99 | } |
80 | 100 | |
81 | 101 | |
— | — | @@ -84,21 +104,35 @@ |
85 | 105 | * @return |
86 | 106 | */ |
87 | 107 | public CharSequence extractFirstSentence(CharSequence text) { |
| 108 | + return extractNextSentence(text, null, true); |
| 109 | + } |
| 110 | + |
| 111 | + public CharSequence extractNextSentence(CharSequence text, ParsePosition position, boolean mangle) { |
88 | 112 | if (text==null || text.length()==0) return ""; |
89 | | - |
90 | | - text = applyManglers(config.sentenceManglers, text); |
| 113 | + |
| 114 | + if (mangle) text = applyManglers(config.sentenceManglers, text); |
91 | 115 | if (text.length()==0) return ""; |
92 | 116 | |
93 | 117 | sentenceMatcher.reset(text); |
94 | 118 | sentenceTailGlueMatcher.reset(text); |
95 | 119 | sentenceFollowGlueMatcher.reset(text); |
96 | 120 | |
| 121 | + int ofs = 0; |
| 122 | + if (position!=null) { |
| 123 | + ofs = position.getIndex(); |
| 124 | + if (ofs>=text.length()) return ""; |
| 125 | + |
| 126 | + sentenceMatcher.region(ofs, text.length()); |
| 127 | + sentenceTailGlueMatcher.region(ofs, text.length()); |
| 128 | + sentenceFollowGlueMatcher.region(ofs, text.length()); |
| 129 | + } |
| 130 | + |
97 | 131 | StringBuilder s = new StringBuilder(); |
98 | | - int pos = 0; |
99 | 132 | boolean add = false; |
100 | 133 | while (sentenceMatcher.find()) { |
101 | | - int start = pos; |
102 | | - pos = sentenceMatcher.end(); |
| 134 | + int start = ofs; |
| 135 | + ofs = sentenceMatcher.end(); |
| 136 | + if (position!=null) position.setIndex(ofs); |
103 | 137 | |
104 | 138 | s.append(text, start, sentenceMatcher.end()); |
105 | 139 | |
— | — | @@ -113,7 +147,7 @@ |
114 | 148 | continue; |
115 | 149 | } |
116 | 150 | |
117 | | - sentenceFollowGlueMatcher.region(pos, text.length()); |
| 151 | + sentenceFollowGlueMatcher.region(ofs, text.length()); |
118 | 152 | if (sentenceFollowGlueMatcher.lookingAt()) { |
119 | 153 | add = true; |
120 | 154 | continue; |
— | — | @@ -124,12 +158,16 @@ |
125 | 159 | } |
126 | 160 | |
127 | 161 | if (add) { |
128 | | - s.append(text.subSequence(pos, text.length())); |
129 | | - pos = text.length(); |
| 162 | + s.append(text.subSequence(ofs, text.length())); |
| 163 | + ofs = text.length(); |
| 164 | + if (position!=null) position.setIndex(ofs); |
130 | 165 | } |
131 | 166 | |
132 | | - if (pos!=0) text = AnalyzerUtils.trim(s); |
133 | | - else AnalyzerUtils.trim(text); |
| 167 | + if (ofs!=0) text = AnalyzerUtils.trim(s); |
| 168 | + else { |
| 169 | + if (position!=null) position.setIndex(text.length()); |
| 170 | + AnalyzerUtils.trim(text); |
| 171 | + } |
134 | 172 | |
135 | 173 | return text; |
136 | 174 | } |
— | — | @@ -150,4 +188,79 @@ |
151 | 189 | return corpus; |
152 | 190 | } |
153 | 191 | |
| 192 | + public PhraseOccuranceSequence extractPhrases(CharSequence text, int maxWeight) { |
| 193 | + ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 194 | + |
| 195 | + text = applyManglers(config.sentenceManglers, text); |
| 196 | + |
| 197 | + ParsePosition pos = new ParsePosition(0); |
| 198 | + while (pos.getIndex() < text.length()) { |
| 199 | + int ofs = pos.getIndex(); |
| 200 | + CharSequence s = extractNextSentence(text, pos, false); |
| 201 | + if (s==null || s.length()==0) break; |
| 202 | + |
| 203 | + buildPhrases(s, ofs, phrases, maxWeight); |
| 204 | + } |
| 205 | + |
| 206 | + return new PhraseOccuranceSequence(text.toString(), phrases); |
| 207 | + } |
| 208 | + |
| 209 | + private PhraseAggregator buildPhrasesAggregator = null; |
| 210 | + |
| 211 | + private void buildPhrases(CharSequence text, int offset, ArrayList<PhraseOccurance> into, int maxWeight) { |
| 212 | + if (buildPhrasesAggregator==null) buildPhrasesAggregator = new PhraseAggregator(phraseBreakeMatcher); |
| 213 | + buildPhrasesAggregator.reset(offset, maxWeight); |
| 214 | + |
| 215 | + int i = 0; |
| 216 | + wordMatcher.reset(text); |
| 217 | + while (wordMatcher.find()) { |
| 218 | + if (i != wordMatcher.start()) { |
| 219 | + CharSequence space = text.subSequence(i, wordMatcher.start()); |
| 220 | + buildPhrasesAggregator.update(i, space, -1, into); |
| 221 | + } |
| 222 | + |
| 223 | + i = wordMatcher.end(); |
| 224 | + String w; |
| 225 | + int weight = 1; |
| 226 | + |
| 227 | + if (wordMatcher.groupCount()>0) w = wordMatcher.group(1); |
| 228 | + else w = wordMatcher.group(0); |
| 229 | + |
| 230 | + if (stopwordFilter.matches(w)) weight = 0; |
| 231 | + buildPhrasesAggregator.update(wordMatcher.start(), w, weight, into); |
| 232 | + } |
| 233 | + |
| 234 | + if (i < text.length()) { |
| 235 | + CharSequence space = text.subSequence(i, text.length()); |
| 236 | + buildPhrasesAggregator.update(i, space, 0, into); |
| 237 | + } |
| 238 | + } |
| 239 | + |
| 240 | + public static void main(String[] argv) throws IOException, InstantiationException { |
| 241 | + Arguments args = new Arguments(); |
| 242 | + args.declare("tweaks", null, true, String.class, "tweak file"); |
| 243 | + |
| 244 | + args.parse(argv); |
| 245 | + |
| 246 | + String lang = args.getParameter(0); |
| 247 | + |
| 248 | + TweakSet tweaks = new TweakSet(); |
| 249 | + |
| 250 | + String tf = args.getStringOption("tweaks", null); |
| 251 | + if (tf!=null) tweaks.loadTweaks(new File(tf)); |
| 252 | + |
| 253 | + tweaks.setTweaks(System.getProperties(), "wikiword.tweak."); //XXX: doc |
| 254 | + tweaks.setTweaks(args, "tweak."); //XXX: doc |
| 255 | + |
| 256 | + Corpus corpus = Corpus.forName("TEST", lang, tweaks); |
| 257 | + PlainTextAnalyzer analyzer = PlainTextAnalyzer.getPlainTextAnalyzer(corpus, tweaks); |
| 258 | + analyzer.initialize(); |
| 259 | + |
| 260 | + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); |
| 261 | + String s ; |
| 262 | + while ( (s = in.readLine()) != null ) { |
| 263 | + PhraseOccuranceSequence phrases = analyzer.extractPhrases(s, 6); |
| 264 | + DebugUtil.dump("", phrases, ConsoleIO.output); |
| 265 | + } |
| 266 | + } |
154 | 267 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java |
— | — | @@ -0,0 +1,111 @@ |
| 2 | +package de.brightbyte.wikiword.analyzer; |
| 3 | + |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.Collection; |
| 6 | +import java.util.Iterator; |
| 7 | +import java.util.regex.Matcher; |
| 8 | + |
| 9 | +public class PhraseAggregator { |
| 10 | + public class PhraseBuilder { |
| 11 | + protected StringBuilder phrase; |
| 12 | + protected int weight; |
| 13 | + protected int lastWeight; |
| 14 | + protected int offset; |
| 15 | + |
| 16 | + public PhraseBuilder(int offset) { |
| 17 | + this.phrase = new StringBuilder(); |
| 18 | + this.weight = 0; |
| 19 | + this.offset = offset; |
| 20 | + } |
| 21 | + |
| 22 | + public int getLength() { |
| 23 | + return phrase.length(); |
| 24 | + } |
| 25 | + |
| 26 | + public int getOffset() { |
| 27 | + return offset; |
| 28 | + } |
| 29 | + |
| 30 | + public int getEndOffset() { |
| 31 | + return getOffset() + getLength(); |
| 32 | + } |
| 33 | + |
| 34 | + public String getPhrase() { |
| 35 | + return phrase.toString(); |
| 36 | + } |
| 37 | + |
| 38 | + public int getWeight() { |
| 39 | + return weight; |
| 40 | + } |
| 41 | + |
| 42 | + public int getLastWeight() { |
| 43 | + return lastWeight; |
| 44 | + } |
| 45 | + |
| 46 | + public PhraseOccurance toPhraseOccurance() { |
| 47 | + return new PhraseOccurance(getPhrase(), getWeight(), getOffset(), getLength()); |
| 48 | + } |
| 49 | + |
| 50 | + public String toString() { |
| 51 | + return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]"; |
| 52 | + } |
| 53 | + |
| 54 | + public void push(CharSequence w, int weight) { |
| 55 | + phrase.append(w); |
| 56 | + if (weight>0) this.weight+= weight; |
| 57 | + this.lastWeight = weight; |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + private int offset = 0; |
| 62 | + private int maxWeight = 0; |
| 63 | + |
| 64 | + private Matcher phraseBreakeMatcher; |
| 65 | + private ArrayList<PhraseBuilder> phrases = new ArrayList<PhraseBuilder>(); |
| 66 | + |
| 67 | + public PhraseAggregator(Matcher phraseBreakeMatcher) { |
| 68 | + super(); |
| 69 | + this.phraseBreakeMatcher = phraseBreakeMatcher; |
| 70 | + } |
| 71 | + |
| 72 | + public void reset(int offset, int maxWeight) { |
| 73 | + this.offset = offset; |
| 74 | + this.maxWeight = maxWeight; |
| 75 | + clear(); |
| 76 | + } |
| 77 | + |
| 78 | + public void clear() { |
| 79 | + phrases.clear(); |
| 80 | + } |
| 81 | + |
| 82 | + public void update(int index, CharSequence word, int weight, Collection<PhraseOccurance> into) { |
| 83 | + if (weight<0) { |
| 84 | + phraseBreakeMatcher.reset(word); |
| 85 | + if (phraseBreakeMatcher.matches()) { |
| 86 | + this.clear(); |
| 87 | + return; |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + this.push(index, word, weight); |
| 92 | + this.commit(into); |
| 93 | + } |
| 94 | + |
| 95 | + public void push(int index, CharSequence word, int weight) { |
| 96 | + if (weight >= 0) phrases.add(new PhraseBuilder(index+offset)); |
| 97 | + |
| 98 | + Iterator<PhraseBuilder> it = phrases.iterator(); |
| 99 | + while (it.hasNext()) { |
| 100 | + PhraseBuilder b = it.next(); |
| 101 | + b.push(word, weight); |
| 102 | + if (b.getWeight() > maxWeight) it.remove(); |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + public void commit(Collection<PhraseOccurance> into) { |
| 107 | + for (PhraseBuilder b: phrases) { |
| 108 | + if (b.getWeight() > 0 && b.getLastWeight() > 0) into.add(b.toPhraseOccurance()); |
| 109 | + } |
| 110 | + } |
| 111 | + |
| 112 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseOccuranceSequence.java |
— | — | @@ -0,0 +1,82 @@ |
| 2 | +package de.brightbyte.wikiword.analyzer; |
| 3 | + |
| 4 | +import java.util.AbstractList; |
| 5 | +import java.util.ArrayList; |
| 6 | +import java.util.Collections; |
| 7 | +import java.util.List; |
| 8 | +import java.util.RandomAccess; |
| 9 | + |
| 10 | +import de.brightbyte.data.filter.Filter; |
| 11 | + |
| 12 | +public class PhraseOccuranceSequence extends AbstractList<PhraseOccurance> implements RandomAccess { |
| 13 | + |
| 14 | + protected List<PhraseOccurance> phrases; |
| 15 | + protected String text; |
| 16 | + |
| 17 | + public PhraseOccuranceSequence(String text, List<PhraseOccurance> phrases) { |
| 18 | + this.text = text; |
| 19 | + |
| 20 | + this.phrases = phrases; |
| 21 | + Collections.sort(this.phrases); //essential! |
| 22 | + } |
| 23 | + |
| 24 | + @Override |
| 25 | + public PhraseOccurance get(int index) { |
| 26 | + return phrases.get(index); |
| 27 | + } |
| 28 | + |
| 29 | + @Override |
| 30 | + public int size() { |
| 31 | + return phrases.size(); |
| 32 | + } |
| 33 | + |
| 34 | + public String getText() { |
| 35 | + return text; |
| 36 | + } |
| 37 | + |
| 38 | + public List<PhraseOccurance> getPhrasesAt(int offs) { |
| 39 | + int i = 0; |
| 40 | + while (i<size()) { |
| 41 | + PhraseOccurance p = get(i); |
| 42 | + if (p.getOffset() >= offs) { |
| 43 | + offs = p.getOffset(); |
| 44 | + break; |
| 45 | + } |
| 46 | + |
| 47 | + i++; |
| 48 | + } |
| 49 | + |
| 50 | + if (i>=size()) return null; |
| 51 | + |
| 52 | + int j = i; |
| 53 | + while (j<size()) { |
| 54 | + PhraseOccurance p = get(j); |
| 55 | + if (p.getOffset() > offs) break; |
| 56 | + j++; |
| 57 | + } |
| 58 | + |
| 59 | + return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first. |
| 60 | + } |
| 61 | + |
| 62 | + public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) { |
| 63 | + List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>(); |
| 64 | + |
| 65 | + int i = 0; |
| 66 | + |
| 67 | + outer: |
| 68 | + while (i<size()) { |
| 69 | + List<PhraseOccurance> candidates = getPhrasesAt(i); |
| 70 | + if (candidates == null) break; |
| 71 | + |
| 72 | + for (PhraseOccurance p: candidates) { |
| 73 | + i = p.getEndOffset(); |
| 74 | + if (filter.matches(p.getPhrase())) { |
| 75 | + phrases.add(p); |
| 76 | + continue outer; |
| 77 | + } |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + return phrases; |
| 82 | + } |
| 83 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/Stopwords_en.properties |
— | — | @@ -0,0 +1,38 @@ |
| 2 | +I |
| 3 | +a |
| 4 | +about |
| 5 | +an |
| 6 | +and |
| 7 | +are |
| 8 | +as |
| 9 | +at |
| 10 | +be |
| 11 | +by |
| 12 | +for |
| 13 | +from |
| 14 | +has |
| 15 | +have |
| 16 | +his |
| 17 | +her |
| 18 | +how |
| 19 | +in |
| 20 | +is |
| 21 | +it |
| 22 | +of |
| 23 | +on |
| 24 | +or |
| 25 | +out |
| 26 | +that |
| 27 | +the |
| 28 | +this |
| 29 | +to |
| 30 | +was |
| 31 | +what |
| 32 | +when |
| 33 | +where |
| 34 | +which |
| 35 | +who |
| 36 | +will |
| 37 | +with |
| 38 | +you |
| 39 | +your |
\ No newline at end of file |