r68117 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r68116‎ \| r68117 \| r68118 >
Date:	12:12, 16 June 2010
Author:	daniel
Status:	deferred
Tags:
Comment:	improved phrase detection: allow for overlapping alternative word chunks
Modified paths:	/trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Interwiki.java (modified) (history) /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java (modified) (history) /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java (modified) (history) /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java (modified) (history) /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java (modified) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java (modified) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java (deleted) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java (modified) (history) /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en_int.java (added) (history) /trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
—	—	@@ -3,10 +3,8 @@
4	4	import java.util.ArrayList;
5	5	import java.util.Collection;
6	6	import java.util.Collections;
7		~~-import java.util.HashSet;~~
8	7	import java.util.List;
9	8	import java.util.Map;
10		~~-import java.util.Set;~~
11	9
12	10	import de.brightbyte.io.Output;
13	11	import de.brightbyte.util.PersistenceException;
—	—	@@ -17,46 +15,6 @@
18	16
19	17	public abstract class AbstractDisambiguator<T extends TermReference, C extends WikiWordConcept> implements Disambiguator<T, C> {
20	18
21		~~- public interface NodeListener<T extends TermReference> {~~
22		~~- public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal);~~
23		~~- }~~
24		-
25		~~- public static class SequenceSetBuilder <T extends TermReference> implements NodeListener<T> {~~
26		~~- protected List<List<T>> seqencees;~~
27		-
28		~~- public SequenceSetBuilder() {~~
29		~~- seqencees = new ArrayList<List<T>>();~~
30		~~- }~~
31		-
32		~~- public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) {~~
33		~~- if (terminal) {~~
34		~~- List<T> p = new ArrayList<T>(seqence); //clone~~
35		~~- seqencees.add(p);~~
36		~~- }~~
37		~~- }~~
38		-
39		~~- public List<List<T>> getSequences() {~~
40		~~- return seqencees;~~
41		~~- }~~
42		~~- }~~
43		-
44		~~- public static class TermSetBuilder <T extends TermReference> implements NodeListener<T> {~~
45		~~- protected Set<T> terms;~~
46		-
47		~~- public TermSetBuilder() {~~
48		~~- terms = new HashSet<T>();~~
49		~~- }~~
50		-
51		~~- public void onNode(PhraseNode<? extends T> node, List<? extends T> seqence, boolean terminal) {~~
52		~~- T t = node.getTermReference();~~
53		~~- if (t.getTerm().length()>0) terms.add(t);~~
54		~~- }~~
55		-
56		~~- public Collection<T> getTerms() {~~
57		~~- return terms;~~
58		~~- }~~
59		~~- }~~
60		-
61	19	private MeaningFetcher<C> meaningFetcher;
62	20
63	21	private Output trace;
—	—	@@ -78,18 +36,6 @@
79	37	this.meaningOverrides = overrideMap;
80	38	}
81	39
82		~~- protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) {~~
83		~~- TermSetBuilder<X> builder = new TermSetBuilder<X>();~~
84		~~- walk(root, null, builder, depth);~~
85		~~- return builder.getTerms();~~
86		~~- }~~
87		-
88		~~- protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root, int depth) {~~
89		~~- SequenceSetBuilder<X> builder = new SequenceSetBuilder<X>();~~
90		~~- walk(root, null, builder, depth);~~
91		~~- return builder.getSequences();~~
92		~~- }~~
93		-
94	40	protected <X extends T>PhraseNode<X> getLastNode(PhraseNode<X> root, List<X> sequence) {
95	41	PhraseNode<X> n = findLastNode(root, sequence);
96	42	if (n==null) throw new IllegalArgumentException("sequence does not match node structure: "+sequence);
—	—	@@ -126,31 +72,19 @@
127	73	return root;
128	74	}
129	75
130		~~- protected <X extends T>void walk(PhraseNode<X> root, List<X> seqence, NodeListener<? super X> nodeListener, int depth) {~~
131		~~- if (depth<1) return;~~
132		~~- if (seqence == null) seqence = new ArrayList<X>();~~
133		-
134		~~- X t = root.getTermReference();~~
135		~~- if (t.getTerm().length()>0) seqence.add(t); //push~~
136		~~- else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes.~~
137		-
138		~~- boolean terminal = (depth<=1);~~
139		-
140		~~- Collection<? extends PhraseNode<X>> successors = terminal ? null : root.getSuccessors();~~
141		~~- if (successors==null \|\| successors.isEmpty()) terminal = true;~~
142		-
143		~~- if (nodeListener!=null)~~
144		~~- nodeListener.onNode(root, seqence, terminal);~~
145		-
146		~~- if (!terminal) {~~
147		~~- for (PhraseNode<X> n: successors) {~~
148		~~- walk(n, seqence, nodeListener, depth-1);~~
149		~~- }~~
150		~~- }~~
151		-
152		~~- if (t.getTerm().length()>0) seqence.remove(t); //pop~~
	76	+ protected <X extends T>Collection<X> getTerms(PhraseNode<X> root, int depth) {
	77	+ PhraseNode.TermSetBuilder<X> builder = new PhraseNode.TermSetBuilder<X>();
	78	+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
	79	+ return builder.getTerms();
153	80	}
154	81
	82	+ protected <X extends T>Collection<List<X>> getSequences(PhraseNode<X> root, int depth) {
	83	+ PhraseNode.SequenceSetBuilder<X> builder = new PhraseNode.SequenceSetBuilder<X>();
	84	+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
	85	+ return builder.getSequences();
	86	+ }
	87	+
	88	+
155	89	protected <X extends T>Map<X, List<? extends C>> getMeanings(PhraseNode<X> root) throws PersistenceException {
156	90	Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
157	91	return getMeanings(terms);
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Interwiki.java
—	—	@@ -78,7 +78,9 @@
79	79
80	80	return interwikis;
81	81	} catch (IOException e) {
82		~~- throw new RuntimeException("failed to load interwiki map from "+n);~~
	82	+ throw new RuntimeException("failed to load interwiki map from "+n, e);
	83	+ } catch (IllegalArgumentException e) { //NOTE: malformed \\u-encoding triggers this. wtf? why not a real exception?...
	84	+ throw new RuntimeException("failed to load interwiki map from "+n, e);
83	85	}
84	86
85	87	}
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccurance.java
—	—	@@ -13,10 +13,6 @@
14	14
15	15	public PhraseOccurance(String phrase, int weight, int offset, int length) {
16	16	if (length < 0) throw new IllegalArgumentException("bad length: "+length);
17		~~- if (length > phrase.length()) throw new IllegalArgumentException("length larger than base string");~~
18		~~- //if (length == phrase.length() && offset > 0) throw new IllegalArgumentException("region outside than base string");~~
19		~~- if (length < phrase.length() && offset+length > phrase.length()) throw new IllegalArgumentException("region outside than base string");~~
20		~~- if (length < phrase.length()) phrase = phrase.substring(offset, offset+length);~~
21	17
22	18	this.phrase = phrase;
23	19	this.weight = weight;
—	—	@@ -49,7 +45,7 @@
50	46	}
51	47
52	48	public String toString() {
53		~~- return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";~~
	49	+ return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]#"+weight;
54	50	}
55	51
56	52	@Override
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
—	—	@@ -1,6 +1,5 @@
2	2	package de.brightbyte.wikiword.model;
3	3
4		~~-import java.util.AbstractList;~~
5	4	import java.util.ArrayList;
6	5	import java.util.Collection;
7	6	import java.util.Collections;
—	—	@@ -8,11 +7,49 @@
9	8	import java.util.Iterator;
10	9	import java.util.List;
11	10	import java.util.ListIterator;
12		~~-import java.util.RandomAccess;~~
13	11	import java.util.Set;
	12	+import java.util.regex.Matcher;
14	13
15		~~-public class PhraseOccuranceSet extends AbstractList<PhraseOccurance> implements RandomAccess {~~
	14	+public class PhraseOccuranceSet implements List<PhraseOccurance> {
16	15
	16	+ public static class AggregatePhraseBuilder extends PhraseNode.Walker<PhraseOccurance> {
	17	+ protected Collection<PhraseOccurance> aggregated;
	18	+ protected double minWeight;
	19	+ protected double maxWeight;
	20	+ protected Matcher phraseBreak;
	21	+
	22	+ public AggregatePhraseBuilder( double minWeight, double maxWeight, Matcher phraseBreak ) {
	23	+ aggregated = new HashSet<PhraseOccurance>();
	24	+ this.minWeight = minWeight;
	25	+ this.maxWeight = maxWeight ;
	26	+ this.phraseBreak = phraseBreak;
	27	+ }
	28	+
	29	+ public boolean onNode(PhraseNode<? extends PhraseOccurance> node, List<? extends PhraseOccurance> sequence, double weight, boolean terminal) {
	30	+ if (weight>=minWeight && !sequence.isEmpty()) {
	31	+ PhraseOccurance p = aggregatePhrase( sequence, minWeight, maxWeight );
	32	+ if (p!=null) aggregated.add(p);
	33	+
	34	+ PhraseOccurance last = sequence.get( sequence.size()-1);
	35	+
	36	+ if (phraseBreak!=null) {
	37	+ phraseBreak.reset(last.getTerm());
	38	+ if (phraseBreak.matches())
	39	+ return false; //phrase terminates here, don't dig deeper.
	40	+ }
	41	+
	42	+ if (p==null) return weight <= maxWeight; //XXX: something is wrong here
	43	+ else return p.getWeight() <= maxWeight; //XXX: can we do that?
	44	+ } else {
	45	+ return weight <= maxWeight; //XXX: not sure...
	46	+ }
	47	+ }
	48	+
	49	+ public Collection<PhraseOccurance> getAggregatedPhrases() {
	50	+ return aggregated;
	51	+ }
	52	+ }
	53	+
17	54	protected class Node implements PhraseNode<PhraseOccurance> {
18	55	protected PhraseOccurance phrase;
19	56
—	—	@@ -57,8 +94,6 @@
58	95	return false;
59	96	return true;
60	97	}
61		-
62		-
63	98	}
64	99
65	100	protected List<PhraseOccurance> phrases;
—	—	@@ -71,12 +106,51 @@
72	107	Collections.sort(this.phrases); //essential!
73	108	}
74	109
75		~~- @Override~~
	110	+ private static PhraseOccurance aggregatePhrase(List<? extends PhraseOccurance> sequence, double minWeight, double maxWeight) {
	111	+ if (sequence.isEmpty()) return null;
	112	+
	113	+ int i = 0;
	114	+ while ( i<sequence.size() && sequence.get(i).getWeight() < minWeight ) i++;
	115	+
	116	+ int j = sequence.size()-1;
	117	+ while ( j>i && sequence.get(j).getWeight() < minWeight ) j--;
	118	+
	119	+ if ( j<i ) return null;
	120	+
	121	+ double weight = 0;
	122	+ int ofs = -1;
	123	+ int start = -1;
	124	+ StringBuilder s = new StringBuilder();
	125	+
	126	+ for (int n=i; n<=j; n++) {
	127	+ PhraseOccurance p = sequence.get(n);
	128	+
	129	+ double w = p.getWeight();
	130	+ if (w<0) w = 0;
	131	+ if (weight+w > maxWeight) break;
	132	+
	133	+ if ( start < 0 ) {
	134	+ start = p.getOffset();
	135	+ ofs = p.getOffset();
	136	+ } else {
	137	+ if (p.getOffset()>ofs) s.append(" ");
	138	+ }
	139	+
	140	+ ofs = p.getEndOffset();
	141	+
	142	+ weight += w;
	143	+ s.append(p.getTerm());
	144	+ }
	145	+
	146	+ if (start < 0) return null;
	147	+
	148	+ return new PhraseOccurance(s.toString(), (int)weight, start, ofs - start);
	149	+ }
	150	+
76	151	public PhraseOccurance get(int index) {
77	152	return phrases.get(index);
78	153	}
79	154
80		~~- @Override~~
81	155	public int size() {
82	156	return phrases.size();
83	157	}
—	—	@@ -179,6 +253,15 @@
180	254	return subList(i, j); //NOTE: Phraseoccurrance.compareTo assures that longest phrases come first.
181	255	}
182	256
	257	+ public boolean hasPhrasesAt(int at) {
	258	+ for ( PhraseOccurance p: phrases ) {
	259	+ if ( p.getOffset() == at) return true;
	260	+ else if ( p.getOffset() > at) return false;
	261	+ }
	262	+
	263	+ return false;
	264	+ }
	265	+
183	266	public List<PhraseOccurance> getPhrasesFrom(int offs) {
184	267	int i = 0;
185	268	while (i<size()) {
—	—	@@ -262,7 +345,96 @@
263	346	public <T> T[] toArray(T[] a) {
264	347	return phrases.toArray(a);
265	348	}
	349	+
	350	+ public void prune( double minWeight ) {
	351	+ Iterator<PhraseOccurance> it = phrases.iterator();
	352	+ while (it.hasNext()) {
	353	+ PhraseOccurance t = it.next();
	354	+ if ( t.getWeight() < minWeight ) it.remove();
	355	+ }
	356	+ }
266	357
	358	+ public void buildAggregatePhrases( int start, double minWeight, double maxWeight, Matcher phraseBreak ) {
	359	+ AggregatePhraseBuilder builder = new AggregatePhraseBuilder( minWeight, maxWeight, phraseBreak );
	360	+
	361	+ if (isEmpty()) return;
	362	+ PhraseOccurance last = phrases.get(phrases.size()-1);
	363	+ int end = last.getEndOffset();
	364	+
	365	+ for (int i=start; i<end; i++) {
	366	+ if (hasPhrasesAt(i)) {
	367	+ builder.walk(getRootNodeAt(i), 0, null, Integer.MAX_VALUE, maxWeight);
	368	+ }
	369	+ }
	370	+
	371	+ Collection<PhraseOccurance> phrases = builder.getAggregatedPhrases();
	372	+ addAll( phrases );
	373	+ }
	374	+
	375	+ public String toString() {
	376	+ return phrases.toString();
	377	+ }
	378	+
	379	+ public Collection<PhraseOccurance> getTerms(PhraseNode<PhraseOccurance> root, int depth) {
	380	+ PhraseNode.TermSetBuilder<PhraseOccurance> builder = new PhraseNode.TermSetBuilder<PhraseOccurance>();
	381	+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
	382	+ return builder.getTerms();
	383	+ }
	384	+
	385	+ public Collection<List<PhraseOccurance>> getSequences(PhraseNode<PhraseOccurance> root, int depth) {
	386	+ PhraseNode.SequenceSetBuilder<PhraseOccurance> builder = new PhraseNode.SequenceSetBuilder<PhraseOccurance>();
	387	+ builder.walk(root, 0, null, depth, Double.MAX_VALUE);
	388	+ return builder.getSequences();
	389	+ }
	390	+
	391	+ public void add(int index, PhraseOccurance element) {
	392	+ add(element);
	393	+ }
	394	+
	395	+ public boolean add(PhraseOccurance e) {
	396	+ int i = Collections.binarySearch(phrases, e);
	397	+
	398	+ if (i<0) i = -i-1;
	399	+ else {
	400	+ PhraseOccurance old = get(i);
	401	+ if (old.equals(e)) return false;
	402	+ }
	403	+
	404	+ phrases.add(i, e);
	405	+
	406	+ return true;
	407	+ }
	408	+
	409	+ public boolean addAll(Collection<? extends PhraseOccurance> c) {
	410	+ int count = 0;
	411	+ for (PhraseOccurance p: c) {
	412	+ if ( add(p) ) count++;
	413	+ }
	414	+
	415	+ return count>0;
	416	+ }
	417	+
	418	+ public boolean addAll(int index, Collection<? extends PhraseOccurance> c) {
	419	+ return addAll(c);
	420	+ }
	421	+
	422	+ public int hashCode() {
	423	+ return phrases.hashCode();
	424	+ }
	425	+
	426	+ public int lastIndexOf(Object o) {
	427	+ return phrases.lastIndexOf(o);
	428	+ }
	429	+
	430	+ public PhraseOccurance set(int index, PhraseOccurance element) {
	431	+ throw new UnsupportedOperationException();
	432	+ }
	433	+
	434	+ public PhraseOccuranceSet subList(int fromIndex, int toIndex) {
	435	+ return new PhraseOccuranceSet(text, phrases.subList(fromIndex, toIndex));
	436	+ }
	437	+
	438	+
267	439	/*
268	440	public List<PhraseOccurance> getDisjointPhraseSequence(Filter<String> filter) {
269	441	List<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseNode.java
—	—	@@ -1,9 +1,90 @@
2	2	package de.brightbyte.wikiword.model;
3	3
	4	+import java.util.ArrayList;
4	5	import java.util.Collection;
	6	+import java.util.HashSet;
	7	+import java.util.List;
	8	+import java.util.Set;
5	9
6	10	public interface PhraseNode<T extends TermReference> {
	11	+ public abstract class Walker<T extends TermReference> {
	12	+ public abstract boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal);
	13	+
	14	+ public void walk(PhraseNode<T> root, int depth) {
	15	+ walk(root, 0, null, depth, Double.MAX_VALUE);
	16	+ }
7	17
	18	+ public void walk(PhraseNode<T> root, double baseWeight, List<T> intoSeqence, int depth, double maxWeight) {
	19	+ if (depth<1) return;
	20	+ if (intoSeqence == null) intoSeqence = new ArrayList<T>();
	21	+
	22	+ T t = root.getTermReference();
	23	+ if (t.getTerm().length()>0) intoSeqence.add(t); //push
	24	+ else if (depth<Integer.MAX_VALUE) depth += 1; //XXX: ugly hack for blank root nodes.
	25	+
	26	+ boolean terminal = (depth<=1) \|\| (baseWeight>=maxWeight);
	27	+
	28	+ Collection<? extends PhraseNode<T>> successors = terminal ? null : root.getSuccessors();
	29	+ if (successors==null \|\| successors.isEmpty()) terminal = true;
	30	+
	31	+ double w = root.getTermReference().getWeight();
	32	+ if (w<0) w = 0;
	33	+ if ( !onNode(root, intoSeqence, w, terminal) ) terminal = true;
	34	+
	35	+ //System.out.println( " - walk: "+intoSeqence+" " );
	36	+
	37	+ if (!terminal) {
	38	+ for (PhraseNode<T> n: successors) {
	39	+ w = n.getTermReference().getWeight();
	40	+ if (w<0) w = 0;
	41	+ walk(n, baseWeight + w, intoSeqence, depth-1, maxWeight);
	42	+ }
	43	+ }
	44	+
	45	+ if (t.getTerm().length()>0) intoSeqence.remove(t); //pop
	46	+ }
	47	+ }
	48	+
	49	+ public static class SequenceSetBuilder <T extends TermReference> extends Walker<T> {
	50	+ protected List<List<T>> sequences;
	51	+
	52	+ public SequenceSetBuilder() {
	53	+ sequences = new ArrayList<List<T>>();
	54	+ }
	55	+
	56	+ public boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal) {
	57	+ if (terminal) {
	58	+ List<T> p = new ArrayList<T>(sequence); //clone
	59	+ sequences.add(p);
	60	+ }
	61	+
	62	+ return !terminal;
	63	+ }
	64	+
	65	+ public List<List<T>> getSequences() {
	66	+ return sequences;
	67	+ }
	68	+ }
	69	+
	70	+ public static class TermSetBuilder <T extends TermReference> extends Walker<T> {
	71	+ protected Set<T> terms;
	72	+
	73	+ public TermSetBuilder() {
	74	+ terms = new HashSet<T>();
	75	+ }
	76	+
	77	+ public boolean onNode(PhraseNode<? extends T> node, List<? extends T> sequence, double weight, boolean terminal) {
	78	+ T t = node.getTermReference();
	79	+ if (t.getTerm().length()>0) terms.add(t);
	80	+
	81	+ return !terminal;
	82	+ }
	83	+
	84	+ public Collection<T> getTerms() {
	85	+ return terms;
	86	+ }
	87	+ }
	88	+
8	89	public T getTermReference();
9	90
10	91	public Collection<? extends PhraseNode<T>> getSuccessors();
Index: trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
—	—	@@ -2,14 +2,16 @@
3	3
4	4	import java.io.IOException;
5	5	import java.net.URISyntaxException;
	6	+import java.util.ArrayList;
6	7	import java.util.Arrays;
	8	+import java.util.Collections;
7	9	import java.util.HashSet;
8	10	import java.util.List;
9	11	import java.util.Set;
10	12
11	13	import de.brightbyte.wikiword.Corpus;
12		~~-import de.brightbyte.wikiword.analyzer.LanguageConfiguration;~~
13		~~-import de.brightbyte.wikiword.analyzer.PlainTextAnalyzer;~~
	14	+import de.brightbyte.wikiword.model.PhraseOccurance;
	15	+import de.brightbyte.wikiword.model.PhraseOccuranceSet;
14	16
15	17	/**
16	18	* Unit tests for PlainTextAnalyzer
—	—	@@ -60,8 +62,8 @@
61	63	words = extractWords("foo-bar");
62	64	assertEquals(theList( "foo-bar" ), words);
63	65
64		~~- words = extractWords("harald's 'schaaand");~~
65		~~- assertEquals(theList( "harald's", "'schaaand" ), words);~~
	66	+ words = extractWords("harald's 'schlaaand");
	67	+ assertEquals(theList( "harald's", "'schlaaand" ), words);
66	68
67	69	words = extractWords("23-42");
68	70	assertEquals(theList( "23-42" ), words);
—	—	@@ -69,6 +71,92 @@
70	72	words = extractWords("23foo42");
71	73	assertEquals(theList( "23", "foo", "42" ), words);
72	74	}
	75	+
	76	+ public void testExtractPhrases() {
	77	+ PhraseOccuranceSet phrases = extractPhrases("", 3);
	78	+ assertEquals(0, phrases.size());
	79	+ assertEquals(theList(), getWordList(phrases.getPhrasesAt(0)));
	80	+
	81	+ phrases = extractPhrases("foo", 3);
	82	+ assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(0)));
	83	+
	84	+ phrases = extractPhrases(" foo ", 3);
	85	+ assertEquals(theList(), getWordList(phrases.getPhrasesAt(0)));
	86	+ assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesAt(1)));
	87	+ assertEquals(theList( "foo" ), getWordList(phrases.getPhrasesFrom(0)));
	88	+ }
	89	+
	90	+ public void testExtractPhrases2() {
	91	+ PhraseOccuranceSet phrases = extractPhrases("red green blue yellow black", 3);
	92	+ assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
	93	+ assertEquals(theList( "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
	94	+
	95	+ phrases = extractPhrases("red green blue yellow black", 5);
	96	+ assertEquals(theList( "red green blue yellow black", "red green blue yellow", "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
	97	+ assertEquals(theList( "green blue yellow black", "green blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
	98	+
	99	+ phrases = extractPhrases("and red and green and blue and yellow", 3);
	100	+ assertEquals(theList( "and red and green and blue",
	101	+ "and red and green and",
	102	+ "and red and green",
	103	+ "and red and",
	104	+ "and red"
	105	+ ),
	106	+ getWordList(phrases.getPhrasesAt(0)));
	107	+ assertEquals(theList( "red and green and blue",
	108	+ "red and green and",
	109	+ "red and green",
	110	+ "red and",
	111	+ "red"
	112	+ ),
	113	+ getWordList(phrases.getPhrasesAt(4)));
	114	+
	115	+ phrases = extractPhrases("red green blue. yellow black", 5);
	116	+ assertEquals(theList( "red green blue", "red green", "red" ), getWordList(phrases.getPhrasesAt(0)));
	117	+ assertEquals(theList( "blue" ), getWordList(phrases.getPhrasesAt(10)));
	118	+ assertEquals(theList( "yellow black", "yellow" ), getWordList(phrases.getPhrasesAt(16)));
	119	+ }
	120	+
	121	+ public void testExtractPhrases3() {
	122	+ PhraseOccuranceSet phrases = extractPhrases("Krababbel: l'Foo-Bar", 3);
	123	+ assertEquals(theList( "Krababbel"), getWordList(phrases.getPhrasesAt(0)));
	124	+
	125	+ assertEquals(theList( "l'Foo-Bar",
	126	+ "l'Foo"
	127	+ ),
	128	+ getWordList(phrases.getPhrasesAt(11)));
	129	+
	130	+ assertEquals(theList( "Foo-Bar",
	131	+ "Foo"
	132	+ ),
	133	+ getWordList(phrases.getPhrasesAt(13)));
	134	+
	135	+ assertEquals(theList( "Bar"),
	136	+ getWordList(phrases.getPhrasesAt(17)));
	137	+
	138	+ phrases = extractPhrases("harald's 'schlaaand", 3);
	139	+ assertEquals(theList( "harald's 'schlaaand",
	140	+ "harald's",
	141	+ "harald"
	142	+ ),
	143	+ getWordList(phrases.getPhrasesAt(0)));
	144	+
	145	+ assertEquals(theList( "'schlaaand"), getWordList(phrases.getPhrasesAt(9)));
	146	+ assertEquals(theList("schlaaand"), getWordList(phrases.getPhrasesAt(10)));
	147	+ }
	148	+
	149	+ private List<String> getWordList(List<PhraseOccurance> phrases) {
	150	+ if (phrases==null) return Collections.emptyList();
	151	+
	152	+ List<String> words = new ArrayList<String>(phrases.size());
	153	+
	154	+ for (PhraseOccurance phrase: phrases) {
	155	+ String w = phrase.getTerm();
	156	+ words.add(w);
	157	+ }
	158	+
	159	+ return words;
	160	+ }
73	161
74	162	}
75	163
—	—	@@ -86,7 +174,7 @@
87	175	public void setUp() throws URISyntaxException, IOException {
88	176	LanguageConfiguration config = new LanguageConfiguration();
89	177
90		~~- corpus = new Corpus("TEST", "generic", "generic", "generic", "generic", "xx", "generic", null);~~
	178	+ corpus = new Corpus("TEST", "en", "en", "en", "en", "en", "en", null);
91	179	testAnalyzer = new TestPlainTextAnalyzer(corpus);
92	180	testAnalyzer.configure(config, tweaks);
93	181	testAnalyzer.initialize();
—	—	@@ -102,6 +190,18 @@
103	191	testAnalyzer.testExtractWords();
104	192	}
105	193
	194	+ public void testExtractPhrases() {
	195	+ testAnalyzer.testExtractPhrases();
	196	+ }
	197	+
	198	+ public void testExtractPhrases2() {
	199	+ testAnalyzer.testExtractPhrases2();
	200	+ }
	201	+
	202	+ public void testExtractPhrases3() {
	203	+ testAnalyzer.testExtractPhrases3();
	204	+ }
	205	+
106	206	public static void main(String[] args) {
107	207	run(PlainTextAnalyzerTest.class, args);
108	208	}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PhraseAggregator.java
—	—	@@ -1,113 +0,0 @@
2		~~-package de.brightbyte.wikiword.analyzer;~~
3		-
4		~~-import java.util.ArrayList;~~
5		~~-import java.util.Collection;~~
6		~~-import java.util.Iterator;~~
7		~~-import java.util.regex.Matcher;~~
8		-
9		~~-import de.brightbyte.wikiword.model.PhraseOccurance;~~
10		-
11		~~-public class PhraseAggregator {~~
12		~~- public class PhraseBuilder {~~
13		~~- protected StringBuilder phrase;~~
14		~~- protected int weight;~~
15		~~- protected int lastWeight;~~
16		~~- protected int offset;~~
17		-
18		~~- public PhraseBuilder(int offset) {~~
19		~~- this.phrase = new StringBuilder();~~
20		~~- this.weight = 0;~~
21		~~- this.offset = offset;~~
22		~~- }~~
23		-
24		~~- public int getLength() {~~
25		~~- return phrase.length();~~
26		~~- }~~
27		-
28		~~- public int getOffset() {~~
29		~~- return offset;~~
30		~~- }~~
31		-
32		~~- public int getEndOffset() {~~
33		~~- return getOffset() + getLength();~~
34		~~- }~~
35		-
36		~~- public String getPhrase() {~~
37		~~- return phrase.toString();~~
38		~~- }~~
39		-
40		~~- public int getWeight() {~~
41		~~- return weight;~~
42		~~- }~~
43		-
44		~~- public int getLastWeight() {~~
45		~~- return lastWeight;~~
46		~~- }~~
47		-
48		~~- public PhraseOccurance toPhraseOccurance() {~~
49		~~- return new PhraseOccurance(getPhrase(), getWeight(), getOffset(), getLength());~~
50		~~- }~~
51		-
52		~~- public String toString() {~~
53		~~- return "\"" + getPhrase() + "\" @[" + getOffset() + ":" + getEndOffset() + "]";~~
54		~~- }~~
55		-
56		~~- public void push(CharSequence w, int weight) {~~
57		~~- phrase.append(w);~~
58		~~- if (weight>0) this.weight+= weight;~~
59		~~- this.lastWeight = weight;~~
60		~~- }~~
61		~~- }~~
62		-
63		~~- private int offset = 0;~~
64		~~- private int maxWeight = 0;~~
65		-
66		~~- private Matcher phraseBreakeMatcher;~~
67		~~- private ArrayList<PhraseBuilder> phrases = new ArrayList<PhraseBuilder>();~~
68		-
69		~~- public PhraseAggregator(Matcher phraseBreakeMatcher) {~~
70		~~- super();~~
71		~~- this.phraseBreakeMatcher = phraseBreakeMatcher;~~
72		~~- }~~
73		-
74		~~- public void reset(int offset, int maxWeight) {~~
75		~~- this.offset = offset;~~
76		~~- this.maxWeight = maxWeight;~~
77		~~- clear();~~
78		~~- }~~
79		-
80		~~- public void clear() {~~
81		~~- phrases.clear();~~
82		~~- }~~
83		-
84		~~- public void update(int index, CharSequence word, int weight, Collection<PhraseOccurance> into) {~~
85		~~- if (weight<0) {~~
86		~~- phraseBreakeMatcher.reset(word);~~
87		~~- if (phraseBreakeMatcher.matches()) {~~
88		~~- this.clear();~~
89		~~- return;~~
90		~~- }~~
91		~~- }~~
92		-
93		~~- this.push(index, word, weight);~~
94		~~- this.commit(into);~~
95		~~- }~~
96		-
97		~~- public void push(int index, CharSequence word, int weight) {~~
98		~~- if (weight >= 0) phrases.add(new PhraseBuilder(index+offset));~~
99		-
100		~~- Iterator<PhraseBuilder> it = phrases.iterator();~~
101		~~- while (it.hasNext()) {~~
102		~~- PhraseBuilder b = it.next();~~
103		~~- b.push(word, weight);~~
104		~~- if (b.getWeight() > maxWeight) it.remove();~~
105		~~- }~~
106		~~- }~~
107		-
108		~~- public void commit(Collection<PhraseOccurance> into) {~~
109		~~- for (PhraseBuilder b: phrases) {~~
110		~~- if (b.getWeight() > 0 && b.getLastWeight() > 0) into.add(b.toPhraseOccurance());~~
111		~~- }~~
112		~~- }~~
113		-
114		-}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
—	—	@@ -60,10 +60,18 @@
61	61
62	62	/**
63	63	* A pattern matching individual words, for splitting a string into words. This is usually
64		~~- * set to match any sequence of letters but not numbers, whitespace or punctuation.~~
	64	+ * set to match any sequence of letters but not numbers, whitespace or punctuation, except
	65	+ * those that may accurs as part of a word, such as an apostrophy or hyphen.
65	66	*/
66	67	public Pattern wordPattern;
67	68
	69	+ /**
	70	+ * A pattern matching individual words parts, for splitting a words into components. This is usually
	71	+ * set to match any sequence of letters but not numbers, whitespace or punctuation, nor
	72	+ * those that may accurs as part of a word, such as an apostrophy or hyphen.
	73	+ */
	74	+ public Pattern wordPartPattern;
	75	+
68	76	protected String languageName;
69	77
70	78	/**
—	—	@@ -98,7 +106,8 @@
99	107	}
100	108
101	109	public void defaults() throws IOException {
102		~~- if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}'']+(?:[\\p{Pc}\\p{Pd}][\\p{L}'']+)*\|\\p{Nd}+(?:.\\p{Nd}+)?");~~
	110	+ if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}']+(?:[\\p{Pc}\\p{Pd}][\\p{L}']+)*\|\\p{Nd}+(?:.\\p{Nd}+)?");
	111	+ if (this.wordPartPattern==null) this.wordPartPattern = Pattern.compile("[\\p{L}]+\|\\p{Nd}+");
103	112
104	113	this.sentenceManglers.add( new RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks
105	114	this.sentenceManglers.add( new RegularExpressionMangler("^([^\\p{L}](\\r\\n\|\\r\|\\n))+[^\\p{L}0-9]\\s*", "", 0) ); //strip leading cruft (lines without any characters)
—	—	@@ -110,7 +119,7 @@
111	120	List<String> stop = AuxilliaryWikiProperties.loadList("Stopwords", languageName);
112	121	if (stop!=null) this.stopwords.addAll(stop);
113	122
114		~~- this.phraseBreakerPattern = Pattern.compile("[,;:]\\s\|\"");~~
	123	+ this.phraseBreakerPattern = Pattern.compile("[,;:\".!?]\\s*");
115	124	this.parentacies = new ArrayList<Pair<String, String>>();
116	125	this.parentacies.add( new Pair<String, String>("(", ")") );
117	126	this.parentacies.add( new Pair<String, String>("[", "]") );
—	—	@@ -126,6 +135,7 @@
127	136	if (with.sentenceManglers!=null) sentenceManglers.addAll(with.sentenceManglers);
128	137
129	138	if (with.wordPattern!=null) wordPattern = with.wordPattern;
	139	+ if (with.wordPartPattern!=null) wordPartPattern = with.wordPartPattern;
130	140	if (with.phraseBreakerPattern!=null) phraseBreakerPattern = with.phraseBreakerPattern;
131	141
132	142	if (with.stopwords!=null) stopwords.addAll(with.stopwords);
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
—	—	@@ -13,8 +13,8 @@
14	14
15	15	import de.brightbyte.application.Arguments;
16	16	import de.brightbyte.audit.DebugUtil;
	17	+import de.brightbyte.data.Lookup;
17	18	import de.brightbyte.data.MapLookup;
18		~~-import de.brightbyte.data.Lookup;~~
19	19	import de.brightbyte.data.filter.Filter;
20	20	import de.brightbyte.data.filter.FixedSetFilter;
21	21	import de.brightbyte.io.ConsoleIO;
—	—	@@ -31,6 +31,7 @@
32	32	private Matcher sentenceTailGlueMatcher;
33	33	private Matcher sentenceFollowGlueMatcher;
34	34	private Matcher wordMatcher;
	35	+ private Matcher wordPartMatcher;
35	36
36	37	protected Filter<String> stopwordFilter;
37	38	protected Matcher phraseBreakeMatcher;
—	—	@@ -94,6 +95,7 @@
95	96	sentenceTailGlueMatcher = config.sentenceTailGluePattern.matcher("");
96	97	sentenceFollowGlueMatcher = config.sentenceFollowGluePattern.matcher("");
97	98	wordMatcher = config.wordPattern.matcher("");
	99	+ wordPartMatcher = config.wordPartPattern.matcher("");
98	100
99	101	phraseBreakeMatcher = config.phraseBreakerPattern.matcher("");
100	102	stopwordFilter = new FixedSetFilter<String>(config.stopwords);
—	—	@@ -191,6 +193,7 @@
192	194	return corpus;
193	195	}
194	196
	197	+ /*
195	198	public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) {
196	199	ArrayList<PhraseOccurance> phrases = new ArrayList<PhraseOccurance>();
197	200
—	—	@@ -231,13 +234,121 @@
232	235
233	236	if (stopwordFilter.matches(w)) weight = 0;
234	237	buildPhrasesAggregator.update(wordMatcher.start(), w, weight, into);
	238	+
	239	+ //after adding the word, now register word parts
	240	+ int j = 0;
	241	+ wordPartMatcher.reset(w);
	242	+ while (wordPartMatcher.find()) {
	243	+ if (wordPartMatcher.start() == 0 && wordPartMatcher.end() == w.length()) {
	244	+ break; //full word matched as a single part. no need to register parts.
	245	+ }
	246	+
	247	+ if (j != wordPartMatcher.start()) {
	248	+ CharSequence glue = w.subSequence(j, wordPartMatcher.start());
	249	+ buildPhrasesAggregator.update(i, glue, -1, into);
	250	+ }
	251	+
	252	+ j = wordPartMatcher.end();
	253	+ weight = 1;
	254	+ String p;
	255	+
	256	+ if (wordPartMatcher.groupCount()>0) p = wordPartMatcher.group(1);
	257	+ else p = wordPartMatcher.group(0);
	258	+
	259	+ if (stopwordFilter.matches(p)) weight = 0;
	260	+ buildPhrasesAggregator.update(i+wordPartMatcher.start(), p, weight, into);
	261	+ }
	262	+
	263	+ if (j>0 && j < w.length()) {
	264	+ CharSequence glue = text.subSequence(j, w.length());
	265	+ buildPhrasesAggregator.update(j, glue, -1, into);
	266	+ }
235	267	}
236	268
237	269	if (i < text.length()) {
238	270	CharSequence space = text.subSequence(i, text.length());
239		~~- buildPhrasesAggregator.update(i, space, 0, into);~~
	271	+ buildPhrasesAggregator.update(i, space, -1, into);
240	272	}
	273	+ } */
	274	+
	275	+ public PhraseOccuranceSet extractPhrases(CharSequence text, int maxWeight) {
	276	+ PhraseOccuranceSet phrases = new PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
	277	+
	278	+ text = applyManglers(config.sentenceManglers, text);
	279	+
	280	+ ParsePosition pos = new ParsePosition(0);
	281	+ while (pos.getIndex() < text.length()) {
	282	+ int ofs = pos.getIndex();
	283	+ CharSequence s = extractNextSentence(text, pos, false);
	284	+ if (s==null \|\| s.length()==0) break;
	285	+
	286	+ buildPhrases(s, ofs, phrases, maxWeight);
	287	+ if (phrases.isEmpty()) continue;
	288	+
	289	+ phrases.buildAggregatePhrases(ofs, 0, maxWeight, phraseBreakeMatcher);
	290	+ }
	291	+
	292	+ if (phrases.isEmpty()) return phrases;
	293	+
	294	+ phrases.prune(1);
	295	+ return phrases;
241	296	}
	297	+
	298	+ private void buildPhrases(CharSequence text, int offset, PhraseOccuranceSet into, int maxWeight) {
	299	+ int i = 0;
	300	+ wordMatcher.reset(text);
	301	+ while (wordMatcher.find()) {
	302	+ if (i != wordMatcher.start()) {
	303	+ CharSequence space = text.subSequence(i, wordMatcher.start());
	304	+ into.add( new PhraseOccurance(space.toString(), -1, offset+i, space.length()) );
	305	+ }
	306	+
	307	+ i = wordMatcher.end();
	308	+ String w;
	309	+ int weight = 1;
	310	+
	311	+ if (wordMatcher.groupCount()>0) w = wordMatcher.group(1);
	312	+ else w = wordMatcher.group(0);
	313	+
	314	+ if (stopwordFilter.matches(w)) weight = 0;
	315	+ into.add( new PhraseOccurance(w, weight, offset+wordMatcher.start(), w.length()) );
	316	+
	317	+ //after adding the word, now register word parts
	318	+ int j = 0;
	319	+ int b = wordMatcher.start();
	320	+ wordPartMatcher.reset(w);
	321	+ while (wordPartMatcher.find()) {
	322	+ if (wordPartMatcher.start() == 0 && wordPartMatcher.end() == w.length()) {
	323	+ break; //full word matched as a single part. no need to register parts.
	324	+ }
	325	+
	326	+ if (j != wordPartMatcher.start()) {
	327	+ CharSequence glue = w.subSequence(j, wordPartMatcher.start());
	328	+ into.add( new PhraseOccurance(glue.toString(), -1, offset+b+j, glue.length()) );
	329	+ }
	330	+
	331	+ j = wordPartMatcher.end();
	332	+ weight = 1;
	333	+ String p;
	334	+
	335	+ if (wordPartMatcher.groupCount()>0) p = wordPartMatcher.group(1);
	336	+ else p = wordPartMatcher.group(0);
	337	+
	338	+ if (stopwordFilter.matches(p)) weight = 0;
	339	+ into.add( new PhraseOccurance(p, weight, offset+b+wordPartMatcher.start(), p.length()) );
	340	+ }
	341	+
	342	+ if (j>0 && j < w.length()) {
	343	+ CharSequence glue = text.subSequence(j, w.length());
	344	+ into.add( new PhraseOccurance(glue.toString(), -1, offset+b+j, glue.length()) );
	345	+ }
	346	+ }
	347	+
	348	+ if (i < text.length()) {
	349	+ CharSequence space = text.subSequence(i, text.length());
	350	+ into.add( new PhraseOccurance(space.toString(), -1, offset+i, space.length()) );
	351	+ }
	352	+ }
242	353
243	354	public static void main(String[] argv) throws IOException, InstantiationException {
244	355	Arguments args = new Arguments();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en_int.java
—	—	@@ -0,0 +1,10 @@
	2	+package de.brightbyte.wikiword.wikis;
	3	+
	4	+public class LanguageConfiguration_en_int extends LanguageConfiguration_en {
	5	+
	6	+ //TODO: list of abbreviations
	7	+ public LanguageConfiguration_en_int() {
	8	+ super();
	9	+ }
	10	+
	11	+}

Status & tagging log

13:53, 16 June 2010 MaxSem (talk | contribs) changed the status of r68117 [removed: new added: deferred]