r51263 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r51262‎ | r51263 | r51264 >
Date:21:38, 31 May 2009
Author:daniel
Status:deferred
Tags:
Comment:
ForeignEntityStoreDescriptor, more cursors
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/TweakSet.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/debug-biography-tweaks.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/debug-lifescience-tweaks.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/InputFileHelper.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/XmlDumpDriver.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/ForeignEntityStoreDescriptor.java (added) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/LoadForeignProperties.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/AssemblingFeatureSetCursor.java (added) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/CollapsingFeatureSetCursor.java (added) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/FeatureSetValueSplitter.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/ResultSetFeatureSetCursor.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/processor/ConceptMappingProcessor.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/AssemblingFeatureSetCursorTest.java (added) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/AssociationTest.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/CollapsingAssociationCursorTest.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/CollapsingFeatureSetCursorTest.java (added) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/CollapsingMatchesCursorTest.java (modified) (history)
  • /trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/FeatureSetsTest.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/TweakSet.java
@@ -21,7 +21,16 @@
2222 */
2323 public class TweakSet {
2424 protected Map<String, Object> parameters = new HashMap<String, Object>();
 25+ protected TweakSet parent;
2526
 27+ public TweakSet() {
 28+ this(null);
 29+ }
 30+
 31+ public TweakSet(TweakSet parent) {
 32+ this.parent = parent;
 33+ }
 34+
2635 public void loadTweaks(File f) throws IOException {
2736 setTweaks( SystemUtils.loadProperties(f, null), null );
2837 }
@@ -86,7 +95,11 @@
8796
8897 @SuppressWarnings("unchecked")
8998 public <T>T getTweak(String key, T def) {
90 - if (!parameters.containsKey(key)) return def;
91 - return (T)parameters.get(key);
 99+ if (!parameters.containsKey(key)) {
 100+ if (parent==null) return def;
 101+ else return parent.getTweak(key, def);
 102+ } else {
 103+ return (T)parameters.get(key);
 104+ }
92105 }
93106 }
Index: trunk/WikiWord/WikiWordBuilder/debug-biography-tweaks.properties
@@ -28,10 +28,10 @@
2929 # unzip will be appended to the command given here. Spaces
3030 # before the last / are taken to be part of the path, spaces
3131 # after the last / separate parameters.
32 -dumpdriver.externalBunzip = null
33 -dumpdriver.externalGunzip = null
34 -#dumpdriver.externalBunzip = "/bin/bunzip2 -c"
35 -#dumpdriver.externalGunzip = "/bin/gunzip -c"
 32+input.externalBunzip = null
 33+input.externalGunzip = null
 34+#input.externalBunzip = "/bin/bunzip2 -c"
 35+#input.externalGunzip = "/bin/gunzip -c"
3636
3737 ### Importer Output and Persistance ############
3838 importer.progressInterval = 1000
Index: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
@@ -28,10 +28,10 @@
2929 # unzip will be appended to the command given here. Spaces
3030 # before the last / are taken to be part of the path, spaces
3131 # after the last / separate parameters.
32 -dumpdriver.externalBunzip = null
33 -dumpdriver.externalGunzip = null
34 -#dumpdriver.externalBunzip = "/bin/bunzip2 -c"
35 -#dumpdriver.externalGunzip = "/bin/gunzip -c"
 32+input.externalBunzip = null
 33+input.externalGunzip = null
 34+#input.externalBunzip = "/bin/bunzip2 -c"
 35+#input.externalGunzip = "/bin/gunzip -c"
3636
3737 ### Importer Output and Persistance ############
3838 importer.progressInterval = 1000
Index: trunk/WikiWord/WikiWordBuilder/debug-lifescience-tweaks.properties
@@ -28,10 +28,10 @@
2929 # unzip will be appended to the command given here. Spaces
3030 # before the last / are taken to be part of the path, spaces
3131 # after the last / separate parameters.
32 -dumpdriver.externalBunzip = null
33 -dumpdriver.externalGunzip = null
34 -#dumpdriver.externalBunzip = "/bin/bunzip2 -c"
35 -#dumpdriver.externalGunzip = "/bin/gunzip -c"
 32+input.externalBunzip = null
 33+input.externalGunzip = null
 34+#input.externalBunzip = "/bin/bunzip2 -c"
 35+#input.externalGunzip = "/bin/gunzip -c"
3636
3737 ### Importer Output and Persistance ############
3838 importer.progressInterval = 1000
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/XmlDumpDriver.java
@@ -1,35 +1,27 @@
22 package de.brightbyte.wikiword.processor;
33
4 -import java.io.BufferedInputStream;
5 -import java.io.File;
6 -import java.io.FileInputStream;
74 import java.io.IOException;
85 import java.io.InputStream;
96 import java.io.InterruptedIOException;
107 import java.net.URL;
11 -import java.net.URLConnection;
128 import java.sql.SQLException;
139 import java.util.Iterator;
1410 import java.util.Map;
1511 import java.util.concurrent.TimeUnit;
16 -import java.util.regex.Matcher;
17 -import java.util.regex.Pattern;
18 -import java.util.zip.GZIPInputStream;
1912
20 -import org.apache.commons.compress.bzip2.CBZip2InputStream;
2113 import org.mediawiki.importer.DumpWriter;
2214 import org.mediawiki.importer.Page;
2315 import org.mediawiki.importer.Revision;
2416 import org.mediawiki.importer.Siteinfo;
2517 import org.mediawiki.importer.XmlDumpReader;
2618
27 -import de.brightbyte.io.IOUtil;
2819 import de.brightbyte.io.LeveledOutput;
2920 import de.brightbyte.job.BlockingJobQueue;
3021 import de.brightbyte.util.PersistenceException;
3122 import de.brightbyte.wikiword.Namespace;
3223 import de.brightbyte.wikiword.NamespaceSet;
3324 import de.brightbyte.wikiword.TweakSet;
 25+import de.brightbyte.wikiword.builder.InputFileHelper;
3426
3527 /**
3628 * DumpImportDriver implements ImportDriver for reading content from
@@ -220,8 +212,7 @@
221213 }
222214
223215 private int importQueueCapacity = 0;
224 - private String externalBunzip = null;
225 - private String externalGunzip = null;
 216+ private InputFileHelper inputHelper;
226217
227218 private void init(LeveledOutput log, TweakSet tweaks) {
228219 if (log==null) throw new NullPointerException();
@@ -231,15 +222,17 @@
232223 this.log = log;
233224
234225 importQueueCapacity = tweaks.getTweak("dumpdriver.pageImportQueue", 8);
235 - externalBunzip = tweaks.getTweak("dumpdriver.externalBunzip", null);
236 - externalGunzip = tweaks.getTweak("dumpdriver.externalGunzip", null);
 226+
 227+ inputHelper = new InputFileHelper(
 228+ tweaks.getTweak("dumpdriver.externalGunzip", tweaks.getTweak("input.externalGunzip", (String)null)),
 229+ tweaks.getTweak("dumpdriver.externalBunzip", tweaks.getTweak("input.externalBunzip", (String)null)));
237230 }
238231
239232 public void run(WikiWordPageProcessor importer) throws IOException, SQLException, InterruptedException, PersistenceException {
240233 DumpWriter sink = new Sink(importer, importQueueCapacity);
241234
242235 try {
243 - if (in==null) in = openURL(dump);
 236+ if (in==null) in = inputHelper.openURL(dump);
244237 XmlDumpReader reader = new XmlDumpReader(in, sink);
245238
246239 reader.readDump();
@@ -252,106 +245,4 @@
253246 sink.close(); //NOTE: make sure the executor queue is terminated
254247 }
255248 }
256 -
257 - protected InputStream openURL(URL u) throws IOException {
258 - String p = u.getProtocol();
259 -
260 - if (p.equals("file")) {
261 - File f = new File(u.getPath());
262 - return openFile(f);
263 - }
264 - else {
265 - URLConnection con = u.openConnection();
266 - String mime = con.getContentType();
267 - mime = mime.replaceAll(";.*$", "");
268 - InputStream in = con.getInputStream();
269 -
270 - if (mime.equals("application/x-gzip")) {
271 - return new GZIPInputStream(in); //FIXME: somehow, this doesn't seem to work. or was the external gunzipper the problem? check this!
272 - }
273 - else if (mime.equals("application/x-bzip2")) {
274 - validateBZ2(in);
275 - return new CBZip2InputStream(in);
276 - }
277 - else if (mime.equals("application/xml")) {
278 - return in;
279 - }
280 -
281 - in.close();
282 - throw new IOException("MIME type not suitable for a wiki dump: "+mime);
283 - }
284 - }
285 -
286 - protected InputStream openFile(File file) throws IOException {
287 - String f = file.getAbsolutePath();
288 -
289 - if (f.equals("-"))
290 - return new BufferedInputStream(System.in);
291 -
292 - InputStream in = new BufferedInputStream(new FileInputStream(file));
293 - if (f.endsWith(".gz")) {
294 - if (externalGunzip!=null) return openProc(externalGunzip, file);
295 - else return new GZIPInputStream(in);
296 - }
297 - else if (f.endsWith(".bz2")) {
298 - if (externalBunzip!=null) {
299 - return openProc(externalBunzip, file);
300 - }
301 - else {
302 - validateBZ2(in);
303 - return new CBZip2InputStream(in);
304 - }
305 - }
306 - else
307 - return in;
308 - }
309 -
310 - protected static void validateBZ2(InputStream in) throws IOException {
311 - int first = in.read();
312 - int second = in.read();
313 - if (first != 'B' || second != 'Z')
314 - throw new IOException("Didn't find BZ file signature");
315 - }
316 -
317 - protected static final Pattern commandParamPattern = Pattern.compile("^(.*) +([^/\\\\]+)$");
318 -
319 - public static InputStream openProc(String command, File f) throws IOException {
320 - String[] cmd;
321 -
322 - Matcher m = commandParamPattern.matcher(command);
323 - if (m.matches()) {
324 - String[] p = m.group(2).trim().split("\\s+");
325 -
326 - cmd = new String[p.length+2];
327 - cmd[0] = m.group(1).trim();
328 - System.arraycopy(p, 0, cmd, 1, p.length);
329 -
330 - cmd[cmd.length-1] = f.getAbsolutePath();
331 - }
332 - else {
333 - cmd = new String[] {
334 - command,
335 - f.getAbsolutePath()
336 - };
337 - }
338 -
339 - Process proc = Runtime.getRuntime().exec(cmd);
340 - final InputStream err = proc.getErrorStream();
341 -
342 - //HACK!
343 - Thread slurper = new Thread("stderr slurper for "+proc) {
344 - @Override
345 - public void run() {
346 - try {
347 - IOUtil.pump(err, System.err);
348 - } catch (IOException e) {
349 - e.printStackTrace(System.err);
350 - }
351 - }
352 - };
353 -
354 - slurper.start();
355 -
356 - return new BufferedInputStream(proc.getInputStream());
357 - }
358249 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/InputFileHelper.java
@@ -0,0 +1,160 @@
 2+package de.brightbyte.wikiword.builder;
 3+
 4+import java.io.BufferedInputStream;
 5+import java.io.File;
 6+import java.io.FileInputStream;
 7+import java.io.IOException;
 8+import java.io.InputStream;
 9+import java.net.MalformedURLException;
 10+import java.net.URL;
 11+import java.net.URLConnection;
 12+import java.util.regex.Matcher;
 13+import java.util.regex.Pattern;
 14+import java.util.zip.GZIPInputStream;
 15+
 16+import org.apache.commons.compress.bzip2.CBZip2InputStream;
 17+
 18+import de.brightbyte.io.IOUtil;
 19+import de.brightbyte.wikiword.TweakSet;
 20+
 21+public class InputFileHelper {
 22+
 23+ private String externalBunzip = null;
 24+ private String externalGunzip = null;
 25+
 26+ public InputFileHelper(TweakSet tweaks) {
 27+ this( tweaks.getTweak("input.externalGunzip", (String)null),
 28+ tweaks.getTweak("input.externalBunzip", (String)null) );
 29+ }
 30+
 31+ public InputFileHelper(String gz, String bz2) {
 32+ externalGunzip = gz;
 33+ externalBunzip = bz2;
 34+ }
 35+
 36+ protected static final Pattern extensionPattern = Pattern.compile("\\.([^./\\]+)(\\.gz|\\.bz2)$", Pattern.CASE_INSENSITIVE);
 37+
 38+ public String getFormat(String n) {
 39+ Matcher m = extensionPattern.matcher(n);
 40+
 41+ if (!m.find()) return null;
 42+ else return m.group(1).toLowerCase();
 43+ }
 44+
 45+ public InputStream open(String n) throws IOException {
 46+ if (n.equals("-")) return new BufferedInputStream(System.in);
 47+
 48+ try {
 49+ URL u = new URL(n);
 50+ return openURL(u);
 51+ } catch (MalformedURLException e) {
 52+ //ignore and continue
 53+ }
 54+
 55+ File f = new File(n);
 56+ return openFile(f);
 57+ }
 58+
 59+ public InputStream openURL(URL u) throws IOException {
 60+ String p = u.getProtocol();
 61+
 62+ if (p.equals("file")) {
 63+ File f = new File(u.getPath());
 64+ return openFile(f);
 65+ }
 66+ else {
 67+ URLConnection con = u.openConnection();
 68+ String mime = con.getContentType();
 69+ mime = mime.replaceAll(";.*$", "");
 70+ InputStream in = con.getInputStream();
 71+
 72+ if (mime.equals("application/x-gzip")) {
 73+ return new GZIPInputStream(in); //FIXME: somehow, this doesn't seem to work. or was the external gunzipper the problem? check this!
 74+ }
 75+ else if (mime.equals("application/x-bzip2")) {
 76+ validateBZ2(in);
 77+ return new CBZip2InputStream(in);
 78+ }
 79+ else if (mime.equals("application/xml")) {
 80+ return in;
 81+ }
 82+
 83+ in.close();
 84+ throw new IOException("MIME type not suitable for a wiki dump: "+mime);
 85+ }
 86+ }
 87+
 88+ public InputStream openFile(File file) throws IOException {
 89+ String f = file.getAbsolutePath();
 90+
 91+ if (f.equals("-"))
 92+ return new BufferedInputStream(System.in);
 93+
 94+ InputStream in = new BufferedInputStream(new FileInputStream(file));
 95+ if (f.endsWith(".gz")) {
 96+ if (externalGunzip!=null) return openProc(externalGunzip, file);
 97+ else return new GZIPInputStream(in);
 98+ }
 99+ else if (f.endsWith(".bz2")) {
 100+ if (externalBunzip!=null) {
 101+ return openProc(externalBunzip, file);
 102+ }
 103+ else {
 104+ validateBZ2(in);
 105+ return new CBZip2InputStream(in);
 106+ }
 107+ }
 108+ else
 109+ return in;
 110+ }
 111+
 112+ protected static void validateBZ2(InputStream in) throws IOException {
 113+ int first = in.read();
 114+ int second = in.read();
 115+ if (first != 'B' || second != 'Z')
 116+ throw new IOException("Didn't find BZ file signature");
 117+ }
 118+
 119+ protected static final Pattern commandParamPattern = Pattern.compile("^(.*) +([^/\\\\]+)$");
 120+
 121+ public static InputStream openProc(String command, File f) throws IOException {
 122+ String[] cmd;
 123+
 124+ Matcher m = commandParamPattern.matcher(command);
 125+ if (m.matches()) {
 126+ String[] p = m.group(2).trim().split("\\s+");
 127+
 128+ cmd = new String[p.length+2];
 129+ cmd[0] = m.group(1).trim();
 130+ System.arraycopy(p, 0, cmd, 1, p.length);
 131+
 132+ cmd[cmd.length-1] = f.getAbsolutePath();
 133+ }
 134+ else {
 135+ cmd = new String[] {
 136+ command,
 137+ f.getAbsolutePath()
 138+ };
 139+ }
 140+
 141+ Process proc = Runtime.getRuntime().exec(cmd);
 142+ final InputStream err = proc.getErrorStream();
 143+
 144+ //HACK!
 145+ Thread slurper = new Thread("stderr slurper for "+proc) {
 146+ @Override
 147+ public void run() {
 148+ try {
 149+ IOUtil.pump(err, System.err);
 150+ } catch (IOException e) {
 151+ e.printStackTrace(System.err);
 152+ }
 153+ }
 154+ };
 155+
 156+ slurper.start();
 157+
 158+ return new BufferedInputStream(proc.getInputStream());
 159+ }
 160+
 161+}
Index: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/AssemblingFeatureSetCursorTest.java
@@ -0,0 +1,65 @@
 2+package de.brightbyte.wikiword.integrator.data;
 3+
 4+import java.util.ArrayList;
 5+import java.util.Arrays;
 6+import java.util.Collection;
 7+import java.util.List;
 8+
 9+import junit.framework.TestCase;
 10+import de.brightbyte.data.cursor.DataCursor;
 11+import de.brightbyte.data.cursor.IteratorCursor;
 12+import de.brightbyte.util.PersistenceException;
 13+
 14+public class AssemblingFeatureSetCursorTest extends TestCase {
 15+
 16+ private static <T> Collection<T> slurp(DataCursor<T> cursor) throws PersistenceException {
 17+ ArrayList<T> list = new ArrayList<T>();
 18+ T obj;
 19+ while ((obj = cursor.next()) != null) list.add(obj);
 20+ return list;
 21+ }
 22+
 23+ public void testNext() throws PersistenceException {
 24+ FeatureSet a = new DefaultFeatureSet("name");
 25+ a.put("id", 1);
 26+ a.put("property", "name");
 27+ a.put("value", "A");
 28+ a.put("value", "a");
 29+ a.put("xyzzy", "bla");
 30+
 31+ FeatureSet b = new DefaultFeatureSet("name");
 32+ b.put("id", 1);
 33+ b.put("property", "foo");
 34+ b.put("value", "X");
 35+ b.put("value", "Y");
 36+
 37+ FeatureSet x = new DefaultFeatureSet("name");
 38+ x.put("id", 2);
 39+ x.put("property", "name");
 40+ x.put("property", "alias");
 41+ x.put("value", "Foo");
 42+
 43+ //--------------------------------------
 44+
 45+ FeatureSet one = new DefaultFeatureSet();
 46+ one.put("id", 1);
 47+ one.put("name", "A");
 48+ one.put("name", "a");
 49+ one.put("foo", "X");
 50+ one.put("foo", "Y");
 51+
 52+ FeatureSet two = new DefaultFeatureSet();
 53+ two.put("id", 2);
 54+ two.put("name", "Foo");
 55+ two.put("alias", "Foo");
 56+
 57+ List<FeatureSet> exp= Arrays.asList(new FeatureSet[] {one, two});
 58+ List<FeatureSet> source= Arrays.asList(new FeatureSet[] {a, b, x});
 59+
 60+ DataCursor<FeatureSet> sourceCursor = new IteratorCursor<FeatureSet>(source.iterator());
 61+ DataCursor<FeatureSet> cursor = new AssemblingFeatureSetCursor(sourceCursor, "id", "property", "value");
 62+
 63+ assertEquals(exp, slurp(cursor));
 64+ }
 65+
 66+}
Property changes on: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/AssemblingFeatureSetCursorTest.java
___________________________________________________________________
Added: svn:mergeinfo
Index: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/FeatureSetsTest.java
@@ -2,14 +2,10 @@
33
44 import java.util.ArrayList;
55
 6+import junit.framework.TestCase;
67 import de.brightbyte.data.LabeledVector;
78 import de.brightbyte.data.MapLabeledVector;
8 -import de.brightbyte.wikiword.integrator.data.DefaultFeatureSet;
9 -import de.brightbyte.wikiword.integrator.data.FeatureSet;
10 -import de.brightbyte.wikiword.integrator.data.FeatureSets;
119
12 -import junit.framework.TestCase;
13 -
1410 public class FeatureSetsTest extends TestCase {
1511
1612 public void testMerge() {
Index: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/AssociationTest.java
@@ -1,8 +1,5 @@
22 package de.brightbyte.wikiword.integrator.data;
33
4 -import de.brightbyte.wikiword.integrator.data.Association;
5 -import de.brightbyte.wikiword.integrator.data.DefaultFeatureSet;
6 -import de.brightbyte.wikiword.integrator.data.FeatureSet;
74 import junit.framework.TestCase;
85
96 public class AssociationTest extends TestCase {
Index: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/CollapsingMatchesCursorTest.java
@@ -3,15 +3,10 @@
44 import java.util.ArrayList;
55 import java.util.Collection;
66
 7+import junit.framework.TestCase;
78 import de.brightbyte.data.cursor.DataCursor;
89 import de.brightbyte.data.cursor.IteratorCursor;
910 import de.brightbyte.util.PersistenceException;
10 -import de.brightbyte.wikiword.integrator.data.Association;
11 -import de.brightbyte.wikiword.integrator.data.CollapsingMatchesCursor;
12 -import de.brightbyte.wikiword.integrator.data.DefaultFeatureSet;
13 -import de.brightbyte.wikiword.integrator.data.FeatureSet;
14 -import de.brightbyte.wikiword.integrator.data.FeatureSets;
15 -import junit.framework.TestCase;
1611
1712 public class CollapsingMatchesCursorTest extends TestCase {
1813
Index: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/CollapsingFeatureSetCursorTest.java
@@ -0,0 +1,84 @@
 2+package de.brightbyte.wikiword.integrator.data;
 3+
 4+import java.util.ArrayList;
 5+import java.util.Collection;
 6+
 7+import junit.framework.TestCase;
 8+import de.brightbyte.data.cursor.DataCursor;
 9+import de.brightbyte.data.cursor.IteratorCursor;
 10+import de.brightbyte.util.PersistenceException;
 11+
 12+public class CollapsingFeatureSetCursorTest extends TestCase {
 13+
 14+ private static <T> Collection<T> slurp(DataCursor<T> cursor) throws PersistenceException {
 15+ ArrayList<T> list = new ArrayList<T>();
 16+ T obj;
 17+ while ((obj = cursor.next()) != null) list.add(obj);
 18+ return list;
 19+ }
 20+
 21+ public void testNext() throws PersistenceException {
 22+ FeatureSet a = new DefaultFeatureSet("id");
 23+ a.put("id", 1);
 24+ a.put("foo", "A");
 25+
 26+ FeatureSet b = new DefaultFeatureSet("id");
 27+ b.put("id", 1);
 28+ b.put("foo", "B");
 29+
 30+ FeatureSet x = new DefaultFeatureSet("id");
 31+ x.put("id", 2);
 32+ x.put("foo", "X");
 33+
 34+ FeatureSet y = new DefaultFeatureSet("id");
 35+ y.put("id", 2);
 36+ y.put("foo", "Y");
 37+
 38+ FeatureSet p = new DefaultFeatureSet("id");
 39+ p.put("id", 3);
 40+ p.put("foo", "P");
 41+
 42+ FeatureSet q = new DefaultFeatureSet("id");
 43+ q.put("id", 3);
 44+ q.put("foo", "Q");
 45+
 46+ //--------------------------------------
 47+ FeatureSet ab = new DefaultFeatureSet("id");
 48+ ab.put("id", 1);
 49+ ab.put("id", 1);
 50+ ab.put("foo", "A");
 51+ ab.put("foo", "B");
 52+
 53+ FeatureSet xy = new DefaultFeatureSet("id");
 54+ xy.put("id", 2);
 55+ xy.put("id", 2);
 56+ xy.put("foo", "X");
 57+ xy.put("foo", "Y");
 58+
 59+ FeatureSet pq = new DefaultFeatureSet("id");
 60+ pq.put("id", 3);
 61+ pq.put("id", 3);
 62+ pq.put("foo", "P");
 63+ pq.put("foo", "Q");
 64+
 65+ //--------------------------------------
 66+ ArrayList<FeatureSet> source = new ArrayList<FeatureSet>();
 67+ source.add(a);
 68+ source.add(b);
 69+ source.add(x);
 70+ source.add(y);
 71+ source.add(p);
 72+ source.add(q);
 73+
 74+ ArrayList<FeatureSet> exp = new ArrayList<FeatureSet>();
 75+ exp.add(ab);
 76+ exp.add(xy);
 77+ exp.add(pq);
 78+
 79+ DataCursor<FeatureSet> sourceCursor = new IteratorCursor<FeatureSet>(source.iterator());
 80+ DataCursor<FeatureSet> cursor = new CollapsingFeatureSetCursor(sourceCursor, "id");
 81+
 82+ assertEquals(exp, slurp(cursor));
 83+ }
 84+
 85+}
Index: trunk/WikiWord/WikiWordIntegrator/src/test/java/de/brightbyte/wikiword/integrator/data/CollapsingAssociationCursorTest.java
@@ -3,16 +3,11 @@
44 import java.util.ArrayList;
55 import java.util.Collection;
66
 7+import junit.framework.TestCase;
78 import de.brightbyte.data.cursor.DataCursor;
89 import de.brightbyte.data.cursor.IteratorCursor;
910 import de.brightbyte.util.PersistenceException;
10 -import de.brightbyte.wikiword.integrator.data.Association;
11 -import de.brightbyte.wikiword.integrator.data.CollapsingAssociationCursor;
12 -import de.brightbyte.wikiword.integrator.data.DefaultFeatureSet;
13 -import de.brightbyte.wikiword.integrator.data.FeatureSet;
1411
15 -import junit.framework.TestCase;
16 -
1712 public class CollapsingAssociationCursorTest extends TestCase {
1813
1914 private static <T> Collection<T> slurp(DataCursor<T> cursor) throws PersistenceException {
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/processor/ConceptMappingProcessor.java
@@ -3,7 +3,6 @@
44 import de.brightbyte.data.cursor.DataCursor;
55 import de.brightbyte.util.PersistenceException;
66 import de.brightbyte.wikiword.integrator.data.MappingCandidates;
7 -import de.brightbyte.wikiword.integrator.store.MappingFeatureStoreBuilder;
87
98 public interface ConceptMappingProcessor {
109 public void processMappings(DataCursor<MappingCandidates> cursor) throws PersistenceException;
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/ForeignEntityStoreDescriptor.java
@@ -0,0 +1,62 @@
 2+package de.brightbyte.wikiword.integrator;
 3+
 4+import java.util.List;
 5+import java.util.Map;
 6+
 7+import de.brightbyte.wikiword.TweakSet;
 8+
 9+public class ForeignEntityStoreDescriptor extends TweakSet {
 10+
 11+ public ForeignEntityStoreDescriptor() {
 12+ super();
 13+ }
 14+
 15+ public ForeignEntityStoreDescriptor(TweakSet parent) {
 16+ super(parent);
 17+ }
 18+
 19+ public String getDataEncoding() {
 20+ return getTweak("foreign.encoding", "UTF-8");
 21+ }
 22+
 23+ public String getSqlQuery() {
 24+ return getTweak("foreign.query", null);
 25+ }
 26+
 27+ public String getSourceFileName() {
 28+ return getTweak("foreign.file", null);
 29+ }
 30+
 31+ public String[] getDataFields() {
 32+ List<String> v = getTweak("foreign.field", (List<String>)null);
 33+ if (v==null) return null;
 34+ return (String[]) v.toArray(new String[v.size()]);
 35+ }
 36+
 37+ public Map<String, String> getSplitExpressions() {
 38+ return getTweak("split", (Map<String, String>)null);
 39+ }
 40+
 41+ public String getPropertyValueField() {
 42+ return getTweak("foreign.property-value-field", null);
 43+ }
 44+
 45+ public String getPropertyNameField() {
 46+ return getTweak("foreign.property-name-field", "value");
 47+ }
 48+
 49+ public String getConceptIdField() {
 50+ return getTweak("foreign.concept-id-field", "id");
 51+ }
 52+
 53+ public String getConceptNameField() {
 54+ return getTweak("foreign.concept-name-field", "name");
 55+ }
 56+
 57+ public String getAuthorityName() {
 58+ String name = getTweak("foreign.authority-name", null);
 59+ if (name==null) throw new RuntimeException("authority name not specified!");
 60+ return name;
 61+ }
 62+
 63+}
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/LoadForeignProperties.java
@@ -1,20 +1,34 @@
22 package de.brightbyte.wikiword.integrator;
33
44 import java.io.IOException;
 5+import java.io.InputStream;
 6+import java.sql.Connection;
 7+import java.sql.ResultSet;
 8+import java.sql.SQLException;
 9+import java.util.Arrays;
 10+import java.util.Collection;
 11+import java.util.Map;
 12+import java.util.regex.Pattern;
513
 14+import de.brightbyte.data.Functor;
615 import de.brightbyte.data.cursor.DataCursor;
 16+import de.brightbyte.db.SqlScriptRunner;
 17+import de.brightbyte.io.IOUtil;
718 import de.brightbyte.util.PersistenceException;
819 import de.brightbyte.wikiword.StoreBackedApp;
9 -import de.brightbyte.wikiword.builder.ImportApp;
 20+import de.brightbyte.wikiword.builder.InputFileHelper;
 21+import de.brightbyte.wikiword.integrator.data.AssemblingFeatureSetCursor;
 22+import de.brightbyte.wikiword.integrator.data.FeatureSet;
 23+import de.brightbyte.wikiword.integrator.data.FeatureSetValueSplitter;
1024 import de.brightbyte.wikiword.integrator.data.ForeignEntity;
 25+import de.brightbyte.wikiword.integrator.data.ForeignEntityCursor;
 26+import de.brightbyte.wikiword.integrator.data.MangelingFeatureSetCursor;
 27+import de.brightbyte.wikiword.integrator.data.ResultSetFeatureSetCursor;
 28+import de.brightbyte.wikiword.integrator.data.TsvFeatureSetCursor;
1129 import de.brightbyte.wikiword.integrator.processor.ForeignPropertyProcessor;
1230 import de.brightbyte.wikiword.integrator.store.DatabaseForeignPropertyStoreBuilder;
1331 import de.brightbyte.wikiword.integrator.store.ForeignPropertyStoreBuilder;
14 -import de.brightbyte.wikiword.model.WikiWordConcept;
1532 import de.brightbyte.wikiword.store.WikiWordStoreFactory;
16 -import de.brightbyte.wikiword.store.builder.ConceptInfoStoreBuilder;
17 -import de.brightbyte.wikiword.store.builder.DatabaseConceptStoreBuilders;
18 -import de.brightbyte.wikiword.store.builder.WikiWordConceptStoreBuilder;
1933
2034 /**
2135 * This is the primary entry point to the first phase of a WikiWord analysis.
@@ -25,6 +39,7 @@
2640
2741 protected ForeignPropertyStoreBuilder propertyStore;
2842 protected ForeignPropertyProcessor propertyProcessor;
 43+ protected InputFileHelper inputHelper;
2944
3045 public LoadForeignProperties() {
3146 super(true, true);
@@ -35,10 +50,14 @@
3651 return new DatabaseForeignPropertyStoreBuilder.Factory(getTargetTableName(), getConfiguredDataset(), getConfiguredDataSource(), tweaks);
3752 }
3853
39 - private String getTargetTableName() {
40 - return args.getParameterCount() > 3 ? args.getParameter(3) : "foreign_property";
 54+ protected String getTargetTableName() {
 55+ return args.getParameterCount() > 2 ? args.getParameter(2) : "foreign_property";
4156 }
4257
 58+ protected String getSourceDescriptionFileName() {
 59+ return args.getParameter(1);
 60+ }
 61+
4362 @Override
4463 protected void declareOptions() {
4564 super.declareOptions();
@@ -59,11 +78,76 @@
6079 cursor.close();
6180 }
6281
63 - protected DataCursor<ForeignEntity> openPropertySource() {
64 - // TODO Auto-generated method stub
65 - return null;
 82+ protected DataCursor<ForeignEntity> openPropertySource() throws IOException, SQLException, PersistenceException {
 83+ ForeignEntityStoreDescriptor sourceDescriptor = loadSourceDescriptor();
 84+
 85+ String enc = sourceDescriptor.getDataEncoding();
 86+ String sql = sourceDescriptor.getSqlQuery();
 87+ InputStream in = null;
 88+
 89+ if (sql==null) {
 90+ String n = sourceDescriptor.getSourceFileName();
 91+ String format = inputHelper.getFormat(n);
 92+ in = inputHelper.open(n);
 93+
 94+ if (format!=null && format.equals("sql")) {
 95+ sql = IOUtil.slurp(in, enc);
 96+
 97+ in.close();
 98+ in = null;
 99+ }
 100+ }
 101+
 102+ DataCursor<FeatureSet> fsc;
 103+ String[] fields = sourceDescriptor.getDataFields();
 104+
 105+ if (sql!=null) {
 106+ Collection<Functor<String, String>> manglers = Arrays.asList(getSqlScriptManglers());
 107+ Connection con = getConfiguredDataSource().getConnection();
 108+ ResultSet rs = SqlScriptRunner.runQuery(con, sql, manglers);
 109+
 110+ fsc = new ResultSetFeatureSetCursor(rs, fields);
 111+ } else {
 112+ fsc = new TsvFeatureSetCursor(in, enc);
 113+
 114+ if (fields!=null) ((TsvFeatureSetCursor)fsc).setFields(fields);
 115+ else ((TsvFeatureSetCursor)fsc).readFields();
 116+ }
 117+
 118+ String propField = sourceDescriptor.getPropertyNameField();
 119+ if (propField!=null) {
 120+ String valueField = sourceDescriptor.getPropertyValueField();
 121+ String idField = sourceDescriptor.getConceptIdField();
 122+ fsc = new AssemblingFeatureSetCursor(fsc, idField, propField, valueField);
 123+ }
 124+
 125+ Map<String, String> splitExp = sourceDescriptor.getSplitExpressions();
 126+ if (splitExp!=null) {
 127+ fsc = new MangelingFeatureSetCursor(fsc, FeatureSetValueSplitter.multiFromStringMap(splitExp, 0));
 128+ }
 129+
 130+ return new ForeignEntityCursor(fsc, sourceDescriptor.getAuthorityName(), sourceDescriptor.getConceptIdField(), sourceDescriptor.getConceptNameField());
66131 }
67132
 133+ protected ForeignEntityStoreDescriptor loadSourceDescriptor() throws IOException {
 134+ ForeignEntityStoreDescriptor descriptor = new ForeignEntityStoreDescriptor();
 135+
 136+ String n = getSourceDescriptionFileName();
 137+ InputStream in = inputHelper.open(n);
 138+ descriptor.loadTweaks(in);
 139+ in.close();
 140+
 141+ return descriptor;
 142+ }
 143+
 144+ @SuppressWarnings("unchecked")
 145+ protected Functor<String, String>[] getSqlScriptManglers() {
 146+ return new Functor[] {
 147+ new SqlScriptRunner.RegularExpressionMangler(Pattern.compile("/\\* *wikiword_prefix* \\*/"), getConfiguredDataset().getDbPrefix()),
 148+ new SqlScriptRunner.RegularExpressionMangler(Pattern.compile("/\\* *wikiword_db* \\*/"), getConfiguredDatasetName()),
 149+ };
 150+ }
 151+
68152 public static void main(String[] argv) throws Exception {
69153 LoadForeignProperties app = new LoadForeignProperties();
70154 app.launch(argv);
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/FeatureSetValueSplitter.java
@@ -2,12 +2,47 @@
33
44 import java.util.ArrayList;
55 import java.util.List;
 6+import java.util.Map;
67 import java.util.regex.Matcher;
78 import java.util.regex.Pattern;
89
910
1011 public class FeatureSetValueSplitter implements FeatureSetMangler {
 12+
 13+ public static FeatureSetMultiMangler multi(FeatureSetValueSplitter... splitters) {
 14+ return new FeatureSetMultiMangler((FeatureSetMangler[])splitters);
 15+ }
1116
 17+ public static FeatureSetMultiMangler multiFromSplitters(Iterable<FeatureSetValueSplitter> splitters) {
 18+ FeatureSetMultiMangler m = new FeatureSetMultiMangler();
 19+
 20+ for (FeatureSetValueSplitter s: splitters) {
 21+ m.addMangler(s);
 22+ }
 23+
 24+ return m;
 25+ }
 26+
 27+ public static FeatureSetMultiMangler multiFromPatternMap(Map<String, Pattern> splitters) {
 28+ FeatureSetMultiMangler m = new FeatureSetMultiMangler();
 29+
 30+ for (Map.Entry<String, Pattern>e: splitters.entrySet()) {
 31+ m.addMangler(new FeatureSetValueSplitter(e.getKey(), e.getValue()));
 32+ }
 33+
 34+ return m;
 35+ }
 36+
 37+ public static FeatureSetMultiMangler multiFromStringMap(Map<String, String> splitters, int flags) {
 38+ FeatureSetMultiMangler m = new FeatureSetMultiMangler();
 39+
 40+ for (Map.Entry<String, String>e: splitters.entrySet()) {
 41+ m.addMangler(new FeatureSetValueSplitter(e.getKey(), e.getValue(), flags));
 42+ }
 43+
 44+ return m;
 45+ }
 46+
1247 protected String field;
1348 protected Matcher splitter;
1449
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/AssemblingFeatureSetCursor.java
@@ -0,0 +1,55 @@
 2+package de.brightbyte.wikiword.integrator.data;
 3+
 4+import java.util.List;
 5+
 6+import de.brightbyte.data.cursor.DataCursor;
 7+import de.brightbyte.util.PersistenceException;
 8+
 9+public class AssemblingFeatureSetCursor implements DataCursor<FeatureSet> {
 10+
 11+ protected DataCursor<FeatureSet> cursor;
 12+ protected FeatureSet prev;
 13+
 14+ protected String recordIdField;
 15+ protected String propertyNameField;
 16+ protected String propertyValueField;
 17+
 18+ public AssemblingFeatureSetCursor(DataCursor<FeatureSet> cursor, String recordIdField, String propertyNameField, String propertyValueField) {
 19+ if (cursor==null) throw new NullPointerException();
 20+ if (recordIdField==null) throw new NullPointerException();
 21+ if (propertyNameField==null) throw new NullPointerException();
 22+ if (propertyValueField==null) throw new NullPointerException();
 23+
 24+ this.cursor = cursor;
 25+ this.recordIdField = recordIdField;
 26+ this.propertyNameField = propertyNameField;
 27+ this.propertyValueField = propertyValueField;
 28+ }
 29+
 30+ public void close() {
 31+ cursor.close();
 32+ }
 33+
 34+ public FeatureSet next() throws PersistenceException {
 35+ if (prev==null) prev = cursor.next();
 36+ if (prev==null) return null;
 37+
 38+ FeatureSet a = new DefaultFeatureSet();;
 39+ a.putAll(recordIdField, prev.get(recordIdField));
 40+
 41+ while (prev!=null) {
 42+ List<Object> keys = prev.get(propertyNameField);
 43+ List<Object> values = prev.get(propertyValueField);
 44+
 45+ for (Object k: keys) {
 46+ a.putAll(k.toString(), values);
 47+ }
 48+
 49+ prev = cursor.next();
 50+ if (prev==null || !prev.overlaps(a, recordIdField)) break;
 51+ }
 52+
 53+ return a;
 54+ }
 55+
 56+}
Property changes on: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/AssemblingFeatureSetCursor.java
___________________________________________________________________
Added: svn:mergeinfo
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/ResultSetFeatureSetCursor.java
@@ -1,13 +1,19 @@
22 package de.brightbyte.wikiword.integrator.data;
33
44 import java.sql.ResultSet;
 5+import java.sql.SQLException;
56
67 import de.brightbyte.db.DatabaseDataSet;
 8+import de.brightbyte.db.DatabaseUtil;
79
810 public class ResultSetFeatureSetCursor extends DatabaseDataSet.Cursor<FeatureSet> {
911
10 - public ResultSetFeatureSetCursor(ResultSet resultSet, String[] fields) {
11 - super(resultSet, new ResultSetFeatureSetFactory(fields));
 12+ public ResultSetFeatureSetCursor(ResultSet resultSet) throws SQLException {
 13+ this(resultSet, null);
1214 }
 15+
 16+ public ResultSetFeatureSetCursor(ResultSet resultSet, String[] fields) throws SQLException {
 17+ super(resultSet, new ResultSetFeatureSetFactory(fields == null ? DatabaseUtil.getFieldNames(resultSet): fields));
 18+ }
1319
1420 }
Index: trunk/WikiWord/WikiWordIntegrator/src/main/java/de/brightbyte/wikiword/integrator/data/CollapsingFeatureSetCursor.java
@@ -0,0 +1,42 @@
 2+package de.brightbyte.wikiword.integrator.data;
 3+
 4+import de.brightbyte.data.cursor.DataCursor;
 5+import de.brightbyte.util.PersistenceException;
 6+
 7+public class CollapsingFeatureSetCursor implements DataCursor<FeatureSet> {
 8+
 9+ protected DataCursor<FeatureSet> cursor;
 10+ protected FeatureSet prev;
 11+
 12+ protected String recordIdField;
 13+
 14+ public CollapsingFeatureSetCursor(DataCursor<FeatureSet> cursor, String sourceKeyField) {
 15+ if (cursor==null) throw new NullPointerException();
 16+ if (sourceKeyField==null) throw new NullPointerException();
 17+
 18+ this.cursor = cursor;
 19+ this.recordIdField = sourceKeyField;
 20+ }
 21+
 22+ public void close() {
 23+ cursor.close();
 24+ }
 25+
 26+ public FeatureSet next() throws PersistenceException {
 27+ if (prev==null) prev = cursor.next();
 28+ if (prev==null) return null;
 29+
 30+ FeatureSet a = prev;
 31+
 32+ while (true) {
 33+ prev = cursor.next();
 34+ if (prev==null) break;
 35+
 36+ if (!prev.overlaps(a, recordIdField)) break;
 37+ a = FeatureSets.merge(a, prev);
 38+ }
 39+
 40+ return a;
 41+ }
 42+
 43+}

Status & tagging log