Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/WordSenseIndexer.java |
— | — | @@ -1,6 +1,8 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
| 4 | +import java.io.FileNotFoundException; |
4 | 5 | import java.io.IOException; |
| 6 | +import java.io.UnsupportedEncodingException; |
5 | 7 | import java.text.ParseException; |
6 | 8 | import java.util.Collections; |
7 | 9 | import java.util.List; |
— | — | @@ -11,6 +13,7 @@ |
12 | 14 | import de.brightbyte.data.measure.Measure; |
13 | 15 | import de.brightbyte.io.ConsoleIO; |
14 | 16 | import de.brightbyte.io.LineCursor; |
| 17 | +import de.brightbyte.io.LineSink; |
15 | 18 | import de.brightbyte.io.OutputSink; |
16 | 19 | import de.brightbyte.text.Chunker; |
17 | 20 | import de.brightbyte.text.RegularExpressionChunker; |
— | — | @@ -43,13 +46,21 @@ |
44 | 47 | } |
45 | 48 | |
46 | 49 | @Override |
47 | | - protected DataSink<String> openSink() { |
48 | | - return new OutputSink(ConsoleIO.output); //FIXME: open stream as required |
| 50 | + protected DataSink<String> openSink() throws PersistenceException { |
| 51 | + try { |
| 52 | + return new LineSink(getOutputWriter()); |
| 53 | + } catch (IOException e) { |
| 54 | + throw new PersistenceException(e); |
| 55 | + } |
49 | 56 | } |
50 | 57 | |
51 | 58 | @Override |
52 | | - protected DataCursor<String> openCursor() { |
53 | | - return new LineCursor(ConsoleIO.newReader()); //FIXME: open stream as required |
| 59 | + protected DataCursor<String> openCursor() throws PersistenceException { |
| 60 | + try { |
| 61 | + return new LineCursor(getInputReader()); |
| 62 | + } catch (IOException e) { |
| 63 | + throw new PersistenceException(e); |
| 64 | + } |
54 | 65 | } |
55 | 66 | |
56 | 67 | @Override |
— | — | @@ -98,7 +109,7 @@ |
99 | 110 | } |
100 | 111 | |
101 | 112 | @Override |
102 | | - protected String process(String line) throws PersistenceException, ParseException { |
| 113 | + protected void process(String line) throws PersistenceException, ParseException { |
103 | 114 | //TODO: logic for handling overlapping phrases in a PhraseOccuranceSequence |
104 | 115 | /* |
105 | 116 | PhraseOccuranceSequence sequence = analyzer.extractPhrases(line, phraseLength); //TODO: alternative tokenizer/splitter //TODO: split by sentence first. |
— | — | @@ -113,7 +124,8 @@ |
114 | 125 | Disambiguator.Result<Term, LocalConcept> result = disambiguator.disambiguate(terms, null); |
115 | 126 | if (flip) Collections.reverse(terms); |
116 | 127 | |
117 | | - return assembleMeanings(terms, result); |
| 128 | + String s = assembleMeanings(terms, result); //TODO: use proper TSV or something |
| 129 | + commit(s); |
118 | 130 | } |
119 | 131 | |
120 | 132 | private String assembleMeanings(List<Term> terms, Result<Term, LocalConcept> result) { |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/StreamProcessorApp.java |
— | — | @@ -1,20 +1,103 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
| 4 | +import java.io.BufferedOutputStream; |
| 5 | +import java.io.File; |
| 6 | +import java.io.FileNotFoundException; |
| 7 | +import java.io.FileOutputStream; |
| 8 | +import java.io.IOException; |
| 9 | +import java.io.InputStream; |
| 10 | +import java.io.InputStreamReader; |
| 11 | +import java.io.OutputStream; |
| 12 | +import java.io.OutputStreamWriter; |
| 13 | +import java.io.Reader; |
| 14 | +import java.io.UnsupportedEncodingException; |
| 15 | +import java.io.Writer; |
| 16 | + |
4 | 17 | import de.brightbyte.data.cursor.DataCursor; |
5 | 18 | import de.brightbyte.data.cursor.DataSink; |
| 19 | +import de.brightbyte.io.ConsoleIO; |
6 | 20 | import de.brightbyte.util.PersistenceException; |
7 | 21 | import de.brightbyte.wikiword.StoreBackedApp; |
| 22 | +import de.brightbyte.wikiword.builder.InputFileHelper; |
8 | 23 | import de.brightbyte.wikiword.store.WikiWordConceptStoreBase; |
9 | 24 | |
10 | 25 | public abstract class StreamProcessorApp<I, O, S extends WikiWordConceptStoreBase> extends StoreBackedApp<S> { |
11 | 26 | |
12 | | - protected DataCursor<I> cursor; |
13 | | - protected DataSink<O> sink; |
| 27 | + protected DataCursor<? extends I> cursor; |
| 28 | + protected DataSink<? super O> sink; |
14 | 29 | |
| 30 | + protected InputFileHelper inputHelper; |
| 31 | + |
15 | 32 | public StreamProcessorApp(boolean allowGlobal, boolean allowLocal) { |
16 | 33 | super(allowGlobal, allowLocal); |
17 | 34 | } |
18 | 35 | |
| 36 | + |
| 37 | + protected File getOutputFile() { |
| 38 | + if (outputFile==null) { |
| 39 | + if (args.getParameterCount()>2) { |
| 40 | + outputFile = new File(args.getParameter(2)); |
| 41 | + } |
| 42 | + } |
| 43 | + return outputFile; |
| 44 | + } |
| 45 | + |
| 46 | + protected String getOutputFileEncoding() { |
| 47 | + return args.getStringOption("outputencoding", "UTF-8"); |
| 48 | + } |
| 49 | + |
| 50 | + protected File outputFile; |
| 51 | + protected Writer outputWriter; |
| 52 | + protected OutputStream outputStream; |
| 53 | + private InputStream inputStream; |
| 54 | + private Reader inputReader; |
| 55 | + |
| 56 | + protected Writer getOutputWriter() throws FileNotFoundException, UnsupportedEncodingException { |
| 57 | + if (outputWriter==null) { |
| 58 | + File f = getOutputFile(); |
| 59 | + if (f==null) outputWriter = ConsoleIO.writer; |
| 60 | + else outputWriter = new OutputStreamWriter(getOutputStream(), getOutputFileEncoding()); |
| 61 | + } |
| 62 | + |
| 63 | + return outputWriter; |
| 64 | + } |
| 65 | + |
| 66 | + protected OutputStream getOutputStream() throws FileNotFoundException { |
| 67 | + if (outputStream==null) { |
| 68 | + File f = getOutputFile(); |
| 69 | + if (f==null) outputStream = System.out; |
| 70 | + else { |
| 71 | + outputStream = new BufferedOutputStream(new FileOutputStream(f, args.isSet("append"))); |
| 72 | + info("Writing output to "+f); |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + return outputStream; |
| 77 | + } |
| 78 | + |
| 79 | + protected Reader getInputReader() throws IOException { |
| 80 | + if (inputReader==null) { |
| 81 | + File f = getOutputFile(); |
| 82 | + if (f==null) inputReader = ConsoleIO.newReader(); |
| 83 | + else inputReader = new InputStreamReader(getInputStream(), getOutputFileEncoding()); |
| 84 | + } |
| 85 | + |
| 86 | + return inputReader; |
| 87 | + } |
| 88 | + |
| 89 | + protected InputStream getInputStream() throws IOException { |
| 90 | + if (inputStream==null) { |
| 91 | + File f = getOutputFile(); |
| 92 | + if (f==null) inputStream = System.in; |
| 93 | + else { |
| 94 | + inputStream = inputHelper.openFile(f); |
| 95 | + info("Reading input from "+f); |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + return inputStream; |
| 100 | + } |
| 101 | + |
19 | 102 | @Override |
20 | 103 | public void run() throws Exception { |
21 | 104 | init(); |
— | — | @@ -25,13 +108,13 @@ |
26 | 109 | close(); |
27 | 110 | } |
28 | 111 | |
29 | | - protected void open() { |
| 112 | + protected void open() throws PersistenceException { |
30 | 113 | cursor = openCursor(); |
31 | 114 | sink = openSink(); |
32 | 115 | } |
33 | 116 | |
34 | | - protected abstract DataCursor<I> openCursor(); |
35 | | - protected abstract DataSink<O> openSink(); |
| 117 | + protected abstract DataCursor<? extends I> openCursor() throws PersistenceException; |
| 118 | + protected abstract DataSink<? super O> openSink() throws PersistenceException; |
36 | 119 | |
37 | 120 | protected void init() throws Exception { |
38 | 121 | // noop |
— | — | @@ -40,15 +123,18 @@ |
41 | 124 | sink.close(); |
42 | 125 | } |
43 | 126 | |
44 | | - public void runTransfer(DataCursor<I> cursor) throws Exception { |
| 127 | + public void runTransfer(DataCursor<? extends I> cursor) throws Exception { |
45 | 128 | I rec; |
46 | 129 | while ((rec = cursor.next()) != null) { |
47 | 130 | //TODO: progress tracker |
48 | | - O res = process(rec); |
49 | | - if (res!=null) sink.commit(res); |
| 131 | + process(rec); |
50 | 132 | } |
51 | 133 | } |
52 | 134 | |
53 | | - protected abstract O process(I rec) throws Exception; |
| 135 | + protected void commit(O rec) throws PersistenceException { |
| 136 | + sink.commit(rec); |
| 137 | + } |
54 | 138 | |
| 139 | + protected abstract void process(I rec) throws Exception; |
| 140 | + |
55 | 141 | } |