Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/AbstractProcessor.java |
— | — | @@ -295,8 +295,8 @@ |
296 | 296 | } |
297 | 297 | |
298 | 298 | public static void declareOptions(Arguments args) { |
299 | | - args.declare("from", "f", true, String.class, "ignores all pages in the dump before (but excluding) the one with the given title"); |
300 | | - args.declare("after", "a", true, String.class, "ignores all pages in the dump until (and including) the one with the given title"); |
| 299 | + args.declare("from", "f", true, String.class, "ignores all pages in the input before (but excluding) the one with the given title"); |
| 300 | + args.declare("after", "a", true, String.class, "ignores all pages in the input until (and including) the one with the given title"); |
301 | 301 | args.declare("limit", "l", true, String.class, "maximum number of pages to process"); |
302 | 302 | args.declare("skip", "k", true, String.class, "number number of pages to skip before starting to process. " + |
303 | 303 | "if --from or --after are given, this number is counted from the position the given title occurrs at. " + |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/PageTitleFilter.java |
— | — | @@ -14,7 +14,7 @@ |
15 | 15 | } |
16 | 16 | |
17 | 17 | public boolean matches(WikiPage page) { |
18 | | - CharSequence t = page.getTitle(); |
| 18 | + CharSequence t = page.getResourceName(); |
19 | 19 | return filter.matches(t); |
20 | 20 | } |
21 | 21 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/AbstractExtractor.java |
— | — | @@ -6,11 +6,11 @@ |
7 | 7 | import de.brightbyte.wikiword.output.DataOutput; |
8 | 8 | import de.brightbyte.wikiword.processor.AbstractProcessor; |
9 | 9 | |
10 | | -public abstract class AbstractExtractor extends AbstractProcessor implements WikiWordExtractor { |
| 10 | +public abstract class AbstractExtractor<S extends DataOutput> extends AbstractProcessor implements WikiWordExtractor { |
11 | 11 | |
12 | | - protected DataOutput output; |
| 12 | + protected S output; |
13 | 13 | |
14 | | - public AbstractExtractor(WikiTextAnalyzer analyzer, DataOutput output, TweakSet tweaks) { |
| 14 | + public AbstractExtractor(WikiTextAnalyzer analyzer, S output, TweakSet tweaks) { |
15 | 15 | super(analyzer, tweaks); |
16 | 16 | |
17 | 17 | if (output==null) throw new NullPointerException(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractText.java |
— | — | @@ -1,7 +1,11 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
| 4 | +import java.io.IOException; |
| 5 | + |
| 6 | +import de.brightbyte.util.PersistenceException; |
4 | 7 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
5 | 8 | import de.brightbyte.wikiword.builder.ConceptImporter; |
| 9 | +import de.brightbyte.wikiword.output.PlainTextOutput; |
6 | 10 | import de.brightbyte.wikiword.output.TextOutput; |
7 | 11 | import de.brightbyte.wikiword.output.TsvTextOutput; |
8 | 12 | |
— | — | @@ -10,36 +14,34 @@ |
11 | 15 | public ExtractText() { |
12 | 16 | super(); |
13 | 17 | } |
14 | | - |
15 | | - |
16 | | - @Override |
17 | | - protected TextExtractor newProcessor(WikiTextAnalyzer analyzer) { |
18 | | - return new TextExtractor(analyzer, output, tweaks); |
19 | | - } |
20 | 18 | |
21 | | - |
22 | 19 | @Override |
23 | 20 | protected void declareOptions() { |
24 | 21 | super.declareOptions(); |
25 | 22 | |
26 | 23 | ConceptImporter.declareOptions(args); |
27 | 24 | |
28 | | - args.declareHelp("<dump-file>", "the dump file to process"); |
29 | | - args.declare("wiki", null, true, String.class, "sets the wiki name (overrides the name given by, or " + |
30 | | - "guessed from, the <dump-file> parameter)"); |
31 | | - |
32 | | - args.declare("plain", null, false, Boolean.class, "output plain text"); |
33 | 25 | args.declare("tsv", null, false, Boolean.class, "output TSV table"); |
34 | 26 | } |
| 27 | + |
| 28 | + @Override |
| 29 | + protected TextOutput createOutput() throws PersistenceException { |
| 30 | + try { |
| 31 | + if (args.isSet("tsv")) return new TsvTextOutput(getCorpus(), getOutputWriter()); |
| 32 | + else return new PlainTextOutput(getCorpus(), getOutputStream(), getOutputFileEncoding()); |
| 33 | + } catch (IOException e) { |
| 34 | + throw new PersistenceException(e); |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + @Override |
| 39 | + protected TextExtractor newProcessor(WikiTextAnalyzer analyzer) { |
| 40 | + return new TextExtractor(analyzer, output, tweaks); |
| 41 | + } |
35 | 42 | |
36 | 43 | public static void main(String[] argv) throws Exception { |
37 | 44 | ExtractText app = new ExtractText(); |
38 | 45 | app.launch(argv); |
39 | 46 | } |
40 | | - |
41 | | - @Override |
42 | | - protected TextOutput createOutput() { |
43 | | - return new TsvTextOutput(getCorpus(), getOutputWriter()); |
44 | | - } |
45 | 47 | |
46 | 48 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/TextExtractor.java |
— | — | @@ -12,9 +12,8 @@ |
13 | 13 | import de.brightbyte.wikiword.builder.AbstractImporter; |
14 | 14 | import de.brightbyte.wikiword.output.TextOutput; |
15 | 15 | |
16 | | -public class TextExtractor extends AbstractExtractor { |
| 16 | +public class TextExtractor extends AbstractExtractor<TextOutput> { |
17 | 17 | |
18 | | - private TextOutput textOutput; |
19 | 18 | //private LocalConceptStore localConceptStore; |
20 | 19 | |
21 | 20 | //private boolean storeDefinitions = true; |
— | — | @@ -25,8 +24,6 @@ |
26 | 25 | |
27 | 26 | public TextExtractor(WikiTextAnalyzer analyzer, TextOutput output, TweakSet tweaks) { |
28 | 27 | super(analyzer, output, tweaks); |
29 | | - |
30 | | - this.output = output; |
31 | 28 | } |
32 | 29 | |
33 | 30 | /* |
— | — | @@ -78,13 +75,13 @@ |
79 | 76 | textId ++; |
80 | 77 | |
81 | 78 | if (storeWikiText) { //TODO: separate access path... |
82 | | - textOutput.storeRawText(textId, name, ptype, text); |
| 79 | + output.storeRawText(textId, name, ptype, text); |
83 | 80 | } |
84 | 81 | |
85 | 82 | if (storePlainText) { //TODO: separate access path... |
86 | 83 | String plain = analyzerPage.getPlainText(false).toString(); |
87 | 84 | checkSmellsLikeWiki(0, plain, "plain text: "+name+" (id={0})", textId); |
88 | | - textOutput.storePlainText(textId, name, ptype, plain); |
| 85 | + output.storePlainText(textId, name, ptype, plain); |
89 | 86 | } |
90 | 87 | |
91 | 88 | /* |
— | — | @@ -103,8 +100,8 @@ |
104 | 101 | public static void declareOptions(Arguments args) { |
105 | 102 | AbstractImporter.declareOptions(args); |
106 | 103 | |
107 | | - args.declare("wiki", null, true, String.class, "store raw wiki text"); |
108 | | - args.declare("plain", null, true, String.class, "store stripped plain text"); |
| 104 | + args.declare("wikitext", null, true, String.class, "store raw wiki text"); |
| 105 | + args.declare("flattext", null, true, String.class, "store stripped plain text"); |
109 | 106 | //args.declare("defs", null, true, String.class, "extract and store definitions"); |
110 | 107 | } |
111 | 108 | |
— | — | @@ -113,8 +110,13 @@ |
114 | 111 | super.configure(args); |
115 | 112 | |
116 | 113 | //this.storeDefinitions = !args.isSet("defs"); |
117 | | - this.storeWikiText = !args.isSet("wiki"); |
118 | | - this.storePlainText = !args.isSet("plain"); |
| 114 | + storeWikiText = args.isSet("wikitext"); |
| 115 | + storePlainText = args.isSet("flattext"); |
| 116 | + |
| 117 | + if (!storeWikiText && !storePlainText) { |
| 118 | + storeWikiText = true; |
| 119 | + storePlainText = true; |
| 120 | + } |
119 | 121 | } |
120 | 122 | |
121 | 123 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java |
— | — | @@ -13,8 +13,6 @@ |
14 | 14 | |
15 | 15 | public abstract class ExtractFromDump<S extends DataOutput> extends ExtractorApp<S> { |
16 | 16 | |
17 | | - protected S output; |
18 | | - |
19 | 17 | public ExtractFromDump() { |
20 | 18 | super(); |
21 | 19 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractorApp.java |
— | — | @@ -1,9 +1,16 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
| 4 | +import java.io.BufferedOutputStream; |
| 5 | +import java.io.File; |
| 6 | +import java.io.FileNotFoundException; |
| 7 | +import java.io.FileOutputStream; |
4 | 8 | import java.io.OutputStream; |
| 9 | +import java.io.OutputStreamWriter; |
| 10 | +import java.io.UnsupportedEncodingException; |
5 | 11 | import java.io.Writer; |
6 | 12 | |
7 | 13 | import de.brightbyte.io.ConsoleIO; |
| 14 | +import de.brightbyte.util.PersistenceException; |
8 | 15 | import de.brightbyte.wikiword.CliApp; |
9 | 16 | import de.brightbyte.wikiword.output.DataOutput; |
10 | 17 | |
— | — | @@ -26,19 +33,53 @@ |
27 | 34 | //FIXME: output file |
28 | 35 | |
29 | 36 | args.declareHelp("<wiki>", "the wiki's domain or short name"); |
| 37 | + args.declareHelp("<outfile>", "the file to write to"); |
30 | 38 | args.declare("wiki", null, true, String.class, "sets the wiki name"); |
| 39 | + args.declare("append", null, false, Boolean.class, "append to output file"); |
| 40 | + args.declare("outputencoding", null, true, String.class, "sets the output encoding (defaults to UTF-8)"); |
31 | 41 | } |
32 | 42 | |
33 | | - protected abstract S createOutput(); |
| 43 | + protected abstract S createOutput() throws PersistenceException; |
| 44 | + |
| 45 | + protected File getOutputFile() { |
| 46 | + if (outputFile==null) { |
| 47 | + if (args.getParameterCount()>2) { |
| 48 | + outputFile = new File(args.getParameter(2)); |
| 49 | + } |
| 50 | + } |
| 51 | + return outputFile; |
| 52 | + } |
34 | 53 | |
35 | | - protected Writer getOutputWriter() { |
36 | | - //FIXME: encoding! |
37 | | - return ConsoleIO.writer; //TODO: get from command line! |
| 54 | + protected String getOutputFileEncoding() { |
| 55 | + return args.getStringOption("outputencoding", "UTF-8"); |
38 | 56 | } |
| 57 | + |
| 58 | + protected File outputFile; |
| 59 | + protected Writer outputWriter; |
| 60 | + protected OutputStream outputStream; |
39 | 61 | |
40 | | - protected OutputStream getOutputStream() { |
41 | | - return System.out; //TODO: get from command line! |
| 62 | + protected Writer getOutputWriter() throws FileNotFoundException, UnsupportedEncodingException { |
| 63 | + if (outputWriter==null) { |
| 64 | + File f = getOutputFile(); |
| 65 | + if (f==null) outputWriter = ConsoleIO.writer; |
| 66 | + else outputWriter = new OutputStreamWriter(getOutputStream(), getOutputFileEncoding()); |
| 67 | + } |
| 68 | + |
| 69 | + return outputWriter; |
42 | 70 | } |
| 71 | + |
| 72 | + protected OutputStream getOutputStream() throws FileNotFoundException { |
| 73 | + if (outputStream==null) { |
| 74 | + File f = getOutputFile(); |
| 75 | + if (f==null) outputStream = System.out; |
| 76 | + else { |
| 77 | + outputStream = new BufferedOutputStream(new FileOutputStream(f, args.isSet("append"))); |
| 78 | + info("Writing output to "+f); |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + return outputStream; |
| 83 | + } |
43 | 84 | |
44 | 85 | @Override |
45 | 86 | protected void prepareApp() throws Exception { |