r50055 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r50054‎ | r50055 | r50056 >
Date:21:58, 29 April 2009
Author:daniel
Status:deferred
Tags:
Comment:
extraxt text to file
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/AbstractExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractText.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractorApp.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/TextExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/AbstractProcessor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/PageTitleFilter.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/AbstractProcessor.java
@@ -295,8 +295,8 @@
296296 }
297297
298298 public static void declareOptions(Arguments args) {
299 - args.declare("from", "f", true, String.class, "ignores all pages in the dump before (but excluding) the one with the given title");
300 - args.declare("after", "a", true, String.class, "ignores all pages in the dump until (and including) the one with the given title");
 299+ args.declare("from", "f", true, String.class, "ignores all pages in the input before (but excluding) the one with the given title");
 300+ args.declare("after", "a", true, String.class, "ignores all pages in the input until (and including) the one with the given title");
301301 args.declare("limit", "l", true, String.class, "maximum number of pages to process");
302302 args.declare("skip", "k", true, String.class, "number number of pages to skip before starting to process. " +
303303 "if --from or --after are given, this number is counted from the position the given title occurrs at. " +
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/PageTitleFilter.java
@@ -14,7 +14,7 @@
1515 }
1616
1717 public boolean matches(WikiPage page) {
18 - CharSequence t = page.getTitle();
 18+ CharSequence t = page.getResourceName();
1919 return filter.matches(t);
2020 }
2121
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/AbstractExtractor.java
@@ -6,11 +6,11 @@
77 import de.brightbyte.wikiword.output.DataOutput;
88 import de.brightbyte.wikiword.processor.AbstractProcessor;
99
10 -public abstract class AbstractExtractor extends AbstractProcessor implements WikiWordExtractor {
 10+public abstract class AbstractExtractor<S extends DataOutput> extends AbstractProcessor implements WikiWordExtractor {
1111
12 - protected DataOutput output;
 12+ protected S output;
1313
14 - public AbstractExtractor(WikiTextAnalyzer analyzer, DataOutput output, TweakSet tweaks) {
 14+ public AbstractExtractor(WikiTextAnalyzer analyzer, S output, TweakSet tweaks) {
1515 super(analyzer, tweaks);
1616
1717 if (output==null) throw new NullPointerException();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractText.java
@@ -1,7 +1,11 @@
22 package de.brightbyte.wikiword.extract;
33
 4+import java.io.IOException;
 5+
 6+import de.brightbyte.util.PersistenceException;
47 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
58 import de.brightbyte.wikiword.builder.ConceptImporter;
 9+import de.brightbyte.wikiword.output.PlainTextOutput;
610 import de.brightbyte.wikiword.output.TextOutput;
711 import de.brightbyte.wikiword.output.TsvTextOutput;
812
@@ -10,36 +14,34 @@
1115 public ExtractText() {
1216 super();
1317 }
14 -
15 -
16 - @Override
17 - protected TextExtractor newProcessor(WikiTextAnalyzer analyzer) {
18 - return new TextExtractor(analyzer, output, tweaks);
19 - }
2018
21 -
2219 @Override
2320 protected void declareOptions() {
2421 super.declareOptions();
2522
2623 ConceptImporter.declareOptions(args);
2724
28 - args.declareHelp("<dump-file>", "the dump file to process");
29 - args.declare("wiki", null, true, String.class, "sets the wiki name (overrides the name given by, or " +
30 - "guessed from, the <dump-file> parameter)");
31 -
32 - args.declare("plain", null, false, Boolean.class, "output plain text");
3325 args.declare("tsv", null, false, Boolean.class, "output TSV table");
3426 }
 27+
 28+ @Override
 29+ protected TextOutput createOutput() throws PersistenceException {
 30+ try {
 31+ if (args.isSet("tsv")) return new TsvTextOutput(getCorpus(), getOutputWriter());
 32+ else return new PlainTextOutput(getCorpus(), getOutputStream(), getOutputFileEncoding());
 33+ } catch (IOException e) {
 34+ throw new PersistenceException(e);
 35+ }
 36+ }
 37+
 38+ @Override
 39+ protected TextExtractor newProcessor(WikiTextAnalyzer analyzer) {
 40+ return new TextExtractor(analyzer, output, tweaks);
 41+ }
3542
3643 public static void main(String[] argv) throws Exception {
3744 ExtractText app = new ExtractText();
3845 app.launch(argv);
3946 }
40 -
41 - @Override
42 - protected TextOutput createOutput() {
43 - return new TsvTextOutput(getCorpus(), getOutputWriter());
44 - }
4547
4648 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/TextExtractor.java
@@ -12,9 +12,8 @@
1313 import de.brightbyte.wikiword.builder.AbstractImporter;
1414 import de.brightbyte.wikiword.output.TextOutput;
1515
16 -public class TextExtractor extends AbstractExtractor {
 16+public class TextExtractor extends AbstractExtractor<TextOutput> {
1717
18 - private TextOutput textOutput;
1918 //private LocalConceptStore localConceptStore;
2019
2120 //private boolean storeDefinitions = true;
@@ -25,8 +24,6 @@
2625
2726 public TextExtractor(WikiTextAnalyzer analyzer, TextOutput output, TweakSet tweaks) {
2827 super(analyzer, output, tweaks);
29 -
30 - this.output = output;
3128 }
3229
3330 /*
@@ -78,13 +75,13 @@
7976 textId ++;
8077
8178 if (storeWikiText) { //TODO: separate access path...
82 - textOutput.storeRawText(textId, name, ptype, text);
 79+ output.storeRawText(textId, name, ptype, text);
8380 }
8481
8582 if (storePlainText) { //TODO: separate access path...
8683 String plain = analyzerPage.getPlainText(false).toString();
8784 checkSmellsLikeWiki(0, plain, "plain text: "+name+" (id={0})", textId);
88 - textOutput.storePlainText(textId, name, ptype, plain);
 85+ output.storePlainText(textId, name, ptype, plain);
8986 }
9087
9188 /*
@@ -103,8 +100,8 @@
104101 public static void declareOptions(Arguments args) {
105102 AbstractImporter.declareOptions(args);
106103
107 - args.declare("wiki", null, true, String.class, "store raw wiki text");
108 - args.declare("plain", null, true, String.class, "store stripped plain text");
 104+ args.declare("wikitext", null, true, String.class, "store raw wiki text");
 105+ args.declare("flattext", null, true, String.class, "store stripped plain text");
109106 //args.declare("defs", null, true, String.class, "extract and store definitions");
110107 }
111108
@@ -113,8 +110,13 @@
114111 super.configure(args);
115112
116113 //this.storeDefinitions = !args.isSet("defs");
117 - this.storeWikiText = !args.isSet("wiki");
118 - this.storePlainText = !args.isSet("plain");
 114+ storeWikiText = args.isSet("wikitext");
 115+ storePlainText = args.isSet("flattext");
 116+
 117+ if (!storeWikiText && !storePlainText) {
 118+ storeWikiText = true;
 119+ storePlainText = true;
 120+ }
119121 }
120122
121123 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java
@@ -13,8 +13,6 @@
1414
1515 public abstract class ExtractFromDump<S extends DataOutput> extends ExtractorApp<S> {
1616
17 - protected S output;
18 -
1917 public ExtractFromDump() {
2018 super();
2119 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractorApp.java
@@ -1,9 +1,16 @@
22 package de.brightbyte.wikiword.extract;
33
 4+import java.io.BufferedOutputStream;
 5+import java.io.File;
 6+import java.io.FileNotFoundException;
 7+import java.io.FileOutputStream;
48 import java.io.OutputStream;
 9+import java.io.OutputStreamWriter;
 10+import java.io.UnsupportedEncodingException;
511 import java.io.Writer;
612
713 import de.brightbyte.io.ConsoleIO;
 14+import de.brightbyte.util.PersistenceException;
815 import de.brightbyte.wikiword.CliApp;
916 import de.brightbyte.wikiword.output.DataOutput;
1017
@@ -26,19 +33,53 @@
2734 //FIXME: output file
2835
2936 args.declareHelp("<wiki>", "the wiki's domain or short name");
 37+ args.declareHelp("<outfile>", "the file to write to");
3038 args.declare("wiki", null, true, String.class, "sets the wiki name");
 39+ args.declare("append", null, false, Boolean.class, "append to output file");
 40+ args.declare("outputencoding", null, true, String.class, "sets the output encoding (defaults to UTF-8)");
3141 }
3242
33 - protected abstract S createOutput();
 43+ protected abstract S createOutput() throws PersistenceException;
 44+
 45+ protected File getOutputFile() {
 46+ if (outputFile==null) {
 47+ if (args.getParameterCount()>2) {
 48+ outputFile = new File(args.getParameter(2));
 49+ }
 50+ }
 51+ return outputFile;
 52+ }
3453
35 - protected Writer getOutputWriter() {
36 - //FIXME: encoding!
37 - return ConsoleIO.writer; //TODO: get from command line!
 54+ protected String getOutputFileEncoding() {
 55+ return args.getStringOption("outputencoding", "UTF-8");
3856 }
 57+
 58+ protected File outputFile;
 59+ protected Writer outputWriter;
 60+ protected OutputStream outputStream;
3961
40 - protected OutputStream getOutputStream() {
41 - return System.out; //TODO: get from command line!
 62+ protected Writer getOutputWriter() throws FileNotFoundException, UnsupportedEncodingException {
 63+ if (outputWriter==null) {
 64+ File f = getOutputFile();
 65+ if (f==null) outputWriter = ConsoleIO.writer;
 66+ else outputWriter = new OutputStreamWriter(getOutputStream(), getOutputFileEncoding());
 67+ }
 68+
 69+ return outputWriter;
4270 }
 71+
 72+ protected OutputStream getOutputStream() throws FileNotFoundException {
 73+ if (outputStream==null) {
 74+ File f = getOutputFile();
 75+ if (f==null) outputStream = System.out;
 76+ else {
 77+ outputStream = new BufferedOutputStream(new FileOutputStream(f, args.isSet("append")));
 78+ info("Writing output to "+f);
 79+ }
 80+ }
 81+
 82+ return outputStream;
 83+ }
4384
4485 @Override
4586 protected void prepareApp() throws Exception {

Status & tagging log