r59622 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r59621‎ | r59622 | r59623 >
Date:11:06, 1 December 2009
Author:daniel
Status:deferred
Tags:
Comment:
standalone flat text extraction
Modified paths:
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/CliApp.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Namespace.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/NamespaceSet.java (modified) (history)
  • /trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Namespaces.properties (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractText.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/TextExtractor.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/PlainTextOutput.java (deleted) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextOutput.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextStreamOutput.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TsvTextOutput.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/CliApp.java
@@ -175,8 +175,12 @@
176176 return (Corpus)dataset;
177177 }
178178
 179+ protected String getDatasetArgument() {
 180+ return args.getParameter(0);
 181+ }
 182+
179183 protected String getConfiguredCollectionName() {
180 - String s = args.getParameter(0);
 184+ String s = getDatasetArgument();
181185 int idx = s.indexOf(':');
182186
183187 if (idx<=0) {
@@ -187,7 +191,7 @@
188192 }
189193
190194 public String getConfiguredDatasetName() {
191 - String s = args.getParameter(0);
 195+ String s = getDatasetArgument();
192196 int idx = s.indexOf(':');
193197
194198 if (idx<0) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/NamespaceSet.java
@@ -29,6 +29,8 @@
3030 import java.util.Map;
3131 import java.util.HashMap;
3232 import java.util.Iterator;
 33+import java.util.regex.Matcher;
 34+import java.util.regex.Pattern;
3335
3436 public class NamespaceSet implements Iterable<Namespace> {
3537 protected Map<Integer, Namespace> byCode = new HashMap<Integer, Namespace>();
@@ -74,8 +76,18 @@
7577 return byCode.get(number);
7678 }
7779
 80+ private static final Pattern numericNamespacePattern = Pattern.compile("^(ns:)?(\\d+)$", Pattern.CASE_INSENSITIVE);
 81+
7882 public Namespace getNamespace(String name) {
79 - return byName.get(normalizeName(name, true));
 83+ if (name.equals("") || name.equals("*")) return getNamespace(Namespace.MAIN);
 84+
 85+ Matcher m = numericNamespacePattern.matcher(name);
 86+ if (m.matches()) {
 87+ int n = Integer.parseInt(m.group(2));
 88+ return getNamespace(n);
 89+ } else {
 90+ return byName.get(normalizeName(name, true));
 91+ }
8092 }
8193
8294 public String getCanonicalName(int number) {
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Namespaces.properties
@@ -1,3 +1,4 @@
 2+MAIN=|*
23 MEDIA=Media
34 SPECIAL=Special
45 TALK=Talk
@@ -5,8 +6,8 @@
67 USER_TALK=User_talk
78 PROJECT=Project
89 PROJECT_TALK=Project_talk
9 -IMAGE=Image
10 -IMAGE_TALK=Image_talk
 10+FILE=File|Image
 11+FILE_TALK=File_talk|Image_talk
1112 MEDIAWIKI=MediaWiki
1213 MEDIAWIKI_TALK=MediaWiki_talk
1314 TEMPLATE=Template
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Namespace.java
@@ -174,6 +174,8 @@
175175 if (k.equals("MEDIA")) return MEDIA;
176176 if (k.equals("SPECIAL")) return SPECIAL;
177177 if (k.equals("MAIN")) return MAIN;
 178+ if (k.equals("*")) return MAIN;
 179+ if (k.equals("")) return MAIN;
178180 if (k.equals("TALK")) return TALK;
179181 if (k.equals("USER")) return USER;
180182 if (k.equals("USER_TALK")) return USER_TALK;
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/PlainTextOutput.java
@@ -1,63 +0,0 @@
2 -package de.brightbyte.wikiword.output;
3 -
4 -import java.io.IOException;
5 -import java.io.OutputStream;
6 -
7 -import de.brightbyte.util.PersistenceException;
8 -import de.brightbyte.wikiword.DatasetIdentifier;
9 -import de.brightbyte.wikiword.ResourceType;
10 -
11 -public class PlainTextOutput extends AbstractStreamOutput implements TextOutput {
12 -
13 - protected String encoding;
14 -
15 - public PlainTextOutput(DatasetIdentifier dataset, OutputStream out, String enc) {
16 - super(dataset, out);
17 -
18 - if (enc==null) throw new NullPointerException();
19 -
20 - this.encoding = enc;
21 - }
22 -
23 - public int storeDefinition(int rcId, String name, int conceptId, ResourceType ptype, String text) throws PersistenceException {
24 - writeBlock(name, "definition", "text/plain", ptype, text);
25 - return 0;
26 - }
27 -
28 - public void storePlainText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException {
29 - writeBlock(name, "plain", "text/plain", ptype, text);
30 - }
31 -
32 - public void storeRawText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException {
33 - writeBlock(name, "raw", "text/x-wiki", ptype, text);
34 - }
35 -
36 - protected void writeBlock(String name, String aspect, String format, ResourceType ptype, String text) throws PersistenceException {
37 - String sep = "\r\n";
38 -
39 - format += "; charset="+encoding.toLowerCase();
40 -
41 - try {
42 - StringBuilder s = new StringBuilder();
43 - text = text.trim()+"\r\n";
44 - byte[] data = text.getBytes(encoding);
45 -
46 - s.append("Page: "); s.append(name); s.append(sep);
47 - s.append("Aspect:"); s.append(aspect); s.append(sep);
48 - s.append("Page-Type:"); s.append(ptype.name()); s.append(sep);
49 - s.append("Content-Type: "); s.append(format); s.append(sep);
50 - s.append("Content-Length: "); s.append(data.length); s.append(sep);
51 - s.append("; chars="); s.append(text.length());
52 - s.append("; codepoints="); s.append(Character.codePointCount(text, 0, text.length()));
53 - s.append(sep);
54 - s.append(sep);
55 -
56 - byte[] b = s.toString().getBytes(encoding);
57 -
58 - write(b);
59 - write(data);
60 - } catch (IOException e) {
61 - throw new PersistenceException(e);
62 - }
63 - }
64 -}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextStreamOutput.java
@@ -0,0 +1,68 @@
 2+package de.brightbyte.wikiword.output;
 3+
 4+import java.io.IOException;
 5+import java.io.OutputStream;
 6+
 7+import de.brightbyte.util.PersistenceException;
 8+import de.brightbyte.wikiword.DatasetIdentifier;
 9+import de.brightbyte.wikiword.ResourceType;
 10+
 11+public class TextStreamOutput extends AbstractStreamOutput implements TextOutput {
 12+
 13+ private static final String MARKER = "[\u0001\u0002\u0003\u0004] (binary marker)";
 14+ protected String encoding;
 15+
 16+ public TextStreamOutput(DatasetIdentifier dataset, OutputStream out, String enc) {
 17+ super(dataset, out);
 18+
 19+ if (enc==null) throw new NullPointerException();
 20+
 21+ this.encoding = enc;
 22+ }
 23+
 24+ public void storeDefinitionText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException {
 25+ writeBlock(name, "definition", "text/plain", ptype, text);
 26+ }
 27+
 28+ public void storeSynopsisText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException {
 29+ writeBlock(name, "synopsis", "text/plain", ptype, text);
 30+ }
 31+
 32+ public void storePlainText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException {
 33+ writeBlock(name, "plain", "text/plain", ptype, text);
 34+ }
 35+
 36+ public void storeRawText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException {
 37+ writeBlock(name, "raw", "text/x-wiki", ptype, text);
 38+ }
 39+
 40+ protected void writeBlock(String name, String aspect, String format, ResourceType ptype, String text) throws PersistenceException {
 41+ String sep = "\r\n";
 42+
 43+ format += "; charset="+encoding.toLowerCase();
 44+
 45+ try {
 46+ StringBuilder s = new StringBuilder();
 47+ text = text.trim()+"\r\n";
 48+ byte[] data = text.getBytes(encoding);
 49+
 50+ s.append("Marker: "); s.append(MARKER); s.append(sep);
 51+ s.append("Page: "); s.append(name); s.append(sep);
 52+ s.append("Aspect:"); s.append(aspect); s.append(sep);
 53+ s.append("Page-Type:"); s.append(ptype.name()); s.append(sep);
 54+ s.append("Content-Type: "); s.append(format); s.append(sep);
 55+ s.append("Content-Length: "); s.append(data.length); s.append(sep);
 56+ s.append("; chars="); s.append(text.length());
 57+ s.append("; codepoints="); s.append(Character.codePointCount(text, 0, text.length()));
 58+ s.append(sep);
 59+ s.append(sep);
 60+
 61+ byte[] b = s.toString().getBytes(encoding);
 62+
 63+ write(b);
 64+ write(data);
 65+ } catch (IOException e) {
 66+ throw new PersistenceException(e);
 67+ }
 68+ }
 69+}
Property changes on: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextStreamOutput.java
___________________________________________________________________
Name: svn:mergeinfo
170 +
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TsvTextOutput.java
@@ -18,7 +18,15 @@
1919 public TsvTextOutput(Corpus corpus, Writer out) {
2020 super(corpus, out);
2121 }
 22+
 23+ public void storeDefinitionText(int textId, String name, ResourceType ptype, String text) throws PersistenceException {
 24+ writeRow("definition", name, text);
 25+ }
2226
 27+ public void storeSynopsisText(int textId, String name, ResourceType ptype, String text) throws PersistenceException {
 28+ writeRow("synopsis", name, text);
 29+ }
 30+
2331 /* (non-Javadoc)
2432 * @see de.brightbyte.wikiword.output.TextOutput#storePlainText(int, java.lang.String, de.brightbyte.wikiword.ResourceType, java.lang.String)
2533 */
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextOutput.java
@@ -11,4 +11,9 @@
1212 public void storeRawText(int textId, String name, ResourceType ptype,
1313 String text) throws PersistenceException;
1414
 15+ public void storeSynopsisText(int textId, String name, ResourceType ptype,
 16+ String text) throws PersistenceException;
 17+
 18+ public void storeDefinitionText(int textId, String name, ResourceType ptype,
 19+ String text) throws PersistenceException;
1520 }
\ No newline at end of file
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractText.java
@@ -4,9 +4,9 @@
55
66 import de.brightbyte.util.PersistenceException;
77 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
8 -import de.brightbyte.wikiword.builder.ConceptImporter;
9 -import de.brightbyte.wikiword.output.PlainTextOutput;
 8+import de.brightbyte.wikiword.output.TextFileOutput;
109 import de.brightbyte.wikiword.output.TextOutput;
 10+import de.brightbyte.wikiword.output.TextStreamOutput;
1111 import de.brightbyte.wikiword.output.TsvTextOutput;
1212
1313 public class ExtractText extends ExtractFromDump<TextOutput> {
@@ -19,16 +19,19 @@
2020 protected void declareOptions() {
2121 super.declareOptions();
2222
23 - ConceptImporter.declareOptions(args);
 23+ TextExtractor.declareOptions(args);
2424
25 - args.declare("tsv", null, false, Boolean.class, "output TSV table");
 25+ args.declare("tsv", null, false, Boolean.class, "output TSV table. Default is a http-like stream.");
 26+ args.declare("files", null, false, Boolean.class, "write output into separate files instead of a single stream. target dir must be given as second parameter");
 27+ args.declare("hashdirs", null, false, Boolean.class, "with --files, create hash-based subdirectories. Avoids large flat directories.");
2628 }
2729
2830 @Override
2931 protected TextOutput createOutput() throws PersistenceException {
3032 try {
3133 if (args.isSet("tsv")) return new TsvTextOutput(getCorpus(), getOutputWriter());
32 - else return new PlainTextOutput(getCorpus(), getOutputStream(), getOutputFileEncoding());
 34+ else if (args.isSet("files")) return new TextFileOutput(getCorpus(), getOutputFile(), getOutputFileEncoding(), args.isSet("hashdirs"));
 35+ else return new TextStreamOutput(getCorpus(), getOutputStream(), getOutputFileEncoding());
3336 } catch (IOException e) {
3437 throw new PersistenceException(e);
3538 }
@@ -36,7 +39,8 @@
3740
3841 @Override
3942 protected TextExtractor newProcessor(WikiTextAnalyzer analyzer) {
40 - return new TextExtractor(analyzer, output, tweaks);
 43+ TextExtractor extractor = new TextExtractor(analyzer, output, tweaks);
 44+ return extractor;
4145 }
4246
4347 public static void main(String[] argv) throws Exception {
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/TextExtractor.java
@@ -1,10 +1,13 @@
22 package de.brightbyte.wikiword.extract;
33
44 import java.util.Date;
 5+import java.util.HashSet;
 6+import java.util.Set;
57
68 import de.brightbyte.application.Arguments;
79 import de.brightbyte.util.PersistenceException;
810 import de.brightbyte.wikiword.Namespace;
 11+import de.brightbyte.wikiword.NamespaceSet;
912 import de.brightbyte.wikiword.ResourceType;
1013 import de.brightbyte.wikiword.TweakSet;
1114 import de.brightbyte.wikiword.analyzer.WikiPage;
@@ -16,11 +19,13 @@
1720
1821 //private LocalConceptStore localConceptStore;
1922
20 - //private boolean storeDefinitions = true;
 23+ private boolean storeDefinitions;
 24+ private boolean storeSynopsis;
2125 private boolean storeWikiText;
2226 private boolean storePlainText;
2327
2428 private int textId = 0;
 29+ private Set<Namespace> allowdNamespaces;
2530
2631 public TextExtractor(WikiTextAnalyzer analyzer, TextOutput output, TweakSet tweaks) {
2732 super(analyzer, output, tweaks);
@@ -46,17 +51,18 @@
4752 */
4853
4954 protected boolean isRelevant(WikiPage analyzerPage) {
50 - int namespace = analyzerPage.getNamespace();
51 - //CharSequence title = analyzerPage.getTitle();
52 - ResourceType type = analyzerPage.getResourceType();
 55+ Namespace namespace = analyzer.getCorpus().getNamespaces().getNamespace( analyzerPage.getNamespace() );
5356
54 - if (namespace!=Namespace.MAIN) {
55 - out.trace("bad namespace "+namespace);
 57+ if (allowdNamespaces != null && !allowdNamespaces.contains(namespace)) {
 58+ out.trace("skipping page from namespace "+namespace);
5659 return false;
5760 }
5861
 62+ //CharSequence title = analyzerPage.getTitle();
 63+ ResourceType type = analyzerPage.getResourceType();
 64+
5965 if (!storeWikiText && type!=ResourceType.ARTICLE) {
60 - out.trace("bad type "+type);
 66+ out.trace("skipping non-article page with type "+type);
6167 return false;
6268 }
6369
@@ -65,7 +71,6 @@
6672
6773 @Override
6874 public int importPage(WikiPage analyzerPage, Date timestamp) throws PersistenceException {
69 - String text = analyzerPage.getText().toString();
7075
7176 ResourceType ptype = analyzerPage.getResourceType();
7277 String name = analyzerPage.getName().toString();
@@ -73,17 +78,42 @@
7479 //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update.
7580
7681 textId ++;
77 -
 82+
7883 if (storeWikiText) { //TODO: separate access path...
 84+ String text = analyzerPage.getText().toString().trim();
7985 output.storeRawText(textId, name, ptype, text);
8086 }
8187
82 - if (storePlainText) { //TODO: separate access path...
83 - String plain = analyzerPage.getPlainText(false).toString();
84 - checkSmellsLikeWiki(0, plain, "plain text: "+name+" (id={0})", textId);
85 - output.storePlainText(textId, name, ptype, plain);
 88+ //CharSequence title = analyzerPage.getTitle();
 89+ ResourceType type = analyzerPage.getResourceType();
 90+
 91+ if (storePlainText && type==ResourceType.ARTICLE) { //TODO: separate access path...
 92+ String plain = analyzerPage.getPlainText(false).toString().trim();
 93+
 94+ if (plain!=null && plain.length()>0) {
 95+ checkSmellsLikeWiki(0, plain, "plain text: "+name+" (id={0})", textId);
 96+ output.storePlainText(textId, name, ptype, plain);
 97+ }
8698 }
8799
 100+ if (storeSynopsis && type==ResourceType.ARTICLE) { //TODO: separate access path...
 101+ String syn = analyzerPage.getFirstParagraph().toString().trim();
 102+
 103+ if (syn!=null && syn.length()>0) {
 104+ checkSmellsLikeWiki(0, syn, "definition text: "+name+" (id={0})", textId);
 105+ output.storeSynopsisText(textId, name, ptype, syn);
 106+ }
 107+ }
 108+
 109+ if (storeDefinitions && type==ResourceType.ARTICLE) { //TODO: separate access path...
 110+ String def = analyzerPage.getFirstSentence().toString().trim();
 111+
 112+ if (def!=null && def.length()>0) {
 113+ checkSmellsLikeWiki(0, def, "definition text: "+name+" (id={0})", textId);
 114+ output.storeDefinitionText(textId, name, ptype, def);
 115+ }
 116+ }
 117+
88118 /*
89119 if (ptype == ResourceType.ARTICLE && storeDefinitions) {
90120 String definition = analyzerPage.getFirstSentence();
@@ -100,23 +130,54 @@
101131 public static void declareOptions(Arguments args) {
102132 AbstractImporter.declareOptions(args);
103133
104 - args.declare("wikitext", null, true, String.class, "store raw wiki text");
105 - args.declare("flattext", null, true, String.class, "store stripped plain text");
106 - //args.declare("defs", null, true, String.class, "extract and store definitions");
 134+ args.declare("namespaces", null, true, String.class, "list of namespaces to process. if omitted, all are processed.");
 135+ args.declare("extract", null, true, String.class, "What to extract. One or more of raw, plain, or definition. Default is raw,plain");
107136 }
108137
109138 @Override
110139 public void configure(Arguments args) throws Exception {
111140 super.configure(args);
112141
113 - //this.storeDefinitions = !args.isSet("defs");
114 - storeWikiText = args.isSet("wikitext");
115 - storePlainText = args.isSet("flattext");
 142+ setNamespaceFilter( getNamespaces(args) );
116143
117 - if (!storeWikiText && !storePlainText) {
118 - storeWikiText = true;
119 - storePlainText = true;
 144+ String ext = args.getOption("extract", "raw,plain").toLowerCase();
 145+ String[]ee = ext.split("[,;/|:+]");
 146+
 147+ for (String e: ee) {
 148+ e = e.toLowerCase();
 149+ if (e.equals("def") || e.equals("definition")) storeDefinitions = true;
 150+ else if (e.equals("synopsis") || e.equals("intro")) storeSynopsis = true;
 151+ else if (e.equals("raw") || e.equals("wiki") || e.equals("wikitext")) storeWikiText= true;
 152+ else if (e.equals("plain") || e.equals("flat")) storePlainText = true;
 153+ else throw new IllegalArgumentException("unknown extraction aspect: "+e);
120154 }
 155+
121156 }
122157
 158+
 159+ protected Set<Namespace> getNamespaces(Arguments args) {
 160+ if (!args.isSet("namespaces")) return null;
 161+
 162+ String s = args.getOption("namespaces", "");
 163+ String[] nn = s.split("[\\s,;:/|+]+");
 164+ if (nn.length==0) return null;
 165+
 166+ NamespaceSet namespaces = analyzer.getCorpus().getNamespaces();
 167+ Set<Namespace> result = new HashSet<Namespace>();
 168+
 169+ for (String n: nn) {
 170+ Namespace ns;
 171+ if (n.equals("") || n.equals("*") || n.equalsIgnoreCase("main")) ns = namespaces.getNamespace(Namespace.MAIN);
 172+ else ns = namespaces.getNamespace(n);
 173+
 174+ result.add(ns);
 175+ }
 176+
 177+ return result;
 178+ }
 179+
 180+ public void setNamespaceFilter(Set<Namespace> namespaces) {
 181+ this.allowdNamespaces = namespaces;
 182+ }
 183+
123184 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java
@@ -18,7 +18,11 @@
1919 }
2020
2121 protected URL dumpFile;
22 -
 22+/*
 23+ protected String getDatasetArgument() {
 24+ return args.getOption("corpus", ":");
 25+ }
 26+*/
2327 @Override
2428 protected boolean applyArguments() {
2529 String d = getTargetFileName();
@@ -50,8 +54,10 @@
5155 args.declare("wiki", null, true, String.class, "sets the wiki name (overrides the name given by, or " +
5256 "guessed from, the <wiki> parameter)");
5357 args.declare("url", null, false, Boolean.class, "read the <dump-file> parameter as a full URL");
 58+
 59+ args.declare("namespaces", null, true, String.class, "Only process pages in the given namespace(s).");
5460 }
55 -
 61+
5662 @Override
5763 protected void run() throws Exception {
5864

Status & tagging log