Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/CliApp.java |
— | — | @@ -175,8 +175,12 @@ |
176 | 176 | return (Corpus)dataset; |
177 | 177 | } |
178 | 178 | |
| 179 | + protected String getDatasetArgument() { |
| 180 | + return args.getParameter(0); |
| 181 | + } |
| 182 | + |
179 | 183 | protected String getConfiguredCollectionName() { |
180 | | - String s = args.getParameter(0); |
| 184 | + String s = getDatasetArgument(); |
181 | 185 | int idx = s.indexOf(':'); |
182 | 186 | |
183 | 187 | if (idx<=0) { |
— | — | @@ -187,7 +191,7 @@ |
188 | 192 | } |
189 | 193 | |
190 | 194 | public String getConfiguredDatasetName() { |
191 | | - String s = args.getParameter(0); |
| 195 | + String s = getDatasetArgument(); |
192 | 196 | int idx = s.indexOf(':'); |
193 | 197 | |
194 | 198 | if (idx<0) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/NamespaceSet.java |
— | — | @@ -29,6 +29,8 @@ |
30 | 30 | import java.util.Map; |
31 | 31 | import java.util.HashMap; |
32 | 32 | import java.util.Iterator; |
| 33 | +import java.util.regex.Matcher; |
| 34 | +import java.util.regex.Pattern; |
33 | 35 | |
34 | 36 | public class NamespaceSet implements Iterable<Namespace> { |
35 | 37 | protected Map<Integer, Namespace> byCode = new HashMap<Integer, Namespace>(); |
— | — | @@ -74,8 +76,18 @@ |
75 | 77 | return byCode.get(number); |
76 | 78 | } |
77 | 79 | |
| 80 | + private static final Pattern numericNamespacePattern = Pattern.compile("^(ns:)?(\\d+)$", Pattern.CASE_INSENSITIVE); |
| 81 | + |
78 | 82 | public Namespace getNamespace(String name) { |
79 | | - return byName.get(normalizeName(name, true)); |
| 83 | + if (name.equals("") || name.equals("*")) return getNamespace(Namespace.MAIN); |
| 84 | + |
| 85 | + Matcher m = numericNamespacePattern.matcher(name); |
| 86 | + if (m.matches()) { |
| 87 | + int n = Integer.parseInt(m.group(2)); |
| 88 | + return getNamespace(n); |
| 89 | + } else { |
| 90 | + return byName.get(normalizeName(name, true)); |
| 91 | + } |
80 | 92 | } |
81 | 93 | |
82 | 94 | public String getCanonicalName(int number) { |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Namespaces.properties |
— | — | @@ -1,3 +1,4 @@ |
| 2 | +MAIN=|* |
2 | 3 | MEDIA=Media |
3 | 4 | SPECIAL=Special |
4 | 5 | TALK=Talk |
— | — | @@ -5,8 +6,8 @@ |
6 | 7 | USER_TALK=User_talk |
7 | 8 | PROJECT=Project |
8 | 9 | PROJECT_TALK=Project_talk |
9 | | -IMAGE=Image |
10 | | -IMAGE_TALK=Image_talk |
| 10 | +FILE=File|Image |
| 11 | +FILE_TALK=File_talk|Image_talk |
11 | 12 | MEDIAWIKI=MediaWiki |
12 | 13 | MEDIAWIKI_TALK=MediaWiki_talk |
13 | 14 | TEMPLATE=Template |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/Namespace.java |
— | — | @@ -174,6 +174,8 @@ |
175 | 175 | if (k.equals("MEDIA")) return MEDIA; |
176 | 176 | if (k.equals("SPECIAL")) return SPECIAL; |
177 | 177 | if (k.equals("MAIN")) return MAIN; |
| 178 | + if (k.equals("*")) return MAIN; |
| 179 | + if (k.equals("")) return MAIN; |
178 | 180 | if (k.equals("TALK")) return TALK; |
179 | 181 | if (k.equals("USER")) return USER; |
180 | 182 | if (k.equals("USER_TALK")) return USER_TALK; |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/PlainTextOutput.java |
— | — | @@ -1,63 +0,0 @@ |
2 | | -package de.brightbyte.wikiword.output; |
3 | | - |
4 | | -import java.io.IOException; |
5 | | -import java.io.OutputStream; |
6 | | - |
7 | | -import de.brightbyte.util.PersistenceException; |
8 | | -import de.brightbyte.wikiword.DatasetIdentifier; |
9 | | -import de.brightbyte.wikiword.ResourceType; |
10 | | - |
11 | | -public class PlainTextOutput extends AbstractStreamOutput implements TextOutput { |
12 | | - |
13 | | - protected String encoding; |
14 | | - |
15 | | - public PlainTextOutput(DatasetIdentifier dataset, OutputStream out, String enc) { |
16 | | - super(dataset, out); |
17 | | - |
18 | | - if (enc==null) throw new NullPointerException(); |
19 | | - |
20 | | - this.encoding = enc; |
21 | | - } |
22 | | - |
23 | | - public int storeDefinition(int rcId, String name, int conceptId, ResourceType ptype, String text) throws PersistenceException { |
24 | | - writeBlock(name, "definition", "text/plain", ptype, text); |
25 | | - return 0; |
26 | | - } |
27 | | - |
28 | | - public void storePlainText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException { |
29 | | - writeBlock(name, "plain", "text/plain", ptype, text); |
30 | | - } |
31 | | - |
32 | | - public void storeRawText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException { |
33 | | - writeBlock(name, "raw", "text/x-wiki", ptype, text); |
34 | | - } |
35 | | - |
36 | | - protected void writeBlock(String name, String aspect, String format, ResourceType ptype, String text) throws PersistenceException { |
37 | | - String sep = "\r\n"; |
38 | | - |
39 | | - format += "; charset="+encoding.toLowerCase(); |
40 | | - |
41 | | - try { |
42 | | - StringBuilder s = new StringBuilder(); |
43 | | - text = text.trim()+"\r\n"; |
44 | | - byte[] data = text.getBytes(encoding); |
45 | | - |
46 | | - s.append("Page: "); s.append(name); s.append(sep); |
47 | | - s.append("Aspect:"); s.append(aspect); s.append(sep); |
48 | | - s.append("Page-Type:"); s.append(ptype.name()); s.append(sep); |
49 | | - s.append("Content-Type: "); s.append(format); s.append(sep); |
50 | | - s.append("Content-Length: "); s.append(data.length); s.append(sep); |
51 | | - s.append("; chars="); s.append(text.length()); |
52 | | - s.append("; codepoints="); s.append(Character.codePointCount(text, 0, text.length())); |
53 | | - s.append(sep); |
54 | | - s.append(sep); |
55 | | - |
56 | | - byte[] b = s.toString().getBytes(encoding); |
57 | | - |
58 | | - write(b); |
59 | | - write(data); |
60 | | - } catch (IOException e) { |
61 | | - throw new PersistenceException(e); |
62 | | - } |
63 | | - } |
64 | | -} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextStreamOutput.java |
— | — | @@ -0,0 +1,68 @@ |
| 2 | +package de.brightbyte.wikiword.output; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.OutputStream; |
| 6 | + |
| 7 | +import de.brightbyte.util.PersistenceException; |
| 8 | +import de.brightbyte.wikiword.DatasetIdentifier; |
| 9 | +import de.brightbyte.wikiword.ResourceType; |
| 10 | + |
| 11 | +public class TextStreamOutput extends AbstractStreamOutput implements TextOutput { |
| 12 | + |
| 13 | + private static final String MARKER = "[\u0001\u0002\u0003\u0004] (binary marker)"; |
| 14 | + protected String encoding; |
| 15 | + |
| 16 | + public TextStreamOutput(DatasetIdentifier dataset, OutputStream out, String enc) { |
| 17 | + super(dataset, out); |
| 18 | + |
| 19 | + if (enc==null) throw new NullPointerException(); |
| 20 | + |
| 21 | + this.encoding = enc; |
| 22 | + } |
| 23 | + |
| 24 | + public void storeDefinitionText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException { |
| 25 | + writeBlock(name, "definition", "text/plain", ptype, text); |
| 26 | + } |
| 27 | + |
| 28 | + public void storeSynopsisText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException { |
| 29 | + writeBlock(name, "synopsis", "text/plain", ptype, text); |
| 30 | + } |
| 31 | + |
| 32 | + public void storePlainText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException { |
| 33 | + writeBlock(name, "plain", "text/plain", ptype, text); |
| 34 | + } |
| 35 | + |
| 36 | + public void storeRawText(int rcId, String name, ResourceType ptype, String text) throws PersistenceException { |
| 37 | + writeBlock(name, "raw", "text/x-wiki", ptype, text); |
| 38 | + } |
| 39 | + |
| 40 | + protected void writeBlock(String name, String aspect, String format, ResourceType ptype, String text) throws PersistenceException { |
| 41 | + String sep = "\r\n"; |
| 42 | + |
| 43 | + format += "; charset="+encoding.toLowerCase(); |
| 44 | + |
| 45 | + try { |
| 46 | + StringBuilder s = new StringBuilder(); |
| 47 | + text = text.trim()+"\r\n"; |
| 48 | + byte[] data = text.getBytes(encoding); |
| 49 | + |
| 50 | + s.append("Marker: "); s.append(MARKER); s.append(sep); |
| 51 | + s.append("Page: "); s.append(name); s.append(sep); |
| 52 | + s.append("Aspect:"); s.append(aspect); s.append(sep); |
| 53 | + s.append("Page-Type:"); s.append(ptype.name()); s.append(sep); |
| 54 | + s.append("Content-Type: "); s.append(format); s.append(sep); |
| 55 | + s.append("Content-Length: "); s.append(data.length); s.append(sep); |
| 56 | + s.append("; chars="); s.append(text.length()); |
| 57 | + s.append("; codepoints="); s.append(Character.codePointCount(text, 0, text.length())); |
| 58 | + s.append(sep); |
| 59 | + s.append(sep); |
| 60 | + |
| 61 | + byte[] b = s.toString().getBytes(encoding); |
| 62 | + |
| 63 | + write(b); |
| 64 | + write(data); |
| 65 | + } catch (IOException e) { |
| 66 | + throw new PersistenceException(e); |
| 67 | + } |
| 68 | + } |
| 69 | +} |
Property changes on: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextStreamOutput.java |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 70 | + |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TsvTextOutput.java |
— | — | @@ -18,7 +18,15 @@ |
19 | 19 | public TsvTextOutput(Corpus corpus, Writer out) { |
20 | 20 | super(corpus, out); |
21 | 21 | } |
| 22 | + |
| 23 | + public void storeDefinitionText(int textId, String name, ResourceType ptype, String text) throws PersistenceException { |
| 24 | + writeRow("definition", name, text); |
| 25 | + } |
22 | 26 | |
| 27 | + public void storeSynopsisText(int textId, String name, ResourceType ptype, String text) throws PersistenceException { |
| 28 | + writeRow("synopsis", name, text); |
| 29 | + } |
| 30 | + |
23 | 31 | /* (non-Javadoc) |
24 | 32 | * @see de.brightbyte.wikiword.output.TextOutput#storePlainText(int, java.lang.String, de.brightbyte.wikiword.ResourceType, java.lang.String) |
25 | 33 | */ |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/output/TextOutput.java |
— | — | @@ -11,4 +11,9 @@ |
12 | 12 | public void storeRawText(int textId, String name, ResourceType ptype, |
13 | 13 | String text) throws PersistenceException; |
14 | 14 | |
| 15 | + public void storeSynopsisText(int textId, String name, ResourceType ptype, |
| 16 | + String text) throws PersistenceException; |
| 17 | + |
| 18 | + public void storeDefinitionText(int textId, String name, ResourceType ptype, |
| 19 | + String text) throws PersistenceException; |
15 | 20 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractText.java |
— | — | @@ -4,9 +4,9 @@ |
5 | 5 | |
6 | 6 | import de.brightbyte.util.PersistenceException; |
7 | 7 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
8 | | -import de.brightbyte.wikiword.builder.ConceptImporter; |
9 | | -import de.brightbyte.wikiword.output.PlainTextOutput; |
| 8 | +import de.brightbyte.wikiword.output.TextFileOutput; |
10 | 9 | import de.brightbyte.wikiword.output.TextOutput; |
| 10 | +import de.brightbyte.wikiword.output.TextStreamOutput; |
11 | 11 | import de.brightbyte.wikiword.output.TsvTextOutput; |
12 | 12 | |
13 | 13 | public class ExtractText extends ExtractFromDump<TextOutput> { |
— | — | @@ -19,16 +19,19 @@ |
20 | 20 | protected void declareOptions() { |
21 | 21 | super.declareOptions(); |
22 | 22 | |
23 | | - ConceptImporter.declareOptions(args); |
| 23 | + TextExtractor.declareOptions(args); |
24 | 24 | |
25 | | - args.declare("tsv", null, false, Boolean.class, "output TSV table"); |
| 25 | + args.declare("tsv", null, false, Boolean.class, "output TSV table. Default is a http-like stream."); |
| 26 | + args.declare("files", null, false, Boolean.class, "write output into separate files instead of a single stream. target dir must be given as second parameter"); |
| 27 | + args.declare("hashdirs", null, false, Boolean.class, "with --files, create hash-based subdirectories. Avoids large flat directories."); |
26 | 28 | } |
27 | 29 | |
28 | 30 | @Override |
29 | 31 | protected TextOutput createOutput() throws PersistenceException { |
30 | 32 | try { |
31 | 33 | if (args.isSet("tsv")) return new TsvTextOutput(getCorpus(), getOutputWriter()); |
32 | | - else return new PlainTextOutput(getCorpus(), getOutputStream(), getOutputFileEncoding()); |
| 34 | + else if (args.isSet("files")) return new TextFileOutput(getCorpus(), getOutputFile(), getOutputFileEncoding(), args.isSet("hashdirs")); |
| 35 | + else return new TextStreamOutput(getCorpus(), getOutputStream(), getOutputFileEncoding()); |
33 | 36 | } catch (IOException e) { |
34 | 37 | throw new PersistenceException(e); |
35 | 38 | } |
— | — | @@ -36,7 +39,8 @@ |
37 | 40 | |
38 | 41 | @Override |
39 | 42 | protected TextExtractor newProcessor(WikiTextAnalyzer analyzer) { |
40 | | - return new TextExtractor(analyzer, output, tweaks); |
| 43 | + TextExtractor extractor = new TextExtractor(analyzer, output, tweaks); |
| 44 | + return extractor; |
41 | 45 | } |
42 | 46 | |
43 | 47 | public static void main(String[] argv) throws Exception { |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/TextExtractor.java |
— | — | @@ -1,10 +1,13 @@ |
2 | 2 | package de.brightbyte.wikiword.extract; |
3 | 3 | |
4 | 4 | import java.util.Date; |
| 5 | +import java.util.HashSet; |
| 6 | +import java.util.Set; |
5 | 7 | |
6 | 8 | import de.brightbyte.application.Arguments; |
7 | 9 | import de.brightbyte.util.PersistenceException; |
8 | 10 | import de.brightbyte.wikiword.Namespace; |
| 11 | +import de.brightbyte.wikiword.NamespaceSet; |
9 | 12 | import de.brightbyte.wikiword.ResourceType; |
10 | 13 | import de.brightbyte.wikiword.TweakSet; |
11 | 14 | import de.brightbyte.wikiword.analyzer.WikiPage; |
— | — | @@ -16,11 +19,13 @@ |
17 | 20 | |
18 | 21 | //private LocalConceptStore localConceptStore; |
19 | 22 | |
20 | | - //private boolean storeDefinitions = true; |
| 23 | + private boolean storeDefinitions; |
| 24 | + private boolean storeSynopsis; |
21 | 25 | private boolean storeWikiText; |
22 | 26 | private boolean storePlainText; |
23 | 27 | |
24 | 28 | private int textId = 0; |
| 29 | + private Set<Namespace> allowdNamespaces; |
25 | 30 | |
26 | 31 | public TextExtractor(WikiTextAnalyzer analyzer, TextOutput output, TweakSet tweaks) { |
27 | 32 | super(analyzer, output, tweaks); |
— | — | @@ -46,17 +51,18 @@ |
47 | 52 | */ |
48 | 53 | |
49 | 54 | protected boolean isRelevant(WikiPage analyzerPage) { |
50 | | - int namespace = analyzerPage.getNamespace(); |
51 | | - //CharSequence title = analyzerPage.getTitle(); |
52 | | - ResourceType type = analyzerPage.getResourceType(); |
| 55 | + Namespace namespace = analyzer.getCorpus().getNamespaces().getNamespace( analyzerPage.getNamespace() ); |
53 | 56 | |
54 | | - if (namespace!=Namespace.MAIN) { |
55 | | - out.trace("bad namespace "+namespace); |
| 57 | + if (allowdNamespaces != null && !allowdNamespaces.contains(namespace)) { |
| 58 | + out.trace("skipping page from namespace "+namespace); |
56 | 59 | return false; |
57 | 60 | } |
58 | 61 | |
| 62 | + //CharSequence title = analyzerPage.getTitle(); |
| 63 | + ResourceType type = analyzerPage.getResourceType(); |
| 64 | + |
59 | 65 | if (!storeWikiText && type!=ResourceType.ARTICLE) { |
60 | | - out.trace("bad type "+type); |
| 66 | + out.trace("skipping non-article page with type "+type); |
61 | 67 | return false; |
62 | 68 | } |
63 | 69 | |
— | — | @@ -65,7 +71,6 @@ |
66 | 72 | |
67 | 73 | @Override |
68 | 74 | public int importPage(WikiPage analyzerPage, Date timestamp) throws PersistenceException { |
69 | | - String text = analyzerPage.getText().toString(); |
70 | 75 | |
71 | 76 | ResourceType ptype = analyzerPage.getResourceType(); |
72 | 77 | String name = analyzerPage.getName().toString(); |
— | — | @@ -73,17 +78,42 @@ |
74 | 79 | //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update. |
75 | 80 | |
76 | 81 | textId ++; |
77 | | - |
| 82 | + |
78 | 83 | if (storeWikiText) { //TODO: separate access path... |
| 84 | + String text = analyzerPage.getText().toString().trim(); |
79 | 85 | output.storeRawText(textId, name, ptype, text); |
80 | 86 | } |
81 | 87 | |
82 | | - if (storePlainText) { //TODO: separate access path... |
83 | | - String plain = analyzerPage.getPlainText(false).toString(); |
84 | | - checkSmellsLikeWiki(0, plain, "plain text: "+name+" (id={0})", textId); |
85 | | - output.storePlainText(textId, name, ptype, plain); |
| 88 | + //CharSequence title = analyzerPage.getTitle(); |
| 89 | + ResourceType type = analyzerPage.getResourceType(); |
| 90 | + |
| 91 | + if (storePlainText && type==ResourceType.ARTICLE) { //TODO: separate access path... |
| 92 | + String plain = analyzerPage.getPlainText(false).toString().trim(); |
| 93 | + |
| 94 | + if (plain!=null && plain.length()>0) { |
| 95 | + checkSmellsLikeWiki(0, plain, "plain text: "+name+" (id={0})", textId); |
| 96 | + output.storePlainText(textId, name, ptype, plain); |
| 97 | + } |
86 | 98 | } |
87 | 99 | |
| 100 | + if (storeSynopsis && type==ResourceType.ARTICLE) { //TODO: separate access path... |
| 101 | + String syn = analyzerPage.getFirstParagraph().toString().trim(); |
| 102 | + |
| 103 | + if (syn!=null && syn.length()>0) { |
| 104 | + checkSmellsLikeWiki(0, syn, "definition text: "+name+" (id={0})", textId); |
| 105 | + output.storeSynopsisText(textId, name, ptype, syn); |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + if (storeDefinitions && type==ResourceType.ARTICLE) { //TODO: separate access path... |
| 110 | + String def = analyzerPage.getFirstSentence().toString().trim(); |
| 111 | + |
| 112 | + if (def!=null && def.length()>0) { |
| 113 | + checkSmellsLikeWiki(0, def, "definition text: "+name+" (id={0})", textId); |
| 114 | + output.storeDefinitionText(textId, name, ptype, def); |
| 115 | + } |
| 116 | + } |
| 117 | + |
88 | 118 | /* |
89 | 119 | if (ptype == ResourceType.ARTICLE && storeDefinitions) { |
90 | 120 | String definition = analyzerPage.getFirstSentence(); |
— | — | @@ -100,23 +130,54 @@ |
101 | 131 | public static void declareOptions(Arguments args) { |
102 | 132 | AbstractImporter.declareOptions(args); |
103 | 133 | |
104 | | - args.declare("wikitext", null, true, String.class, "store raw wiki text"); |
105 | | - args.declare("flattext", null, true, String.class, "store stripped plain text"); |
106 | | - //args.declare("defs", null, true, String.class, "extract and store definitions"); |
| 134 | + args.declare("namespaces", null, true, String.class, "list of namespaces to process. if omitted, all are processed."); |
| 135 | + args.declare("extract", null, true, String.class, "What to extract. One or more of raw, plain, or definition. Default is raw,plain"); |
107 | 136 | } |
108 | 137 | |
109 | 138 | @Override |
110 | 139 | public void configure(Arguments args) throws Exception { |
111 | 140 | super.configure(args); |
112 | 141 | |
113 | | - //this.storeDefinitions = !args.isSet("defs"); |
114 | | - storeWikiText = args.isSet("wikitext"); |
115 | | - storePlainText = args.isSet("flattext"); |
| 142 | + setNamespaceFilter( getNamespaces(args) ); |
116 | 143 | |
117 | | - if (!storeWikiText && !storePlainText) { |
118 | | - storeWikiText = true; |
119 | | - storePlainText = true; |
| 144 | + String ext = args.getOption("extract", "raw,plain").toLowerCase(); |
| 145 | + String[]ee = ext.split("[,;/|:+]"); |
| 146 | + |
| 147 | + for (String e: ee) { |
| 148 | + e = e.toLowerCase(); |
| 149 | + if (e.equals("def") || e.equals("definition")) storeDefinitions = true; |
| 150 | + else if (e.equals("synopsis") || e.equals("intro")) storeSynopsis = true; |
| 151 | + else if (e.equals("raw") || e.equals("wiki") || e.equals("wikitext")) storeWikiText= true; |
| 152 | + else if (e.equals("plain") || e.equals("flat")) storePlainText = true; |
| 153 | + else throw new IllegalArgumentException("unknown extraction aspect: "+e); |
120 | 154 | } |
| 155 | + |
121 | 156 | } |
122 | 157 | |
| 158 | + |
| 159 | + protected Set<Namespace> getNamespaces(Arguments args) { |
| 160 | + if (!args.isSet("namespaces")) return null; |
| 161 | + |
| 162 | + String s = args.getOption("namespaces", ""); |
| 163 | + String[] nn = s.split("[\\s,;:/|+]+"); |
| 164 | + if (nn.length==0) return null; |
| 165 | + |
| 166 | + NamespaceSet namespaces = analyzer.getCorpus().getNamespaces(); |
| 167 | + Set<Namespace> result = new HashSet<Namespace>(); |
| 168 | + |
| 169 | + for (String n: nn) { |
| 170 | + Namespace ns; |
| 171 | + if (n.equals("") || n.equals("*") || n.equalsIgnoreCase("main")) ns = namespaces.getNamespace(Namespace.MAIN); |
| 172 | + else ns = namespaces.getNamespace(n); |
| 173 | + |
| 174 | + result.add(ns); |
| 175 | + } |
| 176 | + |
| 177 | + return result; |
| 178 | + } |
| 179 | + |
| 180 | + public void setNamespaceFilter(Set<Namespace> namespaces) { |
| 181 | + this.allowdNamespaces = namespaces; |
| 182 | + } |
| 183 | + |
123 | 184 | } |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java |
— | — | @@ -18,7 +18,11 @@ |
19 | 19 | } |
20 | 20 | |
21 | 21 | protected URL dumpFile; |
22 | | - |
| 22 | +/* |
| 23 | + protected String getDatasetArgument() { |
| 24 | + return args.getOption("corpus", ":"); |
| 25 | + } |
| 26 | +*/ |
23 | 27 | @Override |
24 | 28 | protected boolean applyArguments() { |
25 | 29 | String d = getTargetFileName(); |
— | — | @@ -50,8 +54,10 @@ |
51 | 55 | args.declare("wiki", null, true, String.class, "sets the wiki name (overrides the name given by, or " + |
52 | 56 | "guessed from, the <wiki> parameter)"); |
53 | 57 | args.declare("url", null, false, Boolean.class, "read the <dump-file> parameter as a full URL"); |
| 58 | + |
| 59 | + args.declare("namespaces", null, true, String.class, "Only process pages in the given namespace(s)."); |
54 | 60 | } |
55 | | - |
| 61 | + |
56 | 62 | @Override |
57 | 63 | protected void run() throws Exception { |
58 | 64 | |