Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/TitleSetFilter.java |
— | — | @@ -0,0 +1,65 @@ |
| 2 | +package de.brightbyte.wikiword.builder; |
| 3 | + |
| 4 | +import java.io.File; |
| 5 | +import java.io.FileInputStream; |
| 6 | +import java.io.IOException; |
| 7 | +import java.io.InputStream; |
| 8 | +import java.util.Set; |
| 9 | + |
| 10 | +import org.ardverk.collection.PatriciaTrie; |
| 11 | +import org.ardverk.collection.StringKeyAnalyzer; |
| 12 | + |
| 13 | +import de.brightbyte.data.cursor.DataCursor; |
| 14 | +import de.brightbyte.data.filter.StaticSetFilter; |
| 15 | +import de.brightbyte.io.LineCursor; |
| 16 | +import de.brightbyte.util.PersistenceException; |
| 17 | + |
| 18 | +public class TitleSetFilter extends PageTitleFilter { |
| 19 | + |
| 20 | + protected static Set<String> slurpCursor(DataCursor<String> titleCursor) throws PersistenceException { |
| 21 | + PatriciaTrie<String, Integer> trie = new PatriciaTrie<String, Integer>(new StringKeyAnalyzer()); |
| 22 | + |
| 23 | + final Integer ONE = new Integer(1); |
| 24 | + |
| 25 | + String s; |
| 26 | + while ((s = titleCursor.next()) != null) { |
| 27 | + trie.put(s, ONE); |
| 28 | + } |
| 29 | + |
| 30 | + return trie.keySet(); |
| 31 | + } |
| 32 | + |
| 33 | + protected static Set<String> slurpLines(File f, String enc) throws PersistenceException { |
| 34 | + try { |
| 35 | + InputStream in = new FileInputStream(f); |
| 36 | + LineCursor cursor = new LineCursor(in, enc); |
| 37 | + |
| 38 | + Set<String> r = slurpCursor( cursor ); |
| 39 | + |
| 40 | + cursor.close(); |
| 41 | + in.close(); |
| 42 | + |
| 43 | + return r; |
| 44 | + } catch (IOException e) { |
| 45 | + throw new PersistenceException(e); |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + @SuppressWarnings("unchecked") |
| 50 | + public TitleSetFilter(String name, Set titles) { |
| 51 | + super(name, new StaticSetFilter<CharSequence>(titles)); |
| 52 | + } |
| 53 | + |
| 54 | + public TitleSetFilter(File titleFile, String enc) throws PersistenceException { |
| 55 | + this(titleFile.getName(), titleFile, enc); |
| 56 | + } |
| 57 | + |
| 58 | + public TitleSetFilter(String name, File titleFile, String enc) throws PersistenceException { |
| 59 | + this(name, slurpLines(titleFile, enc)); |
| 60 | + } |
| 61 | + |
| 62 | + public TitleSetFilter(String name, DataCursor<String> titleCursor) throws PersistenceException { |
| 63 | + this(name, slurpCursor(titleCursor)); |
| 64 | + } |
| 65 | + |
| 66 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/WikiPageFilter.java |
— | — | @@ -0,0 +1,10 @@ |
| 2 | +package de.brightbyte.wikiword.builder; |
| 3 | + |
| 4 | +import de.brightbyte.data.filter.Filter; |
| 5 | +import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
| 6 | + |
| 7 | +public interface WikiPageFilter extends Filter<WikiTextAnalyzer.WikiPage> { |
| 8 | + |
| 9 | + public String getName(); |
| 10 | + |
| 11 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/PropertyImporter.java |
— | — | @@ -12,9 +12,7 @@ |
13 | 13 | import de.brightbyte.wikiword.TweakSet; |
14 | 14 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
15 | 15 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage; |
16 | | -import de.brightbyte.wikiword.schema.AliasScope; |
17 | 16 | import de.brightbyte.wikiword.store.builder.LocalConceptStoreBuilder; |
18 | | -import de.brightbyte.wikiword.store.builder.PropertyStoreBuilder; |
19 | 17 | |
20 | 18 | public class PropertyImporter extends ConceptImporter { |
21 | 19 | |
— | — | @@ -42,14 +40,7 @@ |
43 | 41 | */ |
44 | 42 | |
45 | 43 | @Override |
46 | | - public int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException { |
47 | | - WikiTextAnalyzer.WikiPage analyzerPage = analyzer.makePage(namespace, title, text, forceTitleCase); |
48 | | - |
49 | | - if (!isRelevant(analyzerPage)) { |
50 | | - out.trace("ignored page "+title+" in namespace "+namespace); //XXX: trace only! |
51 | | - return -1; |
52 | | - } |
53 | | - |
| 44 | + public int importPage(WikiTextAnalyzer.WikiPage analyzerPage, Date timestamp) throws PersistenceException { |
54 | 45 | String name = analyzerPage.getConceptName(); |
55 | 46 | String rcName = analyzerPage.getResourceName(); |
56 | 47 | |
— | — | @@ -74,7 +65,8 @@ |
75 | 66 | return cid; |
76 | 67 | } |
77 | 68 | |
78 | | - private boolean isRelevant(WikiPage analyzerPage) { |
| 69 | + @Override |
| 70 | + protected boolean isRelevant(WikiPage analyzerPage) { |
79 | 71 | ResourceType t = analyzerPage.getResourceType(); |
80 | 72 | |
81 | 73 | if (t!=ResourceType.ARTICLE |
— | — | @@ -101,7 +93,7 @@ |
102 | 94 | } |
103 | 95 | |
104 | 96 | @Override |
105 | | - public void configure(Arguments args) { |
| 97 | + public void configure(Arguments args) throws Exception { |
106 | 98 | super.configure(args); |
107 | 99 | } |
108 | 100 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/TextImporter.java |
— | — | @@ -8,6 +8,7 @@ |
9 | 9 | import de.brightbyte.wikiword.ResourceType; |
10 | 10 | import de.brightbyte.wikiword.TweakSet; |
11 | 11 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
| 12 | +import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage; |
12 | 13 | import de.brightbyte.wikiword.store.builder.TextStoreBuilder; |
13 | 14 | |
14 | 15 | public class TextImporter extends AbstractImporter { |
— | — | @@ -46,14 +47,22 @@ |
47 | 48 | } |
48 | 49 | */ |
49 | 50 | |
| 51 | + protected boolean isRelevant(WikiPage analyzerPage) { |
| 52 | + int namespace = analyzerPage.getNamespace(); |
| 53 | + CharSequence title = analyzerPage.getTitle(); |
| 54 | + |
| 55 | + if (namespace!=Namespace.MAIN) { |
| 56 | + out.trace("ignored page "+title+" in namespace "+namespace); //XXX: trace only! |
| 57 | + return false; |
| 58 | + } |
| 59 | + |
| 60 | + return super.isRelevant(analyzerPage); |
| 61 | + } |
| 62 | + |
50 | 63 | @Override |
51 | | - public int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException { |
52 | | - if (namespace!=Namespace.MAIN) { |
53 | | - out.trace("ignored page "+title+" in namespace "+namespace); //XXX: trace only! |
54 | | - return -1; |
55 | | - } |
| 64 | + public int importPage(WikiPage analyzerPage, Date timestamp) throws PersistenceException { |
| 65 | + String text = analyzerPage.getText().toString(); |
56 | 66 | |
57 | | - WikiTextAnalyzer.WikiPage analyzerPage = analyzer.makePage(namespace, title, text, forceTitleCase); |
58 | 67 | ResourceType ptype = analyzerPage.getResourceType(); |
59 | 68 | String name = analyzerPage.getName().toString(); |
60 | 69 | |
— | — | @@ -93,7 +102,7 @@ |
94 | 103 | } |
95 | 104 | |
96 | 105 | @Override |
97 | | - public void configure(Arguments args) { |
| 106 | + public void configure(Arguments args) throws Exception { |
98 | 107 | super.configure(args); |
99 | 108 | |
100 | 109 | //this.storeDefinitions = !args.isSet("defs"); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java |
— | — | @@ -242,22 +242,14 @@ |
243 | 243 | } |
244 | 244 | |
245 | 245 | @Override |
246 | | - public int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException { |
247 | | - if (text.length()==0) { |
248 | | - out.warn("WARNING: ignored blank page "+title); |
249 | | - return -1; |
250 | | - } |
251 | | - |
252 | | - WikiTextAnalyzer.WikiPage analyzerPage = analyzer.makePage(namespace, title, text, forceTitleCase); |
| 246 | + public int importPage(WikiTextAnalyzer.WikiPage analyzerPage, Date timestamp) throws PersistenceException { |
253 | 247 | ResourceType ptype = analyzerPage.getResourceType(); |
254 | 248 | String name = analyzerPage.getConceptName(); |
255 | 249 | String rcName = analyzerPage.getResourceName(); |
| 250 | + String text = analyzerPage.getText().toString(); |
| 251 | + //int namespace = analyzerPage.getNamespace(); |
| 252 | + //String title = analyzerPage.getTitle().toString(); |
256 | 253 | |
257 | | - if (ptype==ResourceType.OTHER || ptype==ResourceType.UNKNOWN) { |
258 | | - out.trace("ignored page "+title+" in namespace "+namespace+" with type "+ptype); |
259 | | - return -1; |
260 | | - } |
261 | | - |
262 | 254 | //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update. |
263 | 255 | int rcId = storeResource(rcName, ptype, timestamp); |
264 | 256 | |
— | — | @@ -476,7 +468,7 @@ |
477 | 469 | } |
478 | 470 | |
479 | 471 | @Override |
480 | | - public void configure(Arguments args) { |
| 472 | + public void configure(Arguments args) throws Exception { |
481 | 473 | super.configure(args); |
482 | 474 | |
483 | 475 | this.storeDefinitions = !args.isSet("nodef"); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ExtractText.java |
— | — | @@ -38,39 +38,8 @@ |
39 | 39 | |
40 | 40 | args.declareHelp("<dump-file>", "the dump file to process"); |
41 | 41 | args.declare("wiki", null, true, String.class, "sets the wiki name (overrides the name given by, or " + |
42 | | - "guessed from, the <wiki-or-dump> parameter)"); |
| 42 | + "guessed from, the <dump-file> parameter)"); |
43 | 43 | } |
44 | | - |
45 | | - /*@Override |
46 | | - protected TextStoreBuilder createStore() throws IOException, PersistenceException {... |
47 | | - if (args.isSet("stream")) { |
48 | | - String n = args.getOption("stream", null); |
49 | | - OutputStream out; |
50 | | - String enc = args.getStringOption("encoding", "utf-8"); |
51 | | - |
52 | | - if (n.equals("-")) { |
53 | | - out = System.out; |
54 | | - } |
55 | | - else { |
56 | | - File f = new File(n); |
57 | | - out = new BufferedOutputStream(new FileOutputStream(f, args.isSet("append"))); |
58 | | - } |
59 | | - |
60 | | - return new PlainTextOutput(out, enc); |
61 | | - } |
62 | | - else { |
63 | | - return super.createStore(); |
64 | | - } |
65 | | - } |
66 | | - |
67 | | - @Override |
68 | | - protected TextStoreBuilder createStore(DataSource db) throws PersistenceException { |
69 | | - try { |
70 | | - return new DatabaseTextStoreBuilder(getCorpus(), db, tweaks); |
71 | | - } catch (SQLException e) { |
72 | | - throw new PersistenceException(e); |
73 | | - } |
74 | | - }*/ |
75 | 44 | |
76 | 45 | public static void main(String[] argv) throws Exception { |
77 | 46 | ExtractText app = new ExtractText(); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/WikiWordImporter.java |
— | — | @@ -80,8 +80,9 @@ |
81 | 81 | /** |
82 | 82 | * called once after the WikiWordImporter has been created, should initialize permanent options |
83 | 83 | * from command line switches. |
| 84 | + * @throws Exception |
84 | 85 | */ |
85 | | - public void configure(Arguments args); |
| 86 | + public void configure(Arguments args) throws Exception; |
86 | 87 | |
87 | 88 | /** |
88 | 89 | * Tells the WikiWordImporter to skip all pages up to the given title. |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/PageTitleFilter.java |
— | — | @@ -0,0 +1,29 @@ |
| 2 | +package de.brightbyte.wikiword.builder; |
| 3 | + |
| 4 | +import de.brightbyte.data.filter.Filter; |
| 5 | +import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
| 6 | + |
| 7 | +public class PageTitleFilter implements WikiPageFilter { |
| 8 | + protected Filter<CharSequence> filter; |
| 9 | + private String name; |
| 10 | + |
| 11 | + public PageTitleFilter(String name, Filter<CharSequence> filter) { |
| 12 | + if (filter==null) throw new NullPointerException(); |
| 13 | + this.filter = filter; |
| 14 | + this.name = name; |
| 15 | + } |
| 16 | + |
| 17 | + public boolean matches(WikiTextAnalyzer.WikiPage page) { |
| 18 | + CharSequence t = page.getTitle(); |
| 19 | + return filter.matches(t); |
| 20 | + } |
| 21 | + |
| 22 | + public String getName() { |
| 23 | + return name; |
| 24 | + } |
| 25 | + |
| 26 | + public String toString() { |
| 27 | + return getName(); |
| 28 | + } |
| 29 | + |
| 30 | +} |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/AbstractImporter.java |
— | — | @@ -1,7 +1,10 @@ |
2 | 2 | package de.brightbyte.wikiword.builder; |
3 | 3 | |
| 4 | +import java.io.File; |
4 | 5 | import java.io.IOException; |
5 | 6 | import java.text.MessageFormat; |
| 7 | +import java.util.ArrayList; |
| 8 | +import java.util.Collection; |
6 | 9 | import java.util.Date; |
7 | 10 | import java.util.Map; |
8 | 11 | import java.util.Random; |
— | — | @@ -15,10 +18,13 @@ |
16 | 19 | import de.brightbyte.job.Progress; |
17 | 20 | import de.brightbyte.job.ProgressRateTracker; |
18 | 21 | import de.brightbyte.util.PersistenceException; |
| 22 | +import de.brightbyte.util.SystemUtils; |
19 | 23 | import de.brightbyte.wikiword.NamespaceSet; |
| 24 | +import de.brightbyte.wikiword.ResourceType; |
20 | 25 | import de.brightbyte.wikiword.TweakSet; |
21 | 26 | import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer; |
22 | 27 | import de.brightbyte.wikiword.analyzer.WikiTextSniffer; |
| 28 | +import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage; |
23 | 29 | import de.brightbyte.wikiword.store.builder.IncrementalStoreBuilder; |
24 | 30 | import de.brightbyte.wikiword.store.builder.WikiWordStoreBuilder; |
25 | 31 | |
— | — | @@ -110,6 +116,8 @@ |
111 | 117 | protected LeveledOutput out; |
112 | 118 | |
113 | 119 | protected boolean forceTitleCase = false; //NOTE: per default, trust title case supplied by import driver! |
| 120 | + private Collection<WikiPageFilter> filters; |
| 121 | + protected String fileecoding = SystemUtils.getPropertySafely("file.encoding", "utf-8"); |
114 | 122 | |
115 | 123 | public AbstractImporter(WikiTextAnalyzer analyzer, WikiWordStoreBuilder store, TweakSet tweaks) { |
116 | 124 | if (analyzer==null) throw new NullPointerException(); |
— | — | @@ -127,6 +135,20 @@ |
128 | 136 | out = new LogOutput(); |
129 | 137 | } |
130 | 138 | |
| 139 | + public void loadTitleList(File f, String enc) throws PersistenceException { |
| 140 | + if (enc==null) enc = fileecoding; |
| 141 | + |
| 142 | + out.info("loading page title from "+f); |
| 143 | + TitleSetFilter filter = new TitleSetFilter(f, enc); |
| 144 | + addFilter(filter); |
| 145 | + } |
| 146 | + |
| 147 | + public void addFilter(WikiPageFilter filter) { |
| 148 | + if (filter==null) return; |
| 149 | + if (filters==null) filters = new ArrayList<WikiPageFilter>(); |
| 150 | + filters.add(filter); |
| 151 | + } |
| 152 | + |
131 | 153 | public void setLogLevel(int level) { |
132 | 154 | if (out == null) return; |
133 | 155 | if (!(out instanceof LogOutput)) return; |
— | — | @@ -260,8 +282,50 @@ |
261 | 283 | } |
262 | 284 | } |
263 | 285 | |
264 | | - protected abstract int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException; |
| 286 | + |
| 287 | + protected boolean isRelevant(WikiPage analyzerPage) { |
| 288 | + ResourceType ptype = analyzerPage.getResourceType(); |
| 289 | + CharSequence text = analyzerPage.getText(); |
| 290 | + int namespace = analyzerPage.getNamespace(); |
| 291 | + CharSequence title = analyzerPage.getTitle(); |
265 | 292 | |
| 293 | + if (text.length()==0) { |
| 294 | + out.warn("blank page "+title); |
| 295 | + return false; |
| 296 | + } |
| 297 | + |
| 298 | + if (ptype==ResourceType.OTHER || ptype==ResourceType.UNKNOWN) { |
| 299 | + out.trace("bad page "+title+" in namespace "+namespace+" with type "+ptype); |
| 300 | + return false; |
| 301 | + } |
| 302 | + |
| 303 | + if (filters!=null) { |
| 304 | + for (WikiPageFilter filter: filters) { |
| 305 | + if (!filter.matches(analyzerPage)) { |
| 306 | + out.trace("page "+title+" matches filter "+filter.getName()); |
| 307 | + return false; |
| 308 | + } |
| 309 | + } |
| 310 | + } |
| 311 | + |
| 312 | + return true; |
| 313 | + } |
| 314 | + |
| 315 | + protected final int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException { |
| 316 | + WikiTextAnalyzer.WikiPage page = analyzer.makePage(namespace, title, text, forceTitleCase); |
| 317 | + |
| 318 | + //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update. |
| 319 | + |
| 320 | + if (!isRelevant(page)) { |
| 321 | + out.trace("ignored page "+title+" in namespace "+namespace); |
| 322 | + return -1; |
| 323 | + } |
| 324 | + |
| 325 | + return importPage(page, timestamp); |
| 326 | + } |
| 327 | + |
| 328 | + protected abstract int importPage(WikiTextAnalyzer.WikiPage page, Date timestamp) throws PersistenceException; |
| 329 | + |
266 | 330 | public int getProgressInterval() { |
267 | 331 | return progressInterval; |
268 | 332 | } |
— | — | @@ -292,7 +356,10 @@ |
293 | 357 | "can be used to reproduce random sets of pages."); |
294 | 358 | |
295 | 359 | args.declare("catchdupes", null, false, Boolean.class, "catch and ignore duplicates (uses more memory)"); |
296 | | - |
| 360 | + |
| 361 | + args.declare("titlelist", null, true, String.class, "file containing a list if page titles to filter by, one per line"); |
| 362 | + args.declare("fileencoding", null, true, String.class, "encoding to use when reading files"); |
| 363 | + |
297 | 364 | //args.declare("nodef", null, true, String.class, "do not extract and store definitions (improves speed)"); |
298 | 365 | //args.declare("nolinks", null, true, String.class, "do not store links between pages (improves speed)"); |
299 | 366 | //args.declare("noterms", null, true, String.class, "do not store term usage"); |
— | — | @@ -300,7 +367,7 @@ |
301 | 368 | //args.declare("plaintext", null, true, String.class, "store full stripped text"); |
302 | 369 | } |
303 | 370 | |
304 | | - public void configure(Arguments args) { |
| 371 | + public void configure(Arguments args) throws Exception { |
305 | 372 | if (args.isSet("from")) { |
306 | 373 | this.skipTo = args.getStringOption("from", null); |
307 | 374 | } |
— | — | @@ -325,6 +392,13 @@ |
326 | 393 | if (args.isSet("catchdupes")) { |
327 | 394 | if (this.stoplist==null) this.stoplist = new ChunkyBitSet(); |
328 | 395 | } |
| 396 | + |
| 397 | + fileecoding = args.getStringOption("fileencoding", "utf-8"); |
| 398 | + |
| 399 | + if (args.isSet("titlelist")) { |
| 400 | + String f = args.getStringOption("titlelist", null); |
| 401 | + loadTitleList(new File(f), fileecoding); |
| 402 | + } |
329 | 403 | } |
330 | 404 | |
331 | 405 | public void initialize(NamespaceSet namespaces, boolean titleCase) { |