r49815 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r49814‎ | r49815 | r49816 >
Date:13:55, 24 April 2009
Author:daniel
Status:deferred
Tags:
Comment:
filter infrastructure
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/AbstractImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ExtractText.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/PageTitleFilter.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/PropertyImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/TextImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/TitleSetFilter.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/WikiPageFilter.java (added) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/WikiWordImporter.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/TitleSetFilter.java
@@ -0,0 +1,65 @@
 2+package de.brightbyte.wikiword.builder;
 3+
 4+import java.io.File;
 5+import java.io.FileInputStream;
 6+import java.io.IOException;
 7+import java.io.InputStream;
 8+import java.util.Set;
 9+
 10+import org.ardverk.collection.PatriciaTrie;
 11+import org.ardverk.collection.StringKeyAnalyzer;
 12+
 13+import de.brightbyte.data.cursor.DataCursor;
 14+import de.brightbyte.data.filter.StaticSetFilter;
 15+import de.brightbyte.io.LineCursor;
 16+import de.brightbyte.util.PersistenceException;
 17+
 18+public class TitleSetFilter extends PageTitleFilter {
 19+
 20+ protected static Set<String> slurpCursor(DataCursor<String> titleCursor) throws PersistenceException {
 21+ PatriciaTrie<String, Integer> trie = new PatriciaTrie<String, Integer>(new StringKeyAnalyzer());
 22+
 23+ final Integer ONE = new Integer(1);
 24+
 25+ String s;
 26+ while ((s = titleCursor.next()) != null) {
 27+ trie.put(s, ONE);
 28+ }
 29+
 30+ return trie.keySet();
 31+ }
 32+
 33+ protected static Set<String> slurpLines(File f, String enc) throws PersistenceException {
 34+ try {
 35+ InputStream in = new FileInputStream(f);
 36+ LineCursor cursor = new LineCursor(in, enc);
 37+
 38+ Set<String> r = slurpCursor( cursor );
 39+
 40+ cursor.close();
 41+ in.close();
 42+
 43+ return r;
 44+ } catch (IOException e) {
 45+ throw new PersistenceException(e);
 46+ }
 47+ }
 48+
 49+ @SuppressWarnings("unchecked")
 50+ public TitleSetFilter(String name, Set titles) {
 51+ super(name, new StaticSetFilter<CharSequence>(titles));
 52+ }
 53+
 54+ public TitleSetFilter(File titleFile, String enc) throws PersistenceException {
 55+ this(titleFile.getName(), titleFile, enc);
 56+ }
 57+
 58+ public TitleSetFilter(String name, File titleFile, String enc) throws PersistenceException {
 59+ this(name, slurpLines(titleFile, enc));
 60+ }
 61+
 62+ public TitleSetFilter(String name, DataCursor<String> titleCursor) throws PersistenceException {
 63+ this(name, slurpCursor(titleCursor));
 64+ }
 65+
 66+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/WikiPageFilter.java
@@ -0,0 +1,10 @@
 2+package de.brightbyte.wikiword.builder;
 3+
 4+import de.brightbyte.data.filter.Filter;
 5+import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
 6+
 7+public interface WikiPageFilter extends Filter<WikiTextAnalyzer.WikiPage> {
 8+
 9+ public String getName();
 10+
 11+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/PropertyImporter.java
@@ -12,9 +12,7 @@
1313 import de.brightbyte.wikiword.TweakSet;
1414 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
1515 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage;
16 -import de.brightbyte.wikiword.schema.AliasScope;
1716 import de.brightbyte.wikiword.store.builder.LocalConceptStoreBuilder;
18 -import de.brightbyte.wikiword.store.builder.PropertyStoreBuilder;
1917
2018 public class PropertyImporter extends ConceptImporter {
2119
@@ -42,14 +40,7 @@
4341 */
4442
4543 @Override
46 - public int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException {
47 - WikiTextAnalyzer.WikiPage analyzerPage = analyzer.makePage(namespace, title, text, forceTitleCase);
48 -
49 - if (!isRelevant(analyzerPage)) {
50 - out.trace("ignored page "+title+" in namespace "+namespace); //XXX: trace only!
51 - return -1;
52 - }
53 -
 44+ public int importPage(WikiTextAnalyzer.WikiPage analyzerPage, Date timestamp) throws PersistenceException {
5445 String name = analyzerPage.getConceptName();
5546 String rcName = analyzerPage.getResourceName();
5647
@@ -74,7 +65,8 @@
7566 return cid;
7667 }
7768
78 - private boolean isRelevant(WikiPage analyzerPage) {
 69+ @Override
 70+ protected boolean isRelevant(WikiPage analyzerPage) {
7971 ResourceType t = analyzerPage.getResourceType();
8072
8173 if (t!=ResourceType.ARTICLE
@@ -101,7 +93,7 @@
10294 }
10395
10496 @Override
105 - public void configure(Arguments args) {
 97+ public void configure(Arguments args) throws Exception {
10698 super.configure(args);
10799 }
108100
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/TextImporter.java
@@ -8,6 +8,7 @@
99 import de.brightbyte.wikiword.ResourceType;
1010 import de.brightbyte.wikiword.TweakSet;
1111 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
 12+import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage;
1213 import de.brightbyte.wikiword.store.builder.TextStoreBuilder;
1314
1415 public class TextImporter extends AbstractImporter {
@@ -46,14 +47,22 @@
4748 }
4849 */
4950
 51+ protected boolean isRelevant(WikiPage analyzerPage) {
 52+ int namespace = analyzerPage.getNamespace();
 53+ CharSequence title = analyzerPage.getTitle();
 54+
 55+ if (namespace!=Namespace.MAIN) {
 56+ out.trace("ignored page "+title+" in namespace "+namespace); //XXX: trace only!
 57+ return false;
 58+ }
 59+
 60+ return super.isRelevant(analyzerPage);
 61+ }
 62+
5063 @Override
51 - public int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException {
52 - if (namespace!=Namespace.MAIN) {
53 - out.trace("ignored page "+title+" in namespace "+namespace); //XXX: trace only!
54 - return -1;
55 - }
 64+ public int importPage(WikiPage analyzerPage, Date timestamp) throws PersistenceException {
 65+ String text = analyzerPage.getText().toString();
5666
57 - WikiTextAnalyzer.WikiPage analyzerPage = analyzer.makePage(namespace, title, text, forceTitleCase);
5867 ResourceType ptype = analyzerPage.getResourceType();
5968 String name = analyzerPage.getName().toString();
6069
@@ -93,7 +102,7 @@
94103 }
95104
96105 @Override
97 - public void configure(Arguments args) {
 106+ public void configure(Arguments args) throws Exception {
98107 super.configure(args);
99108
100109 //this.storeDefinitions = !args.isSet("defs");
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
@@ -242,22 +242,14 @@
243243 }
244244
245245 @Override
246 - public int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException {
247 - if (text.length()==0) {
248 - out.warn("WARNING: ignored blank page "+title);
249 - return -1;
250 - }
251 -
252 - WikiTextAnalyzer.WikiPage analyzerPage = analyzer.makePage(namespace, title, text, forceTitleCase);
 246+ public int importPage(WikiTextAnalyzer.WikiPage analyzerPage, Date timestamp) throws PersistenceException {
253247 ResourceType ptype = analyzerPage.getResourceType();
254248 String name = analyzerPage.getConceptName();
255249 String rcName = analyzerPage.getResourceName();
 250+ String text = analyzerPage.getText().toString();
 251+ //int namespace = analyzerPage.getNamespace();
 252+ //String title = analyzerPage.getTitle().toString();
256253
257 - if (ptype==ResourceType.OTHER || ptype==ResourceType.UNKNOWN) {
258 - out.trace("ignored page "+title+" in namespace "+namespace+" with type "+ptype);
259 - return -1;
260 - }
261 -
262254 //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update.
263255 int rcId = storeResource(rcName, ptype, timestamp);
264256
@@ -476,7 +468,7 @@
477469 }
478470
479471 @Override
480 - public void configure(Arguments args) {
 472+ public void configure(Arguments args) throws Exception {
481473 super.configure(args);
482474
483475 this.storeDefinitions = !args.isSet("nodef");
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ExtractText.java
@@ -38,39 +38,8 @@
3939
4040 args.declareHelp("<dump-file>", "the dump file to process");
4141 args.declare("wiki", null, true, String.class, "sets the wiki name (overrides the name given by, or " +
42 - "guessed from, the <wiki-or-dump> parameter)");
 42+ "guessed from, the <dump-file> parameter)");
4343 }
44 -
45 - /*@Override
46 - protected TextStoreBuilder createStore() throws IOException, PersistenceException {...
47 - if (args.isSet("stream")) {
48 - String n = args.getOption("stream", null);
49 - OutputStream out;
50 - String enc = args.getStringOption("encoding", "utf-8");
51 -
52 - if (n.equals("-")) {
53 - out = System.out;
54 - }
55 - else {
56 - File f = new File(n);
57 - out = new BufferedOutputStream(new FileOutputStream(f, args.isSet("append")));
58 - }
59 -
60 - return new PlainTextOutput(out, enc);
61 - }
62 - else {
63 - return super.createStore();
64 - }
65 - }
66 -
67 - @Override
68 - protected TextStoreBuilder createStore(DataSource db) throws PersistenceException {
69 - try {
70 - return new DatabaseTextStoreBuilder(getCorpus(), db, tweaks);
71 - } catch (SQLException e) {
72 - throw new PersistenceException(e);
73 - }
74 - }*/
7544
7645 public static void main(String[] argv) throws Exception {
7746 ExtractText app = new ExtractText();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/WikiWordImporter.java
@@ -80,8 +80,9 @@
8181 /**
8282 * called once after the WikiWordImporter has been created, should initialize permanent options
8383 * from command line switches.
 84+ * @throws Exception
8485 */
85 - public void configure(Arguments args);
 86+ public void configure(Arguments args) throws Exception;
8687
8788 /**
8889 * Tells the WikiWordImporter to skip all pages up to the given title.
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/PageTitleFilter.java
@@ -0,0 +1,29 @@
 2+package de.brightbyte.wikiword.builder;
 3+
 4+import de.brightbyte.data.filter.Filter;
 5+import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
 6+
 7+public class PageTitleFilter implements WikiPageFilter {
 8+ protected Filter<CharSequence> filter;
 9+ private String name;
 10+
 11+ public PageTitleFilter(String name, Filter<CharSequence> filter) {
 12+ if (filter==null) throw new NullPointerException();
 13+ this.filter = filter;
 14+ this.name = name;
 15+ }
 16+
 17+ public boolean matches(WikiTextAnalyzer.WikiPage page) {
 18+ CharSequence t = page.getTitle();
 19+ return filter.matches(t);
 20+ }
 21+
 22+ public String getName() {
 23+ return name;
 24+ }
 25+
 26+ public String toString() {
 27+ return getName();
 28+ }
 29+
 30+}
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/AbstractImporter.java
@@ -1,7 +1,10 @@
22 package de.brightbyte.wikiword.builder;
33
 4+import java.io.File;
45 import java.io.IOException;
56 import java.text.MessageFormat;
 7+import java.util.ArrayList;
 8+import java.util.Collection;
69 import java.util.Date;
710 import java.util.Map;
811 import java.util.Random;
@@ -15,10 +18,13 @@
1619 import de.brightbyte.job.Progress;
1720 import de.brightbyte.job.ProgressRateTracker;
1821 import de.brightbyte.util.PersistenceException;
 22+import de.brightbyte.util.SystemUtils;
1923 import de.brightbyte.wikiword.NamespaceSet;
 24+import de.brightbyte.wikiword.ResourceType;
2025 import de.brightbyte.wikiword.TweakSet;
2126 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
2227 import de.brightbyte.wikiword.analyzer.WikiTextSniffer;
 28+import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer.WikiPage;
2329 import de.brightbyte.wikiword.store.builder.IncrementalStoreBuilder;
2430 import de.brightbyte.wikiword.store.builder.WikiWordStoreBuilder;
2531
@@ -110,6 +116,8 @@
111117 protected LeveledOutput out;
112118
113119 protected boolean forceTitleCase = false; //NOTE: per default, trust title case supplied by import driver!
 120+ private Collection<WikiPageFilter> filters;
 121+ protected String fileecoding = SystemUtils.getPropertySafely("file.encoding", "utf-8");
114122
115123 public AbstractImporter(WikiTextAnalyzer analyzer, WikiWordStoreBuilder store, TweakSet tweaks) {
116124 if (analyzer==null) throw new NullPointerException();
@@ -127,6 +135,20 @@
128136 out = new LogOutput();
129137 }
130138
 139+ public void loadTitleList(File f, String enc) throws PersistenceException {
 140+ if (enc==null) enc = fileecoding;
 141+
 142+ out.info("loading page title from "+f);
 143+ TitleSetFilter filter = new TitleSetFilter(f, enc);
 144+ addFilter(filter);
 145+ }
 146+
 147+ public void addFilter(WikiPageFilter filter) {
 148+ if (filter==null) return;
 149+ if (filters==null) filters = new ArrayList<WikiPageFilter>();
 150+ filters.add(filter);
 151+ }
 152+
131153 public void setLogLevel(int level) {
132154 if (out == null) return;
133155 if (!(out instanceof LogOutput)) return;
@@ -260,8 +282,50 @@
261283 }
262284 }
263285
264 - protected abstract int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException;
 286+
 287+ protected boolean isRelevant(WikiPage analyzerPage) {
 288+ ResourceType ptype = analyzerPage.getResourceType();
 289+ CharSequence text = analyzerPage.getText();
 290+ int namespace = analyzerPage.getNamespace();
 291+ CharSequence title = analyzerPage.getTitle();
265292
 293+ if (text.length()==0) {
 294+ out.warn("blank page "+title);
 295+ return false;
 296+ }
 297+
 298+ if (ptype==ResourceType.OTHER || ptype==ResourceType.UNKNOWN) {
 299+ out.trace("bad page "+title+" in namespace "+namespace+" with type "+ptype);
 300+ return false;
 301+ }
 302+
 303+ if (filters!=null) {
 304+ for (WikiPageFilter filter: filters) {
 305+ if (!filter.matches(analyzerPage)) {
 306+ out.trace("page "+title+" matches filter "+filter.getName());
 307+ return false;
 308+ }
 309+ }
 310+ }
 311+
 312+ return true;
 313+ }
 314+
 315+ protected final int importPage(int namespace, String title, String text, Date timestamp) throws PersistenceException {
 316+ WikiTextAnalyzer.WikiPage page = analyzer.makePage(namespace, title, text, forceTitleCase);
 317+
 318+ //TODO: check if page is stored. if up to date, skip. if older, update. if missing, create. optionally force update.
 319+
 320+ if (!isRelevant(page)) {
 321+ out.trace("ignored page "+title+" in namespace "+namespace);
 322+ return -1;
 323+ }
 324+
 325+ return importPage(page, timestamp);
 326+ }
 327+
 328+ protected abstract int importPage(WikiTextAnalyzer.WikiPage page, Date timestamp) throws PersistenceException;
 329+
266330 public int getProgressInterval() {
267331 return progressInterval;
268332 }
@@ -292,7 +356,10 @@
293357 "can be used to reproduce random sets of pages.");
294358
295359 args.declare("catchdupes", null, false, Boolean.class, "catch and ignore duplicates (uses more memory)");
296 -
 360+
 361+ args.declare("titlelist", null, true, String.class, "file containing a list if page titles to filter by, one per line");
 362+ args.declare("fileencoding", null, true, String.class, "encoding to use when reading files");
 363+
297364 //args.declare("nodef", null, true, String.class, "do not extract and store definitions (improves speed)");
298365 //args.declare("nolinks", null, true, String.class, "do not store links between pages (improves speed)");
299366 //args.declare("noterms", null, true, String.class, "do not store term usage");
@@ -300,7 +367,7 @@
301368 //args.declare("plaintext", null, true, String.class, "store full stripped text");
302369 }
303370
304 - public void configure(Arguments args) {
 371+ public void configure(Arguments args) throws Exception {
305372 if (args.isSet("from")) {
306373 this.skipTo = args.getStringOption("from", null);
307374 }
@@ -325,6 +392,13 @@
326393 if (args.isSet("catchdupes")) {
327394 if (this.stoplist==null) this.stoplist = new ChunkyBitSet();
328395 }
 396+
 397+ fileecoding = args.getStringOption("fileencoding", "utf-8");
 398+
 399+ if (args.isSet("titlelist")) {
 400+ String f = args.getStringOption("titlelist", null);
 401+ loadTitleList(new File(f), fileecoding);
 402+ }
329403 }
330404
331405 public void initialize(NamespaceSet namespaces, boolean titleCase) {

Status & tagging log