r53053 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53052‎ | r53053 | r53054 >
Date:14:50, 10 July 2009
Author:daniel
Status:deferred
Tags:
Comment:
debug interface for ImportConcepts
Modified paths:
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportApp.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportConcepts.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportDump.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/InputFileHelper.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractorApp.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/XmlDumpDriver.java (modified) (history)
  • /trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DebugLocalConceptStoreBuilder.java (modified) (history)

Diff [purge]

Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java
@@ -484,7 +484,7 @@
485485 }
486486 }
487487 } else {
488 - out.info("skipped inter-namespace redirect "+rcName+" -> "+link);
 488+ warn(rcId, "bad redirect (inter-namespace)", rcName+" -> "+link, null);
489489 }
490490 }
491491 else if (name.equals(link.getPage().toString())) {
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/InputFileHelper.java
@@ -59,6 +59,27 @@
6060 }
6161 }
6262
 63+
 64+ public URL getInputURL(String d) {
 65+ return getInputURL(null, d);
 66+ }
 67+
 68+ public URL getInputURL(URL base, String n) {
 69+ try {
 70+ URL u = base == null || base.getProtocol().equals("file") ? new URL(n) : new URL(base, n);
 71+ return u;
 72+ } catch (MalformedURLException e) {
 73+ //ignore and continue
 74+ }
 75+
 76+ try {
 77+ File f = new File(n);
 78+ return f.toURI().toURL();
 79+ } catch (MalformedURLException e) {
 80+ throw new RuntimeException("failed to create file URL", e);
 81+ }
 82+ }
 83+
6384 public InputStream open(String n) throws IOException {
6485 return open(null, n);
6586 }
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportDump.java
@@ -1,6 +1,5 @@
22 package de.brightbyte.wikiword.builder;
33
4 -import java.io.File;
54 import java.net.MalformedURLException;
65 import java.net.URL;
76
@@ -18,31 +17,35 @@
1918 super(agendaTask, false, true);
2019 }
2120
22 - protected URL dumpFile;
 21+ private URL dumpFile;
2322
2423 @Override
2524 protected boolean applyArguments() {
2625 String d = getTargetFileName();
2726 if (d==null) return false;
2827
29 - if (args.isSet("url")) {
30 - try {
31 - dumpFile = new URL(d);
32 - } catch (MalformedURLException e) {
33 - throw new IllegalArgumentException("bad url: "+d, e);
34 - }
35 - }
36 - else {
37 - try {
38 - dumpFile = new File(d).toURI().toURL();
39 - } catch (MalformedURLException e) {
40 - throw new RuntimeException("failed to generate local file url for `"+d+"`");
41 - }
42 - }
43 -
4428 return true;
4529 }
4630
 31+ protected URL getDumpFileURL() {
 32+ String d = getTargetFileName();
 33+
 34+ if (dumpFile==null) {
 35+ if (args.isSet("url")) {
 36+ try {
 37+ dumpFile = new URL(d);
 38+ } catch (MalformedURLException e) {
 39+ throw new IllegalArgumentException("bad url: "+d, e);
 40+ }
 41+ }
 42+ else {
 43+ dumpFile = inputHelper.getInputURL(d);
 44+ }
 45+ }
 46+
 47+ return dumpFile;
 48+ }
 49+
4750 @Override
4851 protected void declareOptions() {
4952 super.declareOptions();
@@ -104,7 +107,7 @@
105108
106109 ///////////////////////// main import run ////////////////////////////////////
107110 if (agenda.beginTask("ImportDump.run", "analysis")) {
108 - DataSourceDriver driver = new XmlDumpDriver(dumpFile, getLogOutput(), tweaks);
 111+ DataSourceDriver driver = new XmlDumpDriver(getDumpFileURL(), inputHelper, getLogOutput(), tweaks);
109112
110113 importer.reset();
111114 importer.prepare();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportConcepts.java
@@ -2,9 +2,12 @@
33
44 import java.io.IOException;
55
 6+import de.brightbyte.io.ConsoleIO;
67 import de.brightbyte.util.PersistenceException;
 8+import de.brightbyte.wikiword.Corpus;
79 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
810 import de.brightbyte.wikiword.store.WikiWordStoreFactory;
 11+import de.brightbyte.wikiword.store.builder.DebugLocalConceptStoreBuilder;
912 import de.brightbyte.wikiword.store.builder.LocalConceptStoreBuilder;
1013 import de.brightbyte.wikiword.store.builder.PropertyStoreBuilder;
1114 import de.brightbyte.wikiword.store.builder.TextStoreBuilder;
@@ -22,7 +25,12 @@
2326 public ImportConcepts() {
2427 super("ImportConcepts");
2528 }
26 -
 29+
 30+ protected WikiWordStoreFactory<? extends LocalConceptStoreBuilder> createConceptStoreFactory() throws IOException, PersistenceException {
 31+ if (args.isSet("debug")) return new DebugLocalConceptStoreBuilder.Factory((Corpus)getConfiguredDataset(), ConsoleIO.output);
 32+ else return super.createConceptStoreFactory();
 33+ }
 34+
2735 @Override
2836 protected void createStores(WikiWordStoreFactory<? extends LocalConceptStoreBuilder> factory) throws IOException, PersistenceException {
2937 super.createStores(factory);
@@ -53,6 +61,8 @@
5462 protected void declareOptions() {
5563 super.declareOptions();
5664
 65+ args.declare("debug", null, false, Boolean.class, "debug mode, don't write to store.");
 66+
5767 ConceptImporter.declareOptions(args);
5868 }
5969
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ImportApp.java
@@ -35,7 +35,8 @@
3636
3737 protected Operation operation = null;
3838 private Monitor agendaMonitor;
39 - protected String[] baseTasks = new String[] {};
 39+ protected String[] baseTasks = new String[] {};
 40+ protected InputFileHelper inputHelper;
4041
4142 public ImportApp(String agendaTask, boolean allowGlobal, boolean allowLocal) { //TODO: agenda-params!
4243 super(allowGlobal, allowLocal);
@@ -45,7 +46,7 @@
4647
4748 @SuppressWarnings("unchecked")
4849 @Override
49 - protected WikiWordStoreFactory<S> createConceptStoreFactory() throws IOException, PersistenceException {
 50+ protected WikiWordStoreFactory<? extends S> createConceptStoreFactory() throws IOException, PersistenceException {
5051 return new DatabaseConceptStoreBuilders.Factory(getConfiguredDataSource(), getConfiguredDataset(), tweaks, null, true, true);
5152 }
5253
@@ -58,7 +59,6 @@
5960 args.declare("dbstats", null, false, Boolean.class, "calculate and dumps database table statistics");
6061 args.declare("noimport", null, false, Boolean.class, "do not import pages");
6162 args.declare("wiki", null, true, String.class, "sets the wiki name");
62 - args.declare("dummy", null, false, Boolean.class, "use a dummy store (benchmarking mode). In this case, <db-info-file> is ignored");
6363 //args.declare("buildstats", null, false, Boolean.class, "generate corpus statistics");
6464 //args.declare("noimport", null, false, Boolean.class, "do not import anything");
6565 args.declare("optimize", null, false, Boolean.class, "optimizes tables for later queries - this may take very long");
@@ -276,6 +276,13 @@
277277 }
278278
279279 @Override
 280+ protected void prepareApp() {
 281+ inputHelper = new InputFileHelper(
 282+ tweaks.getTweak("dumpdriver.externalGunzip", tweaks.getTweak("input.externalGunzip", (String)null)),
 283+ tweaks.getTweak("dumpdriver.externalBunzip", tweaks.getTweak("input.externalBunzip", (String)null)));
 284+ }
 285+
 286+ @Override
280287 protected void execute() throws Exception {
281288 boolean noimport = args.isSet("noimport");
282289 boolean dbcheck = args.isSet("dbcheck");
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractFromDump.java
@@ -6,6 +6,7 @@
77
88 import de.brightbyte.util.PersistenceException;
99 import de.brightbyte.wikiword.analyzer.WikiTextAnalyzer;
 10+import de.brightbyte.wikiword.builder.InputFileHelper;
1011 import de.brightbyte.wikiword.output.DataOutput;
1112 import de.brightbyte.wikiword.processor.DataSourceDriver;
1213 import de.brightbyte.wikiword.processor.WikiWordPageProcessor;
@@ -60,7 +61,7 @@
6162 processor.setLogOutput(getLogOutput());
6263 processor.configure(args);
6364
64 - DataSourceDriver driver = new XmlDumpDriver(dumpFile, getLogOutput(), tweaks);
 65+ DataSourceDriver driver = new XmlDumpDriver(dumpFile, inputHelper, getLogOutput(), tweaks);
6566
6667 processor.reset();
6768 processor.prepare();
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/extract/ExtractorApp.java
@@ -12,6 +12,7 @@
1313 import de.brightbyte.io.ConsoleIO;
1414 import de.brightbyte.util.PersistenceException;
1515 import de.brightbyte.wikiword.CliApp;
 16+import de.brightbyte.wikiword.builder.InputFileHelper;
1617 import de.brightbyte.wikiword.output.DataOutput;
1718
1819 /**
@@ -20,11 +21,12 @@
2122 public abstract class ExtractorApp<S extends DataOutput> extends CliApp {
2223
2324 protected S output;
 25+ protected InputFileHelper inputHelper;
2426
2527 public ExtractorApp() {
2628 super();
2729 }
28 -
 30+
2931 @Override
3032 protected void declareOptions() {
3133 super.declareOptions();
@@ -85,6 +87,10 @@
8688 protected void prepareApp() throws Exception {
8789 super.prepareApp();
8890
 91+ inputHelper = new InputFileHelper(
 92+ tweaks.getTweak("dumpdriver.externalGunzip", tweaks.getTweak("input.externalGunzip", (String)null)),
 93+ tweaks.getTweak("dumpdriver.externalBunzip", tweaks.getTweak("input.externalBunzip", (String)null)));
 94+
8995 output = createOutput();
9096 }
9197
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DebugLocalConceptStoreBuilder.java
@@ -9,8 +9,8 @@
1010 import de.brightbyte.application.Agenda;
1111 import de.brightbyte.application.Agenda.Record;
1212 import de.brightbyte.application.Agenda.State;
 13+import de.brightbyte.data.cursor.CursorProcessor;
1314 import de.brightbyte.data.cursor.DataSet;
14 -import de.brightbyte.data.cursor.CursorProcessor;
1515 import de.brightbyte.io.Output;
1616 import de.brightbyte.util.PersistenceException;
1717 import de.brightbyte.wikiword.ConceptType;
@@ -24,12 +24,27 @@
2525 import de.brightbyte.wikiword.schema.AliasScope;
2626 import de.brightbyte.wikiword.store.GroupNameTranslator;
2727 import de.brightbyte.wikiword.store.WikiWordConceptStore;
 28+import de.brightbyte.wikiword.store.WikiWordStoreFactory;
2829
2930 /**
3031 * Dummy implementation of WikiStoreBuilder for testing and debugging
3132 */
3233 public class DebugLocalConceptStoreBuilder implements LocalConceptStoreBuilder {
3334
 35+ public static class Factory implements WikiWordStoreFactory<DebugLocalConceptStoreBuilder> {
 36+ protected Output out;
 37+ protected Corpus corpus;
 38+
 39+ public Factory(Corpus corpus, Output out) {
 40+ this.out = out;
 41+ this.corpus = corpus;
 42+ }
 43+
 44+ public DebugLocalConceptStoreBuilder newStore() throws PersistenceException {
 45+ return new DebugLocalConceptStoreBuilder(corpus, out);
 46+ }
 47+ }
 48+
3449 public class DebugTextStoreBuilder implements TextStoreBuilder {
3550
3651 public void storePlainText(int rcId, String name, String text) throws PersistenceException {
@@ -408,14 +423,14 @@
409424 @Override
410425 public Record logStart(int level, String context, String task, Map<String, Object> parameters, boolean complex) {
411426 Record rec = super.logStart(level, context, task, parameters, complex);
412 - trace("+ logStart: level = "+level+", task = "+task+", parameters = "+parameters+", complex = "+complex);
 427+ log("+ logStart: level = "+level+", task = "+task+", parameters = "+parameters+", complex = "+complex);
413428 return rec;
414429 }
415430
416431 @Override
417432 public void logTerminated(int start, int end, long duration, State state, String result) {
418433 super.logTerminated(start, end, duration, state, result);
419 - trace("+ logStart: start = "+start+", end = "+end+", duration = "+duration+", state = "+state+", result = "+result);
 434+ log("+ logStart: start = "+start+", end = "+end+", duration = "+duration+", state = "+state+", result = "+result);
420435 }
421436
422437 }
@@ -436,11 +451,12 @@
437452 protected int sectionCounter = 0;
438453
439454 private Agenda agenda;
440 - private DatasetIdentifier dataset = DatasetIdentifier.forName("DEBUG", "dummy");
 455+ private DatasetIdentifier dataset;
441456
442 - public DebugLocalConceptStoreBuilder(Output out) {
 457+ public DebugLocalConceptStoreBuilder(Corpus corpus, Output out) {
443458 super();
444459 this.out = out;
 460+ this.dataset = corpus;
445461
446462 try {
447463 this.agenda = new Agenda( new DebugAgendaPersistor() );
@@ -497,13 +513,13 @@
498514
499515 public int storeConcept(int rcId, String name, ConceptType ctype) {
500516 conceptCounter++;
501 - trace("+ storeConcept: rc = "+rcId+", name = "+name+", type = "+ctype);
 517+ log("+ storeConcept: rc = "+rcId+", name = "+name+", type = "+ctype);
502518 return conceptCounter;
503519 }
504520
505521 public int storeResource(String name, ResourceType ptype, Date time) {
506522 resourceCounter++;
507 - trace("+ resourceCounter: id = "+resourceCounter+", name = "+name+", type = "+ptype+", timestamp = "+time);
 523+ log("+ storeResource: id = "+resourceCounter+", name = "+name+", type = "+ptype+", timestamp = "+time);
508524 return resourceCounter;
509525 }
510526
@@ -516,84 +532,84 @@
517533
518534 public void storeDefinition(int rcId, int conceptId, String definition) {
519535 definitionCounter++;
520 - trace("+ storeDefinition: conceptId = "+conceptId+": "+definition);
 536+ log("+ storeDefinition: conceptId = "+conceptId+": "+definition);
521537 }
522538
523539 public int storePlainText(int rcId, String text) {
524540 plainTextCounter++;
525 - trace("+ storePlainText: resource = "+rcId+": ");
526 - trace("---------------------------------");
527 - trace(text);
528 - trace("\n---------------------------------");
 541+ log("+ storePlainText: resource = "+rcId+": ");
 542+ log("---------------------------------");
 543+ log(text);
 544+ log("\n---------------------------------");
529545 return plainTextCounter;
530546 }
531547
532548 public int storeRawText(int rcId, String text) {
533549 rawTextCounter++;
534 - trace("+ storeRawText: resource = "+rcId+": ");
535 - trace("---------------------------------");
536 - trace(text);
537 - trace("\n---------------------------------");
 550+ log("+ storeRawText: resource = "+rcId+": ");
 551+ log("---------------------------------");
 552+ log(text);
 553+ log("\n---------------------------------");
538554 return rawTextCounter;
539555 }
540556
541557
542558 public void storeConceptBroader(int rcId, int narrowId, String narrowName, String broadName, ExtractionRule rule) {
543559 conceptBroaderCounter++;
544 - trace("+ storeConceptBroader: rc = "+rcId+", narrow ("+narrowId+") = "+narrowName+", broad = "+broadName+", rule = "+rule);
 560+ log("+ storeConceptBroader: rc = "+rcId+", narrow ("+narrowId+") = "+narrowName+", broad = "+broadName+", rule = "+rule);
545561 }
546562
547563 public void storeConceptBroader(int rcId, String narrowName, String broadName, ExtractionRule rule) {
548564 conceptBroaderCounter++;
549 - trace("+ storeConceptBroader: rc = "+rcId+", narrow = "+narrowName+", broad = "+broadName+", rule = "+rule);
 565+ log("+ storeConceptBroader: rc = "+rcId+", narrow = "+narrowName+", broad = "+broadName+", rule = "+rule);
550566 }
551567
552568 public void storeConceptAlias(int rcId, int left, String leftName, int right, String rightName, AliasScope scope) {
553569 conceptEquivalentCounter++;
554 - trace("+ storeConceptEquivalent: rc = "+rcId+", left ("+left+") = "+leftName+", right ("+right+") = "+rightName+", scope = "+scope);
 570+ log("+ storeConceptEquivalent: rc = "+rcId+", left ("+left+") = "+leftName+", right ("+right+") = "+rightName+", scope = "+scope);
555571 }
556572
557573 public void storeConceptReference(int rcId, int source, String sourceName, String target) {
558574 conceptReferenceCounter++;
559 - trace("+ storeConceptReference: rc = "+rcId+", source ("+source+") = "+sourceName+", target = "+target+"");
 575+ log("+ storeConceptReference: rc = "+rcId+", source ("+source+") = "+sourceName+", target = "+target+"");
560576 }
561577
562578 public void storeLanguageLink(int rcId, int concept, String conceptName, String lang, String target) {
563579 languageLinkCounter++;
564 - trace("+ storeLanguageLink: rc = "+rcId+", concept ("+concept+") = "+conceptName+", language = "+lang+", target = "+target+"");
 580+ log("+ storeLanguageLink: rc = "+rcId+", concept ("+concept+") = "+conceptName+", language = "+lang+", target = "+target+"");
565581 }
566582
567583 public void storeLink(int rcId, int anchorId, String anchorName,
568584 String term, String targetName, ExtractionRule rule) {
569585 linkCounter++;
570 - trace("+ storeTermUse: rc = "+rcId+", anchor ("+anchorId+") = "+anchorName+", term = "+term+", target = "+targetName+", rule = "+rule+"");
 586+ log("+ storeTermUse: rc = "+rcId+", anchor ("+anchorId+") = "+anchorName+", term = "+term+", target = "+targetName+", rule = "+rule+"");
571587 }
572588
573589 public void storeReference(int rcId, String term, int targetId, String targetName,
574590 ExtractionRule rule) {
575591 linkCounter++;
576 - trace("+ storeTermUse: rc = "+rcId+", target ("+targetId+") = "+targetName+", term = "+term+", rule = "+rule+"");
 592+ log("+ storeTermUse: rc = "+rcId+", target ("+targetId+") = "+targetName+", term = "+term+", rule = "+rule+"");
577593 }
578594
579595 public void storeSection(int rcId, String name, String page) {
580596 sectionCounter++;
581 - trace("+ section: rc = "+rcId+", name ("+name+") = "+page);
 597+ log("+ section: rc = "+rcId+", name ("+name+") = "+page);
582598 }
583599
584600 public void checkConsistency() {
585 - trace("* checkConsistency *");
 601+ log("* checkConsistency *");
586602 }
587603
588604 public void flush() {
589 - trace("* flush *");
 605+ log("* flush *");
590606 }
591607
592608 public void deleteDataFrom(int rcId) {
593 - trace("- delete data from resource "+rcId);
 609+ log("- delete data from resource "+rcId);
594610 }
595611
596612 public void deleteDataAfter(int rcId, boolean inclusive) {
597 - trace("- delete data after resource "+rcId);
 613+ log("- delete data after resource "+rcId);
598614 }
599615
600616 public Agenda getAgenda() {
@@ -605,7 +621,7 @@
606622 }
607623
608624 public void optimize() {
609 - trace("- optimize");
 625+ log("- optimize");
610626 }
611627
612628 public void dumpTableStats(PrintStream out, String table) {
@@ -621,10 +637,10 @@
622638 }
623639
624640 public void buildStatistics() {
625 - trace("- build stats");
 641+ log("- build stats");
626642 }
627643 public void clearStatistics() {
628 - trace("- clear stats");
 644+ log("- clear stats");
629645 }
630646
631647 public int getNumberOfWarnings() {
@@ -713,12 +729,12 @@
714730 }
715731
716732 public int storeAbout(int resource, String rcName, String conceptName) {
717 - trace("+ storeAbout: resource = "+resource+", resourceName = "+rcName+", conceptName = "+conceptName);
 733+ log("+ storeAbout: resource = "+resource+", resourceName = "+rcName+", conceptName = "+conceptName);
718734 return -1;
719735 }
720736
721737 public int storeAbout(int resource, String rcName, int concept, String conceptName) {
722 - trace("+ storeAbout: resource = "+resource+", resourceName = "+rcName+", concept = "+concept+", conceptName = "+conceptName);
 738+ log("+ storeAbout: resource = "+resource+", resourceName = "+rcName+", concept = "+concept+", conceptName = "+conceptName);
723739 return -1;
724740 }
725741
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/processor/XmlDumpDriver.java
@@ -199,33 +199,31 @@
200200
201201 }
202202
203 - public XmlDumpDriver(URL dump, LeveledOutput log, TweakSet tweaks) {
 203+ public XmlDumpDriver(URL dump, InputFileHelper inputHelper, LeveledOutput log, TweakSet tweaks) {
204204 if (dump==null) throw new NullPointerException();
205205 this.dump= dump;
206 - init(log, tweaks);
 206+ init(inputHelper, log, tweaks);
207207 }
208208
209209 public XmlDumpDriver(InputStream in, LeveledOutput log, TweakSet tweaks) {
210210 if (in==null) throw new NullPointerException();
211211 this.in= in;
212 - init(log, tweaks);
 212+ init(null, log, tweaks);
213213 }
214214
215215 private int importQueueCapacity = 0;
216216 private InputFileHelper inputHelper;
217217
218 - private void init(LeveledOutput log, TweakSet tweaks) {
 218+ private void init(InputFileHelper inputHelper, LeveledOutput log, TweakSet tweaks) {
219219 if (log==null) throw new NullPointerException();
220220 if (tweaks==null) throw new NullPointerException();
 221+ if (inputHelper==null && in==null) throw new NullPointerException();
221222
222223 this.tweaks = tweaks;
223224 this.log = log;
 225+ this.inputHelper = inputHelper;
224226
225227 importQueueCapacity = tweaks.getTweak("dumpdriver.pageImportQueue", 8);
226 -
227 - inputHelper = new InputFileHelper(
228 - tweaks.getTweak("dumpdriver.externalGunzip", tweaks.getTweak("input.externalGunzip", (String)null)),
229 - tweaks.getTweak("dumpdriver.externalBunzip", tweaks.getTweak("input.externalBunzip", (String)null)));
230228 }
231229
232230 public void run(WikiWordPageProcessor importer) throws IOException, SQLException, InterruptedException, PersistenceException {

Status & tagging log