Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/LocalConceptStoreSchema.java |
— | — | @@ -106,7 +106,7 @@ |
107 | 107 | aboutTable.addField( new ReferenceField(this, "concept", "INT", null, false, KeyType.INDEX, "concept", "id", null ) ); |
108 | 108 | aboutTable.addField( new ReferenceField(this, "concept_name", getTextType(255), null, true, KeyType.INDEX, "concept", "name", null ) ); |
109 | 109 | aboutTable.addKey( new DatabaseKey(this, KeyType.PRIMARY, "about", new String[] {"resource", "concept"}) ); |
110 | | - addTable(aliasTable); |
| 110 | + addTable(aboutTable); |
111 | 111 | |
112 | 112 | meaningTable = new RelationTable(this, "meaning", defaultTableAttributes); |
113 | 113 | //meaningTable.addField( new DatabaseField(this, "id", "INT", "AUTO_INCREMENT", false, KeyType.PRIMARY) ); |
Index: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/schema/WikiWordConceptStoreSchema.java |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | langlinkTable.addField( new DatabaseField(this, "language", getTextType(16), null, true, null ) ); |
59 | 59 | langlinkTable.addField( new DatabaseField(this, "target", getTextType(255), null, true, null ) ); |
60 | 60 | langlinkTable.addKey( new DatabaseKey(this, KeyType.INDEX, "language_target", new String[] {"language", "target"}) ); |
61 | | - langlinkTable.addKey( new DatabaseKey(this, KeyType.UNIQUE, "concept_language_target", new String[] {"concept", "language", "target"}) ); |
| 61 | + langlinkTable.addKey( new DatabaseKey(this, KeyType.INDEX, "concept_language_target", new String[] {"concept", "language", "target"}) ); |
62 | 62 | addTable(langlinkTable); |
63 | 63 | |
64 | 64 | groupStats.add( new GroupStatsSpec("concept", "type", conceptTypeCodeTranslator)); |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/ConceptImporter.java |
— | — | @@ -260,6 +260,11 @@ |
261 | 261 | List<WikiTextAnalyzer.WikiLink> links = analyzerPage.getLinks(); |
262 | 262 | linkTracker.step(links.size()); |
263 | 263 | |
| 264 | + //XXX: after resolving all aliases, change type to OTHER! |
| 265 | + //int conceptId = storeConcept(rcId, name, ConceptType.UNKNOWN); |
| 266 | + |
| 267 | + int conceptId = store.storeAbout(rcId, name); |
| 268 | + |
264 | 269 | for (WikiTextAnalyzer.WikiLink link : links) { |
265 | 270 | WikiTextAnalyzer.LinkMagic m = link.getMagic(); |
266 | 271 | |
— | — | @@ -267,6 +272,11 @@ |
268 | 273 | //FIXME: store this also as a reference to the categorie's concept under it's original title! |
269 | 274 | storeConceptBroader(rcId, name, link.getPage().toString(), ExtractionRule.BROADER_FROM_CAT); |
270 | 275 | } |
| 276 | + |
| 277 | + if (m==WikiTextAnalyzer.LinkMagic.LANGUAGE) { |
| 278 | + //FIXME: language links point to *resource* names. resolve accordingly. |
| 279 | + storeLanguageLink(rcId, conceptId, name, link.getInterwiki().toString(), link.getTarget().toString()); |
| 280 | + } |
271 | 281 | } |
272 | 282 | |
273 | 283 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DebugLocalConceptStoreBuilder.java |
— | — | @@ -628,12 +628,14 @@ |
629 | 629 | return 0; |
630 | 630 | } |
631 | 631 | |
632 | | - public void storeAbout(int resource, String conceptName) { |
| 632 | + public int storeAbout(int resource, String conceptName) { |
633 | 633 | trace("+ storeAbout: resource = "+resource+", conceptName = "+conceptName); |
| 634 | + return -1; |
634 | 635 | } |
635 | 636 | |
636 | | - public void storeAbout(int resource, int concept, String conceptName) { |
| 637 | + public int storeAbout(int resource, int concept, String conceptName) { |
637 | 638 | trace("+ storeAbout: resource = "+resource+", concept = "+concept+", conceptName = "+conceptName); |
| 639 | + return -1; |
638 | 640 | } |
639 | 641 | |
640 | 642 | } |
\ No newline at end of file |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/DatabaseLocalConceptStoreBuilder.java |
— | — | @@ -194,8 +194,7 @@ |
195 | 195 | |
196 | 196 | |
197 | 197 | protected void deleteDataFrom(int rcId, String op) throws PersistenceException { |
198 | | - deleteDataFrom(rcId, op, definitionTable, "concept", conceptTable, "resource"); |
199 | | - //deleteDataFrom(rcId, op, conceptDescriptionTable, "concept", conceptTable, "resource"); |
| 198 | + deleteDataFrom(rcId, op, definitionTable, "concept", aboutTable, "resource"); |
200 | 199 | |
201 | 200 | deleteDataFrom(rcId, op, linkTable, "resource"); |
202 | 201 | deleteDataFrom(rcId, op, langlinkTable, "resource"); |
— | — | @@ -436,14 +435,14 @@ |
437 | 436 | /** |
438 | 437 | * @see de.brightbyte.wikiword.store.builder.LocalConceptStoreBuilder#storeAbout(int, String) |
439 | 438 | */ |
440 | | - public void storeAbout(int rcId, String conceptName) throws PersistenceException { |
441 | | - storeAbout(rcId, -1, conceptName); |
| 439 | + public int storeAbout(int rcId, String conceptName) throws PersistenceException { |
| 440 | + return storeAbout(rcId, -1, conceptName); |
442 | 441 | } |
443 | 442 | |
444 | 443 | /** |
445 | 444 | * @see de.brightbyte.wikiword.store.builder.LocalConceptStoreBuilder#storeAbout(int, int, String) |
446 | 445 | */ |
447 | | - public void storeAbout(int rcId, int concept, String conceptName) throws PersistenceException { |
| 446 | + public int storeAbout(int rcId, int concept, String conceptName) throws PersistenceException { |
448 | 447 | try { |
449 | 448 | if (rcId<0) throw new IllegalArgumentException("bad resource id "+rcId); |
450 | 449 | conceptName = checkName(rcId, conceptName, "concept name (resource #{0})", rcId); |
— | — | @@ -452,9 +451,13 @@ |
453 | 452 | aboutInserter.updateString("concept_name", conceptName); |
454 | 453 | |
455 | 454 | if (concept>0) aboutInserter.updateInt("concept", concept); |
456 | | - else if (idManager!=null) aboutInserter.updateInt("concept", idManager.aquireId(conceptName)); |
| 455 | + else if (idManager!=null) { |
| 456 | + concept = idManager.aquireId(conceptName); |
| 457 | + aboutInserter.updateInt("concept", concept); |
| 458 | + } |
457 | 459 | |
458 | 460 | aliasInserter.updateRow(); |
| 461 | + return concept; |
459 | 462 | } catch (SQLException e) { |
460 | 463 | throw new PersistenceException(e); |
461 | 464 | } |
— | — | @@ -1047,20 +1050,20 @@ |
1048 | 1051 | |
1049 | 1052 | protected int buildSectionConcepts() throws PersistenceException { |
1050 | 1053 | //NOTE: we shouldn't need the "ignore" bit. Let'S keep it for robustness |
1051 | | - String sql = "INSERT ignore INTO "+conceptTable.getSQLName()+" ( id, name, type, resource, random ) " |
1052 | | - +"SELECT S.section_concept, S.section_name, "+ConceptType.UNKNOWN.getCode()+", C.resource, RAND() " |
1053 | | - +"FROM "+sectionTable.getSQLName()+" AS S " |
1054 | | - +"LEFT JOIN "+conceptTable.getSQLName()+" AS C "; |
| 1054 | + String sql = "INSERT ignore INTO "+conceptTable.getSQLName()+" ( id, name, type, random ) " |
| 1055 | + +"SELECT S.section_concept, S.section_name, "+ConceptType.UNKNOWN.getCode()+", RAND() " |
| 1056 | + +"FROM "+sectionTable.getSQLName()+" AS S "; |
1055 | 1057 | |
1056 | | - if (idManager!=null) sql += "ON S.concept = C.id "; |
1057 | | - else sql += "ON S.concept_name = C.name "; |
| 1058 | + //XXX: no more need for joining... |
| 1059 | + //if (idManager!=null) sql += "ON S.concept = C.concept "; |
| 1060 | + //else sql += "ON S.concept_name = C.concept_name "; |
1058 | 1061 | |
1059 | | - String where = "WHERE C.type IS NULL " |
1060 | | - //+"OR C.type != "+ConceptType.NONE.getCode()+" " //XXX: really allow type ALIAS? //XXX: check for NONE/BAD?! |
1061 | | - //+"GROUP BY S.section_name" |
1062 | | - ; |
| 1062 | + //String where = "WHERE C.type IS NULL " ; //WTF?! why do we need this?! |
| 1063 | + String where = ""; |
1063 | 1064 | |
1064 | 1065 | return executeUpdate("buildSectionConcepts", sql+where); //TODO: chunk if ids available?! |
| 1066 | + |
| 1067 | + //TODO: inject about records, so section concepts are linked to resources? |
1065 | 1068 | } |
1066 | 1069 | |
1067 | 1070 | protected int buildSectionBroader() throws PersistenceException { |
— | — | @@ -1078,8 +1081,8 @@ |
1079 | 1082 | |
1080 | 1083 | protected int buildMissingConcepts(DatabaseTable table, String conceptIdField, String conceptNameField) throws PersistenceException { |
1081 | 1084 | //NOTE: we shouldn't need the "ignore" bit. Let'S keep it for robustness |
1082 | | - String sql = "INSERT ignore INTO "+conceptTable.getSQLName()+" ( id, name, type, resource, random ) " |
1083 | | - +"SELECT T."+conceptIdField+", T."+conceptNameField+", "+ConceptType.UNKNOWN.getCode()+", NULL, RAND() " |
| 1085 | + String sql = "INSERT ignore INTO "+conceptTable.getSQLName()+" ( id, name, type, random ) " |
| 1086 | + +"SELECT T."+conceptIdField+", T."+conceptNameField+", "+ConceptType.UNKNOWN.getCode()+", RAND() " |
1084 | 1087 | +"FROM "+table.getSQLName()+" as T " |
1085 | 1088 | +"LEFT JOIN "+conceptTable.getSQLName()+" as C "; |
1086 | 1089 | |
Index: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/store/builder/LocalConceptStoreBuilder.java |
— | — | @@ -54,10 +54,12 @@ |
55 | 55 | String sourceName, int target, String targetName, AliasScope scope) |
56 | 56 | throws PersistenceException; |
57 | 57 | |
58 | | - public abstract void storeAbout(int resource, String conceptName) |
59 | | - throws PersistenceException; |
| 58 | + /* returns concept ID, of known; -1 otherwise */ |
| 59 | + public abstract int storeAbout(int resource, String conceptName) |
| 60 | + throws PersistenceException; |
60 | 61 | |
61 | | - public abstract void storeAbout(int resource, int concept, String conceptName) |
| 62 | + /* returns concept ID, of known; -1 otherwise */ |
| 63 | + public abstract int storeAbout(int resource, int concept, String conceptName) |
62 | 64 | throws PersistenceException; |
63 | 65 | |
64 | 66 | //public abstract void storeConceptReference(int rcId, int source, |
Index: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties |
— | — | @@ -0,0 +1,100 @@ |
| 2 | +## System config |
| 3 | +console.encoding = "UTF-8" |
| 4 | + |
| 5 | +## language handling |
| 6 | + |
| 7 | +# treat "commons" as a language code |
| 8 | +languages.commonsAsLanguage = false |
| 9 | + |
| 10 | +# treat "simple" as a language code |
| 11 | +languages.simpleAsLanguage = true |
| 12 | + |
| 13 | +## RDF Export Config |
| 14 | +# datase URI qualifier - should uniquely identify the entity creating |
| 15 | +# the datasets (that is, YOU). This enusres unique dataset URIs for |
| 16 | +# datasets created by different people. |
| 17 | +# The default is "*" which means "unknown, don't use this publically". |
| 18 | +rdf.dataset.qualifier = "*" |
| 19 | + |
| 20 | +## Import Driver |
| 21 | +# Run import in a thread separate from the one reading and parsing the dump. |
| 22 | +# Disabling this by the queue size to 0 will slightly reduce overhead on single-core systems; |
| 23 | +# Using a queue will not have as big an impact if unzippers (bunzip/gunzip) are used |
| 24 | +#dumpdriver.pageImportQueue = 8 |
| 25 | +dumpdriver.pageImportQueue = 0 |
| 26 | + |
| 27 | +# external unzippers - may boost performance, especially |
| 28 | +# on multi-core systems. The name of the file to |
| 29 | +# unzip will be appended to the command given here. Spaces |
| 30 | +# before the last / are taken to be part of the path, spaces |
| 31 | +# after the last / separate parameters. |
| 32 | +dumpdriver.externalBunzip = null |
| 33 | +dumpdriver.externalGunzip = null |
| 34 | +#dumpdriver.externalBunzip = "/bin/bunzip2 -c" |
| 35 | +#dumpdriver.externalGunzip = "/bin/gunzip -c" |
| 36 | + |
| 37 | +### Importer Output and Persistance ############ |
| 38 | +importer.progressInterval = 1000 |
| 39 | +importer.safepointInterval = 30000 |
| 40 | +#importer.safepointInterval = 1000 |
| 41 | + |
| 42 | +### Database Performance ####################### |
| 43 | +#dbstore.backgroundFlushQueue = 4 |
| 44 | +dbstore.backgroundFlushQueue = 0 |
| 45 | +dbstore.useEntityBuffer = false |
| 46 | +dbstore.useRelationBuffer = false |
| 47 | +#dbstore.useEntityBuffer = true |
| 48 | +#dbstore.useRelationBuffer = true |
| 49 | +#dbstore.insertionBufferFactor = 16 |
| 50 | +dbstore.insertionBufferFactor = 64 |
| 51 | +dbstore.engine = "MyISAM" |
| 52 | +#dbstore.engine = "InnoDB" |
| 53 | + |
| 54 | +#sql mode - see http://dev.mysql.com/doc/refman/5.1/en/server-sql-mode.html |
| 55 | +dbstore.sqlMode = "STRICT_ALL_TABLES" |
| 56 | +#dbstore.sqlMode = "STRICT_TRANS_TABLES" |
| 57 | + |
| 58 | +#NOTE: MySQL does not support 4-byte utf-8 codes. So turn everything into binary... |
| 59 | +dbstore.useBinaryText = true; |
| 60 | + |
| 61 | +#maximum size of sql statement, bytes! MySQL's default: a bit below 16M bytes (!) |
| 62 | +#if not specified, mysql is asked for the current value. |
| 63 | +#dbstore.maxStatementSize = 16776192; |
| 64 | + |
| 65 | +#chunk size to use when chunking large updates by id |
| 66 | +#default is 100000, set to 0 to disable all chunking |
| 67 | +dbstore.queryChunkSize = 100000 |
| 68 | +dbstore.updateChunkSize = 100000 |
| 69 | + |
| 70 | +### Property Cache Fields ########################### |
| 71 | +#dbstore.cacheReferenceSeparator = '\u001E' |
| 72 | +#dbstore.cacheReferenceFieldSeparator = '\u001F' |
| 73 | +dbstore.listBlobSize = 65025 |
| 74 | + |
| 75 | +### ID manager ###################################### |
| 76 | +# NOTE: when using this, allow for 116 bytes plus the average size of names per ID entry. |
| 77 | +# So if you have anaverage name length of 12 and expect1million entries, |
| 78 | +# allow for about 1.3 gigabyte RAM to be used for ID caching. |
| 79 | +dbstore.idManager=false |
| 80 | +#dbstore.auxFileDir defaults to system temp dir |
| 81 | +#dbstore.auxFileDir="/tmp" |
| 82 | +dbstore.idManager.bufferSize=16384 |
| 83 | + |
| 84 | +### CycleFinder ##################################### |
| 85 | +dbstore.CycleFinder.levelWarningThreshold=32 |
| 86 | +dbstore.CycleFinder.degreeWarningThreshold=1024 |
| 87 | +dbstore.CycleFinder.maxDepth=1024 |
| 88 | + |
| 89 | +### Database Debug Output ###################### |
| 90 | +#see java.util.logging.Level for codes to use with dbstore.logLevel |
| 91 | +dbstore.logLevel = 720 |
| 92 | +dbstore.explainSQLThreashold = 0 |
| 93 | +#dbstore.explainSQLThreashold = 1000000 |
| 94 | +dbstore.slowSQLThreashold = 0 |
| 95 | +#dbstore.slowSQLThreashold = 10 |
| 96 | +#dbstore.slowSQLThreashold = 60 |
| 97 | +dbstore.traceSQL = false |
| 98 | + |
| 99 | +### Custom special purpose packages ################# |
| 100 | +#wikiword.ConfigPackages=["de.brightbyte.wikiword.wikipro","de.brightbyte.wikiword.wikipro.wikis"] |
| 101 | + |