Index: trunk/extensions/Wikidata/perl-tools/Import WiktionaryZ.pl |
— | — | @@ -1,23 +1,23 @@ |
2 | | -use WiktionaryZ; |
| 2 | +use OmegaWiki; |
3 | 3 | use POSIX qw(strftime); |
4 | 4 | |
5 | 5 | my $startTime = time; |
6 | 6 | |
7 | | -# Example usage to import UMLS completely into an existing WiktionaryZ database: |
8 | | -# my $importer=new WiktionaryZ('wikidatadb','root','MyPass'); |
| 7 | +# Example usage to import UMLS completely into an existing OmegaWiki database: |
| 8 | +# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
9 | 9 | # $importer->setSourceDB('umls'); |
10 | 10 | # $importer->initialize; |
11 | 11 | # $importer->importCompleteUMLS(); |
12 | 12 | |
13 | | -# Example usage to import a part of UMLS into an existing WiktionaryZ database: |
14 | | -# my $importer=new WiktionaryZ('wikidatadb','root','MyPass'); |
| 13 | +# Example usage to import a part of UMLS into an existing OmegaWiki database: |
| 14 | +# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
15 | 15 | # $importer->setSourceDB('umls'); |
16 | 16 | # $importer->initialize; |
17 | 17 | # my %sourceAbbreviations = $importer->loadSourceAbbreviations(); |
18 | 18 | # delete($sourceAbbreviations{"MSH"}); |
19 | 19 | # $importer->importUMLS(\%sourceAbbreviations); |
20 | 20 | |
21 | | -my $importer=new WiktionaryZ('wikidata_icpc','root',''); |
| 21 | +my $importer=new OmegaWiki('wikidata_icpc','root',''); |
22 | 22 | $importer->setSourceDB('umls'); |
23 | 23 | #$importer->setSourceDB('swissprot'); |
24 | 24 | $importer->initialize; |
Index: trunk/extensions/Wikidata/perl-tools/WiktionaryZ.pm |
— | — | @@ -1,6 +1,6 @@ |
2 | | -# Example usage to import UMLS into an existing WiktionaryZ database: |
3 | | -# use WiktionaryZ; |
4 | | -# my $importer=new WiktionaryZ('wikidatadb','root','MyPass'); |
| 2 | +# Example usage to import UMLS into an existing OmegaWiki database: |
| 3 | +# use OmegaWiki; |
| 4 | +# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
5 | 5 | # $importer->setSourceDB('umls'); |
6 | 6 | # $importer->initialize; |
7 | 7 | # $importer->importCompleteUMLS(); |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | # Fully deal with alternative definitions referring to the same concept |
19 | 19 | # Deal with preferred lexical expressions, primary concepts (general weighting mechanism?) |
20 | 20 | |
21 | | -package WiktionaryZ; |
| 21 | +package OmegaWiki; |
22 | 22 | use DBI; |
23 | 23 | use Encode; |
24 | 24 | use POSIX qw(strftime); |
Index: trunk/extensions/Wikidata/perl-tools/tbx-export.pl |
— | — | @@ -0,0 +1,201 @@ |
| 2 | +#!/usr/bin/perl
|
| 3 | +use strict;
|
| 4 | +use warnings;
|
| 5 | +use DBI;
|
| 6 | +use XML::Writer;
|
| 7 | +use IO::File;
|
| 8 | +use utf8;
|
| 9 | +use Encode;
|
| 10 | +
|
| 11 | +#Edit here to setup your database connection information
|
| 12 | +my $dbUser = "root";
|
| 13 | +my $dbPass = "";
|
| 14 | +my $dbName = "wiki";
|
| 15 | +my $dbHost = "127.0.0.1";
|
| 16 | +
|
| 17 | +my $start = time();
|
| 18 | +print $start . "\n";
|
| 19 | +#create and start the xml document
|
| 20 | +my $output = new IO::File(">wiktionaryz.xml");
|
| 21 | +my $xmlDoc = new XML::Writer(OUTPUT => $output,
|
| 22 | + DATA_MODE => 1,
|
| 23 | + DATA_INDENT => 4);
|
| 24 | +$xmlDoc->xmlDecl("UTF-8");
|
| 25 | +$xmlDoc->doctype("martif", "ISO 12200:1999A//DTD MARTIF core (DXFcdV04)//EN", "TBXcdv04.dtd");
|
| 26 | +
|
| 27 | +#build the tbx header information
|
| 28 | +$xmlDoc->startTag("martif", "type"=>"TBX", "xml:lang"=>"en");
|
| 29 | +$xmlDoc->startTag("martifHeader");
|
| 30 | +$xmlDoc->startTag("fileDesc");
|
| 31 | +$xmlDoc->startTag("titleStmt");
|
| 32 | +$xmlDoc->dataElement("title", "Terminologocial Data");
|
| 33 | +$xmlDoc->endTag("titleStmt");
|
| 34 | +$xmlDoc->startTag("sourceDesc");
|
| 35 | +$xmlDoc->dataElement("p", "from the OmegaWiki Database " . gmtime());
|
| 36 | +$xmlDoc->endTag("sourceDesc");
|
| 37 | +$xmlDoc->endTag("fileDesc");
|
| 38 | +$xmlDoc->startTag("encodingDesc");
|
| 39 | +$xmlDoc->dataElement("p", "SYSTEM TBXDCSv05b.xml", "type"=>"DCSName");
|
| 40 | +$xmlDoc->endTag("encodingDesc");
|
| 41 | +$xmlDoc->endTag("martifHeader");
|
| 42 | +$xmlDoc->startTag("text");
|
| 43 | +$xmlDoc->startTag("body");
|
| 44 | +
|
| 45 | +my $dbh = DBI->connect('dbi:mysql:database=' . $dbName . ';hostname=' . $dbHost . ';port=3306',
|
| 46 | + $dbUser,
|
| 47 | + $dbPass);
|
| 48 | +if (!$dbh) {
|
| 49 | + die("Could not connect to database: " . $DBI::errstr);
|
| 50 | +}
|
| 51 | +
|
| 52 | +#uw_expression_ns - contains term and language
|
| 53 | +#uw_meaning_relations - meaning relations meaning1_mid and meaning2_mid and relation_mid are ids in uw_expression_ns
|
| 54 | +#uw_syntrans - collections
|
| 55 | +#The definitions are in the TEXT table. You will find the keys to the
|
| 56 | +#table using the TRANSLATED_CONTENT table. Each set of languages is
|
| 57 | +#identified by a SET_ID, which we also refer to as a "Translated Content
|
| 58 | +#ID" (TCID) from the UW_DEFINED_MEANING table. Using a TCID, you can find
|
| 59 | +#the different TEXT_IDs for each LANGUAGE_ID, and then get the actual
|
| 60 | +#texts from the TEXT table.
|
| 61 | +
|
| 62 | +#get all the distinct concepts (sets)
|
| 63 | +my @arrCollections;
|
| 64 | +my $sth = $dbh->prepare('SELECT DISTINCT(defined_meaning_id) FROM uw_syntrans');
|
| 65 | +$sth->execute();
|
| 66 | +while (my @ary = $sth->fetchrow_array()){
|
| 67 | + push(@arrCollections, $ary[0]);
|
| 68 | +}
|
| 69 | +$sth->finish();
|
| 70 | +
|
| 71 | +#get the expression_id that pertains to each set
|
| 72 | +foreach (@arrCollections) {
|
| 73 | + my @arrExpressions;
|
| 74 | + my $sth_coll = $dbh->prepare('SELECT expression_id FROM uw_syntrans WHERE set_id=' . $_);
|
| 75 | + $sth_coll->execute();
|
| 76 | + while (my @ary = $sth_coll->fetchrow_array()) {
|
| 77 | + push(@arrExpressions, $ary[0]);
|
| 78 | + }
|
| 79 | + $sth_coll->finish();
|
| 80 | +
|
| 81 | + #check to see if there are expressions in the set
|
| 82 | + if ($#arrExpressions > 0) {
|
| 83 | + #start the concept
|
| 84 | + #print "Processing Concept c" . $_ . "\n";
|
| 85 | + $xmlDoc->startTag("termEntry", "id"=>"c".$_);
|
| 86 | +
|
| 87 | + #get the information about all of the expressions in this set
|
| 88 | + my $sql = "SELECT * FROM uw_expression_ns " .
|
| 89 | + "LEFT JOIN language ON uw_expression_ns.language_id=language.language_id " .
|
| 90 | + "WHERE expression_id IN (" . join(", ", @arrExpressions) . ") " .
|
| 91 | + "GROUP BY uw_expression_ns.language_id";
|
| 92 | + $sth = $dbh->prepare($sql);
|
| 93 | + $sth->execute();
|
| 94 | + my $sCurrLang = "";
|
| 95 | + my $nCount = 0;
|
| 96 | + while (my $result = $sth->fetchrow_hashref()) {
|
| 97 | + my $sLangID = $result->{language_id};
|
| 98 | + my $sLang = $result->{wikimedia_key};
|
| 99 | + my $sID = $result->{expression_id};
|
| 100 | + my $sTerm = $result->{spelling};
|
| 101 | +
|
| 102 | + #determine if this is a new language set or not
|
| 103 | + if (($sCurrLang ne $sLang) && $nCount > 0) {
|
| 104 | + #since new language set and this is not the first language then close
|
| 105 | + #the last langSet and open a new one
|
| 106 | + $sCurrLang = $sLang;
|
| 107 | + $xmlDoc->endTag("langSet");
|
| 108 | + $xmlDoc->startTag("langSet", "xml:lang" => $sLang);
|
| 109 | + }
|
| 110 | + elsif (($sCurrLang ne $sLang) && $nCount == 0) {
|
| 111 | + #this is the first language set so start a langSet
|
| 112 | + $sCurrLang = $sLang;
|
| 113 | + $xmlDoc->startTag("langSet", "xml:lang" => $sLang);
|
| 114 | + }
|
| 115 | +
|
| 116 | + #print out language level information
|
| 117 | + #get the definition
|
| 118 | + my $def_sql = "SELECT old_text FROM uw_defined_meaning " .
|
| 119 | + "LEFT JOIN translated_content ON meaning_text_tcid=set_id " .
|
| 120 | + "LEFT JOIN `text` ON text_id=old_id " .
|
| 121 | + #"WHERE expression_id=" . $sID;
|
| 122 | + "WHERE expression_id=" . $sID . " AND language_id=" . $sLangID;
|
| 123 | + my $def_sth = $dbh->prepare($def_sql);
|
| 124 | + $def_sth->execute();
|
| 125 | + while (my $def_result = $def_sth->fetchrow_hashref()) {
|
| 126 | + my $definition = $def_result->{old_text};
|
| 127 | + $definition =~ s/\r\n/ /gi;
|
| 128 | + $xmlDoc->startTag("descrip", "type"=>"definition");
|
| 129 | + $xmlDoc->characters($definition);
|
| 130 | + $xmlDoc->endTag("descrip");
|
| 131 | + }
|
| 132 | +
|
| 133 | + #get the other relationships
|
| 134 | + my $rel_sql = "SELECT meaning2_mid, relationtype_mid, spelling " .
|
| 135 | + "FROM uw_meaning_relations " .
|
| 136 | + "LEFT JOIN uw_expression_ns ON meaning2_mid=expression_id " .
|
| 137 | + "WHERE meaning1_mid=" . $sID ;
|
| 138 | + #"WHERE meaning1_mid=" . $sID . " AND language_id=" . $sLangID;
|
| 139 | +
|
| 140 | + my $rel_sth = $dbh->prepare($rel_sql);
|
| 141 | + $rel_sth->execute();
|
| 142 | + while (my $rel_result = $rel_sth->fetchrow_hashref()) {
|
| 143 | + my $relType = $rel_result->{relationtype_mid};
|
| 144 | + my $relTerm = $rel_result->{spelling};
|
| 145 | + my $relID = $rel_result->{meaning2_mid};
|
| 146 | +
|
| 147 | + #broader terms
|
| 148 | + if ($relType eq "2" || $relType eq "3") {
|
| 149 | + $xmlDoc->startTag("descrip", "type"=>"subjectField");
|
| 150 | + $xmlDoc->characters($relTerm);
|
| 151 | + $xmlDoc->endTag("descrip");
|
| 152 | + }
|
| 153 | + elsif ($relType eq "4" || $relType eq "5") {
|
| 154 | + $xmlDoc->startTag("descrip", "type"=>"relatedConceptBroader", "target"=>"t".$relID);
|
| 155 | + $xmlDoc->characters($relTerm);
|
| 156 | + $xmlDoc->endTag("descrip");
|
| 157 | + }
|
| 158 | + #narrower terms
|
| 159 | + elsif ($relType eq "6" || $relType eq "7") {
|
| 160 | + $xmlDoc->startTag("descrip", "type"=>"relatedConceptNarrower", "target"=>"t".$relID);
|
| 161 | + $xmlDoc->characters($relTerm);
|
| 162 | + $xmlDoc->endTag("descrip");
|
| 163 | + }
|
| 164 | + #related terms
|
| 165 | + elsif ($relType eq "8" || $relType eq "9") {
|
| 166 | + $xmlDoc->startTag("descrip", "type"=>"relatedConcept", "target"=>"t".$relID);
|
| 167 | + $xmlDoc->characters($relTerm);
|
| 168 | + $xmlDoc->endTag("descrip");
|
| 169 | + }
|
| 170 | + }
|
| 171 | + #print out the ntig and termGrp
|
| 172 | + #print $sTerm . "\n";
|
| 173 | + $xmlDoc->startTag("ntig", "id"=>"t".$sID);
|
| 174 | + $xmlDoc->startTag("termGrp");
|
| 175 | + $xmlDoc->dataElement("term", $sTerm);
|
| 176 | + $xmlDoc->endTag("termGrp");
|
| 177 | + $xmlDoc->endTag("ntig");
|
| 178 | +
|
| 179 | + #increment the count
|
| 180 | + $nCount++;
|
| 181 | + }
|
| 182 | + #Close the last langSet that was left open
|
| 183 | + $xmlDoc->endTag("langSet");
|
| 184 | + $sth->finish();
|
| 185 | + #end the concept
|
| 186 | + $xmlDoc->endTag("termEntry");
|
| 187 | + }
|
| 188 | +}
|
| 189 | +
|
| 190 | +#close the database connection
|
| 191 | +$dbh->disconnect();
|
| 192 | +
|
| 193 | +#write any end tags required to close the document
|
| 194 | +$xmlDoc->endTag("body");
|
| 195 | +$xmlDoc->endTag("text");
|
| 196 | +$xmlDoc->endTag("martif");
|
| 197 | +$xmlDoc->end();
|
| 198 | +my $end = time();
|
| 199 | +print $end . "\n";
|
| 200 | +my $dif = $end - $start;
|
| 201 | +print $dif . " seconds\n";
|
| 202 | +exit(0); |
\ No newline at end of file |