r19775 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r19774‎ | r19775 | r19776 >
Date:00:39, 5 February 2007
Author:erik
Status:old
Tags:
Comment:
1) add (outdated) tbx export tool
2) more renaming
Modified paths:
  • /trunk/extensions/Wikidata/perl-tools/Import+WiktionaryZ.pl (modified) (history)
  • /trunk/extensions/Wikidata/perl-tools/WiktionaryZ.pm (modified) (history)
  • /trunk/extensions/Wikidata/perl-tools/tbx-export.pl (added) (history)

Diff [purge]

Index: trunk/extensions/Wikidata/perl-tools/Import WiktionaryZ.pl
@@ -1,23 +1,23 @@
2 -use WiktionaryZ;
 2+use OmegaWiki;
33 use POSIX qw(strftime);
44
55 my $startTime = time;
66
7 -# Example usage to import UMLS completely into an existing WiktionaryZ database:
8 -# my $importer=new WiktionaryZ('wikidatadb','root','MyPass');
 7+# Example usage to import UMLS completely into an existing OmegaWiki database:
 8+# my $importer=new OmegaWiki('wikidatadb','root','MyPass');
99 # $importer->setSourceDB('umls');
1010 # $importer->initialize;
1111 # $importer->importCompleteUMLS();
1212
13 -# Example usage to import a part of UMLS into an existing WiktionaryZ database:
14 -# my $importer=new WiktionaryZ('wikidatadb','root','MyPass');
 13+# Example usage to import a part of UMLS into an existing OmegaWiki database:
 14+# my $importer=new OmegaWiki('wikidatadb','root','MyPass');
1515 # $importer->setSourceDB('umls');
1616 # $importer->initialize;
1717 # my %sourceAbbreviations = $importer->loadSourceAbbreviations();
1818 # delete($sourceAbbreviations{"MSH"});
1919 # $importer->importUMLS(\%sourceAbbreviations);
2020
21 -my $importer=new WiktionaryZ('wikidata_icpc','root','');
 21+my $importer=new OmegaWiki('wikidata_icpc','root','');
2222 $importer->setSourceDB('umls');
2323 #$importer->setSourceDB('swissprot');
2424 $importer->initialize;
Index: trunk/extensions/Wikidata/perl-tools/WiktionaryZ.pm
@@ -1,6 +1,6 @@
2 -# Example usage to import UMLS into an existing WiktionaryZ database:
3 -# use WiktionaryZ;
4 -# my $importer=new WiktionaryZ('wikidatadb','root','MyPass');
 2+# Example usage to import UMLS into an existing OmegaWiki database:
 3+# use OmegaWiki;
 4+# my $importer=new OmegaWiki('wikidatadb','root','MyPass');
55 # $importer->setSourceDB('umls');
66 # $importer->initialize;
77 # $importer->importCompleteUMLS();
@@ -17,7 +17,7 @@
1818 # Fully deal with alternative definitions referring to the same concept
1919 # Deal with preferred lexical expressions, primary concepts (general weighting mechanism?)
2020
21 -package WiktionaryZ;
 21+package OmegaWiki;
2222 use DBI;
2323 use Encode;
2424 use POSIX qw(strftime);
Index: trunk/extensions/Wikidata/perl-tools/tbx-export.pl
@@ -0,0 +1,201 @@
 2+#!/usr/bin/perl
 3+use strict;
 4+use warnings;
 5+use DBI;
 6+use XML::Writer;
 7+use IO::File;
 8+use utf8;
 9+use Encode;
 10+
 11+#Edit here to setup your database connection information
 12+my $dbUser = "root";
 13+my $dbPass = "";
 14+my $dbName = "wiki";
 15+my $dbHost = "127.0.0.1";
 16+
 17+my $start = time();
 18+print $start . "\n";
 19+#create and start the xml document
 20+my $output = new IO::File(">wiktionaryz.xml");
 21+my $xmlDoc = new XML::Writer(OUTPUT => $output,
 22+ DATA_MODE => 1,
 23+ DATA_INDENT => 4);
 24+$xmlDoc->xmlDecl("UTF-8");
 25+$xmlDoc->doctype("martif", "ISO 12200:1999A//DTD MARTIF core (DXFcdV04)//EN", "TBXcdv04.dtd");
 26+
 27+#build the tbx header information
 28+$xmlDoc->startTag("martif", "type"=>"TBX", "xml:lang"=>"en");
 29+$xmlDoc->startTag("martifHeader");
 30+$xmlDoc->startTag("fileDesc");
 31+$xmlDoc->startTag("titleStmt");
 32+$xmlDoc->dataElement("title", "Terminologocial Data");
 33+$xmlDoc->endTag("titleStmt");
 34+$xmlDoc->startTag("sourceDesc");
 35+$xmlDoc->dataElement("p", "from the OmegaWiki Database " . gmtime());
 36+$xmlDoc->endTag("sourceDesc");
 37+$xmlDoc->endTag("fileDesc");
 38+$xmlDoc->startTag("encodingDesc");
 39+$xmlDoc->dataElement("p", "SYSTEM TBXDCSv05b.xml", "type"=>"DCSName");
 40+$xmlDoc->endTag("encodingDesc");
 41+$xmlDoc->endTag("martifHeader");
 42+$xmlDoc->startTag("text");
 43+$xmlDoc->startTag("body");
 44+
 45+my $dbh = DBI->connect('dbi:mysql:database=' . $dbName . ';hostname=' . $dbHost . ';port=3306',
 46+ $dbUser,
 47+ $dbPass);
 48+if (!$dbh) {
 49+ die("Could not connect to database: " . $DBI::errstr);
 50+}
 51+
 52+#uw_expression_ns - contains term and language
 53+#uw_meaning_relations - meaning relations meaning1_mid and meaning2_mid and relation_mid are ids in uw_expression_ns
 54+#uw_syntrans - collections
 55+#The definitions are in the TEXT table. You will find the keys to the
 56+#table using the TRANSLATED_CONTENT table. Each set of languages is
 57+#identified by a SET_ID, which we also refer to as a "Translated Content
 58+#ID" (TCID) from the UW_DEFINED_MEANING table. Using a TCID, you can find
 59+#the different TEXT_IDs for each LANGUAGE_ID, and then get the actual
 60+#texts from the TEXT table.
 61+
 62+#get all the distinct concepts (sets)
 63+my @arrCollections;
 64+my $sth = $dbh->prepare('SELECT DISTINCT(defined_meaning_id) FROM uw_syntrans');
 65+$sth->execute();
 66+while (my @ary = $sth->fetchrow_array()){
 67+ push(@arrCollections, $ary[0]);
 68+}
 69+$sth->finish();
 70+
 71+#get the expression_id that pertains to each set
 72+foreach (@arrCollections) {
 73+ my @arrExpressions;
 74+ my $sth_coll = $dbh->prepare('SELECT expression_id FROM uw_syntrans WHERE set_id=' . $_);
 75+ $sth_coll->execute();
 76+ while (my @ary = $sth_coll->fetchrow_array()) {
 77+ push(@arrExpressions, $ary[0]);
 78+ }
 79+ $sth_coll->finish();
 80+
 81+ #check to see if there are expressions in the set
 82+ if ($#arrExpressions > 0) {
 83+ #start the concept
 84+ #print "Processing Concept c" . $_ . "\n";
 85+ $xmlDoc->startTag("termEntry", "id"=>"c".$_);
 86+
 87+ #get the information about all of the expressions in this set
 88+ my $sql = "SELECT * FROM uw_expression_ns " .
 89+ "LEFT JOIN language ON uw_expression_ns.language_id=language.language_id " .
 90+ "WHERE expression_id IN (" . join(", ", @arrExpressions) . ") " .
 91+ "GROUP BY uw_expression_ns.language_id";
 92+ $sth = $dbh->prepare($sql);
 93+ $sth->execute();
 94+ my $sCurrLang = "";
 95+ my $nCount = 0;
 96+ while (my $result = $sth->fetchrow_hashref()) {
 97+ my $sLangID = $result->{language_id};
 98+ my $sLang = $result->{wikimedia_key};
 99+ my $sID = $result->{expression_id};
 100+ my $sTerm = $result->{spelling};
 101+
 102+ #determine if this is a new language set or not
 103+ if (($sCurrLang ne $sLang) && $nCount > 0) {
 104+ #since new language set and this is not the first language then close
 105+ #the last langSet and open a new one
 106+ $sCurrLang = $sLang;
 107+ $xmlDoc->endTag("langSet");
 108+ $xmlDoc->startTag("langSet", "xml:lang" => $sLang);
 109+ }
 110+ elsif (($sCurrLang ne $sLang) && $nCount == 0) {
 111+ #this is the first language set so start a langSet
 112+ $sCurrLang = $sLang;
 113+ $xmlDoc->startTag("langSet", "xml:lang" => $sLang);
 114+ }
 115+
 116+ #print out language level information
 117+ #get the definition
 118+ my $def_sql = "SELECT old_text FROM uw_defined_meaning " .
 119+ "LEFT JOIN translated_content ON meaning_text_tcid=set_id " .
 120+ "LEFT JOIN `text` ON text_id=old_id " .
 121+ #"WHERE expression_id=" . $sID;
 122+ "WHERE expression_id=" . $sID . " AND language_id=" . $sLangID;
 123+ my $def_sth = $dbh->prepare($def_sql);
 124+ $def_sth->execute();
 125+ while (my $def_result = $def_sth->fetchrow_hashref()) {
 126+ my $definition = $def_result->{old_text};
 127+ $definition =~ s/\r\n/ /gi;
 128+ $xmlDoc->startTag("descrip", "type"=>"definition");
 129+ $xmlDoc->characters($definition);
 130+ $xmlDoc->endTag("descrip");
 131+ }
 132+
 133+ #get the other relationships
 134+ my $rel_sql = "SELECT meaning2_mid, relationtype_mid, spelling " .
 135+ "FROM uw_meaning_relations " .
 136+ "LEFT JOIN uw_expression_ns ON meaning2_mid=expression_id " .
 137+ "WHERE meaning1_mid=" . $sID ;
 138+ #"WHERE meaning1_mid=" . $sID . " AND language_id=" . $sLangID;
 139+
 140+ my $rel_sth = $dbh->prepare($rel_sql);
 141+ $rel_sth->execute();
 142+ while (my $rel_result = $rel_sth->fetchrow_hashref()) {
 143+ my $relType = $rel_result->{relationtype_mid};
 144+ my $relTerm = $rel_result->{spelling};
 145+ my $relID = $rel_result->{meaning2_mid};
 146+
 147+ #broader terms
 148+ if ($relType eq "2" || $relType eq "3") {
 149+ $xmlDoc->startTag("descrip", "type"=>"subjectField");
 150+ $xmlDoc->characters($relTerm);
 151+ $xmlDoc->endTag("descrip");
 152+ }
 153+ elsif ($relType eq "4" || $relType eq "5") {
 154+ $xmlDoc->startTag("descrip", "type"=>"relatedConceptBroader", "target"=>"t".$relID);
 155+ $xmlDoc->characters($relTerm);
 156+ $xmlDoc->endTag("descrip");
 157+ }
 158+ #narrower terms
 159+ elsif ($relType eq "6" || $relType eq "7") {
 160+ $xmlDoc->startTag("descrip", "type"=>"relatedConceptNarrower", "target"=>"t".$relID);
 161+ $xmlDoc->characters($relTerm);
 162+ $xmlDoc->endTag("descrip");
 163+ }
 164+ #related terms
 165+ elsif ($relType eq "8" || $relType eq "9") {
 166+ $xmlDoc->startTag("descrip", "type"=>"relatedConcept", "target"=>"t".$relID);
 167+ $xmlDoc->characters($relTerm);
 168+ $xmlDoc->endTag("descrip");
 169+ }
 170+ }
 171+ #print out the ntig and termGrp
 172+ #print $sTerm . "\n";
 173+ $xmlDoc->startTag("ntig", "id"=>"t".$sID);
 174+ $xmlDoc->startTag("termGrp");
 175+ $xmlDoc->dataElement("term", $sTerm);
 176+ $xmlDoc->endTag("termGrp");
 177+ $xmlDoc->endTag("ntig");
 178+
 179+ #increment the count
 180+ $nCount++;
 181+ }
 182+ #Close the last langSet that was left open
 183+ $xmlDoc->endTag("langSet");
 184+ $sth->finish();
 185+ #end the concept
 186+ $xmlDoc->endTag("termEntry");
 187+ }
 188+}
 189+
 190+#close the database connection
 191+$dbh->disconnect();
 192+
 193+#write any end tags required to close the document
 194+$xmlDoc->endTag("body");
 195+$xmlDoc->endTag("text");
 196+$xmlDoc->endTag("martif");
 197+$xmlDoc->end();
 198+my $end = time();
 199+print $end . "\n";
 200+my $dif = $end - $start;
 201+print $dif . " seconds\n";
 202+exit(0);
\ No newline at end of file