r19775 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r19774‎ \| r19775 \| r19776 >
Date:	00:39, 5 February 2007
Author:	erik
Status:	old
Tags:
Comment:	1) add (outdated) tbx export tool 2) more renaming
Modified paths:	/trunk/extensions/Wikidata/perl-tools/Import+WiktionaryZ.pl (modified) (history) /trunk/extensions/Wikidata/perl-tools/WiktionaryZ.pm (modified) (history) /trunk/extensions/Wikidata/perl-tools/tbx-export.pl (added) (history)

Diff [purge]

Index: trunk/extensions/Wikidata/perl-tools/Import WiktionaryZ.pl
—	—	@@ -1,23 +1,23 @@
2		~~-use WiktionaryZ;~~
	2	+use OmegaWiki;
3	3	use POSIX qw(strftime);
4	4
5	5	my $startTime = time;
6	6
7		~~-# Example usage to import UMLS completely into an existing WiktionaryZ database:~~
8		~~-# my $importer=new WiktionaryZ('wikidatadb','root','MyPass');~~
	7	+# Example usage to import UMLS completely into an existing OmegaWiki database:
	8	+# my $importer=new OmegaWiki('wikidatadb','root','MyPass');
9	9	# $importer->setSourceDB('umls');
10	10	# $importer->initialize;
11	11	# $importer->importCompleteUMLS();
12	12
13		~~-# Example usage to import a part of UMLS into an existing WiktionaryZ database:~~
14		~~-# my $importer=new WiktionaryZ('wikidatadb','root','MyPass');~~
	13	+# Example usage to import a part of UMLS into an existing OmegaWiki database:
	14	+# my $importer=new OmegaWiki('wikidatadb','root','MyPass');
15	15	# $importer->setSourceDB('umls');
16	16	# $importer->initialize;
17	17	# my %sourceAbbreviations = $importer->loadSourceAbbreviations();
18	18	# delete($sourceAbbreviations{"MSH"});
19	19	# $importer->importUMLS(\%sourceAbbreviations);
20	20
21		~~-my $importer=new WiktionaryZ('wikidata_icpc','root','');~~
	21	+my $importer=new OmegaWiki('wikidata_icpc','root','');
22	22	$importer->setSourceDB('umls');
23	23	#$importer->setSourceDB('swissprot');
24	24	$importer->initialize;
Index: trunk/extensions/Wikidata/perl-tools/WiktionaryZ.pm
—	—	@@ -1,6 +1,6 @@
2		~~-# Example usage to import UMLS into an existing WiktionaryZ database:~~
3		~~-# use WiktionaryZ;~~
4		~~-# my $importer=new WiktionaryZ('wikidatadb','root','MyPass');~~
	2	+# Example usage to import UMLS into an existing OmegaWiki database:
	3	+# use OmegaWiki;
	4	+# my $importer=new OmegaWiki('wikidatadb','root','MyPass');
5	5	# $importer->setSourceDB('umls');
6	6	# $importer->initialize;
7	7	# $importer->importCompleteUMLS();
—	—	@@ -17,7 +17,7 @@
18	18	# Fully deal with alternative definitions referring to the same concept
19	19	# Deal with preferred lexical expressions, primary concepts (general weighting mechanism?)
20	20
21		~~-package WiktionaryZ;~~
	21	+package OmegaWiki;
22	22	use DBI;
23	23	use Encode;
24	24	use POSIX qw(strftime);
Index: trunk/extensions/Wikidata/perl-tools/tbx-export.pl
—	—	@@ -0,0 +1,201 @@
	2	+#!/usr/bin/perl
	3	+use strict;
	4	+use warnings;
	5	+use DBI;
	6	+use XML::Writer;
	7	+use IO::File;
	8	+use utf8;
	9	+use Encode;
	10	+
	11	+#Edit here to setup your database connection information
	12	+my $dbUser = "root";
	13	+my $dbPass = "";
	14	+my $dbName = "wiki";
	15	+my $dbHost = "127.0.0.1";
	16	+
	17	+my $start = time();
	18	+print $start . "\n";
	19	+#create and start the xml document
	20	+my $output = new IO::File(">wiktionaryz.xml");
	21	+my $xmlDoc = new XML::Writer(OUTPUT => $output,
	22	+ DATA_MODE => 1,
	23	+ DATA_INDENT => 4);
	24	+$xmlDoc->xmlDecl("UTF-8");
	25	+$xmlDoc->doctype("martif", "ISO 12200:1999A//DTD MARTIF core (DXFcdV04)//EN", "TBXcdv04.dtd");
	26	+
	27	+#build the tbx header information
	28	+$xmlDoc->startTag("martif", "type"=>"TBX", "xml:lang"=>"en");
	29	+$xmlDoc->startTag("martifHeader");
	30	+$xmlDoc->startTag("fileDesc");
	31	+$xmlDoc->startTag("titleStmt");
	32	+$xmlDoc->dataElement("title", "Terminologocial Data");
	33	+$xmlDoc->endTag("titleStmt");
	34	+$xmlDoc->startTag("sourceDesc");
	35	+$xmlDoc->dataElement("p", "from the OmegaWiki Database " . gmtime());
	36	+$xmlDoc->endTag("sourceDesc");
	37	+$xmlDoc->endTag("fileDesc");
	38	+$xmlDoc->startTag("encodingDesc");
	39	+$xmlDoc->dataElement("p", "SYSTEM TBXDCSv05b.xml", "type"=>"DCSName");
	40	+$xmlDoc->endTag("encodingDesc");
	41	+$xmlDoc->endTag("martifHeader");
	42	+$xmlDoc->startTag("text");
	43	+$xmlDoc->startTag("body");
	44	+
	45	+my $dbh = DBI->connect('dbi:mysql:database=' . $dbName . ';hostname=' . $dbHost . ';port=3306',
	46	+ $dbUser,
	47	+ $dbPass);
	48	+if (!$dbh) {
	49	+ die("Could not connect to database: " . $DBI::errstr);
	50	+}
	51	+
	52	+#uw_expression_ns - contains term and language
	53	+#uw_meaning_relations - meaning relations meaning1_mid and meaning2_mid and relation_mid are ids in uw_expression_ns
	54	+#uw_syntrans - collections
	55	+#The definitions are in the TEXT table. You will find the keys to the
	56	+#table using the TRANSLATED_CONTENT table. Each set of languages is
	57	+#identified by a SET_ID, which we also refer to as a "Translated Content
	58	+#ID" (TCID) from the UW_DEFINED_MEANING table. Using a TCID, you can find
	59	+#the different TEXT_IDs for each LANGUAGE_ID, and then get the actual
	60	+#texts from the TEXT table.
	61	+
	62	+#get all the distinct concepts (sets)
	63	+my @arrCollections;
	64	+my $sth = $dbh->prepare('SELECT DISTINCT(defined_meaning_id) FROM uw_syntrans');
	65	+$sth->execute();
	66	+while (my @ary = $sth->fetchrow_array()){
	67	+ push(@arrCollections, $ary[0]);
	68	+}
	69	+$sth->finish();
	70	+
	71	+#get the expression_id that pertains to each set
	72	+foreach (@arrCollections) {
	73	+ my @arrExpressions;
	74	+ my $sth_coll = $dbh->prepare('SELECT expression_id FROM uw_syntrans WHERE set_id=' . $_);
	75	+ $sth_coll->execute();
	76	+ while (my @ary = $sth_coll->fetchrow_array()) {
	77	+ push(@arrExpressions, $ary[0]);
	78	+ }
	79	+ $sth_coll->finish();
	80	+
	81	+ #check to see if there are expressions in the set
	82	+ if ($#arrExpressions > 0) {
	83	+ #start the concept
	84	+ #print "Processing Concept c" . $_ . "\n";
	85	+ $xmlDoc->startTag("termEntry", "id"=>"c".$_);
	86	+
	87	+ #get the information about all of the expressions in this set
	88	+ my $sql = "SELECT * FROM uw_expression_ns " .
	89	+ "LEFT JOIN language ON uw_expression_ns.language_id=language.language_id " .
	90	+ "WHERE expression_id IN (" . join(", ", @arrExpressions) . ") " .
	91	+ "GROUP BY uw_expression_ns.language_id";
	92	+ $sth = $dbh->prepare($sql);
	93	+ $sth->execute();
	94	+ my $sCurrLang = "";
	95	+ my $nCount = 0;
	96	+ while (my $result = $sth->fetchrow_hashref()) {
	97	+ my $sLangID = $result->{language_id};
	98	+ my $sLang = $result->{wikimedia_key};
	99	+ my $sID = $result->{expression_id};
	100	+ my $sTerm = $result->{spelling};
	101	+
	102	+ #determine if this is a new language set or not
	103	+ if (($sCurrLang ne $sLang) && $nCount > 0) {
	104	+ #since new language set and this is not the first language then close
	105	+ #the last langSet and open a new one
	106	+ $sCurrLang = $sLang;
	107	+ $xmlDoc->endTag("langSet");
	108	+ $xmlDoc->startTag("langSet", "xml:lang" => $sLang);
	109	+ }
	110	+ elsif (($sCurrLang ne $sLang) && $nCount == 0) {
	111	+ #this is the first language set so start a langSet
	112	+ $sCurrLang = $sLang;
	113	+ $xmlDoc->startTag("langSet", "xml:lang" => $sLang);
	114	+ }
	115	+
	116	+ #print out language level information
	117	+ #get the definition
	118	+ my $def_sql = "SELECT old_text FROM uw_defined_meaning " .
	119	+ "LEFT JOIN translated_content ON meaning_text_tcid=set_id " .
	120	+ "LEFT JOIN `text` ON text_id=old_id " .
	121	+ #"WHERE expression_id=" . $sID;
	122	+ "WHERE expression_id=" . $sID . " AND language_id=" . $sLangID;
	123	+ my $def_sth = $dbh->prepare($def_sql);
	124	+ $def_sth->execute();
	125	+ while (my $def_result = $def_sth->fetchrow_hashref()) {
	126	+ my $definition = $def_result->{old_text};
	127	+ $definition =~ s/\r\n/ /gi;
	128	+ $xmlDoc->startTag("descrip", "type"=>"definition");
	129	+ $xmlDoc->characters($definition);
	130	+ $xmlDoc->endTag("descrip");
	131	+ }
	132	+
	133	+ #get the other relationships
	134	+ my $rel_sql = "SELECT meaning2_mid, relationtype_mid, spelling " .
	135	+ "FROM uw_meaning_relations " .
	136	+ "LEFT JOIN uw_expression_ns ON meaning2_mid=expression_id " .
	137	+ "WHERE meaning1_mid=" . $sID ;
	138	+ #"WHERE meaning1_mid=" . $sID . " AND language_id=" . $sLangID;
	139	+
	140	+ my $rel_sth = $dbh->prepare($rel_sql);
	141	+ $rel_sth->execute();
	142	+ while (my $rel_result = $rel_sth->fetchrow_hashref()) {
	143	+ my $relType = $rel_result->{relationtype_mid};
	144	+ my $relTerm = $rel_result->{spelling};
	145	+ my $relID = $rel_result->{meaning2_mid};
	146	+
	147	+ #broader terms
	148	+ if ($relType eq "2" \|\| $relType eq "3") {
	149	+ $xmlDoc->startTag("descrip", "type"=>"subjectField");
	150	+ $xmlDoc->characters($relTerm);
	151	+ $xmlDoc->endTag("descrip");
	152	+ }
	153	+ elsif ($relType eq "4" \|\| $relType eq "5") {
	154	+ $xmlDoc->startTag("descrip", "type"=>"relatedConceptBroader", "target"=>"t".$relID);
	155	+ $xmlDoc->characters($relTerm);
	156	+ $xmlDoc->endTag("descrip");
	157	+ }
	158	+ #narrower terms
	159	+ elsif ($relType eq "6" \|\| $relType eq "7") {
	160	+ $xmlDoc->startTag("descrip", "type"=>"relatedConceptNarrower", "target"=>"t".$relID);
	161	+ $xmlDoc->characters($relTerm);
	162	+ $xmlDoc->endTag("descrip");
	163	+ }
	164	+ #related terms
	165	+ elsif ($relType eq "8" \|\| $relType eq "9") {
	166	+ $xmlDoc->startTag("descrip", "type"=>"relatedConcept", "target"=>"t".$relID);
	167	+ $xmlDoc->characters($relTerm);
	168	+ $xmlDoc->endTag("descrip");
	169	+ }
	170	+ }
	171	+ #print out the ntig and termGrp
	172	+ #print $sTerm . "\n";
	173	+ $xmlDoc->startTag("ntig", "id"=>"t".$sID);
	174	+ $xmlDoc->startTag("termGrp");
	175	+ $xmlDoc->dataElement("term", $sTerm);
	176	+ $xmlDoc->endTag("termGrp");
	177	+ $xmlDoc->endTag("ntig");
	178	+
	179	+ #increment the count
	180	+ $nCount++;
	181	+ }
	182	+ #Close the last langSet that was left open
	183	+ $xmlDoc->endTag("langSet");
	184	+ $sth->finish();
	185	+ #end the concept
	186	+ $xmlDoc->endTag("termEntry");
	187	+ }
	188	+}
	189	+
	190	+#close the database connection
	191	+$dbh->disconnect();
	192	+
	193	+#write any end tags required to close the document
	194	+$xmlDoc->endTag("body");
	195	+$xmlDoc->endTag("text");
	196	+$xmlDoc->endTag("martif");
	197	+$xmlDoc->end();
	198	+my $end = time();
	199	+print $end . "\n";
	200	+my $dif = $end - $start;
	201	+print $dif . " seconds\n";
	202	+exit(0);
\ No newline at end of file