Index: trunk/extensions/Wikidata/perl-tools/Import WiktionaryZ.pl |
— | — | @@ -1,55 +0,0 @@ |
2 | | -use OmegaWiki; |
3 | | -use POSIX qw(strftime); |
4 | | - |
5 | | -my $startTime = time; |
6 | | - |
7 | | -# Example usage to import UMLS completely into an existing OmegaWiki database: |
8 | | -# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
9 | | -# $importer->setSourceDB('umls'); |
10 | | -# $importer->initialize; |
11 | | -# $importer->importCompleteUMLS(); |
12 | | - |
13 | | -# Example usage to import a part of UMLS into an existing OmegaWiki database: |
14 | | -# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
15 | | -# $importer->setSourceDB('umls'); |
16 | | -# $importer->initialize; |
17 | | -# my %sourceAbbreviations = $importer->loadSourceAbbreviations(); |
18 | | -# delete($sourceAbbreviations{"MSH"}); |
19 | | -# $importer->importUMLS(\%sourceAbbreviations); |
20 | | - |
21 | | -my $importer=new OmegaWiki('wikidata_icpc','root',''); |
22 | | -$importer->setSourceDB('umls'); |
23 | | -#$importer->setSourceDB('swissprot'); |
24 | | -$importer->initialize; |
25 | | -#$importer->importCompleteUMLS(); |
26 | | - |
27 | | -#read the source abbreviations and remove those you do not wish to import |
28 | | -my %sourceAbbreviations = $importer->loadSourceAbbreviations(); |
29 | | -my @deleteList; |
30 | | -while (($key, $val) = each(%sourceAbbreviations)) { |
31 | | - |
32 | | -#remove all that contains "MSH": |
33 | | -# if (index($key,"MSH") >= 0) { |
34 | | -# push(@deleteList, $key); |
35 | | -# } |
36 | | - |
37 | | -#remove all that does not contain "ICPC": |
38 | | - if (index($key,"ICPC") < 0) { |
39 | | - push(@deleteList, $key); |
40 | | - } |
41 | | -} |
42 | | - |
43 | | -foreach $sab (@deleteList){ |
44 | | - delete($sourceAbbreviations{$sab}); |
45 | | -} |
46 | | - |
47 | | -$importer->importUMLS(\%sourceAbbreviations); |
48 | | - |
49 | | - |
50 | | -my $endTime = time; |
51 | | -print "\n"; |
52 | | -print "Import started at: " . (strftime "%H:%M:%S", localtime($startTime)) . "\n"; |
53 | | -print "Import ended at: " . (strftime "%H:%M:%S", localtime($endTime)) . "\n"; |
54 | | -print "Elapsed time: " . (strftime "%H:%M:%S", gmtime($endTime - $startTime)) . "\n"; |
55 | | - |
56 | | -exit 0; |
\ No newline at end of file |
Index: trunk/extensions/Wikidata/perl-tools/WiktionaryZ.pm |
— | — | @@ -1,1063 +0,0 @@ |
2 | | -# Example usage to import UMLS into an existing OmegaWiki database: |
3 | | -# use OmegaWiki; |
4 | | -# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
5 | | -# $importer->setSourceDB('umls'); |
6 | | -# $importer->initialize; |
7 | | -# $importer->importCompleteUMLS(); |
8 | | -# |
9 | | -# NOTE: When importing UMLS, we expect the presence of the semantic network data |
10 | | -# in the tables SRDEF and the manually created tables SEMRELHIER and SEMTYPEHIER. |
11 | | -# SEMRELHIER and SEMTYPEHIER contain information about the relations between |
12 | | -# semantic types and relation types, using RB as the code for "broader than" |
13 | | -# and RN for "narrower than". |
14 | | -# |
15 | | -# Todo for UMLS: |
16 | | -# SyntransCollection |
17 | | -# RelationCollection |
18 | | -# Fully deal with alternative definitions referring to the same concept |
19 | | -# Deal with preferred lexical expressions, primary concepts (general weighting mechanism?) |
20 | | - |
21 | | -package OmegaWiki; |
22 | | -use DBI; |
23 | | -use Encode; |
24 | | -use POSIX qw(strftime); |
25 | | - |
26 | | -sub new { |
27 | | - my $type=shift; |
28 | | - my $self={}; |
29 | | - $self->{targetdb}=shift; |
30 | | - $self->{targetuser}=shift; |
31 | | - $self->{targetpass}=shift; |
32 | | - $self->{targethost}=shift || 'localhost'; |
33 | | - $self->{targetport}=shift || '3306'; |
34 | | - $self->{targetdriver}=shift || 'mysql'; |
35 | | - bless($self, $type); |
36 | | - return($self); |
37 | | -} |
38 | | - |
39 | | -sub setSourceDB { |
40 | | - my $self=shift; |
41 | | - $self->{sourcedb}=shift; |
42 | | - $self->{sourceuser}=shift || $self->{targetuser}; |
43 | | - $self->{sourcepass}=shift || $self->{targetpass}; |
44 | | - $self->{sourcehost}=shift || $self->{targethost}; |
45 | | - $self->{sourceport}=shift || $self->{targetport}; |
46 | | - $self->{sourcedriver}=shift || $self->{targetdriver}; |
47 | | -} |
48 | | - |
49 | | -sub connectSourceDB { |
50 | | - my $self=shift; |
51 | | - my $dsn = 'dbi:'.$self->{sourcedriver}.':'.$self->{sourcedb}.':'.$self->{sourcehost}.':'.$self->{sourceport}; |
52 | | - $self->{dbs}=DBI->connect($dsn,$self->{sourceuser},$self->{sourcepass}); |
53 | | -} |
54 | | - |
55 | | -sub connectTargetDB { |
56 | | - my $self=shift; |
57 | | - my $dsn = 'dbi:'.$self->{targetdriver}.':'.$self->{targetdb}.':'.$self->{targethost}.':'.$self->{targetport}; |
58 | | - $self->{dbt}=DBI->connect($dsn,$self->{targetuser},$self->{targetpass}); |
59 | | -} |
60 | | - |
61 | | -sub connectDBs() { |
62 | | - my $self = shift; |
63 | | - |
64 | | - $self->connectSourceDB(); |
65 | | - $self->connectTargetDB(); |
66 | | -} |
67 | | - |
68 | | -sub loadLanguages() { |
69 | | - my $self = shift; |
70 | | - |
71 | | - my %la=$self->loadLangs(); |
72 | | - $self->{la}=\%la; |
73 | | - my %la_iso=$self->loadLangsIso(); |
74 | | - $self->{la_iso}=\%la_iso; |
75 | | -} |
76 | | - |
77 | | -sub initialize { |
78 | | - my $self=shift; |
79 | | - $self->connectDBs(); |
80 | | - $self->loadLanguages(); |
81 | | -} |
82 | | - |
83 | | -sub importCompleteUMLS { |
84 | | - my $self=shift; |
85 | | - my $level=shift || 0; # 0= complete; 1=reltypes+; 2=rel+ |
86 | | - |
87 | | - my %sourceAbbreviations = $self->loadSourceAbbreviations(); |
88 | | - $self->importUMLS(\%sourceAbbreviations, $level); |
89 | | -} |
90 | | - |
91 | | -sub importUMLS() { |
92 | | - my $self=shift; |
93 | | - my $sourceAbbreviationsReference = shift; |
94 | | - my $level=shift || 0; # 0= complete; 1=reltypes+; 2=rel+ |
95 | | - |
96 | | - my %sourceAbbreviations = %{$sourceAbbreviationsReference}; |
97 | | - my %cid=$self->getOrCreateCollections(\%sourceAbbreviations); |
98 | | - $self->{cid}=\%cid; |
99 | | - |
100 | | - if ($level<1){ |
101 | | - while (($sourceAbbreviation, $collectionId) = each %cid) { |
102 | | - print "Import UMLS terms for $sourceAbbreviation\n"; |
103 | | - $self->importUMLSterms($sourceAbbreviation, $collectionId); |
104 | | - } |
105 | | - } |
106 | | - |
107 | | - if($level<2) { |
108 | | - print "Import UMLS relation types 'REL'\n"; |
109 | | - $self->importUMLSrelationtypes('REL'); |
110 | | - print "Import UMLS relation types 'RELA'\n"; |
111 | | - $self->importUMLSrelationtypes('RELA'); |
112 | | - } |
113 | | - |
114 | | - if($level<3) { |
115 | | - while (($sourceAbbreviation, $collectionId) = each %cid) { |
116 | | - my %rt=$self->loadReltypes(); |
117 | | - $self->{reltypes}=\%rt; |
118 | | - print "Import UMLS relations 'REL' for $sourceAbbreviation\n"; |
119 | | - $self->importUMLSrelations('REL',$sourceAbbreviation); |
120 | | - print "Import UMLS relations 'RELA' for $sourceAbbreviation\n"; |
121 | | - $self->importUMLSrelations('RELA',$sourceAbbreviation); |
122 | | - } |
123 | | - } |
124 | | - |
125 | | - if($level<4) { |
126 | | - print "Import SN types 'STY'\n"; |
127 | | - $self->importSNtypes('STY'); |
128 | | - print "Import SN types 'RL'\n"; |
129 | | - $self->importSNtypes('RL'); |
130 | | - print "Import ST relations 'STY'\n"; |
131 | | - $self->importSTrelations('STY'); |
132 | | - print "Import ST relations 'RL'\n"; |
133 | | - $self->importSTrelations('RL'); |
134 | | - # $self->importSTrelations2(); |
135 | | - } |
136 | | - |
137 | | - if($level<5) { |
138 | | - while (($sourceAbbreviation, $collectionId) = each %cid) { |
139 | | - my %attribs=$self->loadAttributes(); |
140 | | - $self->{attribs}=\%attribs; |
141 | | - print "Import UMLS types for $sourceAbbreviation\n"; |
142 | | - $self->importUMLSstypes($sourceAbbreviation); |
143 | | - } |
144 | | - } |
145 | | -} |
146 | | - |
147 | | -sub importGEMET { |
148 | | - my $self=shift; |
149 | | - $self->connectSourceDB(); |
150 | | - $self->connectTargetDB(); |
151 | | - my %la=$self->loadLangs(); |
152 | | - $self->{la}=\%la; |
153 | | - my %cid=$self->bootstrapGemetCollection(); |
154 | | - $self->{cid}=\%cid; |
155 | | - $self->initRel($self->{cid}{'GEMETREL'}); |
156 | | - my %rt=$self->loadReltypes(); |
157 | | - $self->{reltypes}=\%rt; |
158 | | - $self->importGemetTerms(); |
159 | | - $self->importGemetRelations(); |
160 | | - $self->importGemetThemes(); |
161 | | -} |
162 | | - |
163 | | -sub importUMLSstypes { |
164 | | - my $self=shift; |
165 | | - my $sab=shift; |
166 | | - |
167 | | - my $getassocs=$self->{dbs}->prepare("select MRSTY.CUI, MRSTY.STY from MRCONSO,MRSTY where MRCONSO.SAB like ? and MRCONSO.CUI=MRSTY.CUI"); |
168 | | - $getassocs->execute($sab); |
169 | | - while(my $row=$getassocs->fetchrow_hashref()) { |
170 | | - my %rv=$self->getMidForMember($row->{CUI}); |
171 | | - my $att=$self->{attribs}{$row->{STY}}; |
172 | | - #print "$rv{mid} is a $row->{STY} ($att)\n"; |
173 | | - $self->addRelation($rv{rid},0,$rv{mid},$att, my $checkfordupes=1); |
174 | | - } |
175 | | -} |
176 | | - |
177 | | -sub getCollections(){ |
178 | | - my $self=shift; |
179 | | - my %cid; |
180 | | - $cid{'CRISP'}=$self->findCollection($self->findMeaning($self->findItem('CRISP Thesaurus, 2005',$self->{la}{'en'}))); |
181 | | - $cid{'STY'}=$self->findCollection($self->findMeaning($self->findItem('Semantic Network 2005AC Semantic Types',$self->{la}{'en'}))); |
182 | | - $cid{'RL'}=$self->findCollection($self->findMeaning($self->findItem('Semantic Network 2005AC Relation Types',$self->{la}{'en'}))); |
183 | | - $cid{'REL'}=$self->findCollection($self->findMeaning($self->findItem('UMLS Relation Types 2005',$self->{la}{'en'}))); |
184 | | - $cid{'RELA'}=$self->findCollection($self->findMeaning($self->findItem('UMLS Relation Attributes 2005',$self->{la}{'en'}))); |
185 | | - $cid{'ICPC'}=$self->findCollection($self->findMeaning($self->findItem('The International Classification of Primary Care (ICPC), 1993',$self->{la}{'en'}))); |
186 | | - $cid{'MESH'}=$self->findCollection($self->findMeaning($self->findItem('Medical Subject Headings (MeSH), 2005',$self->{la}{'en'}))); |
187 | | -# $cid{'SP'}=$self->findCollection($self->findMeaning($self->findItem('Swiss-Prot',$self->{la}{'en'}))); |
188 | | - return %cid; |
189 | | -} |
190 | | - |
191 | | -sub findCollection() { |
192 | | - my $self=shift; |
193 | | - my $mid=shift; |
194 | | - my $findcoll=$self->{dbt}->prepare("select collection_id from uw_collection_ns where collection_mid=? and is_latest=1"); |
195 | | - $findcoll->execute($mid); |
196 | | - my $row=$findcoll->fetchrow_hashref(); |
197 | | - return $row->{collection_id}; |
198 | | -} |
199 | | - |
200 | | -sub getCollection { |
201 | | - my $self = shift; |
202 | | - my $expression = shift; |
203 | | - |
204 | | - return $self->findCollection($self->findMeaning($self->findExpressionId($expression,$self->{la}{'en'}))); |
205 | | -} |
206 | | - |
207 | | -sub bootstrapCollection { |
208 | | - my $self = shift; |
209 | | - my $expression = shift; |
210 | | - my $collectionType = shift; |
211 | | - |
212 | | - %rv=$self->addExpression($expression,$self->{la}{'en'}); |
213 | | - return $self->addCollection($rv{mid},$collectionType); |
214 | | -} |
215 | | - |
216 | | -sub getOrCreateCollection { |
217 | | - my $self = shift; |
218 | | - my $expression = shift; |
219 | | - |
220 | | - my $result = $self->getCollection($expression); |
221 | | - |
222 | | - if (!$result) { |
223 | | - $result = $self->bootstrapCollection($expression, ''); |
224 | | - } |
225 | | - |
226 | | - return $result; |
227 | | -} |
228 | | - |
229 | | -sub getOrCreateCollections { |
230 | | - my $self = shift; |
231 | | - my $sourceAbbreviationsReference = shift; |
232 | | - my %sourceAbbreviations = %{$sourceAbbreviationsReference}; |
233 | | - my %cid; |
234 | | - |
235 | | - while (($key, $value) = each %sourceAbbreviations) { |
236 | | - $cid{$key} = $self->getOrCreateCollection($value); |
237 | | - } |
238 | | - |
239 | | - return %cid; |
240 | | -} |
241 | | - |
242 | | -sub loadSourceAbbreviations { |
243 | | - my $self = shift; |
244 | | - my %sab; |
245 | | - |
246 | | - my $dataset = $self->{dbs}->prepare("select * from mrsab"); |
247 | | - $dataset->execute(); |
248 | | - while (my $row = $dataset->fetchrow_hashref()) { |
249 | | - $sab{$row->{RSAB}} = $row->{SON}; |
250 | | - } |
251 | | - return %sab; |
252 | | -} |
253 | | - |
254 | | -# SEMTYPEHIER and SEMRELHIER contain only the is_a relationships, whereas |
255 | | -# srstr contains all others |
256 | | -# FIXME: only use SRSTR |
257 | | -sub importSTrelations2 { |
258 | | - my $self=shift; |
259 | | - my $getrels=$self->{dbs}->prepare("select * from srstr where rel!='isa'"); |
260 | | - $getrels->execute(); |
261 | | - while(my $row=$getrels->fetchrow_hashref()) { |
262 | | - my %rv1=$self->getMidForMember($row->{TYPE1},$self->{cid}{'STY'}); |
263 | | - my %rv2=$self->getMidForMember($row->{TYPE2},$self->{cid}{'STY'}); |
264 | | - my $rtmid=$self->{reltypes}{$row->{REL}}; |
265 | | - #print "Adding relation $row->{REL} ($rtmid) between $row->{TYPE1} and $row->{TYPE2}\n"; |
266 | | - $self->addRelation($rv1{rid},$rtmid,$rv1{mid},$rv2{mid},my $checkfordupes=1); |
267 | | - } |
268 | | -} |
269 | | - |
270 | | - |
271 | | -sub importSTrelations { |
272 | | - my $self=shift; |
273 | | - my $which=shift; |
274 | | - my $table; |
275 | | - my $field1; |
276 | | - my $field2; |
277 | | - if($which eq 'STY') { |
278 | | - $table='semtypehier'; |
279 | | - $field1='SEMTYPE1'; |
280 | | - $field2='SEMTYPE2'; |
281 | | - } elsif($which eq 'RL') { |
282 | | - $table='semrelhier'; |
283 | | - $field1='RELTYPE1'; |
284 | | - $field2='RELTYPE2'; |
285 | | - } |
286 | | - |
287 | | - my $gettypehier=$self->{dbs}->prepare("select * from $table"); |
288 | | - $gettypehier->execute(); |
289 | | - while(my $typehier=$gettypehier->fetchrow_hashref()) { |
290 | | - my %rv1=$self->getMidForMember($typehier->{$field1},$self->{cid}{$which}); |
291 | | - my %rv2=$self->getMidForMember($typehier->{$field2},$self->{cid}{$which}); |
292 | | - my $rtmid=$self->{reltypes}{$typehier->{RELATION}}; |
293 | | - print "Adding relation $typehier->{RELATION} ($rtmid) between $typehier->{$field1} and $typehier->{$field2}\n"; |
294 | | - $self->addRelation($rv1{rid},$rtmid,$rv1{mid},$rv2{mid},my $checkfordupes=1); |
295 | | - } |
296 | | -} |
297 | | - |
298 | | -# $member_id - the collection-internal identifier for this member |
299 | | -# $cid The collection in which to search for this member (optional) |
300 | | -# Returns the DefinedMeaningID and the revision id |
301 | | -sub getMidForMember { |
302 | | - my $self=shift; |
303 | | - my $member_id=shift; |
304 | | - my $cid=shift; |
305 | | - my %rv; |
306 | | - my $getmid; |
307 | | - if($cid) { |
308 | | - $getmid=$self->{dbt}->prepare("select member_mid,revision_id from uw_collection_contents where collection_id=? and internal_member_id=? and is_latest_set=1 limit 1"); |
309 | | - $getmid->execute($cid,$member_id); |
310 | | - } else { |
311 | | - $getmid=$self->{dbt}->prepare("select member_mid,revision_id from uw_collection_contents where internal_member_id=? and is_latest_set=1 limit 1"); |
312 | | - $getmid->execute($member_id); |
313 | | - } |
314 | | - my $member_mid=$getmid->fetchrow_hashref(); |
315 | | - $rv{mid}=$member_mid->{member_mid}; |
316 | | - $rv{rid}=$member_mid->{revision_id}; |
317 | | - return %rv; |
318 | | - |
319 | | -} |
320 | | - |
321 | | -sub loadReltypes { |
322 | | - my $self=shift; |
323 | | - my %reltypes; |
324 | | - # Get the relation type |
325 | | - $getreltype=$self->{dbt}->prepare("select member_mid,internal_member_id from uw_collection_contents,uw_collection_ns where uw_collection_ns.collection_type='RELT' and uw_collection_ns.collection_id=uw_collection_contents.collection_id"); |
326 | | - $getreltype->execute(); |
327 | | - while (my $reltype=$getreltype->fetchrow_hashref()) { |
328 | | - $reltypes{$reltype->{internal_member_id}}=$reltype->{member_mid}; |
329 | | - } |
330 | | - return %reltypes; |
331 | | -} |
332 | | - |
333 | | -sub loadAttributes { |
334 | | - my $self=shift; |
335 | | - my %attributes; |
336 | | - $getatt=$self->{dbt}->prepare("select member_mid,internal_member_id from uw_collection_contents,uw_collection_ns where uw_collection_ns.collection_type='ATTR' and uw_collection_ns.collection_id=uw_collection_contents.collection_id"); |
337 | | - $getatt->execute(); |
338 | | - while (my $att=$getatt->fetchrow_hashref()) { |
339 | | - $attributes{$att->{internal_member_id}}=$att->{member_mid}; |
340 | | - } |
341 | | - return %attributes; |
342 | | -} |
343 | | - |
344 | | - |
345 | | -# Get all SRDEF attributes |
346 | | -# Get relations between SRDEF |
347 | | -sub importSNtypes { |
348 | | - my $self=shift; |
349 | | - my $type=shift; |
350 | | - $getsemtypes=$self->{dbs}->prepare("select semtypeab,type,definition from srdef where type=?"); |
351 | | - $getsemtypes->execute($type); |
352 | | - while (my $semtype=$getsemtypes->fetchrow_hashref()) { |
353 | | - my $type_expression=$semtype->{semtypeab}; |
354 | | - my $type_code=$type_expression; |
355 | | - $type_expression=~s/_/ /g; |
356 | | - $type_expression=lc($type_expression); |
357 | | - my %rv=$self->addExpression($type_expression,$self->{la}{'en'},0,$self->{cid}{$type},$type_code); |
358 | | - $self->addMeaningText($rv{'rid'},$rv{'mid'},$semtype->{definition},undef,$self->{la}{'en'}); |
359 | | - #print $type_expression." - $self->{cid}{$type} - $type_code\n"; |
360 | | - } |
361 | | -} |
362 | | - |
363 | | -sub importUMLSrelations { |
364 | | - my $self=shift; |
365 | | - my $which=shift; # REL or RELA |
366 | | - my $source=shift; # SAB as MySQL LIKE string |
367 | | - my $getrels; |
368 | | - |
369 | | - if($which eq 'REL') { |
370 | | - $getrels=$self->{dbs}->prepare("select cui1,cui2,rel from MRREL where sab like ?"); |
371 | | - } elsif($which eq 'RELA') { |
372 | | - $getrels=$self->{dbs}->prepare("select cui1,cui2,rela from MRREL where sab like ? and rela!=''"); |
373 | | - } |
374 | | - $getrels->execute($source); |
375 | | - while(my $rel=$getrels->fetchrow_hashref()) { |
376 | | - my $relid=$rel->{lc($which)}; |
377 | | - # These mean the same thing |
378 | | - if($relid eq 'CHD') { |
379 | | - $relid='RN'; |
380 | | - } elsif($relid eq 'PAR') { |
381 | | - $relid='RB'; |
382 | | - } |
383 | | - $getmid=$self->{dbt}->prepare("select member_mid,revision_id from uw_collection_contents where internal_member_id=? and is_latest_set=1 limit 1"); |
384 | | - # Note that the direction in UMLS is opposite to ours |
385 | | - $getmid->execute($rel->{cui2}); |
386 | | - my $mid1=$getmid->fetchrow_hashref(); |
387 | | - $getmid->execute($rel->{cui1}); |
388 | | - my $mid2=$getmid->fetchrow_hashref(); |
389 | | - # FIXME: We are ignoring term relations for now! |
390 | | - if(($mid1->{member_mid} && $mid2->{member_mid}) && ($mid1->{member_mid} != $mid2->{member_mid}) && $self->{reltypes}{$relid}) { |
391 | | - # Add the relation |
392 | | - #print "Found relation ".$relid." (".$self->{reltypes}{$relid}.") between ".$mid1->{member_mid}." and ".$mid2->{member_mid}.".\n"; |
393 | | - $self->addRelation($mid1->{revision_id},$self->{reltypes}{$relid},$mid1->{member_mid},$mid2->{member_mid},my $checkfordupes=1); |
394 | | - } else { |
395 | | - if(!$mid1->{member_mid} && $mid2->{member_mid}) { |
396 | | - print "Did not find MID for ".$rel->{cui1}."!\n"; |
397 | | - } elsif($mid1->{member_mid} && !$mid2->{member_mid}) { |
398 | | - print "Did not find MID for ".$rel->{cui2}."!\n"; |
399 | | - } elsif(!$mid1->{member_mid} && !$mid2->{member_mid}) { |
400 | | - print "Did not find MIDs for ".$rel->{cui1}." and ".$rel->{cui2}."!\n"; |
401 | | - } |
402 | | - } |
403 | | - } |
404 | | - |
405 | | -} |
406 | | - |
407 | | - |
408 | | -sub bootstrapGemetCollection { |
409 | | - my $self=shift; |
410 | | - my %cid; |
411 | | - %rv=$self->addExpression('GEMET Environmental Thesaurus Relation Types',$self->{la}{'en'}); |
412 | | - $cid{'GEMETREL'}=$self->addCollection($rv{mid},'RELT'); |
413 | | - %rv=$self->addExpression('GEMET Environmental Thesaurus Relation Types',$self->{la}{'en'}); |
414 | | - $cid{'GEMET'}=$self->addCollection($rv{mid},''); |
415 | | - return %cid; |
416 | | -} |
417 | | - |
418 | | - |
419 | | -sub bootstrapCollections { |
420 | | - my $self=shift; |
421 | | - my %cid; |
422 | | - my %rv; |
423 | | - |
424 | | - %rv=$self->addExpression('CRISP Thesaurus, 2005',$self->{la}{'en'}); |
425 | | - $cid{'CRISP'}=$self->addCollection($rv{mid},''); |
426 | | - %rv=$self->addExpression('Semantic Network 2005AC Semantic Types',$self->{la}{'en'}); |
427 | | - $cid{'STY'}=$self->addCollection($rv{mid},'ATTR'); |
428 | | - %rv=$self->addExpression('Semantic Network 2005AC Relation Types',$self->{la}{'en'}); |
429 | | - $cid{'RL'}=$self->addCollection($rv{mid},'RELT'); |
430 | | - %rv=$self->addExpression('UMLS Relation Types 2005',$self->{la}{'en'}); |
431 | | - $cid{'REL'}=$self->addCollection($rv{mid},'RELT'); |
432 | | - %rv=$self->addExpression('UMLS Relation Attributes 2005',$self->{la}{'en'}); |
433 | | - $cid{'RELA'}=$self->addCollection($rv{mid},'RELT'); |
434 | | - %rv=$self->addExpression('The International Classification of Primary Care (ICPC), 1993',$self->{la}{'en'}); |
435 | | - $cid{'ICPC'}=$self->addCollection($rv{mid},''); |
436 | | - %rv=$self->addExpression('Medical Subject Headings (MeSH), 2005',$self->{la}{'en'}); |
437 | | - $cid{'MESH'}=$self->addCollection($rv{mid},''); |
438 | | -# %rv=$self->addExpression('Swiss-Prot',$self->{la}{'en'}); |
439 | | -# $cid{'SP'}=$self->addCollection($rv{mid},''); |
440 | | - return %cid; |
441 | | -} |
442 | | - |
443 | | -sub addCollection { |
444 | | - my $self=shift; |
445 | | - my $mid=shift; |
446 | | - my $collection_type=shift; |
447 | | - my $addcollection=$self->{dbt}->prepare('INSERT INTO uw_collection_ns(collection_mid,is_latest,collection_type) values(?,1,?)'); |
448 | | - $addcollection->execute($mid,$collection_type); |
449 | | - my $cid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
450 | | - my $updatefirstver=$self->{dbt}->prepare('UPDATE uw_collection_ns set first_ver=? where collection_id=?'); |
451 | | - $updatefirstver->execute($cid,$cid); |
452 | | - return $cid; |
453 | | -} |
454 | | - |
455 | | -sub importUMLSrelationtypes { |
456 | | - my $self=shift; |
457 | | - my $which=shift; |
458 | | - my $getreltypes; |
459 | | - if($which eq 'REL') { |
460 | | - # CHD and PAR are to be interpreted as RN and RB, SUBX is not used |
461 | | - $getreltypes=$self->{dbs}->prepare("select * from rel where ABBREV!='CHD' and ABBREV!='PAR' and ABBREV!='SUBX'"); |
462 | | - } elsif($which eq 'RELA') { |
463 | | - $getreltypes=$self->{dbs}->prepare("select * from rela"); |
464 | | - } |
465 | | - $getreltypes->execute(); |
466 | | - while(my $reltype=$getreltypes->fetchrow_hashref()) { |
467 | | - my %rv=$self->addExpression($reltype->{FULL},$self->{la}{'en'},0,$self->{cid}{$which},$reltype->{ABBREV}); |
468 | | - } |
469 | | -} |
470 | | - |
471 | | -sub importUMLSterms { |
472 | | - my $self=shift; |
473 | | - my $sab=shift; # the source abbreviation which to import |
474 | | - my $cid=shift; # which collection to associate the defined meanings with |
475 | | - |
476 | | - $getterm=$self->{dbs}->prepare("select str,cui,lat from MRCONSO where sab like ?"); |
477 | | - $getterm->execute($sab); |
478 | | - my %textmid; |
479 | | - while(my $r=$getterm->fetchrow_hashref()) { |
480 | | - my %rv; |
481 | | - my $dupe=0; |
482 | | - my %cuimid=$self->getMidForMember($r->{cui}); |
483 | | - |
484 | | - # Create new expression / Defined Meaning |
485 | | - if(!$cuimid{mid}) { |
486 | | - %rv=$self->addExpression($r->{str},$self->{la_iso}{lc($r->{lat})},0,$cid,$r->{cui}); |
487 | | - # If this is the first time we encounter this CUI, import the definitions |
488 | | - # Note that we'll take any definitions, regardless of the SABs specified! |
489 | | - if($rv{mid}!=-1) { |
490 | | - $getdefs=$self->{dbs}->prepare("select def from MRDEF where cui=?"); |
491 | | - $getdefs->execute($r->{cui}); |
492 | | - while(my $d=$getdefs->fetchrow_hashref()) { |
493 | | - # UMLS only has English definitions |
494 | | - $self->addMeaningText($rv{rid},$rv{mid},$d->{def},0,$self->{la}{'en'}); |
495 | | - } |
496 | | - $textmid{$rv{mid}}=1; |
497 | | - } |
498 | | - # Add as SynTrans to existing Defined Meaning |
499 | | - } else { |
500 | | - %rv=$self->addExpression($r->{str},$self->{la_iso}{lc($r->{lat})},$cuimid{mid}); |
501 | | - } |
502 | | - } |
503 | | -} |
504 | | - |
505 | | - |
506 | | -sub importGemetTerms { |
507 | | - my $self=shift; |
508 | | - my $cid=shift; |
509 | | - # Get all English terms as base |
510 | | - $getterm=$self->{dbs}->prepare("select * from term where langcode=?"); |
511 | | - $getterm->execute('en'); |
512 | | - while($r=$getterm->fetchrow_hashref()) { |
513 | | - # Add English term as defined meaning |
514 | | - my %rv=$self->addExpression($r->{name},$self->{la}{'en'},0,); |
515 | | - |
516 | | - # All translations |
517 | | - $gettrans=$self->{dbs}->prepare("select name,langcode from term where id_concept=? and langcode!='en'"); |
518 | | - $gettrans->execute($r->{id_concept}); |
519 | | - # Add them with the same meaning ID |
520 | | - while($t=$gettrans->fetchrow_hashref()) { |
521 | | - print "Language: $t->{langcode}\n"; |
522 | | - %tv=$self->addExpression($t->{name},$self->{la}{$t->{langcode}},$rv{mid}); |
523 | | - } |
524 | | - # All definitions |
525 | | - $getdef=$self->{dbs}->prepare("select definition,langcode from scope where id_concept=?"); |
526 | | - $getdef->execute($r->{id_concept}); |
527 | | - my $tcid=0; |
528 | | - while($d=$getdef->fetchrow_hashref()) { |
529 | | - if(!$tcid) { |
530 | | - my %mv=$self->addMeaningText($rv{rid},$rv{mid},$d->{definition},0,$self->{la}{$d->{langcode}}); |
531 | | - $tcid=$mv{tcid}; |
532 | | - |
533 | | - } else { |
534 | | - $self->addMeaningText($rv{rid},$rv{mid},$d->{definition},$tcid,$self->{la}{$d->{langcode}}); |
535 | | - |
536 | | - } |
537 | | - } |
538 | | - } |
539 | | -} |
540 | | - |
541 | | - |
542 | | -sub importGemetRelations { |
543 | | - my $self=shift; |
544 | | - # Import GEMET relations |
545 | | - my $getrels=$self->{dbs}->prepare("select * from relation"); |
546 | | - $getrels->execute(); |
547 | | - while(my $rrow=$getrels->fetchrow_hashref()) { |
548 | | - %rv_A=$self->findGemetItem($rrow->{id_concept}); |
549 | | - %rv_B=$self->findGemetItem($rrow->{id_relation}); |
550 | | - if($rv_A{mid} && $rv_B{mid}) { |
551 | | - $self->addRelation($rv_A{rid},$self->{reltypes}{$rrow->{id_type}},$rv_A{mid},$rv_B{mid}); |
552 | | - } |
553 | | - } |
554 | | -} |
555 | | - |
556 | | -sub importGemetThemes { |
557 | | - my $self=shift; |
558 | | - # Get all themes |
559 | | - my $getthemes=$self->{dbs}->prepare("select * from theme"); |
560 | | - my $gettheme_set=$self->{dbs}->prepare("select * from theme where id_theme=?"); |
561 | | - $getthemes->execute(); |
562 | | - while(my $theme_row=$getthemes->fetchrow_hashref()) { |
563 | | - my $theme=$theme_row->{description}; |
564 | | - my @themes=split(/[,;]( ){0,1}/,$theme); |
565 | | - foreach(@themes) { |
566 | | - $_=~s/^ *$//i; |
567 | | - if($_) { |
568 | | - # Does this theme have a expression? |
569 | | - my $t=$_; |
570 | | - my %it=$self->findLatestRevision($t,$self->{la}{$theme_row->{langcode}}); |
571 | | - if($it{liid}) { |
572 | | - # Get the meaning |
573 | | - print "NEW THEME: $t - retrieving existing MID for LIID... ".$it{liid}; |
574 | | - $it{mid}=$self->findMeaning($rv{liid}); |
575 | | - print $it{mid}."\n"; |
576 | | - #print $t. " is a dupe! - $dupes\n"; |
577 | | - #$dupes++; |
578 | | - } else { |
579 | | - # Do we have any of its translations? |
580 | | - # We can only add those if the theme does |
581 | | - # not contain a , - otherwise we can't match! |
582 | | - my $tra_mid=0; |
583 | | - if(!($theme_row->{description}=~m/[,;]/i)) { |
584 | | - print "NEW THEME: $t - no record, looking for its known translations in GEMET\n"; |
585 | | - #print "Checking for translations of ".$theme_row->{description}."\n"; |
586 | | - $gettheme_set->execute($theme_row->{id_theme}); |
587 | | - while((my $tra_row=$gettheme_set->fetchrow_hashref()) && !$tra_mid) { |
588 | | - if($tra_lid=$self->findExpressionId($tra_row->{description},$self->{la}{$tra_row->{langcode}})) { |
589 | | - $tra_mid=$self->findMeaning($tra_lid); |
590 | | - |
591 | | - } |
592 | | - } |
593 | | - } else { |
594 | | - print "NEW THEME: $t - split from the original GEMET data\n"; |
595 | | - } |
596 | | - # Let's make one |
597 | | - if($tra_mid) { |
598 | | - print "Adding new term as translation of $tra_mid\n"; |
599 | | - %it = $self->addExpression($t,$self->{la}{$theme_row->{langcode}},$tra_mid); |
600 | | - } else { |
601 | | - print "Adding new term independently, we do not know its translations.\n"; |
602 | | - %it = $self->addExpression($t,$self->{la}{$theme_row->{langcode}}); |
603 | | - } |
604 | | - |
605 | | - |
606 | | - } |
607 | | - |
608 | | - if(!$have_rel{$theme_row->{id_theme}}) { |
609 | | - # Get all items which have this relation |
610 | | - my $getconcepts=$self->{dbs}->prepare('select id_concept from concept_theme where id_theme=?'); |
611 | | - $getconcepts->execute($theme_row->{id_theme}); |
612 | | - while(my $concrow=$getconcepts->fetchrow_hashref()) { |
613 | | - # Get LIID,RID->meaning for the item |
614 | | - my %tr=$self->findGemetItem($concrow->{id_concept}); |
615 | | - if($tr{rid}) { |
616 | | - $self->addRelation($tr{rid},$self->{reltypes}{it},$tr{mid},$it{mid}); |
617 | | - print "Tied up a relation.."; |
618 | | - } else { |
619 | | - print "Missing record to tie the relation to.."; |
620 | | - } |
621 | | - } |
622 | | - print "\n"; |
623 | | - $have_rel{$theme_row->{id_theme}}=1; |
624 | | - } |
625 | | - |
626 | | - } |
627 | | - } |
628 | | - } |
629 | | - #Split theme into parts |
630 | | -} |
631 | | - |
632 | | -sub findGemetItem { |
633 | | - my $self=shift; |
634 | | - my $concept_id=shift; |
635 | | - # get a word, language |
636 | | - my $getword=$self->{dbs}->prepare("select langcode,name from term where id_concept=? LIMIT 1"); |
637 | | - $getword->execute($concept_id); |
638 | | - my $wordrow=$getword->fetchrow_hashref(); |
639 | | - |
640 | | - # find an expression + meaning |
641 | | - my %rv=$self->findLatestRevision($wordrow->{name},$self->{la}{$wordrow->{langcode}}); |
642 | | - $rv{mid}=$self->findMeaning($rv{liid}); |
643 | | - return %rv; |
644 | | -} |
645 | | - |
646 | | -sub addRelation { |
647 | | - my $self=shift; |
648 | | - my $revid=shift; |
649 | | - my $rtid=shift; |
650 | | - my $mid_A=shift; |
651 | | - my $mid_B=shift; |
652 | | - my $checkfordupes=shift; |
653 | | - |
654 | | - if($checkfordupes) { |
655 | | - my $checkRelationDuplicates=$self->{dbt}->prepare('select 1 as one from uw_meaning_relations where meaning1_mid=? and meaning2_mid=? and relationtype_mid=? and is_latest_set=1 limit 1'); |
656 | | - $checkRelationDuplicates->execute($mid_A,$mid_B,$rtid); |
657 | | - #print "Checking dupe $mid_A, $mid_B, relation type $rtid\n"; |
658 | | - my $dupecheck=$checkRelationDuplicates->fetchrow_hashref(); |
659 | | - if($dupecheck->{one}) { |
660 | | - print "Duplicate relation, not adding.\n"; |
661 | | - return false; |
662 | | - } |
663 | | - } |
664 | | - |
665 | | - my $newkey= $self->getSetIdWhere('uw_meaning_relations','meaning1_mid',$mid_A) || $self->getMaxId('set_id','uw_meaning_relations'); |
666 | | - my $addrel=$self->{dbt}->prepare('insert into uw_meaning_relations(set_id,meaning1_mid,meaning2_mid,relationtype_mid,is_latest_set,first_set,revision_id) values(?,?,?,?,?,?,?)'); |
667 | | - $addrel->execute($newkey,$mid_A,$mid_B,$rtid,1,$newkey,$revid); |
668 | | - |
669 | | - print "newkey: $newkey\n"; |
670 | | - print "mid_A: $mid_A\n"; |
671 | | - print "mid_B: $mid_B\n"; |
672 | | - print "rtid: $rtid\n"; |
673 | | - print "revid: $revid\n"; |
674 | | -} |
675 | | - |
676 | | - |
677 | | -sub findMeaning { |
678 | | - my $self=shift; |
679 | | - my $liid=shift; |
680 | | - # Search syntrans table |
681 | | - my $getsyn=$self->{dbt}->prepare("select defined_meaning_id from uw_syntrans where expression_id=?"); |
682 | | - $getsyn->execute($liid); |
683 | | - my $syn_row=$getsyn->fetchrow_hashref(); |
684 | | - if($syn_row->{defined_meaning_id}) { |
685 | | - return $syn_row->{defined_meaning_id}; |
686 | | - } |
687 | | - my $getdm=$self->{dbt}->prepare("select defined_meaning_id from uw_defined_meaning where expression_id=? limit 1"); |
688 | | - $getdm->execute($liid); |
689 | | - my $dm_row=$getdm->fetchrow_hashref(); |
690 | | - if($dm_row->{defined_meaning_id}) { |
691 | | - return $dm_row->{defined_meaning_id}; |
692 | | - } |
693 | | - return 0; |
694 | | -} |
695 | | - |
696 | | -# If there already is a meaning text for this DefinedMeaning, it will add the MeaningText as an alternative definition |
697 | | -sub addMeaningText { |
698 | | - my $self=shift; |
699 | | - my $rid=shift; |
700 | | - my $mid=shift; |
701 | | - my $meaningtext=shift; # optional |
702 | | - my $meaningtext_set=shift; # optional TCID set to join with |
703 | | - my $lid=shift; # ID, not code |
704 | | - my %rv; |
705 | | - |
706 | | - # Add text row entry |
707 | | - my $maketext=$self->{dbt}->prepare('insert into text(old_text) values(?)'); |
708 | | - $maketext->execute($meaningtext); |
709 | | - # Get text row ID |
710 | | - $tid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
711 | | - # Get new or existing translated content set ID |
712 | | - $tcid=$meaningtext_set || $self->getMaxId('set_id','translated_content'); |
713 | | - # Create new translated content set |
714 | | - my $maketc=$self->{dbt}->prepare('insert into translated_content(set_id,language_id,text_id,first_set,revision_id) values(?,?,?,?,?)'); |
715 | | - $maketc->execute($tcid,$lid,$tid,$tcid,$rid); |
716 | | - $rv{tcid}=$tcid; |
717 | | - |
718 | | - # THIS DOESN'T WORK FOR DEFINITIONS IN MULTIPLE LANGUAGES |
719 | | - # Check if a meaning text has already been set |
720 | | - my $lookformeaning=$self->{dbt}->prepare('select meaning_text_tcid from uw_defined_meaning where defined_meaning_id=? and is_latest_ver=1'); |
721 | | - $lookformeaning->execute($mid); |
722 | | - my $mrow=$lookformeaning->fetchrow_hashref(); |
723 | | - if($mrow->{meaning_text_tcid}) { |
724 | | - # There is a meaning text - the new one is only an alternative |
725 | | - my $altset=$self->getSetIdWhere('uw_alt_meaningtexts','meaning_mid',$mid) || $self->getMaxId('set_id','uw_alt_meaningtexts'); |
726 | | - my $addaltmeaning=$self->{dbt}->prepare('insert into uw_alt_meaningtexts(set_id,meaning_mid,meaning_text_tcid,is_latest_set,first_set,revision_id) values(?,?,?,?,?,?)'); |
727 | | - $addaltmeaning->execute($altset,$mid,$tcid,1,$altset,$rid) |
728 | | - } else { |
729 | | - my $updatemeaning=$self->{dbt}->prepare('update uw_defined_meaning set meaning_text_tcid=? where defined_meaning_id=?'); |
730 | | - $updatemeaning->execute($tcid,$mid); |
731 | | - } |
732 | | - return %rv; |
733 | | -} |
734 | | - |
735 | | - |
736 | | -# If the expression already exists, add a new DefinedMeaning - unless this is a translation or synonym; if a record already exists in SynTrans with this expression _and_ $translation_of as a DefinedMeaning, do not do anything |
737 | | -sub addExpression { |
738 | | - my $self=shift; |
739 | | - # return MID, RID, LID, TCID! |
740 | | - my $expression=shift; |
741 | | - my $lid=shift; # ID, not code |
742 | | - my $translation_of=shift; # 0 or MID (!), optional |
743 | | - my $collection_id=shift; # optional |
744 | | - my $collection_internal_member_id=shift; # what does the collection use to refer to this member? |
745 | | - my %rv; |
746 | | - my $isdupe=0; |
747 | | - my %firv=$self->findLatestRevision($expression,$lid); |
748 | | - if($firv{liid}) { $isdupe=1; } |
749 | | - |
750 | | - if(!$isdupe) { |
751 | | - |
752 | | - #create page |
753 | | - my $pt=$self->canonize($expression); |
754 | | - $makepage=$self->{dbt}->prepare('insert into page(page_namespace,page_title,page_is_new,page_title_language_id,page_touched) values(?,?,?,?,?)'); |
755 | | - $makepage->execute(16,$pt,1,$lid,$self->mwtimestamp()); |
756 | | - $pid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
757 | | - print "PID: $pid\n"; |
758 | | - |
759 | | - $rv{pid}=$pid; |
760 | | - |
761 | | - #create revision |
762 | | - $makerev=$self->{dbt}->prepare('insert into revision(rev_page,rev_comment,rev_user,rev_user_text,rev_timestamp) values(?,?,?,?,?)'); |
763 | | - $makerev->execute($pid,'Initial import',2,'GEMET',$self->mwtimestamp()); |
764 | | - |
765 | | - #get revision_id |
766 | | - $rid=$self->getId('select rev_id from revision where rev_page=?',$pid); |
767 | | - $rv{rid}=$rid; |
768 | | - |
769 | | - #update page to link to revision |
770 | | - $updatepage=$self->{dbt}->prepare('update page set page_latest=? where page_id=?'); |
771 | | - $updatepage->execute($rid,$pid); |
772 | | - |
773 | | - #create expression |
774 | | - $makeitem=$self->{dbt}->prepare('insert into uw_expression_ns(spelling,language_id,is_latest) values(?,?,1)'); |
775 | | - $makeitem->execute($expression,$lid); |
776 | | - $liid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
777 | | - $rv{liid}=$liid; |
778 | | - |
779 | | - # update firstver |
780 | | - $updateitem=$self->{dbt}->prepare('update uw_expression_ns set first_ver=? where expression_id=?'); |
781 | | - $updateitem->execute($liid,$liid); |
782 | | - |
783 | | - #update revision to link to expression |
784 | | - $updaterev=$self->{dbt}->prepare('update revision set rev_data_id=? where rev_id=?'); |
785 | | - $updaterev->execute($liid,$rid); |
786 | | - |
787 | | - } else { |
788 | | - |
789 | | - $rid=$firv{rid}; |
790 | | - $liid=$firv{liid}; |
791 | | - $rv{rid}=$rid; |
792 | | - $rv{liid}=$liid; |
793 | | - |
794 | | - } |
795 | | - |
796 | | - #create definedmeaning |
797 | | - if(!$translation_of) { |
798 | | - $makemean=$self->{dbt}->prepare('insert into uw_defined_meaning(expression_id,revision_id) values(?,?)'); |
799 | | - $makemean->execute($liid,$rid); |
800 | | - # We always want a syntrans record, so in this case it links to its own |
801 | | - # def. meaning |
802 | | - $translation_of=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
803 | | - $mid=$translation_of; |
804 | | - $rv{mid}=$mid; |
805 | | - $updatemeaningver=$self->{dbt}->prepare('update uw_defined_meaning set first_ver=? where defined_meaning_id=?'); |
806 | | - $updatemeaningver->execute($mid,$mid); |
807 | | - if($collection_id) { |
808 | | - $addtocoll=$self->{dbt}->prepare('insert into uw_collection_contents(set_id, collection_id, member_mid, is_latest_set, first_Set, revision_id, internal_member_id) values(?,?,?,?,?,?,?)'); |
809 | | - #fixme set association |
810 | | - $addtocoll->execute(1,$collection_id,$mid,1,1,$rid,$collection_internal_member_id); |
811 | | - } |
812 | | - } |
813 | | - |
814 | | - # Check if we already have this specific record |
815 | | - $checkdupes=$self->{dbt}->prepare('select set_id from uw_syntrans where defined_meaning_id=? and expression_id=?'); |
816 | | - $checkdupes->execute($translation_of,$liid); |
817 | | - my $duperow=$checkdupes->fetchrow_hashref(); |
818 | | - my $dupeid=$duperow->{set_id}; |
819 | | - if(!$dupeid) { |
820 | | - |
821 | | - # Check if this is part of a set |
822 | | - $getset=$self->{dbt}->prepare('select set_id from uw_syntrans where defined_meaning_id=? and is_latest_set=1'); |
823 | | - $getset->execute($mid); |
824 | | - $row=$getset->fetchrow_hashref(); |
825 | | - my $setid=$row->{set_id} || $self->getMaxId('set_id','uw_syntrans'); |
826 | | - # Add syntrans record |
827 | | - $maketrans=$self->{dbt}->prepare('insert into uw_syntrans(set_id,defined_meaning_id,expression_id,first_set,revision_id,is_latest_set,endemic_meaning) values(?,?,?,?,?,1,1)'); |
828 | | - $maketrans->execute($setid,$translation_of,$liid,$setid,$rid); |
829 | | - $rv{setid}=$setid; |
830 | | - $rv{mid}=$translation_of; |
831 | | - } else{ |
832 | | - $rv{setid}=$dupeid; # Dupe |
833 | | - $rv{mid}=-1; # Dupe |
834 | | - } |
835 | | - return %rv; |
836 | | - |
837 | | -} |
838 | | - |
839 | | -sub findLatestRevision { |
840 | | - my $self = shift; |
841 | | - my $expressionSpelling = shift; |
842 | | - my $languageId = shift; |
843 | | - |
844 | | - my $expressionId = $self->findExpressionId($expressionSpelling, $languageId); |
845 | | - if ($expressionId != 0) { |
846 | | - my $getRevisionId = $self->{dbt}->prepare('select rev_id from revision where rev_data_id=?'); |
847 | | - $getRevisionId->execute($expressionId); |
848 | | - my %revision; |
849 | | - $revision{liid} = $expressionId; |
850 | | - $revision{rid} = $getRevisionId->fetchrow_hashref->{rev_id}; |
851 | | - return %revision; |
852 | | - } else { |
853 | | - return 0; |
854 | | - } |
855 | | -} |
856 | | - |
857 | | -sub findExpressionId { |
858 | | - my $self = shift; |
859 | | - my $expressionSpelling = shift; |
860 | | - my $languageId = shift; |
861 | | - |
862 | | - my $getItem = $self->{dbt}->prepare("select expression_id from uw_expression_ns where spelling=binary ? and language_id=? and is_latest=1"); |
863 | | - $getItem->execute($expressionSpelling, $languageId); |
864 | | - my $itemRow = $getItem->fetchrow_hashref(); |
865 | | - if ($itemRow) { |
866 | | - return $itemRow->{expression_id}; |
867 | | - } else { |
868 | | - return 0; |
869 | | - } |
870 | | -} |
871 | | - |
872 | | -sub getMaxId { |
873 | | - my $self=shift; |
874 | | - my $field=shift; |
875 | | - my $table=shift; |
876 | | - $getmax=$self->{dbt}->prepare("select max($field) as maxset from $table"); |
877 | | - $getmax->execute(); |
878 | | - my $row=$getmax->fetchrow_hashref(); |
879 | | - return $row->{maxset}+1; |
880 | | -} |
881 | | - |
882 | | -sub getSetIdWhere { |
883 | | - my $self=shift; |
884 | | - my $table=shift; |
885 | | - my $wherefield=shift; |
886 | | - my $wherekey=shift; |
887 | | - $getmax=$self->{dbt}->prepare("select set_id from $table WHERE $wherefield=? AND is_latest_set=1 limit 1"); |
888 | | - $getmax->execute($wherekey); |
889 | | - my $row=$getmax->fetchrow_hashref(); |
890 | | - return $row->{set_id}; |
891 | | -} |
892 | | - |
893 | | - |
894 | | -sub getId { |
895 | | - my $self=shift; |
896 | | - my $prep=shift; |
897 | | - $prep=~m/select (.*?) from/i; |
898 | | - my $field=$1; |
899 | | - my $getlang=$self->{dbt}->prepare($prep); |
900 | | - $getlang->execute(@_); |
901 | | - my $row=$getlang->fetchrow_hashref(); |
902 | | - my $id=$row->{$field}; |
903 | | - return $id; |
904 | | -} |
905 | | - |
906 | | -sub mwtimestamp { |
907 | | - my $self=shift; |
908 | | - use POSIX qw(strftime); |
909 | | - return(strftime "%Y%m%d%H%M%S", localtime); |
910 | | -} |
911 | | - |
912 | | - |
913 | | -sub canonize { |
914 | | - my $self=shift; |
915 | | - my $title=shift; |
916 | | - #$title=ucfirst($title); |
917 | | - $title=~s/ /_/ig; |
918 | | - return $title; |
919 | | -} |
920 | | - |
921 | | -sub initlangs { |
922 | | - my $self=shift; |
923 | | - %langs=( |
924 | | - en_en=>'English', |
925 | | - en_de=>'Englisch', |
926 | | - 'en-US_de'=>'Englisch (USA)', |
927 | | - 'en-US_en'=>'English (United States)', |
928 | | - bg_en=>'Bulgarian', |
929 | | - bg_de=>'Bulgarisch', |
930 | | - cs_en=>'Czech', |
931 | | - cs_de=>'Tschechisch', |
932 | | - da_en=>'Dansk', |
933 | | - da_de=>'D?isch', |
934 | | - de_en=>'German', |
935 | | - de_de=>'Deutsch', |
936 | | - es_en=>'Spanish', |
937 | | - es_de=>'Spanisch', |
938 | | - et_en=>'Estonian', |
939 | | - et_de=>'Estnisch', |
940 | | - eu_en=>'Basque', |
941 | | - eu_de=>'Baskisch', |
942 | | - fi_en=>'Finnish', |
943 | | - fi_de=>'Finnisch', |
944 | | - fr_en=>'French', |
945 | | - fr_de=>'Franz?isch', |
946 | | - hu_en=>'Hungarian', |
947 | | - hu_de=>'Ungarisch', |
948 | | - it_en=>'Italian', |
949 | | - it_de=>'Italienisch', |
950 | | - nl_en=>'Dutch', |
951 | | - nl_de=>'Niederl?disch', |
952 | | - no_en=>'Norwegian', |
953 | | - no_de=>'Norwegisch', |
954 | | - pl_en=>'Polish', |
955 | | - pl_de=>'Polnisch', |
956 | | - pt_en=>'Portuguese', |
957 | | - pt_de=>'Portugiesisch', |
958 | | - ru_en=>'Russian', |
959 | | - ru_de=>'Russisch', |
960 | | - sk_en=>'Slovak', |
961 | | - sk_de=>'Slowakische Sprache', |
962 | | - sl_en=>'Slovenian', |
963 | | - sl_de=>'Slowenisch', |
964 | | - el_en=>'Greek', |
965 | | - el_de=>'Griechisch', |
966 | | - sv_en=>'Swedish', |
967 | | - sv_de=>'Schwedisch'); |
968 | | - foreach(keys(%langs)) { |
969 | | - $key=$_; |
970 | | - $key=~m/(.*?)_(.*)/i; |
971 | | - $lang=$1; |
972 | | - #print "Lang: $lang\n"; |
973 | | - $wordlang=$2; |
974 | | - if($wordlang eq 'en') { |
975 | | - $addwm=$self->{dbt}->prepare("insert into language(wikimedia_key) values(?)"); |
976 | | - $addwm->execute($lang); |
977 | | - } |
978 | | - } |
979 | | - foreach(keys(%langs)) { |
980 | | - $key=$_; |
981 | | - $key=~m/(.*?)_(.*)/i; |
982 | | - $lang=$1; |
983 | | - #print "Lang: $lang\n"; |
984 | | - $wordlang=$2; |
985 | | - $langword_u=$langs{$key}; |
986 | | - $langword=encode("utf8",$langword_u); |
987 | | - $newwm=$self->{dbt}->prepare("select language_id from language where wikimedia_key=?"); |
988 | | - $newwm->execute($lang); |
989 | | - my $row=$newwm->fetchrow_hashref(); |
990 | | - $newwm->execute('en'); |
991 | | - my $en_row=$newwm->fetchrow_hashref(); |
992 | | - $newwm->execute('de'); |
993 | | - my $de_row=$newwm->fetchrow_hashref(); |
994 | | - $newword=$self->{dbt}->prepare("insert into language_names values (?,?,?)"); |
995 | | - if($wordlang eq 'en') { |
996 | | - $newword->execute($row->{language_id},$en_row->{language_id},$langword); |
997 | | - } elsif($wordlang eq 'de') { |
998 | | - $newword->execute($row->{language_id},$de_row->{language_id},$langword); |
999 | | - } |
1000 | | - } |
1001 | | -} |
1002 | | - |
1003 | | -sub initRel { |
1004 | | - my $self=shift; |
1005 | | - my $cid=shift; |
1006 | | - %rel_types=( |
1007 | | - bt_en=>'broader terms', |
1008 | | - bt_de=>'breitere Begriffe', |
1009 | | - nt_en=>'narrower terms', |
1010 | | - nt_de=>'engere Begriffe', |
1011 | | - rt_en=>'related terms', |
1012 | | - rt_de=>'verwandte Begriffe', |
1013 | | - it_en=>'is part of theme', |
1014 | | - it_de=>'ist Themenbestandteil von' |
1015 | | - ); |
1016 | | - |
1017 | | - %rel_definitions=( |
1018 | | - bt_en=>'Those terms in a thesaurus which are broader than others', |
1019 | | - bt_de=>'Die Begriffe in einem Thesaurus, die breiter sind als andere', |
1020 | | - nt_en=>'Those terms in a thesaurus which are narrower than others', |
1021 | | - nt_de=>'Die Begriffe in einem Thesaurus, die enger sind als andere', |
1022 | | - rt_en=>'Those terms in a thesaurus which are related to others', |
1023 | | - rt_de=>'Die Begriffe in einem Thesaurus, die mit anderen verwandt sind', |
1024 | | - it_en=>'Those terms in a thesaurus or dictionary which are associated with a topic', |
1025 | | - it_de=>'Die Begriffe in einem Thesaurus oder Woerterbuch, die mit einem Thema assoziiert sind'); |
1026 | | - |
1027 | | - foreach(keys(%rel_types)) { |
1028 | | - $key=$_; |
1029 | | - $key=~m/(..)_(..)/i; |
1030 | | - $ident=$1; |
1031 | | - $lang=$2; |
1032 | | - if($lang eq 'de') { |
1033 | | - $en_key="$ident\_en"; |
1034 | | - my %rv=$self->addExpression($rel_types{$en_key},$self->{la}{'en'},0,$cid,$ident); |
1035 | | - $self->addMeaningText($rv{rid},$rv{mid},$rel_definitions{$en_key},0,$self->{la}{'en'}); |
1036 | | - my %dv=$self->addExpression($rel_types{$key},$self->{la}{'de'},$rv{'mid'}); |
1037 | | - $self->addMeaningText($dv{rid},$dv{mid},$rel_definitions{$key},$rv{'tcid'},$self->{la}{'de'}); |
1038 | | - } |
1039 | | - } |
1040 | | -} |
1041 | | - |
1042 | | -sub loadLangs { |
1043 | | - my $self=shift; |
1044 | | - my %la; |
1045 | | - $getlangs=$self->{dbt}->prepare('select language_id,wikimedia_key from language'); |
1046 | | - $getlangs->execute(); |
1047 | | - while($langrow=$getlangs->fetchrow_hashref()) { |
1048 | | - $la{$langrow->{wikimedia_key}}=$langrow->{language_id}; |
1049 | | - } |
1050 | | - return %la; |
1051 | | -} |
1052 | | - |
1053 | | -sub loadLangsIso { |
1054 | | - my $self=shift; |
1055 | | - my %la_iso; |
1056 | | - $getlangs=$self->{dbt}->prepare('select language_id,iso639_2 from language'); |
1057 | | - $getlangs->execute(); |
1058 | | - while($langrow=$getlangs->fetchrow_hashref()) { |
1059 | | - $la_iso{$langrow->{iso639_2}}=$langrow->{language_id}; |
1060 | | - } |
1061 | | - return %la_iso; |
1062 | | -} |
1063 | | - |
1064 | | -return(1); |
Index: trunk/extensions/Wikidata/perl-tools/Import OmegaWiki.pl |
— | — | @@ -0,0 +1,55 @@ |
| 2 | +use OmegaWiki; |
| 3 | +use POSIX qw(strftime); |
| 4 | + |
| 5 | +my $startTime = time; |
| 6 | + |
| 7 | +# Example usage to import UMLS completely into an existing OmegaWiki database: |
| 8 | +# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
| 9 | +# $importer->setSourceDB('umls'); |
| 10 | +# $importer->initialize; |
| 11 | +# $importer->importCompleteUMLS(); |
| 12 | + |
| 13 | +# Example usage to import a part of UMLS into an existing OmegaWiki database: |
| 14 | +# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
| 15 | +# $importer->setSourceDB('umls'); |
| 16 | +# $importer->initialize; |
| 17 | +# my %sourceAbbreviations = $importer->loadSourceAbbreviations(); |
| 18 | +# delete($sourceAbbreviations{"MSH"}); |
| 19 | +# $importer->importUMLS(\%sourceAbbreviations); |
| 20 | + |
| 21 | +my $importer=new OmegaWiki('wikidata_icpc','root',''); |
| 22 | +$importer->setSourceDB('umls'); |
| 23 | +#$importer->setSourceDB('swissprot'); |
| 24 | +$importer->initialize; |
| 25 | +#$importer->importCompleteUMLS(); |
| 26 | + |
| 27 | +#read the source abbreviations and remove those you do not wish to import |
| 28 | +my %sourceAbbreviations = $importer->loadSourceAbbreviations(); |
| 29 | +my @deleteList; |
| 30 | +while (($key, $val) = each(%sourceAbbreviations)) { |
| 31 | + |
| 32 | +#remove all that contains "MSH": |
| 33 | +# if (index($key,"MSH") >= 0) { |
| 34 | +# push(@deleteList, $key); |
| 35 | +# } |
| 36 | + |
| 37 | +#remove all that does not contain "ICPC": |
| 38 | + if (index($key,"ICPC") < 0) { |
| 39 | + push(@deleteList, $key); |
| 40 | + } |
| 41 | +} |
| 42 | + |
| 43 | +foreach $sab (@deleteList){ |
| 44 | + delete($sourceAbbreviations{$sab}); |
| 45 | +} |
| 46 | + |
| 47 | +$importer->importUMLS(\%sourceAbbreviations); |
| 48 | + |
| 49 | + |
| 50 | +my $endTime = time; |
| 51 | +print "\n"; |
| 52 | +print "Import started at: " . (strftime "%H:%M:%S", localtime($startTime)) . "\n"; |
| 53 | +print "Import ended at: " . (strftime "%H:%M:%S", localtime($endTime)) . "\n"; |
| 54 | +print "Elapsed time: " . (strftime "%H:%M:%S", gmtime($endTime - $startTime)) . "\n"; |
| 55 | + |
| 56 | +exit 0; |
\ No newline at end of file |
Index: trunk/extensions/Wikidata/perl-tools/OmegaWiki.pm |
— | — | @@ -0,0 +1,1063 @@ |
| 2 | +# Example usage to import UMLS into an existing OmegaWiki database: |
| 3 | +# use OmegaWiki; |
| 4 | +# my $importer=new OmegaWiki('wikidatadb','root','MyPass'); |
| 5 | +# $importer->setSourceDB('umls'); |
| 6 | +# $importer->initialize; |
| 7 | +# $importer->importCompleteUMLS(); |
| 8 | +# |
| 9 | +# NOTE: When importing UMLS, we expect the presence of the semantic network data |
| 10 | +# in the tables SRDEF and the manually created tables SEMRELHIER and SEMTYPEHIER. |
| 11 | +# SEMRELHIER and SEMTYPEHIER contain information about the relations between |
| 12 | +# semantic types and relation types, using RB as the code for "broader than" |
| 13 | +# and RN for "narrower than". |
| 14 | +# |
| 15 | +# Todo for UMLS: |
| 16 | +# SyntransCollection |
| 17 | +# RelationCollection |
| 18 | +# Fully deal with alternative definitions referring to the same concept |
| 19 | +# Deal with preferred lexical expressions, primary concepts (general weighting mechanism?) |
| 20 | + |
| 21 | +package OmegaWiki; |
| 22 | +use DBI; |
| 23 | +use Encode; |
| 24 | +use POSIX qw(strftime); |
| 25 | + |
| 26 | +sub new { |
| 27 | + my $type=shift; |
| 28 | + my $self={}; |
| 29 | + $self->{targetdb}=shift; |
| 30 | + $self->{targetuser}=shift; |
| 31 | + $self->{targetpass}=shift; |
| 32 | + $self->{targethost}=shift || 'localhost'; |
| 33 | + $self->{targetport}=shift || '3306'; |
| 34 | + $self->{targetdriver}=shift || 'mysql'; |
| 35 | + bless($self, $type); |
| 36 | + return($self); |
| 37 | +} |
| 38 | + |
| 39 | +sub setSourceDB { |
| 40 | + my $self=shift; |
| 41 | + $self->{sourcedb}=shift; |
| 42 | + $self->{sourceuser}=shift || $self->{targetuser}; |
| 43 | + $self->{sourcepass}=shift || $self->{targetpass}; |
| 44 | + $self->{sourcehost}=shift || $self->{targethost}; |
| 45 | + $self->{sourceport}=shift || $self->{targetport}; |
| 46 | + $self->{sourcedriver}=shift || $self->{targetdriver}; |
| 47 | +} |
| 48 | + |
| 49 | +sub connectSourceDB { |
| 50 | + my $self=shift; |
| 51 | + my $dsn = 'dbi:'.$self->{sourcedriver}.':'.$self->{sourcedb}.':'.$self->{sourcehost}.':'.$self->{sourceport}; |
| 52 | + $self->{dbs}=DBI->connect($dsn,$self->{sourceuser},$self->{sourcepass}); |
| 53 | +} |
| 54 | + |
| 55 | +sub connectTargetDB { |
| 56 | + my $self=shift; |
| 57 | + my $dsn = 'dbi:'.$self->{targetdriver}.':'.$self->{targetdb}.':'.$self->{targethost}.':'.$self->{targetport}; |
| 58 | + $self->{dbt}=DBI->connect($dsn,$self->{targetuser},$self->{targetpass}); |
| 59 | +} |
| 60 | + |
| 61 | +sub connectDBs() { |
| 62 | + my $self = shift; |
| 63 | + |
| 64 | + $self->connectSourceDB(); |
| 65 | + $self->connectTargetDB(); |
| 66 | +} |
| 67 | + |
| 68 | +sub loadLanguages() { |
| 69 | + my $self = shift; |
| 70 | + |
| 71 | + my %la=$self->loadLangs(); |
| 72 | + $self->{la}=\%la; |
| 73 | + my %la_iso=$self->loadLangsIso(); |
| 74 | + $self->{la_iso}=\%la_iso; |
| 75 | +} |
| 76 | + |
| 77 | +sub initialize { |
| 78 | + my $self=shift; |
| 79 | + $self->connectDBs(); |
| 80 | + $self->loadLanguages(); |
| 81 | +} |
| 82 | + |
| 83 | +sub importCompleteUMLS { |
| 84 | + my $self=shift; |
| 85 | + my $level=shift || 0; # 0= complete; 1=reltypes+; 2=rel+ |
| 86 | + |
| 87 | + my %sourceAbbreviations = $self->loadSourceAbbreviations(); |
| 88 | + $self->importUMLS(\%sourceAbbreviations, $level); |
| 89 | +} |
| 90 | + |
| 91 | +sub importUMLS() { |
| 92 | + my $self=shift; |
| 93 | + my $sourceAbbreviationsReference = shift; |
| 94 | + my $level=shift || 0; # 0= complete; 1=reltypes+; 2=rel+ |
| 95 | + |
| 96 | + my %sourceAbbreviations = %{$sourceAbbreviationsReference}; |
| 97 | + my %cid=$self->getOrCreateCollections(\%sourceAbbreviations); |
| 98 | + $self->{cid}=\%cid; |
| 99 | + |
| 100 | + if ($level<1){ |
| 101 | + while (($sourceAbbreviation, $collectionId) = each %cid) { |
| 102 | + print "Import UMLS terms for $sourceAbbreviation\n"; |
| 103 | + $self->importUMLSterms($sourceAbbreviation, $collectionId); |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + if($level<2) { |
| 108 | + print "Import UMLS relation types 'REL'\n"; |
| 109 | + $self->importUMLSrelationtypes('REL'); |
| 110 | + print "Import UMLS relation types 'RELA'\n"; |
| 111 | + $self->importUMLSrelationtypes('RELA'); |
| 112 | + } |
| 113 | + |
| 114 | + if($level<3) { |
| 115 | + while (($sourceAbbreviation, $collectionId) = each %cid) { |
| 116 | + my %rt=$self->loadReltypes(); |
| 117 | + $self->{reltypes}=\%rt; |
| 118 | + print "Import UMLS relations 'REL' for $sourceAbbreviation\n"; |
| 119 | + $self->importUMLSrelations('REL',$sourceAbbreviation); |
| 120 | + print "Import UMLS relations 'RELA' for $sourceAbbreviation\n"; |
| 121 | + $self->importUMLSrelations('RELA',$sourceAbbreviation); |
| 122 | + } |
| 123 | + } |
| 124 | + |
| 125 | + if($level<4) { |
| 126 | + print "Import SN types 'STY'\n"; |
| 127 | + $self->importSNtypes('STY'); |
| 128 | + print "Import SN types 'RL'\n"; |
| 129 | + $self->importSNtypes('RL'); |
| 130 | + print "Import ST relations 'STY'\n"; |
| 131 | + $self->importSTrelations('STY'); |
| 132 | + print "Import ST relations 'RL'\n"; |
| 133 | + $self->importSTrelations('RL'); |
| 134 | + # $self->importSTrelations2(); |
| 135 | + } |
| 136 | + |
| 137 | + if($level<5) { |
| 138 | + while (($sourceAbbreviation, $collectionId) = each %cid) { |
| 139 | + my %attribs=$self->loadAttributes(); |
| 140 | + $self->{attribs}=\%attribs; |
| 141 | + print "Import UMLS types for $sourceAbbreviation\n"; |
| 142 | + $self->importUMLSstypes($sourceAbbreviation); |
| 143 | + } |
| 144 | + } |
| 145 | +} |
| 146 | + |
| 147 | +sub importGEMET { |
| 148 | + my $self=shift; |
| 149 | + $self->connectSourceDB(); |
| 150 | + $self->connectTargetDB(); |
| 151 | + my %la=$self->loadLangs(); |
| 152 | + $self->{la}=\%la; |
| 153 | + my %cid=$self->bootstrapGemetCollection(); |
| 154 | + $self->{cid}=\%cid; |
| 155 | + $self->initRel($self->{cid}{'GEMETREL'}); |
| 156 | + my %rt=$self->loadReltypes(); |
| 157 | + $self->{reltypes}=\%rt; |
| 158 | + $self->importGemetTerms(); |
| 159 | + $self->importGemetRelations(); |
| 160 | + $self->importGemetThemes(); |
| 161 | +} |
| 162 | + |
| 163 | +sub importUMLSstypes { |
| 164 | + my $self=shift; |
| 165 | + my $sab=shift; |
| 166 | + |
| 167 | + my $getassocs=$self->{dbs}->prepare("select MRSTY.CUI, MRSTY.STY from MRCONSO,MRSTY where MRCONSO.SAB like ? and MRCONSO.CUI=MRSTY.CUI"); |
| 168 | + $getassocs->execute($sab); |
| 169 | + while(my $row=$getassocs->fetchrow_hashref()) { |
| 170 | + my %rv=$self->getMidForMember($row->{CUI}); |
| 171 | + my $att=$self->{attribs}{$row->{STY}}; |
| 172 | + #print "$rv{mid} is a $row->{STY} ($att)\n"; |
| 173 | + $self->addRelation($rv{rid},0,$rv{mid},$att, my $checkfordupes=1); |
| 174 | + } |
| 175 | +} |
| 176 | + |
| 177 | +sub getCollections(){ |
| 178 | + my $self=shift; |
| 179 | + my %cid; |
| 180 | + $cid{'CRISP'}=$self->findCollection($self->findMeaning($self->findItem('CRISP Thesaurus, 2005',$self->{la}{'en'}))); |
| 181 | + $cid{'STY'}=$self->findCollection($self->findMeaning($self->findItem('Semantic Network 2005AC Semantic Types',$self->{la}{'en'}))); |
| 182 | + $cid{'RL'}=$self->findCollection($self->findMeaning($self->findItem('Semantic Network 2005AC Relation Types',$self->{la}{'en'}))); |
| 183 | + $cid{'REL'}=$self->findCollection($self->findMeaning($self->findItem('UMLS Relation Types 2005',$self->{la}{'en'}))); |
| 184 | + $cid{'RELA'}=$self->findCollection($self->findMeaning($self->findItem('UMLS Relation Attributes 2005',$self->{la}{'en'}))); |
| 185 | + $cid{'ICPC'}=$self->findCollection($self->findMeaning($self->findItem('The International Classification of Primary Care (ICPC), 1993',$self->{la}{'en'}))); |
| 186 | + $cid{'MESH'}=$self->findCollection($self->findMeaning($self->findItem('Medical Subject Headings (MeSH), 2005',$self->{la}{'en'}))); |
| 187 | +# $cid{'SP'}=$self->findCollection($self->findMeaning($self->findItem('Swiss-Prot',$self->{la}{'en'}))); |
| 188 | + return %cid; |
| 189 | +} |
| 190 | + |
| 191 | +sub findCollection() { |
| 192 | + my $self=shift; |
| 193 | + my $mid=shift; |
| 194 | + my $findcoll=$self->{dbt}->prepare("select collection_id from uw_collection_ns where collection_mid=? and is_latest=1"); |
| 195 | + $findcoll->execute($mid); |
| 196 | + my $row=$findcoll->fetchrow_hashref(); |
| 197 | + return $row->{collection_id}; |
| 198 | +} |
| 199 | + |
| 200 | +sub getCollection { |
| 201 | + my $self = shift; |
| 202 | + my $expression = shift; |
| 203 | + |
| 204 | + return $self->findCollection($self->findMeaning($self->findExpressionId($expression,$self->{la}{'en'}))); |
| 205 | +} |
| 206 | + |
| 207 | +sub bootstrapCollection { |
| 208 | + my $self = shift; |
| 209 | + my $expression = shift; |
| 210 | + my $collectionType = shift; |
| 211 | + |
| 212 | + %rv=$self->addExpression($expression,$self->{la}{'en'}); |
| 213 | + return $self->addCollection($rv{mid},$collectionType); |
| 214 | +} |
| 215 | + |
| 216 | +sub getOrCreateCollection { |
| 217 | + my $self = shift; |
| 218 | + my $expression = shift; |
| 219 | + |
| 220 | + my $result = $self->getCollection($expression); |
| 221 | + |
| 222 | + if (!$result) { |
| 223 | + $result = $self->bootstrapCollection($expression, ''); |
| 224 | + } |
| 225 | + |
| 226 | + return $result; |
| 227 | +} |
| 228 | + |
| 229 | +sub getOrCreateCollections { |
| 230 | + my $self = shift; |
| 231 | + my $sourceAbbreviationsReference = shift; |
| 232 | + my %sourceAbbreviations = %{$sourceAbbreviationsReference}; |
| 233 | + my %cid; |
| 234 | + |
| 235 | + while (($key, $value) = each %sourceAbbreviations) { |
| 236 | + $cid{$key} = $self->getOrCreateCollection($value); |
| 237 | + } |
| 238 | + |
| 239 | + return %cid; |
| 240 | +} |
| 241 | + |
| 242 | +sub loadSourceAbbreviations { |
| 243 | + my $self = shift; |
| 244 | + my %sab; |
| 245 | + |
| 246 | + my $dataset = $self->{dbs}->prepare("select * from mrsab"); |
| 247 | + $dataset->execute(); |
| 248 | + while (my $row = $dataset->fetchrow_hashref()) { |
| 249 | + $sab{$row->{RSAB}} = $row->{SON}; |
| 250 | + } |
| 251 | + return %sab; |
| 252 | +} |
| 253 | + |
| 254 | +# SEMTYPEHIER and SEMRELHIER contain only the is_a relationships, whereas |
| 255 | +# srstr contains all others |
| 256 | +# FIXME: only use SRSTR |
| 257 | +sub importSTrelations2 { |
| 258 | + my $self=shift; |
| 259 | + my $getrels=$self->{dbs}->prepare("select * from srstr where rel!='isa'"); |
| 260 | + $getrels->execute(); |
| 261 | + while(my $row=$getrels->fetchrow_hashref()) { |
| 262 | + my %rv1=$self->getMidForMember($row->{TYPE1},$self->{cid}{'STY'}); |
| 263 | + my %rv2=$self->getMidForMember($row->{TYPE2},$self->{cid}{'STY'}); |
| 264 | + my $rtmid=$self->{reltypes}{$row->{REL}}; |
| 265 | + #print "Adding relation $row->{REL} ($rtmid) between $row->{TYPE1} and $row->{TYPE2}\n"; |
| 266 | + $self->addRelation($rv1{rid},$rtmid,$rv1{mid},$rv2{mid},my $checkfordupes=1); |
| 267 | + } |
| 268 | +} |
| 269 | + |
| 270 | + |
| 271 | +sub importSTrelations { |
| 272 | + my $self=shift; |
| 273 | + my $which=shift; |
| 274 | + my $table; |
| 275 | + my $field1; |
| 276 | + my $field2; |
| 277 | + if($which eq 'STY') { |
| 278 | + $table='semtypehier'; |
| 279 | + $field1='SEMTYPE1'; |
| 280 | + $field2='SEMTYPE2'; |
| 281 | + } elsif($which eq 'RL') { |
| 282 | + $table='semrelhier'; |
| 283 | + $field1='RELTYPE1'; |
| 284 | + $field2='RELTYPE2'; |
| 285 | + } |
| 286 | + |
| 287 | + my $gettypehier=$self->{dbs}->prepare("select * from $table"); |
| 288 | + $gettypehier->execute(); |
| 289 | + while(my $typehier=$gettypehier->fetchrow_hashref()) { |
| 290 | + my %rv1=$self->getMidForMember($typehier->{$field1},$self->{cid}{$which}); |
| 291 | + my %rv2=$self->getMidForMember($typehier->{$field2},$self->{cid}{$which}); |
| 292 | + my $rtmid=$self->{reltypes}{$typehier->{RELATION}}; |
| 293 | + print "Adding relation $typehier->{RELATION} ($rtmid) between $typehier->{$field1} and $typehier->{$field2}\n"; |
| 294 | + $self->addRelation($rv1{rid},$rtmid,$rv1{mid},$rv2{mid},my $checkfordupes=1); |
| 295 | + } |
| 296 | +} |
| 297 | + |
| 298 | +# $member_id - the collection-internal identifier for this member |
| 299 | +# $cid The collection in which to search for this member (optional) |
| 300 | +# Returns the DefinedMeaningID and the revision id |
| 301 | +sub getMidForMember { |
| 302 | + my $self=shift; |
| 303 | + my $member_id=shift; |
| 304 | + my $cid=shift; |
| 305 | + my %rv; |
| 306 | + my $getmid; |
| 307 | + if($cid) { |
| 308 | + $getmid=$self->{dbt}->prepare("select member_mid,revision_id from uw_collection_contents where collection_id=? and internal_member_id=? and is_latest_set=1 limit 1"); |
| 309 | + $getmid->execute($cid,$member_id); |
| 310 | + } else { |
| 311 | + $getmid=$self->{dbt}->prepare("select member_mid,revision_id from uw_collection_contents where internal_member_id=? and is_latest_set=1 limit 1"); |
| 312 | + $getmid->execute($member_id); |
| 313 | + } |
| 314 | + my $member_mid=$getmid->fetchrow_hashref(); |
| 315 | + $rv{mid}=$member_mid->{member_mid}; |
| 316 | + $rv{rid}=$member_mid->{revision_id}; |
| 317 | + return %rv; |
| 318 | + |
| 319 | +} |
| 320 | + |
| 321 | +sub loadReltypes { |
| 322 | + my $self=shift; |
| 323 | + my %reltypes; |
| 324 | + # Get the relation type |
| 325 | + $getreltype=$self->{dbt}->prepare("select member_mid,internal_member_id from uw_collection_contents,uw_collection_ns where uw_collection_ns.collection_type='RELT' and uw_collection_ns.collection_id=uw_collection_contents.collection_id"); |
| 326 | + $getreltype->execute(); |
| 327 | + while (my $reltype=$getreltype->fetchrow_hashref()) { |
| 328 | + $reltypes{$reltype->{internal_member_id}}=$reltype->{member_mid}; |
| 329 | + } |
| 330 | + return %reltypes; |
| 331 | +} |
| 332 | + |
| 333 | +sub loadAttributes { |
| 334 | + my $self=shift; |
| 335 | + my %attributes; |
| 336 | + $getatt=$self->{dbt}->prepare("select member_mid,internal_member_id from uw_collection_contents,uw_collection_ns where uw_collection_ns.collection_type='ATTR' and uw_collection_ns.collection_id=uw_collection_contents.collection_id"); |
| 337 | + $getatt->execute(); |
| 338 | + while (my $att=$getatt->fetchrow_hashref()) { |
| 339 | + $attributes{$att->{internal_member_id}}=$att->{member_mid}; |
| 340 | + } |
| 341 | + return %attributes; |
| 342 | +} |
| 343 | + |
| 344 | + |
| 345 | +# Get all SRDEF attributes |
| 346 | +# Get relations between SRDEF |
| 347 | +sub importSNtypes { |
| 348 | + my $self=shift; |
| 349 | + my $type=shift; |
| 350 | + $getsemtypes=$self->{dbs}->prepare("select semtypeab,type,definition from srdef where type=?"); |
| 351 | + $getsemtypes->execute($type); |
| 352 | + while (my $semtype=$getsemtypes->fetchrow_hashref()) { |
| 353 | + my $type_expression=$semtype->{semtypeab}; |
| 354 | + my $type_code=$type_expression; |
| 355 | + $type_expression=~s/_/ /g; |
| 356 | + $type_expression=lc($type_expression); |
| 357 | + my %rv=$self->addExpression($type_expression,$self->{la}{'en'},0,$self->{cid}{$type},$type_code); |
| 358 | + $self->addMeaningText($rv{'rid'},$rv{'mid'},$semtype->{definition},undef,$self->{la}{'en'}); |
| 359 | + #print $type_expression." - $self->{cid}{$type} - $type_code\n"; |
| 360 | + } |
| 361 | +} |
| 362 | + |
| 363 | +sub importUMLSrelations { |
| 364 | + my $self=shift; |
| 365 | + my $which=shift; # REL or RELA |
| 366 | + my $source=shift; # SAB as MySQL LIKE string |
| 367 | + my $getrels; |
| 368 | + |
| 369 | + if($which eq 'REL') { |
| 370 | + $getrels=$self->{dbs}->prepare("select cui1,cui2,rel from MRREL where sab like ?"); |
| 371 | + } elsif($which eq 'RELA') { |
| 372 | + $getrels=$self->{dbs}->prepare("select cui1,cui2,rela from MRREL where sab like ? and rela!=''"); |
| 373 | + } |
| 374 | + $getrels->execute($source); |
| 375 | + while(my $rel=$getrels->fetchrow_hashref()) { |
| 376 | + my $relid=$rel->{lc($which)}; |
| 377 | + # These mean the same thing |
| 378 | + if($relid eq 'CHD') { |
| 379 | + $relid='RN'; |
| 380 | + } elsif($relid eq 'PAR') { |
| 381 | + $relid='RB'; |
| 382 | + } |
| 383 | + $getmid=$self->{dbt}->prepare("select member_mid,revision_id from uw_collection_contents where internal_member_id=? and is_latest_set=1 limit 1"); |
| 384 | + # Note that the direction in UMLS is opposite to ours |
| 385 | + $getmid->execute($rel->{cui2}); |
| 386 | + my $mid1=$getmid->fetchrow_hashref(); |
| 387 | + $getmid->execute($rel->{cui1}); |
| 388 | + my $mid2=$getmid->fetchrow_hashref(); |
| 389 | + # FIXME: We are ignoring term relations for now! |
| 390 | + if(($mid1->{member_mid} && $mid2->{member_mid}) && ($mid1->{member_mid} != $mid2->{member_mid}) && $self->{reltypes}{$relid}) { |
| 391 | + # Add the relation |
| 392 | + #print "Found relation ".$relid." (".$self->{reltypes}{$relid}.") between ".$mid1->{member_mid}." and ".$mid2->{member_mid}.".\n"; |
| 393 | + $self->addRelation($mid1->{revision_id},$self->{reltypes}{$relid},$mid1->{member_mid},$mid2->{member_mid},my $checkfordupes=1); |
| 394 | + } else { |
| 395 | + if(!$mid1->{member_mid} && $mid2->{member_mid}) { |
| 396 | + print "Did not find MID for ".$rel->{cui1}."!\n"; |
| 397 | + } elsif($mid1->{member_mid} && !$mid2->{member_mid}) { |
| 398 | + print "Did not find MID for ".$rel->{cui2}."!\n"; |
| 399 | + } elsif(!$mid1->{member_mid} && !$mid2->{member_mid}) { |
| 400 | + print "Did not find MIDs for ".$rel->{cui1}." and ".$rel->{cui2}."!\n"; |
| 401 | + } |
| 402 | + } |
| 403 | + } |
| 404 | + |
| 405 | +} |
| 406 | + |
| 407 | + |
| 408 | +sub bootstrapGemetCollection { |
| 409 | + my $self=shift; |
| 410 | + my %cid; |
| 411 | + %rv=$self->addExpression('GEMET Environmental Thesaurus Relation Types',$self->{la}{'en'}); |
| 412 | + $cid{'GEMETREL'}=$self->addCollection($rv{mid},'RELT'); |
| 413 | + %rv=$self->addExpression('GEMET Environmental Thesaurus Relation Types',$self->{la}{'en'}); |
| 414 | + $cid{'GEMET'}=$self->addCollection($rv{mid},''); |
| 415 | + return %cid; |
| 416 | +} |
| 417 | + |
| 418 | + |
| 419 | +sub bootstrapCollections { |
| 420 | + my $self=shift; |
| 421 | + my %cid; |
| 422 | + my %rv; |
| 423 | + |
| 424 | + %rv=$self->addExpression('CRISP Thesaurus, 2005',$self->{la}{'en'}); |
| 425 | + $cid{'CRISP'}=$self->addCollection($rv{mid},''); |
| 426 | + %rv=$self->addExpression('Semantic Network 2005AC Semantic Types',$self->{la}{'en'}); |
| 427 | + $cid{'STY'}=$self->addCollection($rv{mid},'ATTR'); |
| 428 | + %rv=$self->addExpression('Semantic Network 2005AC Relation Types',$self->{la}{'en'}); |
| 429 | + $cid{'RL'}=$self->addCollection($rv{mid},'RELT'); |
| 430 | + %rv=$self->addExpression('UMLS Relation Types 2005',$self->{la}{'en'}); |
| 431 | + $cid{'REL'}=$self->addCollection($rv{mid},'RELT'); |
| 432 | + %rv=$self->addExpression('UMLS Relation Attributes 2005',$self->{la}{'en'}); |
| 433 | + $cid{'RELA'}=$self->addCollection($rv{mid},'RELT'); |
| 434 | + %rv=$self->addExpression('The International Classification of Primary Care (ICPC), 1993',$self->{la}{'en'}); |
| 435 | + $cid{'ICPC'}=$self->addCollection($rv{mid},''); |
| 436 | + %rv=$self->addExpression('Medical Subject Headings (MeSH), 2005',$self->{la}{'en'}); |
| 437 | + $cid{'MESH'}=$self->addCollection($rv{mid},''); |
| 438 | +# %rv=$self->addExpression('Swiss-Prot',$self->{la}{'en'}); |
| 439 | +# $cid{'SP'}=$self->addCollection($rv{mid},''); |
| 440 | + return %cid; |
| 441 | +} |
| 442 | + |
| 443 | +sub addCollection { |
| 444 | + my $self=shift; |
| 445 | + my $mid=shift; |
| 446 | + my $collection_type=shift; |
| 447 | + my $addcollection=$self->{dbt}->prepare('INSERT INTO uw_collection_ns(collection_mid,is_latest,collection_type) values(?,1,?)'); |
| 448 | + $addcollection->execute($mid,$collection_type); |
| 449 | + my $cid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
| 450 | + my $updatefirstver=$self->{dbt}->prepare('UPDATE uw_collection_ns set first_ver=? where collection_id=?'); |
| 451 | + $updatefirstver->execute($cid,$cid); |
| 452 | + return $cid; |
| 453 | +} |
| 454 | + |
| 455 | +sub importUMLSrelationtypes { |
| 456 | + my $self=shift; |
| 457 | + my $which=shift; |
| 458 | + my $getreltypes; |
| 459 | + if($which eq 'REL') { |
| 460 | + # CHD and PAR are to be interpreted as RN and RB, SUBX is not used |
| 461 | + $getreltypes=$self->{dbs}->prepare("select * from rel where ABBREV!='CHD' and ABBREV!='PAR' and ABBREV!='SUBX'"); |
| 462 | + } elsif($which eq 'RELA') { |
| 463 | + $getreltypes=$self->{dbs}->prepare("select * from rela"); |
| 464 | + } |
| 465 | + $getreltypes->execute(); |
| 466 | + while(my $reltype=$getreltypes->fetchrow_hashref()) { |
| 467 | + my %rv=$self->addExpression($reltype->{FULL},$self->{la}{'en'},0,$self->{cid}{$which},$reltype->{ABBREV}); |
| 468 | + } |
| 469 | +} |
| 470 | + |
| 471 | +sub importUMLSterms { |
| 472 | + my $self=shift; |
| 473 | + my $sab=shift; # the source abbreviation which to import |
| 474 | + my $cid=shift; # which collection to associate the defined meanings with |
| 475 | + |
| 476 | + $getterm=$self->{dbs}->prepare("select str,cui,lat from MRCONSO where sab like ?"); |
| 477 | + $getterm->execute($sab); |
| 478 | + my %textmid; |
| 479 | + while(my $r=$getterm->fetchrow_hashref()) { |
| 480 | + my %rv; |
| 481 | + my $dupe=0; |
| 482 | + my %cuimid=$self->getMidForMember($r->{cui}); |
| 483 | + |
| 484 | + # Create new expression / Defined Meaning |
| 485 | + if(!$cuimid{mid}) { |
| 486 | + %rv=$self->addExpression($r->{str},$self->{la_iso}{lc($r->{lat})},0,$cid,$r->{cui}); |
| 487 | + # If this is the first time we encounter this CUI, import the definitions |
| 488 | + # Note that we'll take any definitions, regardless of the SABs specified! |
| 489 | + if($rv{mid}!=-1) { |
| 490 | + $getdefs=$self->{dbs}->prepare("select def from MRDEF where cui=?"); |
| 491 | + $getdefs->execute($r->{cui}); |
| 492 | + while(my $d=$getdefs->fetchrow_hashref()) { |
| 493 | + # UMLS only has English definitions |
| 494 | + $self->addMeaningText($rv{rid},$rv{mid},$d->{def},0,$self->{la}{'en'}); |
| 495 | + } |
| 496 | + $textmid{$rv{mid}}=1; |
| 497 | + } |
| 498 | + # Add as SynTrans to existing Defined Meaning |
| 499 | + } else { |
| 500 | + %rv=$self->addExpression($r->{str},$self->{la_iso}{lc($r->{lat})},$cuimid{mid}); |
| 501 | + } |
| 502 | + } |
| 503 | +} |
| 504 | + |
| 505 | + |
| 506 | +sub importGemetTerms { |
| 507 | + my $self=shift; |
| 508 | + my $cid=shift; |
| 509 | + # Get all English terms as base |
| 510 | + $getterm=$self->{dbs}->prepare("select * from term where langcode=?"); |
| 511 | + $getterm->execute('en'); |
| 512 | + while($r=$getterm->fetchrow_hashref()) { |
| 513 | + # Add English term as defined meaning |
| 514 | + my %rv=$self->addExpression($r->{name},$self->{la}{'en'},0,); |
| 515 | + |
| 516 | + # All translations |
| 517 | + $gettrans=$self->{dbs}->prepare("select name,langcode from term where id_concept=? and langcode!='en'"); |
| 518 | + $gettrans->execute($r->{id_concept}); |
| 519 | + # Add them with the same meaning ID |
| 520 | + while($t=$gettrans->fetchrow_hashref()) { |
| 521 | + print "Language: $t->{langcode}\n"; |
| 522 | + %tv=$self->addExpression($t->{name},$self->{la}{$t->{langcode}},$rv{mid}); |
| 523 | + } |
| 524 | + # All definitions |
| 525 | + $getdef=$self->{dbs}->prepare("select definition,langcode from scope where id_concept=?"); |
| 526 | + $getdef->execute($r->{id_concept}); |
| 527 | + my $tcid=0; |
| 528 | + while($d=$getdef->fetchrow_hashref()) { |
| 529 | + if(!$tcid) { |
| 530 | + my %mv=$self->addMeaningText($rv{rid},$rv{mid},$d->{definition},0,$self->{la}{$d->{langcode}}); |
| 531 | + $tcid=$mv{tcid}; |
| 532 | + |
| 533 | + } else { |
| 534 | + $self->addMeaningText($rv{rid},$rv{mid},$d->{definition},$tcid,$self->{la}{$d->{langcode}}); |
| 535 | + |
| 536 | + } |
| 537 | + } |
| 538 | + } |
| 539 | +} |
| 540 | + |
| 541 | + |
| 542 | +sub importGemetRelations { |
| 543 | + my $self=shift; |
| 544 | + # Import GEMET relations |
| 545 | + my $getrels=$self->{dbs}->prepare("select * from relation"); |
| 546 | + $getrels->execute(); |
| 547 | + while(my $rrow=$getrels->fetchrow_hashref()) { |
| 548 | + %rv_A=$self->findGemetItem($rrow->{id_concept}); |
| 549 | + %rv_B=$self->findGemetItem($rrow->{id_relation}); |
| 550 | + if($rv_A{mid} && $rv_B{mid}) { |
| 551 | + $self->addRelation($rv_A{rid},$self->{reltypes}{$rrow->{id_type}},$rv_A{mid},$rv_B{mid}); |
| 552 | + } |
| 553 | + } |
| 554 | +} |
| 555 | + |
| 556 | +sub importGemetThemes { |
| 557 | + my $self=shift; |
| 558 | + # Get all themes |
| 559 | + my $getthemes=$self->{dbs}->prepare("select * from theme"); |
| 560 | + my $gettheme_set=$self->{dbs}->prepare("select * from theme where id_theme=?"); |
| 561 | + $getthemes->execute(); |
| 562 | + while(my $theme_row=$getthemes->fetchrow_hashref()) { |
| 563 | + my $theme=$theme_row->{description}; |
| 564 | + my @themes=split(/[,;]( ){0,1}/,$theme); |
| 565 | + foreach(@themes) { |
| 566 | + $_=~s/^ *$//i; |
| 567 | + if($_) { |
| 568 | + # Does this theme have a expression? |
| 569 | + my $t=$_; |
| 570 | + my %it=$self->findLatestRevision($t,$self->{la}{$theme_row->{langcode}}); |
| 571 | + if($it{liid}) { |
| 572 | + # Get the meaning |
| 573 | + print "NEW THEME: $t - retrieving existing MID for LIID... ".$it{liid}; |
| 574 | + $it{mid}=$self->findMeaning($rv{liid}); |
| 575 | + print $it{mid}."\n"; |
| 576 | + #print $t. " is a dupe! - $dupes\n"; |
| 577 | + #$dupes++; |
| 578 | + } else { |
| 579 | + # Do we have any of its translations? |
| 580 | + # We can only add those if the theme does |
| 581 | + # not contain a , - otherwise we can't match! |
| 582 | + my $tra_mid=0; |
| 583 | + if(!($theme_row->{description}=~m/[,;]/i)) { |
| 584 | + print "NEW THEME: $t - no record, looking for its known translations in GEMET\n"; |
| 585 | + #print "Checking for translations of ".$theme_row->{description}."\n"; |
| 586 | + $gettheme_set->execute($theme_row->{id_theme}); |
| 587 | + while((my $tra_row=$gettheme_set->fetchrow_hashref()) && !$tra_mid) { |
| 588 | + if($tra_lid=$self->findExpressionId($tra_row->{description},$self->{la}{$tra_row->{langcode}})) { |
| 589 | + $tra_mid=$self->findMeaning($tra_lid); |
| 590 | + |
| 591 | + } |
| 592 | + } |
| 593 | + } else { |
| 594 | + print "NEW THEME: $t - split from the original GEMET data\n"; |
| 595 | + } |
| 596 | + # Let's make one |
| 597 | + if($tra_mid) { |
| 598 | + print "Adding new term as translation of $tra_mid\n"; |
| 599 | + %it = $self->addExpression($t,$self->{la}{$theme_row->{langcode}},$tra_mid); |
| 600 | + } else { |
| 601 | + print "Adding new term independently, we do not know its translations.\n"; |
| 602 | + %it = $self->addExpression($t,$self->{la}{$theme_row->{langcode}}); |
| 603 | + } |
| 604 | + |
| 605 | + |
| 606 | + } |
| 607 | + |
| 608 | + if(!$have_rel{$theme_row->{id_theme}}) { |
| 609 | + # Get all items which have this relation |
| 610 | + my $getconcepts=$self->{dbs}->prepare('select id_concept from concept_theme where id_theme=?'); |
| 611 | + $getconcepts->execute($theme_row->{id_theme}); |
| 612 | + while(my $concrow=$getconcepts->fetchrow_hashref()) { |
| 613 | + # Get LIID,RID->meaning for the item |
| 614 | + my %tr=$self->findGemetItem($concrow->{id_concept}); |
| 615 | + if($tr{rid}) { |
| 616 | + $self->addRelation($tr{rid},$self->{reltypes}{it},$tr{mid},$it{mid}); |
| 617 | + print "Tied up a relation.."; |
| 618 | + } else { |
| 619 | + print "Missing record to tie the relation to.."; |
| 620 | + } |
| 621 | + } |
| 622 | + print "\n"; |
| 623 | + $have_rel{$theme_row->{id_theme}}=1; |
| 624 | + } |
| 625 | + |
| 626 | + } |
| 627 | + } |
| 628 | + } |
| 629 | + #Split theme into parts |
| 630 | +} |
| 631 | + |
| 632 | +sub findGemetItem { |
| 633 | + my $self=shift; |
| 634 | + my $concept_id=shift; |
| 635 | + # get a word, language |
| 636 | + my $getword=$self->{dbs}->prepare("select langcode,name from term where id_concept=? LIMIT 1"); |
| 637 | + $getword->execute($concept_id); |
| 638 | + my $wordrow=$getword->fetchrow_hashref(); |
| 639 | + |
| 640 | + # find an expression + meaning |
| 641 | + my %rv=$self->findLatestRevision($wordrow->{name},$self->{la}{$wordrow->{langcode}}); |
| 642 | + $rv{mid}=$self->findMeaning($rv{liid}); |
| 643 | + return %rv; |
| 644 | +} |
| 645 | + |
| 646 | +sub addRelation { |
| 647 | + my $self=shift; |
| 648 | + my $revid=shift; |
| 649 | + my $rtid=shift; |
| 650 | + my $mid_A=shift; |
| 651 | + my $mid_B=shift; |
| 652 | + my $checkfordupes=shift; |
| 653 | + |
| 654 | + if($checkfordupes) { |
| 655 | + my $checkRelationDuplicates=$self->{dbt}->prepare('select 1 as one from uw_meaning_relations where meaning1_mid=? and meaning2_mid=? and relationtype_mid=? and is_latest_set=1 limit 1'); |
| 656 | + $checkRelationDuplicates->execute($mid_A,$mid_B,$rtid); |
| 657 | + #print "Checking dupe $mid_A, $mid_B, relation type $rtid\n"; |
| 658 | + my $dupecheck=$checkRelationDuplicates->fetchrow_hashref(); |
| 659 | + if($dupecheck->{one}) { |
| 660 | + print "Duplicate relation, not adding.\n"; |
| 661 | + return false; |
| 662 | + } |
| 663 | + } |
| 664 | + |
| 665 | + my $newkey= $self->getSetIdWhere('uw_meaning_relations','meaning1_mid',$mid_A) || $self->getMaxId('set_id','uw_meaning_relations'); |
| 666 | + my $addrel=$self->{dbt}->prepare('insert into uw_meaning_relations(set_id,meaning1_mid,meaning2_mid,relationtype_mid,is_latest_set,first_set,revision_id) values(?,?,?,?,?,?,?)'); |
| 667 | + $addrel->execute($newkey,$mid_A,$mid_B,$rtid,1,$newkey,$revid); |
| 668 | + |
| 669 | + print "newkey: $newkey\n"; |
| 670 | + print "mid_A: $mid_A\n"; |
| 671 | + print "mid_B: $mid_B\n"; |
| 672 | + print "rtid: $rtid\n"; |
| 673 | + print "revid: $revid\n"; |
| 674 | +} |
| 675 | + |
| 676 | + |
| 677 | +sub findMeaning { |
| 678 | + my $self=shift; |
| 679 | + my $liid=shift; |
| 680 | + # Search syntrans table |
| 681 | + my $getsyn=$self->{dbt}->prepare("select defined_meaning_id from uw_syntrans where expression_id=?"); |
| 682 | + $getsyn->execute($liid); |
| 683 | + my $syn_row=$getsyn->fetchrow_hashref(); |
| 684 | + if($syn_row->{defined_meaning_id}) { |
| 685 | + return $syn_row->{defined_meaning_id}; |
| 686 | + } |
| 687 | + my $getdm=$self->{dbt}->prepare("select defined_meaning_id from uw_defined_meaning where expression_id=? limit 1"); |
| 688 | + $getdm->execute($liid); |
| 689 | + my $dm_row=$getdm->fetchrow_hashref(); |
| 690 | + if($dm_row->{defined_meaning_id}) { |
| 691 | + return $dm_row->{defined_meaning_id}; |
| 692 | + } |
| 693 | + return 0; |
| 694 | +} |
| 695 | + |
| 696 | +# If there already is a meaning text for this DefinedMeaning, it will add the MeaningText as an alternative definition |
| 697 | +sub addMeaningText { |
| 698 | + my $self=shift; |
| 699 | + my $rid=shift; |
| 700 | + my $mid=shift; |
| 701 | + my $meaningtext=shift; # optional |
| 702 | + my $meaningtext_set=shift; # optional TCID set to join with |
| 703 | + my $lid=shift; # ID, not code |
| 704 | + my %rv; |
| 705 | + |
| 706 | + # Add text row entry |
| 707 | + my $maketext=$self->{dbt}->prepare('insert into text(old_text) values(?)'); |
| 708 | + $maketext->execute($meaningtext); |
| 709 | + # Get text row ID |
| 710 | + $tid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
| 711 | + # Get new or existing translated content set ID |
| 712 | + $tcid=$meaningtext_set || $self->getMaxId('set_id','translated_content'); |
| 713 | + # Create new translated content set |
| 714 | + my $maketc=$self->{dbt}->prepare('insert into translated_content(set_id,language_id,text_id,first_set,revision_id) values(?,?,?,?,?)'); |
| 715 | + $maketc->execute($tcid,$lid,$tid,$tcid,$rid); |
| 716 | + $rv{tcid}=$tcid; |
| 717 | + |
| 718 | + # THIS DOESN'T WORK FOR DEFINITIONS IN MULTIPLE LANGUAGES |
| 719 | + # Check if a meaning text has already been set |
| 720 | + my $lookformeaning=$self->{dbt}->prepare('select meaning_text_tcid from uw_defined_meaning where defined_meaning_id=? and is_latest_ver=1'); |
| 721 | + $lookformeaning->execute($mid); |
| 722 | + my $mrow=$lookformeaning->fetchrow_hashref(); |
| 723 | + if($mrow->{meaning_text_tcid}) { |
| 724 | + # There is a meaning text - the new one is only an alternative |
| 725 | + my $altset=$self->getSetIdWhere('uw_alt_meaningtexts','meaning_mid',$mid) || $self->getMaxId('set_id','uw_alt_meaningtexts'); |
| 726 | + my $addaltmeaning=$self->{dbt}->prepare('insert into uw_alt_meaningtexts(set_id,meaning_mid,meaning_text_tcid,is_latest_set,first_set,revision_id) values(?,?,?,?,?,?)'); |
| 727 | + $addaltmeaning->execute($altset,$mid,$tcid,1,$altset,$rid) |
| 728 | + } else { |
| 729 | + my $updatemeaning=$self->{dbt}->prepare('update uw_defined_meaning set meaning_text_tcid=? where defined_meaning_id=?'); |
| 730 | + $updatemeaning->execute($tcid,$mid); |
| 731 | + } |
| 732 | + return %rv; |
| 733 | +} |
| 734 | + |
| 735 | + |
| 736 | +# If the expression already exists, add a new DefinedMeaning - unless this is a translation or synonym; if a record already exists in SynTrans with this expression _and_ $translation_of as a DefinedMeaning, do not do anything |
| 737 | +sub addExpression { |
| 738 | + my $self=shift; |
| 739 | + # return MID, RID, LID, TCID! |
| 740 | + my $expression=shift; |
| 741 | + my $lid=shift; # ID, not code |
| 742 | + my $translation_of=shift; # 0 or MID (!), optional |
| 743 | + my $collection_id=shift; # optional |
| 744 | + my $collection_internal_member_id=shift; # what does the collection use to refer to this member? |
| 745 | + my %rv; |
| 746 | + my $isdupe=0; |
| 747 | + my %firv=$self->findLatestRevision($expression,$lid); |
| 748 | + if($firv{liid}) { $isdupe=1; } |
| 749 | + |
| 750 | + if(!$isdupe) { |
| 751 | + |
| 752 | + #create page |
| 753 | + my $pt=$self->canonize($expression); |
| 754 | + $makepage=$self->{dbt}->prepare('insert into page(page_namespace,page_title,page_is_new,page_title_language_id,page_touched) values(?,?,?,?,?)'); |
| 755 | + $makepage->execute(16,$pt,1,$lid,$self->mwtimestamp()); |
| 756 | + $pid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
| 757 | + print "PID: $pid\n"; |
| 758 | + |
| 759 | + $rv{pid}=$pid; |
| 760 | + |
| 761 | + #create revision |
| 762 | + $makerev=$self->{dbt}->prepare('insert into revision(rev_page,rev_comment,rev_user,rev_user_text,rev_timestamp) values(?,?,?,?,?)'); |
| 763 | + $makerev->execute($pid,'Initial import',2,'GEMET',$self->mwtimestamp()); |
| 764 | + |
| 765 | + #get revision_id |
| 766 | + $rid=$self->getId('select rev_id from revision where rev_page=?',$pid); |
| 767 | + $rv{rid}=$rid; |
| 768 | + |
| 769 | + #update page to link to revision |
| 770 | + $updatepage=$self->{dbt}->prepare('update page set page_latest=? where page_id=?'); |
| 771 | + $updatepage->execute($rid,$pid); |
| 772 | + |
| 773 | + #create expression |
| 774 | + $makeitem=$self->{dbt}->prepare('insert into uw_expression_ns(spelling,language_id,is_latest) values(?,?,1)'); |
| 775 | + $makeitem->execute($expression,$lid); |
| 776 | + $liid=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
| 777 | + $rv{liid}=$liid; |
| 778 | + |
| 779 | + # update firstver |
| 780 | + $updateitem=$self->{dbt}->prepare('update uw_expression_ns set first_ver=? where expression_id=?'); |
| 781 | + $updateitem->execute($liid,$liid); |
| 782 | + |
| 783 | + #update revision to link to expression |
| 784 | + $updaterev=$self->{dbt}->prepare('update revision set rev_data_id=? where rev_id=?'); |
| 785 | + $updaterev->execute($liid,$rid); |
| 786 | + |
| 787 | + } else { |
| 788 | + |
| 789 | + $rid=$firv{rid}; |
| 790 | + $liid=$firv{liid}; |
| 791 | + $rv{rid}=$rid; |
| 792 | + $rv{liid}=$liid; |
| 793 | + |
| 794 | + } |
| 795 | + |
| 796 | + #create definedmeaning |
| 797 | + if(!$translation_of) { |
| 798 | + $makemean=$self->{dbt}->prepare('insert into uw_defined_meaning(expression_id,revision_id) values(?,?)'); |
| 799 | + $makemean->execute($liid,$rid); |
| 800 | + # We always want a syntrans record, so in this case it links to its own |
| 801 | + # def. meaning |
| 802 | + $translation_of=$self->{dbt}->last_insert_id(undef,undef,undef,undef); |
| 803 | + $mid=$translation_of; |
| 804 | + $rv{mid}=$mid; |
| 805 | + $updatemeaningver=$self->{dbt}->prepare('update uw_defined_meaning set first_ver=? where defined_meaning_id=?'); |
| 806 | + $updatemeaningver->execute($mid,$mid); |
| 807 | + if($collection_id) { |
| 808 | + $addtocoll=$self->{dbt}->prepare('insert into uw_collection_contents(set_id, collection_id, member_mid, is_latest_set, first_Set, revision_id, internal_member_id) values(?,?,?,?,?,?,?)'); |
| 809 | + #fixme set association |
| 810 | + $addtocoll->execute(1,$collection_id,$mid,1,1,$rid,$collection_internal_member_id); |
| 811 | + } |
| 812 | + } |
| 813 | + |
| 814 | + # Check if we already have this specific record |
| 815 | + $checkdupes=$self->{dbt}->prepare('select set_id from uw_syntrans where defined_meaning_id=? and expression_id=?'); |
| 816 | + $checkdupes->execute($translation_of,$liid); |
| 817 | + my $duperow=$checkdupes->fetchrow_hashref(); |
| 818 | + my $dupeid=$duperow->{set_id}; |
| 819 | + if(!$dupeid) { |
| 820 | + |
| 821 | + # Check if this is part of a set |
| 822 | + $getset=$self->{dbt}->prepare('select set_id from uw_syntrans where defined_meaning_id=? and is_latest_set=1'); |
| 823 | + $getset->execute($mid); |
| 824 | + $row=$getset->fetchrow_hashref(); |
| 825 | + my $setid=$row->{set_id} || $self->getMaxId('set_id','uw_syntrans'); |
| 826 | + # Add syntrans record |
| 827 | + $maketrans=$self->{dbt}->prepare('insert into uw_syntrans(set_id,defined_meaning_id,expression_id,first_set,revision_id,is_latest_set,endemic_meaning) values(?,?,?,?,?,1,1)'); |
| 828 | + $maketrans->execute($setid,$translation_of,$liid,$setid,$rid); |
| 829 | + $rv{setid}=$setid; |
| 830 | + $rv{mid}=$translation_of; |
| 831 | + } else{ |
| 832 | + $rv{setid}=$dupeid; # Dupe |
| 833 | + $rv{mid}=-1; # Dupe |
| 834 | + } |
| 835 | + return %rv; |
| 836 | + |
| 837 | +} |
| 838 | + |
| 839 | +sub findLatestRevision { |
| 840 | + my $self = shift; |
| 841 | + my $expressionSpelling = shift; |
| 842 | + my $languageId = shift; |
| 843 | + |
| 844 | + my $expressionId = $self->findExpressionId($expressionSpelling, $languageId); |
| 845 | + if ($expressionId != 0) { |
| 846 | + my $getRevisionId = $self->{dbt}->prepare('select rev_id from revision where rev_data_id=?'); |
| 847 | + $getRevisionId->execute($expressionId); |
| 848 | + my %revision; |
| 849 | + $revision{liid} = $expressionId; |
| 850 | + $revision{rid} = $getRevisionId->fetchrow_hashref->{rev_id}; |
| 851 | + return %revision; |
| 852 | + } else { |
| 853 | + return 0; |
| 854 | + } |
| 855 | +} |
| 856 | + |
| 857 | +sub findExpressionId { |
| 858 | + my $self = shift; |
| 859 | + my $expressionSpelling = shift; |
| 860 | + my $languageId = shift; |
| 861 | + |
| 862 | + my $getItem = $self->{dbt}->prepare("select expression_id from uw_expression_ns where spelling=binary ? and language_id=? and is_latest=1"); |
| 863 | + $getItem->execute($expressionSpelling, $languageId); |
| 864 | + my $itemRow = $getItem->fetchrow_hashref(); |
| 865 | + if ($itemRow) { |
| 866 | + return $itemRow->{expression_id}; |
| 867 | + } else { |
| 868 | + return 0; |
| 869 | + } |
| 870 | +} |
| 871 | + |
| 872 | +sub getMaxId { |
| 873 | + my $self=shift; |
| 874 | + my $field=shift; |
| 875 | + my $table=shift; |
| 876 | + $getmax=$self->{dbt}->prepare("select max($field) as maxset from $table"); |
| 877 | + $getmax->execute(); |
| 878 | + my $row=$getmax->fetchrow_hashref(); |
| 879 | + return $row->{maxset}+1; |
| 880 | +} |
| 881 | + |
| 882 | +sub getSetIdWhere { |
| 883 | + my $self=shift; |
| 884 | + my $table=shift; |
| 885 | + my $wherefield=shift; |
| 886 | + my $wherekey=shift; |
| 887 | + $getmax=$self->{dbt}->prepare("select set_id from $table WHERE $wherefield=? AND is_latest_set=1 limit 1"); |
| 888 | + $getmax->execute($wherekey); |
| 889 | + my $row=$getmax->fetchrow_hashref(); |
| 890 | + return $row->{set_id}; |
| 891 | +} |
| 892 | + |
| 893 | + |
| 894 | +sub getId { |
| 895 | + my $self=shift; |
| 896 | + my $prep=shift; |
| 897 | + $prep=~m/select (.*?) from/i; |
| 898 | + my $field=$1; |
| 899 | + my $getlang=$self->{dbt}->prepare($prep); |
| 900 | + $getlang->execute(@_); |
| 901 | + my $row=$getlang->fetchrow_hashref(); |
| 902 | + my $id=$row->{$field}; |
| 903 | + return $id; |
| 904 | +} |
| 905 | + |
| 906 | +sub mwtimestamp { |
| 907 | + my $self=shift; |
| 908 | + use POSIX qw(strftime); |
| 909 | + return(strftime "%Y%m%d%H%M%S", localtime); |
| 910 | +} |
| 911 | + |
| 912 | + |
| 913 | +sub canonize { |
| 914 | + my $self=shift; |
| 915 | + my $title=shift; |
| 916 | + #$title=ucfirst($title); |
| 917 | + $title=~s/ /_/ig; |
| 918 | + return $title; |
| 919 | +} |
| 920 | + |
| 921 | +sub initlangs { |
| 922 | + my $self=shift; |
| 923 | + %langs=( |
| 924 | + en_en=>'English', |
| 925 | + en_de=>'Englisch', |
| 926 | + 'en-US_de'=>'Englisch (USA)', |
| 927 | + 'en-US_en'=>'English (United States)', |
| 928 | + bg_en=>'Bulgarian', |
| 929 | + bg_de=>'Bulgarisch', |
| 930 | + cs_en=>'Czech', |
| 931 | + cs_de=>'Tschechisch', |
| 932 | + da_en=>'Dansk', |
| 933 | + da_de=>'D?isch', |
| 934 | + de_en=>'German', |
| 935 | + de_de=>'Deutsch', |
| 936 | + es_en=>'Spanish', |
| 937 | + es_de=>'Spanisch', |
| 938 | + et_en=>'Estonian', |
| 939 | + et_de=>'Estnisch', |
| 940 | + eu_en=>'Basque', |
| 941 | + eu_de=>'Baskisch', |
| 942 | + fi_en=>'Finnish', |
| 943 | + fi_de=>'Finnisch', |
| 944 | + fr_en=>'French', |
| 945 | + fr_de=>'Franz?isch', |
| 946 | + hu_en=>'Hungarian', |
| 947 | + hu_de=>'Ungarisch', |
| 948 | + it_en=>'Italian', |
| 949 | + it_de=>'Italienisch', |
| 950 | + nl_en=>'Dutch', |
| 951 | + nl_de=>'Niederl?disch', |
| 952 | + no_en=>'Norwegian', |
| 953 | + no_de=>'Norwegisch', |
| 954 | + pl_en=>'Polish', |
| 955 | + pl_de=>'Polnisch', |
| 956 | + pt_en=>'Portuguese', |
| 957 | + pt_de=>'Portugiesisch', |
| 958 | + ru_en=>'Russian', |
| 959 | + ru_de=>'Russisch', |
| 960 | + sk_en=>'Slovak', |
| 961 | + sk_de=>'Slowakische Sprache', |
| 962 | + sl_en=>'Slovenian', |
| 963 | + sl_de=>'Slowenisch', |
| 964 | + el_en=>'Greek', |
| 965 | + el_de=>'Griechisch', |
| 966 | + sv_en=>'Swedish', |
| 967 | + sv_de=>'Schwedisch'); |
| 968 | + foreach(keys(%langs)) { |
| 969 | + $key=$_; |
| 970 | + $key=~m/(.*?)_(.*)/i; |
| 971 | + $lang=$1; |
| 972 | + #print "Lang: $lang\n"; |
| 973 | + $wordlang=$2; |
| 974 | + if($wordlang eq 'en') { |
| 975 | + $addwm=$self->{dbt}->prepare("insert into language(wikimedia_key) values(?)"); |
| 976 | + $addwm->execute($lang); |
| 977 | + } |
| 978 | + } |
| 979 | + foreach(keys(%langs)) { |
| 980 | + $key=$_; |
| 981 | + $key=~m/(.*?)_(.*)/i; |
| 982 | + $lang=$1; |
| 983 | + #print "Lang: $lang\n"; |
| 984 | + $wordlang=$2; |
| 985 | + $langword_u=$langs{$key}; |
| 986 | + $langword=encode("utf8",$langword_u); |
| 987 | + $newwm=$self->{dbt}->prepare("select language_id from language where wikimedia_key=?"); |
| 988 | + $newwm->execute($lang); |
| 989 | + my $row=$newwm->fetchrow_hashref(); |
| 990 | + $newwm->execute('en'); |
| 991 | + my $en_row=$newwm->fetchrow_hashref(); |
| 992 | + $newwm->execute('de'); |
| 993 | + my $de_row=$newwm->fetchrow_hashref(); |
| 994 | + $newword=$self->{dbt}->prepare("insert into language_names values (?,?,?)"); |
| 995 | + if($wordlang eq 'en') { |
| 996 | + $newword->execute($row->{language_id},$en_row->{language_id},$langword); |
| 997 | + } elsif($wordlang eq 'de') { |
| 998 | + $newword->execute($row->{language_id},$de_row->{language_id},$langword); |
| 999 | + } |
| 1000 | + } |
| 1001 | +} |
| 1002 | + |
| 1003 | +sub initRel { |
| 1004 | + my $self=shift; |
| 1005 | + my $cid=shift; |
| 1006 | + %rel_types=( |
| 1007 | + bt_en=>'broader terms', |
| 1008 | + bt_de=>'breitere Begriffe', |
| 1009 | + nt_en=>'narrower terms', |
| 1010 | + nt_de=>'engere Begriffe', |
| 1011 | + rt_en=>'related terms', |
| 1012 | + rt_de=>'verwandte Begriffe', |
| 1013 | + it_en=>'is part of theme', |
| 1014 | + it_de=>'ist Themenbestandteil von' |
| 1015 | + ); |
| 1016 | + |
| 1017 | + %rel_definitions=( |
| 1018 | + bt_en=>'Those terms in a thesaurus which are broader than others', |
| 1019 | + bt_de=>'Die Begriffe in einem Thesaurus, die breiter sind als andere', |
| 1020 | + nt_en=>'Those terms in a thesaurus which are narrower than others', |
| 1021 | + nt_de=>'Die Begriffe in einem Thesaurus, die enger sind als andere', |
| 1022 | + rt_en=>'Those terms in a thesaurus which are related to others', |
| 1023 | + rt_de=>'Die Begriffe in einem Thesaurus, die mit anderen verwandt sind', |
| 1024 | + it_en=>'Those terms in a thesaurus or dictionary which are associated with a topic', |
| 1025 | + it_de=>'Die Begriffe in einem Thesaurus oder Woerterbuch, die mit einem Thema assoziiert sind'); |
| 1026 | + |
| 1027 | + foreach(keys(%rel_types)) { |
| 1028 | + $key=$_; |
| 1029 | + $key=~m/(..)_(..)/i; |
| 1030 | + $ident=$1; |
| 1031 | + $lang=$2; |
| 1032 | + if($lang eq 'de') { |
| 1033 | + $en_key="$ident\_en"; |
| 1034 | + my %rv=$self->addExpression($rel_types{$en_key},$self->{la}{'en'},0,$cid,$ident); |
| 1035 | + $self->addMeaningText($rv{rid},$rv{mid},$rel_definitions{$en_key},0,$self->{la}{'en'}); |
| 1036 | + my %dv=$self->addExpression($rel_types{$key},$self->{la}{'de'},$rv{'mid'}); |
| 1037 | + $self->addMeaningText($dv{rid},$dv{mid},$rel_definitions{$key},$rv{'tcid'},$self->{la}{'de'}); |
| 1038 | + } |
| 1039 | + } |
| 1040 | +} |
| 1041 | + |
| 1042 | +sub loadLangs { |
| 1043 | + my $self=shift; |
| 1044 | + my %la; |
| 1045 | + $getlangs=$self->{dbt}->prepare('select language_id,wikimedia_key from language'); |
| 1046 | + $getlangs->execute(); |
| 1047 | + while($langrow=$getlangs->fetchrow_hashref()) { |
| 1048 | + $la{$langrow->{wikimedia_key}}=$langrow->{language_id}; |
| 1049 | + } |
| 1050 | + return %la; |
| 1051 | +} |
| 1052 | + |
| 1053 | +sub loadLangsIso { |
| 1054 | + my $self=shift; |
| 1055 | + my %la_iso; |
| 1056 | + $getlangs=$self->{dbt}->prepare('select language_id,iso639_2 from language'); |
| 1057 | + $getlangs->execute(); |
| 1058 | + while($langrow=$getlangs->fetchrow_hashref()) { |
| 1059 | + $la_iso{$langrow->{iso639_2}}=$langrow->{language_id}; |
| 1060 | + } |
| 1061 | + return %la_iso; |
| 1062 | +} |
| 1063 | + |
| 1064 | +return(1); |