r30036 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r30035‎ \| r30036 \| r30037 >
Date:	21:24, 21 January 2008
Author:	dale
Status:	old
Tags:
Comment:	updated scraping and database syncing maintenance tools
Modified paths:	/trunk/extensions/MetavidWiki/includes/MV_Index.php (modified) (history) /trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.inc.php (added) (history) /trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.php (modified) (history) /trunk/extensions/MetavidWiki/maintenance/ogg_thumb_insert.sh (modified) (history) /trunk/extensions/MetavidWiki/maintenance/scrape_and_insert.php (modified) (history)

Diff [purge]

Index: trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.inc.php
—	—	@@ -0,0 +1,604 @@
	2	+<?php
	3	+/*
	4	+ * metavid2mvWiki.inc.php Created on Jan 19, 2008
	5	+ *
	6	+ * All Metavid Wiki code is Released under the GPL2
	7	+ * for more info visit http:/metavid.ucsc.edu/code
	8	+ *
	9	+ * @author Michael Dale
	10	+ * @email dale@ucsc.edu
	11	+ * @url http://metavid.ucsc.edu
	12	+ */
	13	+ function upTempalte_Ht_en($force = false) {
	14	+ $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Ht_en');
	15	+ if (!$wgTemplateTitle->exists() \|\| $force) {
	16	+ do_update_wiki_page($wgTemplateTitle, '<noinclude>
	17	+ This is the default Template for the display of transcript text.
	18	+ </noinclude><includeonly>{{ #if: {{{PersonName\|}}} \| {{ #ifexist: Image:{{{PersonName}}}.jpg \| [[Image:{{{PersonName}}}.jpg\|44px\|left]]\|[[Image:Missing person.jpg\|44px\|left]]}} \|}}{{ #if:{{{PersonName\|}}}\|[[{{{PersonName}}}]]: \|}}{{{BodyText}}}
	19	+ </includeonly>');
	20	+ }
	21	+}
	22	+function upTemplate_person($force = false) {
	23	+ global $valid_attributes;
	24	+ $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Congress Person');
	25	+ if (!$wgTemplateTitle->exists() \|\| $force) {
	26	+ $wgTemplateArticle = new Article($wgTemplateTitle);
	27	+ $template_body = '<noinclude>Congress Person template simplifies
	28	+ the structure of articles about Congress People.
	29	+ <pre>{{Congress Person\|' . "\n";
	30	+ foreach ($valid_attributes as $dbKey => $attr) {
	31	+ list ($name, $desc) = $attr;
	32	+ $template_body .= $name . '=' . $desc . "\|\n";
	33	+ }
	34	+ $template_body .= '}}</pre>' .
	35	+ 'The order of the fields is not relevant. The template name (Congress Person) should be given as the \'\'first\'\' thing on a page.
	36	+ </noinclude>' .
	37	+ '<includeonly>' . "\n";
	38	+ //include the image if present:
	39	+ $template_body .= '{{ #if: { Image:{{PAGENAME}}.jpg}\| [[Image:{{PAGENAME}}.jpg]] \|}}' . "\n";
	40	+ foreach ($valid_attributes as $dbKey => $attr) {
	41	+ list ($name, $desc) = $attr;
	42	+ //raw semantic data (@@todo make pretty template table thing)
	43	+ $template_body .= "{{ #if: {{{" . $name . "}}}\| [[$name:={{{" . $name . "}}}\| ]] \|}} \n";
	44	+ }
	45	+ $template_body .= '[[Category:Congress Person]] [[Category:Person]]
	46	+ </includeonly>';
	47	+ echo "updated 'Congress Person' template\n";
	48	+ do_update_wiki_page($wgTemplateTitle, $template_body);
	49	+ }
	50	+}
	51	+function do_people_insert() {
	52	+ global $valid_attributes, $states_ary;
	53	+ $dbr = wfGetDB(DB_SLAVE);
	54	+
	55	+ //check person
	56	+ upTemplate_person();
	57	+ //do people query:
	58	+ $res = $dbr->query("SELECT * FROM `metavid`.`people`");
	59	+ if ($dbr->numRows($res) == 0)
	60	+ die('could not find people: ' . "\n");
	61	+ $person_ary = array ();
	62	+ while ($person = $dbr->fetchObject($res)) {
	63	+ $person_ary[] = $person;
	64	+ }
	65	+ foreach ($person_ary as $person) {
	66	+ $person_title = Title :: newFromUrl($person->name_clean);
	67	+ //semantic data via template:
	68	+ $page_body = '{{Congress Person\|' . "\n";
	69	+ foreach ($valid_attributes as $dbKey => $attr) {
	70	+ list ($name, $desc) = $attr;
	71	+ if (trim($person-> $dbKey) != '') {
	72	+ if ($dbKey == 'state')
	73	+ $person->state = $states_ary[$person->state];
	74	+ $page_body .= "\|{$name}={$person->$dbKey}\| \n";
	75	+ }
	76	+ }
	77	+ //add in the full name attribute:
	78	+ $page_body .= "\|Full Name=" . $person->title . ' ' . $person->first .
	79	+ ' ' . $person->middle . ' ' . $person->last . "\| \n";
	80	+ $page_body .= '}}';
	81	+ //add in basic info to be overwitten by tranclude (from
	82	+ $full_name = $person->title . ' ' . $person->first .
	83	+ ' ' . $person->middle . ' ' . $person->last;
	84	+ if (trim($full_name) == '')
	85	+ $full_name = $person->name_clean;
	86	+
	87	+ $page_body .= "\n" .'Basic Person page For <b>' . $full_name . "</b><br>\n".
	88	+ "Text Spoken By [[Special:MediaSearch/person/{$person->name_clean}\|$full_name]] ";
	89	+ ;
	90	+ do_update_wiki_page($person_title, $page_body);
	91	+ }
	92	+ foreach ($person_ary as $person) {
	93	+ //download/upload all the photos:
	94	+ $imgTitle = Title :: makeTitle(NS_IMAGE, $person->name_clean . '.jpg');
	95	+ //if(!$imgTitle->exists()){
	96	+ global $wgTmpDirectory;
	97	+ $url = 'http://www.opensecrets.org/politicians/img/pix/' . $person->osid . '.jpg';
	98	+ //print $wgTmpDirectory . "\n";
	99	+ $local_file = tempnam($wgTmpDirectory, 'WEBUPLOAD');
	100	+ //copy file:
	101	+
	102	+ # Check if already there existence
	103	+ $image = wfLocalFile($imgTitle);
	104	+ if ($image->exists()) {
	105	+ echo ($imgTitle->getDBkey() . " already in the wiki\n");
	106	+ continue;
	107	+ }
	108	+
	109	+ for ($ct = 0; $ct < 10; $ct++) {
	110	+ if (!@ copy($url, $local_file)) {
	111	+ print ("failed to copy $url to local_file (tring again) \n");
	112	+ } else {
	113	+ print "copy success\n";
	114	+ $ct = 10;
	115	+ }
	116	+ if ($ct == 9)
	117	+ print 'complete failure' . "\n";
	118	+ }
	119	+
	120	+ # Stash the file
	121	+ echo ("Saving " . $imgTitle->getDBkey() . "...");
	122	+ $image = wfLocalFile($imgTitle);
	123	+
	124	+ $archive = $image->publish($local_file);
	125	+ if (WikiError :: isError($archive)) {
	126	+ echo ("failed.\n");
	127	+ continue;
	128	+ }
	129	+ echo ("importing...");
	130	+ $comment = 'Image file for [[' . $person->name_clean . ']]';
	131	+ $license = '';
	132	+
	133	+ if ($image->recordUpload($archive, $comment, $license)) {
	134	+ # We're done!
	135	+ echo ("done.\n");
	136	+ } else {
	137	+ echo ("failed.\n");
	138	+ }
	139	+ //}
	140	+ }
	141	+}
	142	+//$i=0;
	143	+function do_stream_attr_check($old_stream) {
	144	+ global $i;
	145	+ $mvStream = & mvGetMVStream(array (
	146	+ 'name' => $old_stream->name
	147	+ ));
	148	+ //print "doding stream attr check: ";
	149	+ //print_r($old_stream);
	150	+
	151	+ if ($mvStream->date_start_time != $old_stream->adj_start_time) {
	152	+ $mvStream->date_start_time = $old_stream->adj_start_time;
	153	+ }
	154	+ if ($mvStream->duration != ($old_stream->adj_end_time - $old_stream->adj_start_time)) {
	155	+ $mvStream->duration = ($old_stream->adj_end_time - $old_stream->adj_start_time);
	156	+ }
	157	+ $mvStream->updateStreamDB();
	158	+ print "\nran stream db update: " .$mvStream->duration . ' ' . $mvStream->date_start_time."\n";
	159	+ //if($i==3)die;
	160	+ //$i++;
	161	+}
	162	+function do_stream_file_check(& $old_stream) {
	163	+ global $mvgIP;
	164	+ $mvStream = & mvGetMVStream(array (
	165	+ 'name' => $old_stream->name
	166	+ ));
	167	+ $file_list = $mvStream->getFileList();
	168	+
	169	+ if ($old_stream->trascoded != 'none') {
	170	+ //print "transcode is: " . $old_stream->trascoded;
	171	+ if ($old_stream->trascoded == 'low')
	172	+ $set = array (
	173	+ 'mv_ogg_low_quality'
	174	+ );
	175	+ if ($old_stream->trascoded == 'high')
	176	+ $set = array (
	177	+ 'mv_ogg_high_quality'
	178	+ );
	179	+ if ($old_stream->trascoded == 'all')
	180	+ $set = array (
	181	+ 'mv_ogg_high_quality',
	182	+ 'mv_ogg_low_quality'
	183	+ );
	184	+ //print "set: " . print_r($set);
	185	+ //remove old file pointers:
	186	+ $dbw = wfGetDB(DB_WRITE);
	187	+ $sql = "DELETE FROM `mv_stream_files` WHERE `stream_id`=".$mvStream->id;
	188	+ $dbw->query($sql);
	189	+ //update files:
	190	+ foreach ($set as $qf) {
	191	+ do_insert_stream_file($mvStream, $old_stream, $qf);
	192	+ }
	193	+ }
	194	+ //check for archive.org stuff too..
	195	+ /*if($old_stream->archive_org!=''){
	196	+ $found=false;
	197	+ foreach($file_list as $file){
	198	+ if($file->path_type =='ext_archive_org'){
	199	+ $found=true;
	200	+ }
	201	+ }
	202	+ if(!$found)do_insert_stream_file($mvStream, $old_stream, 'mv_archive_org_link');
	203	+ }*/
	204	+}
	205	+function do_insert_stream_file($mvStream, $old_stream, $quality_msg) {
	206	+ global $mvVideoArchivePaths;
	207	+ $dbw = wfGetDB(DB_WRITE);
	208	+ if ($quality_msg == 'mv_ogg_low_quality') {
	209	+ $path = $mvVideoArchivePaths[$old_stream->archive_server] . $mvStream->name. '.ogg';
	210	+ } else if ($quality_msg == 'mv_ogg_high_quality') {
	211	+ $path = $mvVideoArchivePaths[$old_stream->archive_server] .$mvStream->name.'.HQ.ogg';
	212	+ }else{
	213	+ return '';
	214	+ }
	215	+ //get file duration from nfo file (if avaliable ):
	216	+ $nfo_url = $path . '.nfo';
	217	+ $nfo_txt = file($nfo_url);
	218	+ if($nfo_txt){
	219	+ if( isset($nfo_txt[0])){
	220	+ list($na, $len) = explode('n:', $nfo_txt[0]);
	221	+ $len = trim($len);
	222	+ //trim leading zero
	223	+ if($len[0]=='0')$len=substr($len,1);
	224	+ //trim sub frame times:
	225	+ if(strpos($len, '.')!==false){
	226	+ $len = substr($len, 0, strpos($len, '.'));
	227	+ }
	228	+ $dur=ntp2seconds($len);
	229	+ }else{
	230	+ echo "empty nfo file: $nfo_url \n";
	231	+ $dur=0;
	232	+ }
	233	+ }else{
	234	+ echo "missing nfo file: $nfo_url \n";
	235	+ $dur=0;
	236	+ }
	237	+
	238	+ $sql = "INSERT INTO `mv_stream_files` (`stream_id`, `file_desc_msg`, `path`, `duration`)" .
	239	+ " VALUES ('{$mvStream->id}', '{$quality_msg}', " ." '{$path}', {$dur} )";
	240	+ $dbw->query($sql);
	241	+}
	242	+//@@todo convert to MV_EditStream
	243	+function do_add_stream(& $mvTitle, & $stream) {
	244	+ $MV_SpecialAddStream = new MV_SpecialCRUDStream('add');
	245	+ $MV_SpecialAddStream->stream_name = $mvTitle->getStreamName();
	246	+ $MV_SpecialAddStream->stream_type = 'metavid_file';
	247	+ $MV_SpecialAddStream->stream_desc = mv_semantic_stream_desc($mvTitle, $stream);
	248	+ //add the stream:
	249	+ $MV_SpecialAddStream->add_stream();
	250	+}
	251	+function do_stream_insert($mode, $stream_name = '') {
	252	+ global $mvgIP, $MVStreams, $options;
	253	+ $dbr = wfGetDB(DB_SLAVE);
	254	+ if ($mode == 'all'){
	255	+ $sql = "SELECT * FROM `metavid`.`streams` WHERE `sync_status`='in_sync'";
	256	+ }else if($mode=='files') {
	257	+ $sql = "SELECT * FROM `metavid`.`streams` WHERE `trascoded` != 'none'";
	258	+ }else{
	259	+ $sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '{$stream_name}'";
	260	+ }
	261	+ $res = $dbr->query($sql);
	262	+ if ($dbr->numRows($res) == 0)
	263	+ die('could not find stream: ' . $stream_name . "\n");
	264	+ //load all stream names:
	265	+ while ($row = $dbr->fetchObject($res)) {
	266	+ $streams[] = $row;
	267	+ }
	268	+ print "working on " . count($streams) . ' streams'."\n";
	269	+ foreach ($streams as $stream) {
	270	+ //init the stream
	271	+ $MVStreams[$stream->name] = new MV_Stream($stream);
	272	+ //check if the stream has already been added to the wiki (if not add it)
	273	+ $mvTitle = new MV_Title('MvStream:' . $stream->name);
	274	+ if (!$mvTitle->doesStreamExist()) {
	275	+ //print 'do stream desc'."\n";
	276	+ do_add_stream($mvTitle, $stream);
	277	+ echo "stream " . $mvTitle->getStreamName() . " added \n";
	278	+ } else {
	279	+ do_update_wiki_page($stream->name, mv_semantic_stream_desc($mvTitle, $stream), MV_NS_STREAM);
	280	+ //$updated = ' updated' echo "stream " . $mvTitle->getStreamName() . " already present $updated\n";
	281	+ }
	282	+ //add duration and start_time attr
	283	+ do_stream_attr_check($stream);
	284	+
	285	+ //do insert/copy all media images
	286	+ if(!isset($options['noimage'])){
	287	+ do_proccess_images($stream);
	288	+ }
	289	+
	290	+ //check for files (make sure they match with metavid db values
	291	+ do_stream_file_check($stream);
	292	+
	293	+ if(!isset($options['skiptext'])){
	294	+ //proccess all stream text:
	295	+ do_proccess_text($stream);
	296	+ }
	297	+ }
	298	+}
	299	+function do_proccess_text($stream){
	300	+ $dbr = wfGetDB(DB_SLAVE);
	301	+ /* for now use the stream search table (in the future should put in our orphaned person data)
	302	+ * should be able to do quick checks against the index. */
	303	+ $sql = "SELECT (`time`+" . CC_OFFSET . ") as time, `value` " .
	304	+ "FROM `metavid`.`stream_attr_time_text`
	305	+ WHERE `stream_fk`=" . $stream->id . "
	306	+ AND `time` >= " . $stream->adj_start_time . "
	307	+ AND `time` <= " . $stream->adj_end_time . "
	308	+ ORDER BY `time` ASC ";
	309	+
	310	+ //$sql = "SELECT * FROM `metavid`.`stream_search` WHERE `stream_fk`={$stream->id}";
	311	+ $page_res = $dbr->query($sql);
	312	+ if ($dbr->numRows($page_res) == 0)
	313	+ echo 'No pages for stream' . $stream->name . "\n";
	314	+ $pages = array ();
	315	+ while ($page = $dbr->fetchObject($page_res)) {
	316	+ $pages[] = $page;
	317	+ }
	318	+ print "Checking ".count($pages) . " text pages\n";
	319	+ $i=$j=0;
	320	+ foreach ($pages as $inx => $page) {
	321	+ //status updates:
	322	+ if($i==50){
	323	+ print "on $j of ". count($pages) . "\n";
	324	+ $i=0;
	325	+ }
	326	+ $i++;
	327	+ $j++;
	328	+ $start_time = $page->time - $stream->adj_start_time;
	329	+ if (seconds2ntp($start_time) < 0)
	330	+ $start_time = '0:00:00';
	331	+ if (($inx +1) == count($pages)) {
	332	+ $end_time = $stream->adj_end_time - $stream->adj_start_time;
	333	+ } else {
	334	+ $end_time = $pages[$inx +1]->time - $stream->adj_start_time;
	335	+ }
	336	+ if (($end_time - $start_time) > 40)
	337	+ $end_time = $start_time +40;
	338	+ //skip if end_time <1
	339	+ if ($end_time < 0)
	340	+ continue;
	341	+ //now pull up the person for the given stream time:`metavid`.`people`.`name_clean`
	342	+ $sql = "SELECT * , abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} ) AS `distance` " .
	343	+ "FROM `metavid`.`people_attr_stream_time` " .
	344	+ "LEFT JOIN `metavid`.`people` ON `metavid`.`people_attr_stream_time`.`people_fk` = `metavid`.`people`.`id` " .
	345	+ "WHERE `metavid`.`people_attr_stream_time`.`stream_fk` ={$stream->id} " .
	346	+ //have a negative threshold of 4 seconds
	347	+ "AND (`metavid`.`people_attr_stream_time`.`time`-{$page->time})>-4 " .
	348	+ //have a total distance threshold of 30 seconds
	349	+ "AND abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} )< 90 " .
	350	+ "ORDER BY `distance` ASC " .
	351	+ "LIMIT 1 ";
	352	+ $person_res = $dbr->query($sql);
	353	+
	354	+ $page_title = $stream->name . '/' . seconds2ntp($start_time) . '/' . seconds2ntp($end_time);
	355	+ //print $page_title . "\n";
	356	+ $page_body = '';
	357	+ if ($dbr->numRows($person_res) != 0) {
	358	+ $person = $dbr->fetchObject($person_res);
	359	+ $person_name = utf8_encode($person->name_clean);
	360	+ $page_body .= "\n[[Spoken By::{$person_name}]] ";
	361	+ }
	362	+ $page_body .= trim(str_replace("\n", ' ', strtolower($page->value)));
	363	+
	364	+ //print $page_title . "\n";
	365	+ //die;
	366	+ //print $page_body . "\n\n";
	367	+ do_update_wiki_page('Ht_en:' . $page_title, $page_body, MV_NS_MVD);
	368	+ }
	369	+}
	370	+/*
	371	+ * for each image add it to the image directory
	372	+ */
	373	+function do_proccess_images($stream) {
	374	+ global $mvLocalImgLoc, $MVStreams, $wgDBname;
	375	+ $dbr =& wfGetDB(DB_SLAVE);
	376	+ $dbw =& wfGetDB(DB_MASTER);
	377	+
	378	+ //get all images for the current stream:
	379	+ $sql = "SELECT * FROM `metavid`.`image_archive`
	380	+ WHERE `stream_fk`= {$stream->id}";
	381	+ $image_res = $dbr->query($sql);
	382	+ $img_count = $dbr->numRows($image_res);
	383	+ print "Found " . $img_count . " images for stream " . $stream->name . "\n";
	384	+ //grab from metavid and copy to local directory structure:
	385	+ $i=$j= 0;
	386	+ while ($row = $dbr->fetchObject($image_res)) {
	387	+ $relative_time = $row->time - $stream->adj_start_time;
	388	+ //status updates:
	389	+ if ($i == 10) {
	390	+ print "On image $j of $img_count time: " . seconds2ntp($relative_time) . "\n";
	391	+ $i = 0;
	392	+ }
	393	+ $j++;
	394	+ $i++;
	395	+ //get streamImage obj:
	396	+ $mv_stream_id = $MVStreams[$stream->name]->getStreamId();
	397	+ $local_img_dir = MV_StreamImage :: getLocalImageDir($mv_stream_id);
	398	+ $metavid_img_url = 'http://metavid.ucsc.edu/image_media/' . $row->id . '.jpg';
	399	+
	400	+ $local_img_file = $local_img_dir . '/' . $relative_time . '.jpg';
	401	+ //check if the image already exist in the new table
	402	+ $sql = "SELECT * FROM `$wgDBname`.`mv_stream_images` " .
	403	+ "WHERE `stream_id`={$mv_stream_id} " .
	404	+ "AND `time`=$relative_time";
	405	+ $img_check = $dbr->query($sql);
	406	+ $doInsert = true;
	407	+ if ($dbr->numRows($img_check) != 0) {
	408	+ //make sure its there:
	409	+ if (is_file($local_img_file)) {
	410	+ //print "skiped stream_id:" . $mv_stream_id . " time: " . $relative_time . "\n";
	411	+ continue;
	412	+ } else {
	413	+ //grab but don't insert:
	414	+ $doInsert = false;
	415	+ }
	416	+ }
	417	+ if ($doInsert) {
	418	+ //insert:
	419	+ $dbw->insert('mv_stream_images', array (
	420	+ 'stream_id' => $MVStreams[$stream->name]->getStreamId(), 'time' => $relative_time));
	421	+ $img_id = $dbw->insertId();
	422	+ //$grab = exec('cd ' . $img_path . '; wget ' . $im_url);
	423	+ }
	424	+
	425	+ if (is_file($local_img_file)) {
	426	+ echo "skipped $local_img_file \n";
	427	+ continue;
	428	+ }
	429	+ if (!copy($metavid_img_url, $local_img_file)) {
	430	+ echo "failed to copy $metavid_img_url to $local_img_file...\n";
	431	+ } else {
	432	+ //all good don't report anything'
	433	+ }
	434	+ }
	435	+}
	436	+
	437	+function do_update_wiki_page($wgTitle, $wikiText, $ns = null, $forceUpdate=false) {
	438	+ global $botUserName;
	439	+ if (!is_object($wgTitle)) {
	440	+ $wgTitle = Title :: makeTitle($ns, $wgTitle);
	441	+ }
	442	+ //make sure the text is utf8 encoded:
	443	+ $wikiText = utf8_encode($wikiText);
	444	+
	445	+ $wgArticle = new Article($wgTitle);
	446	+ if(!mvDoMvPage($wgTitle, $wgArticle, false)){
	447	+ print "bad title: ".$wgTitle->getDBkey()." no edit";
	448	+ if($wgTitle->exists()){
	449	+ print "remove article";
	450	+ $wgArticle->doDeleteArticle( 'bad title' );
	451	+ }
	452	+ //some how mvdIndex and mvd pages got out of sync do a seperate check for the mvd:
	453	+ if(MV_Index::getMVDbyTitle($wgArticle->mTitle->getDBkey())!=null){
	454	+ print ', rm mvd';
	455	+ MV_Index::remove_by_wiki_title($wgArticle->mTitle->getDBkey());
	456	+ }
	457	+ print "\n";
	458	+ return ;
	459	+ }
	460	+ if ($wgTitle->exists()) {
	461	+ //if last edit!=mvBot skip (don't overwite peoples improvments')
	462	+ $rev = & Revision::newFromTitle($wgTitle);
	463	+ if( $botUserName!= $rev->getRawUserText()){
	464	+ print ' skiped page edited by user:'.$rev->getRawUserText()."\n";
	465	+ if(!$forceUpdate)return ;
	466	+ }
	467	+ //proc article:
	468	+ $cur_text = $wgArticle->getContent();
	469	+ //if its a redirect skip
	470	+ if(substr($cur_text, 0, strlen('#REDIRECT') )=='#REDIRECT'){
	471	+ print ' skiped page moved by user:'.$rev->getRawUserText()."\n";
	472	+ if(!$forceUpdate)return ;
	473	+ }
	474	+ //check if text is identical:
	475	+ if (trim($cur_text) == trim($wikiText)) {
	476	+ if(!$forceUpdate)return ;
	477	+ }
	478	+ }
	479	+ //got here do the edit:
	480	+ $sum_txt = 'metavid bot insert';
	481	+ $wgArticle->doEdit($wikiText, $sum_txt);
	482	+ print "did edit on " . $wgTitle->getDBkey() . "\n";
	483	+ //die;
	484	+}
	485	+//given a stream name it pulls all metavid stream data and builds semantic wiki page
	486	+function mv_semantic_stream_desc(& $mvTitle, & $stream) {
	487	+ global $start_time, $end_time;
	488	+ /$sql = "SELECT FROM `metavid`.`streams` WHERE `name` LIKE '" . $mvTitle->getStreamName() . "'";
	489	+ $dbr = wfGetDB(DB_SLAVE);
	490	+ $res = $dbr->query($sql);
	491	+ //echo "\n" . $sql . "\n";
	492	+ $stream = $dbr->fetchObject($res);*/
	493	+ $stream_id = $stream->id;
	494	+ $out = '';
	495	+ $pout = mv_proccess_attr('stream_attr_varchar', $stream_id);
	496	+ $pout .= mv_proccess_attr('stream_attr_int', $stream_id);
	497	+ //add links/generic text at the start
	498	+ $out .= '==Official Record==' . "\n";
	499	+ $date = date('Ymd', $start_time);
	500	+ $cspan_date = date('Y-m-d', $start_time);
	501	+ $ch_type = '';
	502	+ if (strpos($mvTitle->getStreamName(), 'house') !== false)
	503	+ $ch_type = 'h';
	504	+ if (strpos($mvTitle->getStreamName(), 'senate') !== false)
	505	+ $ch_type = 's';
	506	+ if ($ch_type != '') {
	507	+ $out .= '*[[GovTrack]] Congressional Record' .
	508	+ '[http://www.govtrack.us/congress/recordindex.xpd?date=' . $date .
	509	+ '&where=' . $ch_type .
	510	+ ']' . "\n\n";
	511	+ $out .= '*[[THOMAS]] Congressional Record ' .
	512	+ '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
	513	+ ']' . "\n\n";
	514	+ $out .= '*[[THOMAS]] Extension of Remarks ' .
	515	+ '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
	516	+ ']' . "\n\n";
	517	+ }
	518	+ if ($stream->archive_org != '') {
	519	+ $out .= '==More Media Sources=='."\n";
	520	+ $out .= '*[[Archive.org]] hosted original copy ' .
	521	+ '[http://www.archive.org/details/mv_' . $stream->name . ']' . "\n";
	522	+ }
	523	+ //all streams have congretional cronical:
	524	+ $out .= '*[[CSPAN]]\'s Congressional Chronicle ' .
	525	+ '[http://www.c-spanarchives.org/congress/?q=node/69850&date=' . $cspan_date . '&hors=' . $ch_type . ']';
	526	+ $out .= "\n\n";
	527	+ $out .= $pout;
	528	+ $out .= '[[stream_duration:=' . ($end_time - $start_time) . '\| ]]' . "\n";
	529	+ if($stream->org_start_time){
	530	+ $out .= '[[original_date:='.$stream->org_start_time.'\| ]]';
	531	+ }
	532	+
	533	+ //add stream category (based on sync status)
	534	+ switch($stream->sync_status){
	535	+ case 'not_checked':
	536	+ $out.="\n\n".'[[Category:Stream Unchecked]]';
	537	+ break;
	538	+ case 'impossible':
	539	+ $out.="\n\n".'[[Category:Stream Out of Sync]]';
	540	+ break;
	541	+ case 'in_sync':
	542	+ $out.="\n\n".'[[Category:Stream Basic Sync]]';
	543	+ //other options [stream high quality sync ];
	544	+ break;
	545	+ }
	546	+
	547	+ return $out;
	548	+}
	549	+function mv_proccess_attr($table, $stream_id) {
	550	+ global $start_time, $end_time;
	551	+ $dbr = wfGetDB(DB_SLAVE);
	552	+ $sql = "SELECT * FROM `metavid`.`$table` WHERE `stream_fk`=$stream_id";
	553	+ $res = $dbr->query($sql);
	554	+ $out = '';
	555	+ while ($var = $dbr->fetchObject($res)) {
	556	+ $type_title = getTypeTitle($var->type);
	557	+ if ($var->type == 'adj_start_time')
	558	+ $start_time = $var->value;
	559	+ if ($var->type == 'adj_end_time')
	560	+ $end_time = $var->value;
	561	+ if ($type_title != '') {
	562	+ $reltype = ($type_title[0] == 'rel') ? '::' : ':=';
	563	+ $out .= '[[' . $var->type . ':=' . $var->value . '\| ]]' . "\n";
	564	+ }
	565	+ }
	566	+ return $out;
	567	+}
	568	+function getTypeTitle($type) {
	569	+ switch ($type) {
	570	+ case 'cspan_type' :
	571	+ return array (
	572	+ 'rel',
	573	+ 'Government Event'
	574	+ );
	575	+ break;
	576	+ case 'cspan_title' :
	577	+ return array (
	578	+ 'atr',
	579	+ 'C-SPAN Title'
	580	+ );
	581	+ break;
	582	+ case 'cspan_desc' :
	583	+ return array (
	584	+ 'atr',
	585	+ 'C-SPAN Description'
	586	+ );
	587	+ break;
	588	+ case 'adj_start_time' :
	589	+ return array (
	590	+ 'atr',
	591	+ 'Unix Start Time'
	592	+ );
	593	+ break;
	594	+ case 'adj_end_time' :
	595	+ return array (
	596	+ 'atr',
	597	+ 'Unix End Time'
	598	+ );
	599	+ break;
	600	+ default :
	601	+ return '';
	602	+ break;
	603	+ }
	604	+}
	605	+?>
Index: trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.php
—	—	@@ -17,6 +17,7 @@
18	18	$cur_path = $IP = dirname(__FILE__);
19	19	//include commandLine.inc from the mediaWiki maintance dir:
20	20	require_once ('../../../maintenance/commandLine.inc');
	21	+require_once ('metavid2mvWiki.inc.php');
21	22	/*
22	23	* assume the wiki user has access to the metavid table and that the
23	24	* metavid table is titled `metavid`
—	—	@@ -236,597 +237,6 @@
237	238	do_stream_insert('stream', $args[0]);
238	239	break;
239	240	}
240		~~-function upTempalte_Ht_en($force = false) {~~
241		~~- $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Ht_en');~~
242		~~- if (!$wgTemplateTitle->exists() \|\| $force) {~~
243		~~- do_update_wiki_page($wgTemplateTitle, '<noinclude>~~
244		~~- This is the default Template for the display of transcript text.~~
245		- </noinclude><includeonly>{{ #if: {{{PersonName\|}}} \| {{ #ifexist: Image:{{{PersonName}}}.jpg \| [[Image:{{{PersonName}}}.jpg\|44px\|left]]\|[[Image:Missing person.jpg\|44px\|left]]}} \|}}{{ #if:{{{PersonName\|}}}\|[[{{{PersonName}}}]]: \|}}{{{BodyText}}}
246		~~- </includeonly>');~~
247		~~- }~~
248		-}
249		~~-function upTemplate_person($force = false) {~~
250		~~- global $valid_attributes;~~
251		~~- $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Congress Person');~~
252		~~- if (!$wgTemplateTitle->exists() \|\| $force) {~~
253		~~- $wgTemplateArticle = new Article($wgTemplateTitle);~~
254		~~- $template_body = '<noinclude>Congress Person template simplifies~~
255		~~- the structure of articles about Congress People.~~
256		~~- <pre>{{Congress Person\|' . "\n";~~
257		~~- foreach ($valid_attributes as $dbKey => $attr) {~~
258		~~- list ($name, $desc) = $attr;~~
259		~~- $template_body .= $name . '=' . $desc . "\|\n";~~
260		~~- }~~
261		~~- $template_body .= '}}</pre>' .~~
262		~~- 'The order of the fields is not relevant. The template name (Congress Person) should be given as the \'\'first\'\' thing on a page.~~
263		~~- </noinclude>' .~~
264		~~- '<includeonly>' . "\n";~~
265		~~- //include the image if present:~~
266		~~- $template_body .= '{{ #if: { Image:{{PAGENAME}}.jpg}\| [[Image:{{PAGENAME}}.jpg]] \|}}' . "\n";~~
267		~~- foreach ($valid_attributes as $dbKey => $attr) {~~
268		~~- list ($name, $desc) = $attr;~~
269		~~- //raw semantic data (@@todo make pretty template table thing)~~
270		~~- $template_body .= "{{ #if: {{{" . $name . "}}}\| [[$name:={{{" . $name . "}}}\| ]] \|}} \n";~~
271		~~- }~~
272		~~- $template_body .= '[[Category:Congress Person]] [[Category:Person]]~~
273		~~- </includeonly>';~~
274		~~- echo "updated 'Congress Person' template\n";~~
275		~~- do_update_wiki_page($wgTemplateTitle, $template_body);~~
276		~~- }~~
277		-}
278		~~-function do_people_insert() {~~
279		~~- global $valid_attributes, $states_ary;~~
280		~~- $dbr = wfGetDB(DB_SLAVE);~~
281	241
282		~~- //check person~~
283		~~- upTemplate_person();~~
284		~~- //do people query:~~
285		~~- $res = $dbr->query("SELECT * FROM `metavid`.`people`");~~
286		~~- if ($dbr->numRows($res) == 0)~~
287		~~- die('could not find people: ' . "\n");~~
288		~~- $person_ary = array ();~~
289		~~- while ($person = $dbr->fetchObject($res)) {~~
290		~~- $person_ary[] = $person;~~
291		~~- }~~
292		~~- foreach ($person_ary as $person) {~~
293		~~- $person_title = Title :: newFromUrl($person->name_clean);~~
294		~~- //semantic data via template:~~
295		~~- $page_body = '{{Congress Person\|' . "\n";~~
296		~~- foreach ($valid_attributes as $dbKey => $attr) {~~
297		~~- list ($name, $desc) = $attr;~~
298		~~- if (trim($person-> $dbKey) != '') {~~
299		~~- if ($dbKey == 'state')~~
300		~~- $person->state = $states_ary[$person->state];~~
301		~~- $page_body .= "\|{$name}={$person->$dbKey}\| \n";~~
302		~~- }~~
303		~~- }~~
304		~~- //add in the full name attribute:~~
305		~~- $page_body .= "\|Full Name=" . $person->title . ' ' . $person->first .~~
306		~~- ' ' . $person->middle . ' ' . $person->last . "\| \n";~~
307		~~- $page_body .= '}}';~~
308		~~- //add in basic info to be overwitten by tranclude (from~~
309		~~- $full_name = $person->title . ' ' . $person->first .~~
310		~~- ' ' . $person->middle . ' ' . $person->last;~~
311		~~- if (trim($full_name) == '')~~
312		~~- $full_name = $person->name_clean;~~
313		-
314		~~- $page_body .= "\n" .'Basic Person page For <b>' . $full_name . "</b><br>\n".~~
315		~~- "Text Spoken By [[Special:MediaSearch/person/{$person->name_clean}\|$full_name]] ";~~
316		~~- ;~~
317		~~- do_update_wiki_page($person_title, $page_body);~~
318		~~- }~~
319		~~- foreach ($person_ary as $person) {~~
320		~~- //download/upload all the photos:~~
321		~~- $imgTitle = Title :: makeTitle(NS_IMAGE, $person->name_clean . '.jpg');~~
322		~~- //if(!$imgTitle->exists()){~~
323		~~- global $wgTmpDirectory;~~
324		~~- $url = 'http://www.opensecrets.org/politicians/img/pix/' . $person->osid . '.jpg';~~
325		~~- //print $wgTmpDirectory . "\n";~~
326		~~- $local_file = tempnam($wgTmpDirectory, 'WEBUPLOAD');~~
327		~~- //copy file:~~
328		-
329		~~- # Check if already there existence~~
330		~~- $image = wfLocalFile($imgTitle);~~
331		~~- if ($image->exists()) {~~
332		~~- echo ($imgTitle->getDBkey() . " already in the wiki\n");~~
333		~~- continue;~~
334		~~- }~~
335		-
336		~~- for ($ct = 0; $ct < 10; $ct++) {~~
337		~~- if (!@ copy($url, $local_file)) {~~
338		~~- print ("failed to copy $url to local_file (tring again) \n");~~
339		~~- } else {~~
340		~~- print "copy success\n";~~
341		~~- $ct = 10;~~
342		~~- }~~
343		~~- if ($ct == 9)~~
344		~~- print 'complete failure' . "\n";~~
345		~~- }~~
346		-
347		~~- # Stash the file~~
348		~~- echo ("Saving " . $imgTitle->getDBkey() . "...");~~
349		~~- $image = wfLocalFile($imgTitle);~~
350		-
351		~~- $archive = $image->publish($local_file);~~
352		~~- if (WikiError :: isError($archive)) {~~
353		~~- echo ("failed.\n");~~
354		~~- continue;~~
355		~~- }~~
356		~~- echo ("importing...");~~
357		~~- $comment = 'Image file for [[' . $person->name_clean . ']]';~~
358		~~- $license = '';~~
359		-
360		~~- if ($image->recordUpload($archive, $comment, $license)) {~~
361		~~- # We're done!~~
362		~~- echo ("done.\n");~~
363		~~- } else {~~
364		~~- echo ("failed.\n");~~
365		~~- }~~
366		~~- //}~~
367		~~- }~~
368		-}
369		~~-//$i=0;~~
370		~~-function do_stream_attr_check($old_stream) {~~
371		~~- global $i;~~
372		~~- $mvStream = & mvGetMVStream(array (~~
373		~~- 'name' => $old_stream->name~~
374		~~- ));~~
375		~~- //print "doding stream attr check: ";~~
376		~~- //print_r($old_stream);~~
377		-
378		~~- if ($mvStream->date_start_time != $old_stream->adj_start_time) {~~
379		~~- $mvStream->date_start_time = $old_stream->adj_start_time;~~
380		~~- }~~
381		~~- if ($mvStream->duration != ($old_stream->adj_end_time - $old_stream->adj_start_time)) {~~
382		~~- $mvStream->duration = ($old_stream->adj_end_time - $old_stream->adj_start_time);~~
383		~~- }~~
384		~~- $mvStream->updateStreamDB();~~
385		~~- print "\nran stream db update: " .$mvStream->duration . ' ' . $mvStream->date_start_time."\n";~~
386		~~- //if($i==3)die;~~
387		~~- //$i++;~~
388		-}
389		~~-function do_stream_file_check(& $old_stream) {~~
390		~~- global $mvgIP;~~
391		~~- $mvStream = & mvGetMVStream(array (~~
392		~~- 'name' => $old_stream->name~~
393		~~- ));~~
394		~~- $file_list = $mvStream->getFileList();~~
395		-
396		~~- if ($old_stream->trascoded != 'none') {~~
397		~~- //print "transcode is: " . $old_stream->trascoded;~~
398		~~- if ($old_stream->trascoded == 'low')~~
399		~~- $set = array (~~
400		~~- 'mv_ogg_low_quality'~~
401		~~- );~~
402		~~- if ($old_stream->trascoded == 'high')~~
403		~~- $set = array (~~
404		~~- 'mv_ogg_high_quality'~~
405		~~- );~~
406		~~- if ($old_stream->trascoded == 'all')~~
407		~~- $set = array (~~
408		~~- 'mv_ogg_high_quality',~~
409		~~- 'mv_ogg_low_quality'~~
410		~~- );~~
411		~~- //print "set: " . print_r($set);~~
412		~~- //remove old file pointers:~~
413		~~- $dbw = wfGetDB(DB_WRITE);~~
414		~~- $sql = "DELETE FROM `mv_stream_files` WHERE `stream_id`=".$mvStream->id;~~
415		~~- $dbw->query($sql);~~
416		~~- //update files:~~
417		~~- foreach ($set as $qf) {~~
418		~~- do_insert_stream_file($mvStream, $old_stream, $qf);~~
419		~~- }~~
420		~~- }~~
421		~~- //check for archive.org stuff too..~~
422		~~- /*if($old_stream->archive_org!=''){~~
423		~~- $found=false;~~
424		~~- foreach($file_list as $file){~~
425		~~- if($file->path_type =='ext_archive_org'){~~
426		~~- $found=true;~~
427		~~- }~~
428		~~- }~~
429		~~- if(!$found)do_insert_stream_file($mvStream, $old_stream, 'mv_archive_org_link');~~
430		~~- }*/~~
431		-}
432		~~-function do_insert_stream_file($mvStream, $old_stream, $quality_msg) {~~
433		~~- global $mvVideoArchivePaths;~~
434		~~- $dbw = wfGetDB(DB_WRITE);~~
435		~~- if ($quality_msg == 'mv_ogg_low_quality') {~~
436		~~- $path = $mvVideoArchivePaths[$old_stream->archive_server] . $mvStream->name. '.ogg';~~
437		~~- } else if ($quality_msg == 'mv_ogg_high_quality') {~~
438		~~- $path = $mvVideoArchivePaths[$old_stream->archive_server] .$mvStream->name.'.HQ.ogg';~~
439		~~- }else{~~
440		~~- return '';~~
441		~~- }~~
442		~~- //get file duration from nfo file (if avaliable ):~~
443		~~- $nfo_url = $path . '.nfo';~~
444		~~- $nfo_txt = file($nfo_url);~~
445		~~- if($nfo_txt){~~
446		~~- if( isset($nfo_txt[0])){~~
447		~~- list($na, $len) = explode('n:', $nfo_txt[0]);~~
448		~~- $len = trim($len);~~
449		~~- //trim leading zero~~
450		~~- if($len[0]=='0')$len=substr($len,1);~~
451		~~- //trim sub frame times:~~
452		~~- if(strpos($len, '.')!==false){~~
453		~~- $len = substr($len, 0, strpos($len, '.'));~~
454		~~- }~~
455		~~- $dur=ntp2seconds($len);~~
456		~~- }else{~~
457		~~- echo "empty nfo file: $nfo_url \n";~~
458		~~- $dur=0;~~
459		~~- }~~
460		~~- }else{~~
461		~~- echo "missing nfo file: $nfo_url \n";~~
462		~~- $dur=0;~~
463		~~- }~~
464		-
465		~~- $sql = "INSERT INTO `mv_stream_files` (`stream_id`, `file_desc_msg`, `path`, `duration`)" .~~
466		~~- " VALUES ('{$mvStream->id}', '{$quality_msg}', " ." '{$path}', {$dur} )";~~
467		~~- $dbw->query($sql);~~
468		-}
469		~~-//@@todo convert to MV_EditStream~~
470		~~-function do_add_stream(& $mvTitle, & $stream) {~~
471		~~- $MV_SpecialAddStream = new MV_SpecialCRUDStream('add');~~
472		~~- $MV_SpecialAddStream->stream_name = $mvTitle->getStreamName();~~
473		~~- $MV_SpecialAddStream->stream_type = 'metavid_file';~~
474		~~- $MV_SpecialAddStream->stream_desc = mv_semantic_stream_desc($mvTitle, $stream);~~
475		~~- //add the stream:~~
476		~~- $MV_SpecialAddStream->add_stream();~~
477		-}
478		~~-function do_stream_insert($mode, $stream_name = '') {~~
479		~~- global $mvgIP, $MVStreams, $options;~~
480		~~- $dbr = wfGetDB(DB_SLAVE);~~
481		~~- if ($mode == 'all'){~~
482		~~- $sql = "SELECT * FROM `metavid`.`streams` WHERE `sync_status`='in_sync'";~~
483		~~- }else if($mode=='files') {~~
484		~~- $sql = "SELECT * FROM `metavid`.`streams` WHERE `trascoded` != 'none'";~~
485		~~- }else{~~
486		~~- $sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '{$stream_name}'";~~
487		~~- }~~
488		~~- $res = $dbr->query($sql);~~
489		~~- if ($dbr->numRows($res) == 0)~~
490		~~- die('could not find stream: ' . $stream_name . "\n");~~
491		~~- //load all stream names:~~
492		~~- while ($row = $dbr->fetchObject($res)) {~~
493		~~- $streams[] = $row;~~
494		~~- }~~
495		~~- print "working on " . count($streams) . ' streams'."\n";~~
496		~~- foreach ($streams as $stream) {~~
497		~~- //init the stream~~
498		~~- $MVStreams[$stream->name] = new MV_Stream($stream);~~
499		~~- //check if the stream has already been added to the wiki (if not add it)~~
500		~~- $mvTitle = new MV_Title('MvStream:' . $stream->name);~~
501		~~- if (!$mvTitle->doesStreamExist()) {~~
502		~~- //print 'do stream desc'."\n";~~
503		~~- do_add_stream($mvTitle, $stream);~~
504		~~- echo "stream " . $mvTitle->getStreamName() . " added \n";~~
505		~~- } else {~~
506		~~- do_update_wiki_page($stream->name, mv_semantic_stream_desc($mvTitle, $stream), MV_NS_STREAM);~~
507		~~- //$updated = ' updated' echo "stream " . $mvTitle->getStreamName() . " already present $updated\n";~~
508		~~- }~~
509		~~- //add duration and start_time attr~~
510		~~- do_stream_attr_check($stream);~~
511		-
512		~~- //do insert/copy all media images~~
513		~~- if(!isset($options['noimage'])){~~
514		~~- do_proccess_images($stream);~~
515		~~- }~~
516		-
517		~~- //check for files (make sure they match with metavid db values~~
518		~~- do_stream_file_check($stream);~~
519		-
520		~~- if(!isset($options['skiptext'])){~~
521		~~- //proccess all stream text:~~
522		~~- do_proccess_text($stream);~~
523		~~- }~~
524		~~- }~~
525		-}
526		~~-function do_proccess_text($stream){~~
527		~~- $dbr = wfGetDB(DB_SLAVE);~~
528		~~- /* for now use the stream search table (in the future should put in our orphaned person data)~~
529		~~- * should be able to do quick checks against the index. */~~
530		~~- $sql = "SELECT (`time`+" . CC_OFFSET . ") as time, `value` " .~~
531		~~- "FROM `metavid`.`stream_attr_time_text`~~
532		~~- WHERE `stream_fk`=" . $stream->id . "~~
533		~~- AND `time` >= " . $stream->adj_start_time . "~~
534		~~- AND `time` <= " . $stream->adj_end_time . "~~
535		~~- ORDER BY `time` ASC ";~~
536		-
537		~~- //$sql = "SELECT * FROM `metavid`.`stream_search` WHERE `stream_fk`={$stream->id}";~~
538		~~- $page_res = $dbr->query($sql);~~
539		~~- if ($dbr->numRows($page_res) == 0)~~
540		~~- echo 'No pages for stream' . $stream->name . "\n";~~
541		~~- $pages = array ();~~
542		~~- while ($page = $dbr->fetchObject($page_res)) {~~
543		~~- $pages[] = $page;~~
544		~~- }~~
545		~~- print "Checking ".count($pages) . " text pages\n";~~
546		~~- $i=$j=0;~~
547		~~- foreach ($pages as $inx => $page) {~~
548		~~- //status updates:~~
549		~~- if($i==50){~~
550		~~- print "on $j of ". count($pages) . "\n";~~
551		~~- $i=0;~~
552		~~- }~~
553		~~- $i++;~~
554		~~- $j++;~~
555		~~- $start_time = $page->time - $stream->adj_start_time;~~
556		~~- if (seconds2ntp($start_time) < 0)~~
557		~~- $start_time = '0:00:00';~~
558		~~- if (($inx +1) == count($pages)) {~~
559		~~- $end_time = $stream->adj_end_time - $stream->adj_start_time;~~
560		~~- } else {~~
561		~~- $end_time = $pages[$inx +1]->time - $stream->adj_start_time;~~
562		~~- }~~
563		~~- if (($end_time - $start_time) > 40)~~
564		~~- $end_time = $start_time +40;~~
565		~~- //skip if end_time <1~~
566		~~- if ($end_time < 0)~~
567		~~- continue;~~
568		~~- //now pull up the person for the given stream time:`metavid`.`people`.`name_clean`~~
569		~~- $sql = "SELECT * , abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} ) AS `distance` " .~~
570		~~- "FROM `metavid`.`people_attr_stream_time` " .~~
571		~~- "LEFT JOIN `metavid`.`people` ON `metavid`.`people_attr_stream_time`.`people_fk` = `metavid`.`people`.`id` " .~~
572		~~- "WHERE `metavid`.`people_attr_stream_time`.`stream_fk` ={$stream->id} " .~~
573		~~- //have a negative threshold of 4 seconds~~
574		~~- "AND (`metavid`.`people_attr_stream_time`.`time`-{$page->time})>-4 " .~~
575		~~- //have a total distance threshold of 30 seconds~~
576		~~- "AND abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} )< 90 " .~~
577		~~- "ORDER BY `distance` ASC " .~~
578		~~- "LIMIT 1 ";~~
579		~~- $person_res = $dbr->query($sql);~~
580		-
581		~~- $page_title = $stream->name . '/' . seconds2ntp($start_time) . '/' . seconds2ntp($end_time);~~
582		~~- //print $page_title . "\n";~~
583		~~- $page_body = '';~~
584		~~- if ($dbr->numRows($person_res) != 0) {~~
585		~~- $person = $dbr->fetchObject($person_res);~~
586		~~- $person_name = utf8_encode($person->name_clean);~~
587		~~- $page_body .= "\n[[Spoken By::{$person_name}]] ";~~
588		~~- }~~
589		~~- $page_body .= trim(str_replace("\n", ' ', strtolower($page->value)));~~
590		-
591		~~- //print $page_title . "\n";~~
592		~~- //die;~~
593		~~- //print $page_body . "\n\n";~~
594		~~- do_update_wiki_page('Ht_en:' . $page_title, $page_body, MV_NS_MVD);~~
595		~~- }~~
596		-}
597		-/*
598		~~- * for each image add it to the image directory~~
599		~~- */~~
600		~~-function do_proccess_images($stream) {~~
601		~~- global $mvLocalImgLoc, $MVStreams, $wgDBname;~~
602		~~- $dbr = wfGetDB(DB_SLAVE);~~
603		~~- $dbw = wfGetDB(DB_MASTER);~~
604		-
605		~~- //get all images for the current stream:~~
606		~~- $sql = "SELECT * FROM `metavid`.`image_archive`~~
607		~~- WHERE `stream_fk`= {$stream->id}";~~
608		~~- $image_res = $dbr->query($sql);~~
609		~~- $img_count = $dbr->numRows($image_res);~~
610		~~- print "Found " . $img_count . " images for stream " . $stream->name . "\n";~~
611		~~- //grab from metavid and copy to local directory structure:~~
612		~~- $i=$j= 0;~~
613		~~- while ($row = $dbr->fetchObject($image_res)) {~~
614		~~- $relative_time = $row->time - $stream->adj_start_time;~~
615		~~- //status updates:~~
616		~~- if ($i == 10) {~~
617		~~- print "On image $j of $img_count time: " . seconds2ntp($relative_time) . "\n";~~
618		~~- $i = 0;~~
619		~~- }~~
620		~~- $j++;~~
621		~~- $i++;~~
622		~~- //get streamImage obj:~~
623		~~- $mv_stream_id = $MVStreams[$stream->name]->getStreamId();~~
624		~~- $local_img_dir = MV_StreamImage :: getLocalImageDir($mv_stream_id);~~
625		~~- $metavid_img_url = 'http://metavid.ucsc.edu/image_media/' . $row->id . '.jpg';~~
626		-
627		~~- $local_img_file = $local_img_dir . '/' . $relative_time . '.jpg';~~
628		~~- //check if the image already exist in the new table~~
629		~~- $sql = "SELECT * FROM `$wgDBname`.`mv_stream_images` " .~~
630		~~- "WHERE `stream_id`={$mv_stream_id} " .~~
631		~~- "AND `time`=$relative_time";~~
632		~~- $img_check = $dbr->query($sql);~~
633		~~- $doInsert = true;~~
634		~~- if ($dbr->numRows($img_check) != 0) {~~
635		~~- //make sure its there:~~
636		~~- if (is_file($local_img_file)) {~~
637		~~- //print "skiped stream_id:" . $mv_stream_id . " time: " . $relative_time . "\n";~~
638		~~- continue;~~
639		~~- } else {~~
640		~~- //grab but don't insert:~~
641		~~- $doInsert = false;~~
642		~~- }~~
643		~~- }~~
644		~~- if ($doInsert) {~~
645		~~- //insert:~~
646		~~- $dbw->insert('mv_stream_images', array (~~
647		~~- 'stream_id' => $MVStreams[$stream->name]->getStreamId(), 'time' => $relative_time));~~
648		~~- $img_id = $dbw->insertId();~~
649		~~- //$grab = exec('cd ' . $img_path . '; wget ' . $im_url);~~
650		~~- }~~
651		-
652		~~- if (is_file($local_img_file)) {~~
653		~~- echo "skipped $local_img_file \n";~~
654		~~- continue;~~
655		~~- }~~
656		~~- if (!copy($metavid_img_url, $local_img_file)) {~~
657		~~- echo "failed to copy $metavid_img_url to $local_img_file...\n";~~
658		~~- } else {~~
659		~~- //all good don't report anything'~~
660		~~- }~~
661		~~- }~~
662		-}
663		-
664		~~-function do_update_wiki_page($wgTitle, $wikiText, $ns = null, $forceUpdate=false) {~~
665		~~- global $botUserName;~~
666		~~- if (!is_object($wgTitle)) {~~
667		~~- $wgTitle = Title :: makeTitle($ns, $wgTitle);~~
668		~~- }~~
669		~~- //make sure the text is utf8 encoded:~~
670		~~- $wikiText = utf8_encode($wikiText);~~
671		-
672		~~- $wgArticle = new Article($wgTitle);~~
673		~~- if(!mvDoMvPage($wgTitle, $wgArticle, false)){~~
674		~~- print "bad title: ".$wgTitle->getDBkey()." no edit";~~
675		~~- if($wgTitle->exists()){~~
676		~~- print "remove article";~~
677		~~- $wgArticle->doDeleteArticle( 'bad title' );~~
678		~~- }~~
679		~~- //some how mvdIndex and mvd pages got out of sync do a seperate check for the mvd:~~
680		~~- if(MV_Index::getMVDbyTitle($wgArticle->mTitle->getDBkey())!=null){~~
681		~~- print ', rm mvd';~~
682		~~- MV_Index::remove_by_wiki_title($wgArticle->mTitle->getDBkey());~~
683		~~- }~~
684		~~- print "\n";~~
685		~~- return ;~~
686		~~- }~~
687		~~- if ($wgTitle->exists()) {~~
688		~~- //if last edit!=mvBot skip (don't overwite peoples improvments')~~
689		~~- $rev = & Revision::newFromTitle($wgTitle);~~
690		~~- if( $botUserName!= $rev->getRawUserText()){~~
691		~~- print ' skiped page edited by user:'.$rev->getRawUserText()."\n";~~
692		~~- if(!$forceUpdate)return ;~~
693		~~- }~~
694		~~- //proc article:~~
695		~~- $cur_text = $wgArticle->getContent();~~
696		~~- //if its a redirect skip~~
697		~~- if(substr($cur_text, 0, strlen('#REDIRECT') )=='#REDIRECT'){~~
698		~~- print ' skiped page moved by user:'.$rev->getRawUserText()."\n";~~
699		~~- if(!$forceUpdate)return ;~~
700		~~- }~~
701		~~- //check if text is identical:~~
702		~~- if (trim($cur_text) == trim($wikiText)) {~~
703		~~- if(!$forceUpdate)return ;~~
704		~~- }~~
705		~~- }~~
706		~~- //got here do the edit:~~
707		~~- $sum_txt = 'metavid bot insert';~~
708		~~- $wgArticle->doEdit($wikiText, $sum_txt);~~
709		~~- print "did edit on " . $wgTitle->getDBkey() . "\n";~~
710		~~- //die;~~
711		-}
712		~~-//given a stream name it pulls all metavid stream data and builds semantic wiki page~~
713		~~-function mv_semantic_stream_desc(& $mvTitle, & $stream) {~~
714		~~- global $start_time, $end_time;~~
715		~~- /$sql = "SELECT FROM `metavid`.`streams` WHERE `name` LIKE '" . $mvTitle->getStreamName() . "'";~~
716		~~- $dbr = wfGetDB(DB_SLAVE);~~
717		~~- $res = $dbr->query($sql);~~
718		~~- //echo "\n" . $sql . "\n";~~
719		~~- $stream = $dbr->fetchObject($res);*/~~
720		~~- $stream_id = $stream->id;~~
721		~~- $out = '';~~
722		~~- $pout = mv_proccess_attr('stream_attr_varchar', $stream_id);~~
723		~~- $pout .= mv_proccess_attr('stream_attr_int', $stream_id);~~
724		~~- //add links/generic text at the start~~
725		~~- $out .= '==Official Record==' . "\n";~~
726		~~- $date = date('Ymd', $start_time);~~
727		~~- $cspan_date = date('Y-m-d', $start_time);~~
728		~~- $ch_type = '';~~
729		~~- if (strpos($mvTitle->getStreamName(), 'house') !== false)~~
730		~~- $ch_type = 'h';~~
731		~~- if (strpos($mvTitle->getStreamName(), 'senate') !== false)~~
732		~~- $ch_type = 's';~~
733		~~- if ($ch_type != '') {~~
734		~~- $out .= '*[[GovTrack]] Congressional Record' .~~
735		~~- '[http://www.govtrack.us/congress/recordindex.xpd?date=' . $date .~~
736		~~- '&where=' . $ch_type .~~
737		~~- ']' . "\n\n";~~
738		~~- $out .= '*[[THOMAS]] Congressional Record ' .~~
739		~~- '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .~~
740		~~- ']' . "\n\n";~~
741		~~- $out .= '*[[THOMAS]] Extension of Remarks ' .~~
742		~~- '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .~~
743		~~- ']' . "\n\n";~~
744		~~- }~~
745		~~- if ($stream->archive_org != '') {~~
746		~~- $out .= '==More Media Sources=='."\n";~~
747		~~- $out .= '*[[Archive.org]] hosted original copy ' .~~
748		~~- '[http://www.archive.org/details/mv_' . $stream->name . ']' . "\n";~~
749		~~- }~~
750		~~- //all streams have congretional cronical:~~
751		~~- $out .= '*[[CSPAN]]\'s Congressional Chronicle ' .~~
752		~~- '[http://www.c-spanarchives.org/congress/?q=node/69850&date=' . $cspan_date . '&hors=' . $ch_type . ']';~~
753		~~- $out .= "\n\n";~~
754		~~- $out .= $pout;~~
755		~~- $out .= '[[stream_duration:=' . ($end_time - $start_time) . '\| ]]' . "\n";~~
756		~~- if($stream->org_start_time){~~
757		~~- $out .= '[[original_date:='.$stream->org_start_time.'\| ]]';~~
758		~~- }~~
759		-
760		~~- //add stream category (based on sync status)~~
761		~~- switch($stream->sync_status){~~
762		~~- case 'not_checked':~~
763		~~- $out.="\n\n".'[[Category:Stream Unchecked]]';~~
764		~~- break;~~
765		~~- case 'impossible':~~
766		~~- $out.="\n\n".'[[Category:Stream Out of Sync]]';~~
767		~~- break;~~
768		~~- case 'in_sync':~~
769		~~- $out.="\n\n".'[[Category:Stream Basic Sync]]';~~
770		~~- //other options [stream high quality sync ];~~
771		~~- break;~~
772		~~- }~~
773		-
774		~~- return $out;~~
775		-}
776		~~-function mv_proccess_attr($table, $stream_id) {~~
777		~~- global $start_time, $end_time;~~
778		~~- $dbr = wfGetDB(DB_SLAVE);~~
779		~~- $sql = "SELECT * FROM `metavid`.`$table` WHERE `stream_fk`=$stream_id";~~
780		~~- $res = $dbr->query($sql);~~
781		~~- $out = '';~~
782		~~- while ($var = $dbr->fetchObject($res)) {~~
783		~~- $type_title = getTypeTitle($var->type);~~
784		~~- if ($var->type == 'adj_start_time')~~
785		~~- $start_time = $var->value;~~
786		~~- if ($var->type == 'adj_end_time')~~
787		~~- $end_time = $var->value;~~
788		~~- if ($type_title != '') {~~
789		~~- $reltype = ($type_title[0] == 'rel') ? '::' : ':=';~~
790		~~- $out .= '[[' . $var->type . ':=' . $var->value . '\| ]]' . "\n";~~
791		~~- }~~
792		~~- }~~
793		~~- return $out;~~
794		-}
795		~~-function getTypeTitle($type) {~~
796		~~- switch ($type) {~~
797		~~- case 'cspan_type' :~~
798		~~- return array (~~
799		~~- 'rel',~~
800		~~- 'Government Event'~~
801		~~- );~~
802		~~- break;~~
803		~~- case 'cspan_title' :~~
804		~~- return array (~~
805		~~- 'atr',~~
806		~~- 'C-SPAN Title'~~
807		~~- );~~
808		~~- break;~~
809		~~- case 'cspan_desc' :~~
810		~~- return array (~~
811		~~- 'atr',~~
812		~~- 'C-SPAN Description'~~
813		~~- );~~
814		~~- break;~~
815		~~- case 'adj_start_time' :~~
816		~~- return array (~~
817		~~- 'atr',~~
818		~~- 'Unix Start Time'~~
819		~~- );~~
820		~~- break;~~
821		~~- case 'adj_end_time' :~~
822		~~- return array (~~
823		~~- 'atr',~~
824		~~- 'Unix End Time'~~
825		~~- );~~
826		~~- break;~~
827		~~- default :~~
828		~~- return '';~~
829		~~- break;~~
830		~~- }~~
831		-}
832	242	?>
833	243
Index: trunk/extensions/MetavidWiki/maintenance/ogg_thumb_insert.sh
—	—	@@ -1,15 +1,77 @@
2	2	#!/bin/bash
3	3
4		~~-streamid=${1};~~
5		~~-filename=${2};~~
6		~~-interval=${3};~~
	4	+###########################################################################
	5	+#
	6	+# DESCRIPTION
	7	+#
	8	+# This script can extract jpg frames from your ogg files at a specified
	9	+# interval. It can also insert this information into your mvWiki
	10	+# database.
	11	+#
	12	+# USAGE
	13	+#
	14	+# ./ogg_thumb_insert.sh stream_id filename interval
	15	+#
	16	+# EXAMPLE
	17	+#
	18	+# ./ogg_thumb_insert.sh 17 /var/www/localhost/htdocs/media/stream.ogg 20
	19	+#
	20	+# The previous example should extract frames every 20 seconds into the
	21	+# file named stream.ogg. It will place them in the appropriate stream
	22	+# directory which by default is '../stream_images/7/17/'. It should also
	23	+# insert information about the frame into the 'mv_stream_images' table.
	24	+#
	25	+###########################################################################
	26	+#
	27	+# This script relies on a number of programs being in your path, and is
	28	+# intended to be executed from the 'maintenance' directory.
	29	+#
	30	+# Requirements:
	31	+#
	32	+# ffmpeg
	33	+# mysql
	34	+# imagemagick
	35	+# ogginfo
	36	+# grep
	37	+# sed
	38	+# awk
	39	+# gawk
	40	+# echo
	41	+# wc
	42	+# bc
	43	+# seq
	44	+# mkdir
	45	+#
	46	+###########################################################################
	47	+#
	48	+# Use at your own risk. There is very little error checking.
	49	+#
	50	+###########################################################################
	51	+# This quick hack brought to you by Seth McClain smcclain@opengov.org
	52	+###########################################################################
7	53
	54	+
	55	+## REMOVE THE FOLLOWING TWO LINES BEFORE EXECUTING ##
	56	+echo "Please be sure to edit this file and change some variables before executing it";
	57	+exit
	58	+## REMOVE THE PREVIOUS TWO LINES BEFORE EXECUTING ##
	59	+
	60	+
	61	+## The following variables need to be set to allow the script access to your
	62	+## MySQL database
	63	+
8	64	table="mv_stream_images";
9	65	db="mvwiki";
10	66	user="user";
11	67	pw="password";
12	68	hostname="localhost";
13	69
	70	+## Do not edit below this line
	71	+
	72	+streamid=${1};
	73	+filename=${2};
	74	+interval=${3};
	75	+
14	76	chars=`echo -n ${streamid} \| wc -c`;
15	77	dots=`for i in \`seq 1 ${chars}\`; do echo -n .; done \| sed -e s/^.//`
16	78	dir=`echo ${streamid} \| sed -e s/^${dots}//`
—	—	@@ -25,7 +87,7 @@
26	88
27	89	for i in `seq 1 ${interval} ${duration}`
28	90	do
29		~~- #echo "insert into ${table}(stream_id, time) values(${streamid}, ${i});" \| mysql -u ${user} --password=${pw} ${db}~~
	91	+ echo "insert into ${table}(stream_id, time) values(${streamid}, ${i});" \| mysql -u ${user} --password=${pw} ${db}
30	92	ffmpeg -ss ${i} -i ${filename} -vcodec mjpeg -vframes 1 -an -f rawvideo -s 320x240 -y ${filedir}/${i}_320x240.jpg
31	93	done
32	94
Index: trunk/extensions/MetavidWiki/maintenance/scrape_and_insert.php
—	—	@@ -38,6 +38,7 @@
39	39	switch($args[0]){
40	40	case 'cspan_chronicle':
41	41	$MV_CspanScraper = new MV_CspanScraper();
	42	+ $MV_CspanScraper->doScrapeInsert();
42	43	break;
43	44	}
44	45	}
—	—	@@ -96,15 +97,52 @@
97	98	$href='';
98	99	$href_match=array();
99	100	preg_match('/href="(.*)"/',$matches[5][$k], $href_match);
100		~~- if(count($href_match)!=0)$href=$href_match[1];~~
	101	+ if(count($href_match)!=0)$href=$href_match[1];
	102	+
	103	+ $porg = str_replace('<br>',' ',$matches[4][$k]);
	104	+ $porg = preg_replace('/[D\|R]+\-\[.*\]/', '', $porg);
	105	+ $pparts = explode(',',$porg);
	106	+ $pname = trim($pparts[1]) . '_' . trim($pparts[0]);
101	107	$person_time_ary[]= array(
102	108	'start_time'=>strip_tags($matches[1][$k]),
103	109	'length'=>$matches[3][$k],
104	110	'person_title'=>str_replace('<br>',' ',$matches[4][$k]),
	111	+ 'spoken_by'=>$pname,
105	112	'href'=>$href
106	113	);
107		~~- }~~
108		~~- print_r($person_time_ary);~~
	114	+ }
	115	+ //group people in page matches
	116	+ $g_person_time_ary=array();
	117	+ $prev_person=null;
	118	+ foreach($person_time_ary as $ptag){
	119	+ $g_person_time_ary[$ptag['spoken_by']][]=$ptag;
	120	+ }
	121	+
	122	+ //retrive rows to find match:
	123	+ $dbr =& wfGetDB(DB_SLAVE);
	124	+ $mvd_res = MV_Index::getMVDInRange($stream->id, null, null, $mvd_type='ht_en',false,$smw_properties=array('Spoken_by'), '');
	125	+ $g_row_matches=array();
	126	+ //group peole in db matches:
	127	+ while ($row = $dbr->fetchObject($mvd_res)) {
	128	+ if(!isset($row->Spoken_by))continue;
	129	+ if(!isset($g_row_matches[strtolower($row->Spoken_by)])){
	130	+ $g_row_matches[strtolower($row->Spoken_by)]=get_object_vars($row);
	131	+ $g_row_matches[strtolower($row->Spoken_by)]['end_time_sec']=$row->end_time;
	132	+ }else{
	133	+ $g_row_matches[strtolower($row->Spoken_by)]['end_time_sec']+=$row->end_time;
	134	+ }
	135	+ $cspan_person = next($g_person_time_ary);
	136	+ }
	137	+ //add in sync offset data for $g_person_time_ary
	138	+ reset($g_person_time_ary);
	139	+ foreach($g_row_matches as $rp=>$rperson){
	140	+
	141	+ }
	142	+ //find match person1->person2->person3
	143	+
	144	+
	145	+ //average switch time to get offset of stream
	146	+ //use offset to insert all $person_time_array data
109	147	}
110	148	}
111	149	}
—	—	@@ -132,8 +170,7 @@
133	171	$page = file_get_contents($url);
134	172	if($page===false){
135	173	echo("error retriving $url retrying...\n");
136		~~- sleep(5);~~
137		~~- //@@todo: this may eventually overflow the stack:~~
	174	+ sleep(5);
138	175	return $this->doRequest($url);
139	176	}
140	177	if($page!=''){
Index: trunk/extensions/MetavidWiki/includes/MV_Index.php
—	—	@@ -102,17 +102,36 @@
103	103	}
104	104	/*
105	105	* getMVDInRange returns the mvd titles that are in the given range
	106	+ * param list got kind of crazy long... @@todo refactor int a request object or something cleaner
106	107	*/
107		~~- function getMVDInRange($stream_id, $start_time=null, $end_time=null, $mvd_type='all',$getText=false){~~
	108	+ function getMVDInRange($stream_id, $start_time=null, $end_time=null, $mvd_type='all',$getText=false,$smw_properties=array(), $limit='LIMIT 0, 200'){
108	109	global $mvIndexTableName, $mvDefaultClipLength;
109	110	$dbr =& wfGetDB(DB_SLAVE);
110	111
111		~~- $sql = "SELECT `mv_page_id` as `id`, `mvd_type`, `wiki_title`, `stream_id`, `start_time`, `end_time` " .~~
112		~~- "FROM {$dbr->tableName($mvIndexTableName)} " .~~
113		~~- "WHERE `stream_id`={$stream_id} ";~~
	112	+ $sql_sel = "SELECT `mv_page_id` as `id`, `mvd_type`, `wiki_title`, `stream_id`, `start_time`, `end_time` ";
	113	+ $sql_from=" FROM {$dbr->tableName($mvIndexTableName)} ";
	114	+ if(count($smw_properties)!=0){
	115	+ foreach($smw_properties as $prop_name){
	116	+ $sql_sel.=", `$prop_name`.`object_title` as `$prop_name`";
	117	+ $sql_from.="LEFT JOIN `smw_relations` as `$prop_name` ON (`mv_mvd_index`.`mv_page_id`=`$prop_name`.`subject_id` " .
	118	+ "AND `$prop_name`.`relation_title`='$prop_name') ";
	119	+ }
	120	+ }
	121	+ $sql = $sql_sel . $sql_from;
	122	+ $sql.= "WHERE `stream_id`={$stream_id} ";
114	123	if($mvd_type!='all'){
115	124	//check if mvd_type is array:
116		~~- $sql.="AND `mvd_type`='{$mvd_type}' ";~~
	125	+ if(is_array($mvd_type)){
	126	+ $sql.='AND (';
	127	+ $or='';
	128	+ foreach($mvd_type as $mtype){
	129	+ $sql.=$or."`mvd_type'='{$mtype}' ";
	130	+ $or='OR ';
	131	+ }
	132	+ $sql.=')';
	133	+ }else{
	134	+ $sql.="AND `mvd_type`='{$mvd_type}' ";
	135	+ }
117	136	}
118	137	//get any data that covers this rage:
119	138	if($end_time)$sql.=" AND `start_time` <= " . $end_time;
—	—	@@ -120,7 +139,7 @@
121	140	//add in ordering
122	141	$sql.=' ORDER BY `start_time` ASC ';
123	142	//add in limit of 200 for now
124		~~- $sql.=' LIMIT 0, 200';~~
	143	+ $sql.=$limit;
125	144	//echo $sql;
126	145	$result =& $dbr->query( $sql, 'MV_Index:time_index_query');
127	146	return $result;

Status & tagging log

15:24, 12 September 2011 Meno25 (talk | contribs) changed the status of r30036 [removed: ok added: old]