r30036 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r30035‎ | r30036 | r30037 >
Date:21:24, 21 January 2008
Author:dale
Status:old
Tags:
Comment:
updated scraping and database syncing maintenance tools
Modified paths:
  • /trunk/extensions/MetavidWiki/includes/MV_Index.php (modified) (history)
  • /trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.inc.php (added) (history)
  • /trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.php (modified) (history)
  • /trunk/extensions/MetavidWiki/maintenance/ogg_thumb_insert.sh (modified) (history)
  • /trunk/extensions/MetavidWiki/maintenance/scrape_and_insert.php (modified) (history)

Diff [purge]

Index: trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.inc.php
@@ -0,0 +1,604 @@
 2+<?php
 3+/*
 4+ * metavid2mvWiki.inc.php Created on Jan 19, 2008
 5+ *
 6+ * All Metavid Wiki code is Released under the GPL2
 7+ * for more info visit http:/metavid.ucsc.edu/code
 8+ *
 9+ * @author Michael Dale
 10+ * @email dale@ucsc.edu
 11+ * @url http://metavid.ucsc.edu
 12+ */
 13+ function upTempalte_Ht_en($force = false) {
 14+ $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Ht_en');
 15+ if (!$wgTemplateTitle->exists() || $force) {
 16+ do_update_wiki_page($wgTemplateTitle, '<noinclude>
 17+ This is the default Template for the display of transcript text.
 18+ </noinclude><includeonly>{{ #if: {{{PersonName|}}} | {{ #ifexist: Image:{{{PersonName}}}.jpg | [[Image:{{{PersonName}}}.jpg|44px|left]]|[[Image:Missing person.jpg|44px|left]]}} |}}{{ #if:{{{PersonName|}}}|[[{{{PersonName}}}]]: |}}{{{BodyText}}}
 19+ </includeonly>');
 20+ }
 21+}
 22+function upTemplate_person($force = false) {
 23+ global $valid_attributes;
 24+ $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Congress Person');
 25+ if (!$wgTemplateTitle->exists() || $force) {
 26+ $wgTemplateArticle = new Article($wgTemplateTitle);
 27+ $template_body = '<noinclude>Congress Person template simplifies
 28+ the structure of articles about Congress People.
 29+ <pre>{{Congress Person|' . "\n";
 30+ foreach ($valid_attributes as $dbKey => $attr) {
 31+ list ($name, $desc) = $attr;
 32+ $template_body .= $name . '=' . $desc . "|\n";
 33+ }
 34+ $template_body .= '}}</pre>' .
 35+ 'The order of the fields is not relevant. The template name (Congress Person) should be given as the \'\'first\'\' thing on a page.
 36+ </noinclude>' .
 37+ '<includeonly>' . "\n";
 38+ //include the image if present:
 39+ $template_body .= '{{ #if: { Image:{{PAGENAME}}.jpg}| [[Image:{{PAGENAME}}.jpg]] |}}' . "\n";
 40+ foreach ($valid_attributes as $dbKey => $attr) {
 41+ list ($name, $desc) = $attr;
 42+ //raw semantic data (@@todo make pretty template table thing)
 43+ $template_body .= "{{ #if: {{{" . $name . "}}}| [[$name:={{{" . $name . "}}}| ]] |}} \n";
 44+ }
 45+ $template_body .= '[[Category:Congress Person]] [[Category:Person]]
 46+ </includeonly>';
 47+ echo "updated 'Congress Person' template\n";
 48+ do_update_wiki_page($wgTemplateTitle, $template_body);
 49+ }
 50+}
 51+function do_people_insert() {
 52+ global $valid_attributes, $states_ary;
 53+ $dbr = wfGetDB(DB_SLAVE);
 54+
 55+ //check person
 56+ upTemplate_person();
 57+ //do people query:
 58+ $res = $dbr->query("SELECT * FROM `metavid`.`people`");
 59+ if ($dbr->numRows($res) == 0)
 60+ die('could not find people: ' . "\n");
 61+ $person_ary = array ();
 62+ while ($person = $dbr->fetchObject($res)) {
 63+ $person_ary[] = $person;
 64+ }
 65+ foreach ($person_ary as $person) {
 66+ $person_title = Title :: newFromUrl($person->name_clean);
 67+ //semantic data via template:
 68+ $page_body = '{{Congress Person|' . "\n";
 69+ foreach ($valid_attributes as $dbKey => $attr) {
 70+ list ($name, $desc) = $attr;
 71+ if (trim($person-> $dbKey) != '') {
 72+ if ($dbKey == 'state')
 73+ $person->state = $states_ary[$person->state];
 74+ $page_body .= "|{$name}={$person->$dbKey}| \n";
 75+ }
 76+ }
 77+ //add in the full name attribute:
 78+ $page_body .= "|Full Name=" . $person->title . ' ' . $person->first .
 79+ ' ' . $person->middle . ' ' . $person->last . "| \n";
 80+ $page_body .= '}}';
 81+ //add in basic info to be overwitten by tranclude (from
 82+ $full_name = $person->title . ' ' . $person->first .
 83+ ' ' . $person->middle . ' ' . $person->last;
 84+ if (trim($full_name) == '')
 85+ $full_name = $person->name_clean;
 86+
 87+ $page_body .= "\n" .'Basic Person page For <b>' . $full_name . "</b><br>\n".
 88+ "Text Spoken By [[Special:MediaSearch/person/{$person->name_clean}|$full_name]] ";
 89+ ;
 90+ do_update_wiki_page($person_title, $page_body);
 91+ }
 92+ foreach ($person_ary as $person) {
 93+ //download/upload all the photos:
 94+ $imgTitle = Title :: makeTitle(NS_IMAGE, $person->name_clean . '.jpg');
 95+ //if(!$imgTitle->exists()){
 96+ global $wgTmpDirectory;
 97+ $url = 'http://www.opensecrets.org/politicians/img/pix/' . $person->osid . '.jpg';
 98+ //print $wgTmpDirectory . "\n";
 99+ $local_file = tempnam($wgTmpDirectory, 'WEBUPLOAD');
 100+ //copy file:
 101+
 102+ # Check if already there existence
 103+ $image = wfLocalFile($imgTitle);
 104+ if ($image->exists()) {
 105+ echo ($imgTitle->getDBkey() . " already in the wiki\n");
 106+ continue;
 107+ }
 108+
 109+ for ($ct = 0; $ct < 10; $ct++) {
 110+ if (!@ copy($url, $local_file)) {
 111+ print ("failed to copy $url to local_file (tring again) \n");
 112+ } else {
 113+ print "copy success\n";
 114+ $ct = 10;
 115+ }
 116+ if ($ct == 9)
 117+ print 'complete failure' . "\n";
 118+ }
 119+
 120+ # Stash the file
 121+ echo ("Saving " . $imgTitle->getDBkey() . "...");
 122+ $image = wfLocalFile($imgTitle);
 123+
 124+ $archive = $image->publish($local_file);
 125+ if (WikiError :: isError($archive)) {
 126+ echo ("failed.\n");
 127+ continue;
 128+ }
 129+ echo ("importing...");
 130+ $comment = 'Image file for [[' . $person->name_clean . ']]';
 131+ $license = '';
 132+
 133+ if ($image->recordUpload($archive, $comment, $license)) {
 134+ # We're done!
 135+ echo ("done.\n");
 136+ } else {
 137+ echo ("failed.\n");
 138+ }
 139+ //}
 140+ }
 141+}
 142+//$i=0;
 143+function do_stream_attr_check($old_stream) {
 144+ global $i;
 145+ $mvStream = & mvGetMVStream(array (
 146+ 'name' => $old_stream->name
 147+ ));
 148+ //print "doding stream attr check: ";
 149+ //print_r($old_stream);
 150+
 151+ if ($mvStream->date_start_time != $old_stream->adj_start_time) {
 152+ $mvStream->date_start_time = $old_stream->adj_start_time;
 153+ }
 154+ if ($mvStream->duration != ($old_stream->adj_end_time - $old_stream->adj_start_time)) {
 155+ $mvStream->duration = ($old_stream->adj_end_time - $old_stream->adj_start_time);
 156+ }
 157+ $mvStream->updateStreamDB();
 158+ print "\nran stream db update: " .$mvStream->duration . ' ' . $mvStream->date_start_time."\n";
 159+ //if($i==3)die;
 160+ //$i++;
 161+}
 162+function do_stream_file_check(& $old_stream) {
 163+ global $mvgIP;
 164+ $mvStream = & mvGetMVStream(array (
 165+ 'name' => $old_stream->name
 166+ ));
 167+ $file_list = $mvStream->getFileList();
 168+
 169+ if ($old_stream->trascoded != 'none') {
 170+ //print "transcode is: " . $old_stream->trascoded;
 171+ if ($old_stream->trascoded == 'low')
 172+ $set = array (
 173+ 'mv_ogg_low_quality'
 174+ );
 175+ if ($old_stream->trascoded == 'high')
 176+ $set = array (
 177+ 'mv_ogg_high_quality'
 178+ );
 179+ if ($old_stream->trascoded == 'all')
 180+ $set = array (
 181+ 'mv_ogg_high_quality',
 182+ 'mv_ogg_low_quality'
 183+ );
 184+ //print "set: " . print_r($set);
 185+ //remove old file pointers:
 186+ $dbw = wfGetDB(DB_WRITE);
 187+ $sql = "DELETE FROM `mv_stream_files` WHERE `stream_id`=".$mvStream->id;
 188+ $dbw->query($sql);
 189+ //update files:
 190+ foreach ($set as $qf) {
 191+ do_insert_stream_file($mvStream, $old_stream, $qf);
 192+ }
 193+ }
 194+ //check for archive.org stuff too..
 195+ /*if($old_stream->archive_org!=''){
 196+ $found=false;
 197+ foreach($file_list as $file){
 198+ if($file->path_type =='ext_archive_org'){
 199+ $found=true;
 200+ }
 201+ }
 202+ if(!$found)do_insert_stream_file($mvStream, $old_stream, 'mv_archive_org_link');
 203+ }*/
 204+}
 205+function do_insert_stream_file($mvStream, $old_stream, $quality_msg) {
 206+ global $mvVideoArchivePaths;
 207+ $dbw = wfGetDB(DB_WRITE);
 208+ if ($quality_msg == 'mv_ogg_low_quality') {
 209+ $path = $mvVideoArchivePaths[$old_stream->archive_server] . $mvStream->name. '.ogg';
 210+ } else if ($quality_msg == 'mv_ogg_high_quality') {
 211+ $path = $mvVideoArchivePaths[$old_stream->archive_server] .$mvStream->name.'.HQ.ogg';
 212+ }else{
 213+ return '';
 214+ }
 215+ //get file duration from nfo file (if avaliable ):
 216+ $nfo_url = $path . '.nfo';
 217+ $nfo_txt = file($nfo_url);
 218+ if($nfo_txt){
 219+ if( isset($nfo_txt[0])){
 220+ list($na, $len) = explode('n:', $nfo_txt[0]);
 221+ $len = trim($len);
 222+ //trim leading zero
 223+ if($len[0]=='0')$len=substr($len,1);
 224+ //trim sub frame times:
 225+ if(strpos($len, '.')!==false){
 226+ $len = substr($len, 0, strpos($len, '.'));
 227+ }
 228+ $dur=ntp2seconds($len);
 229+ }else{
 230+ echo "empty nfo file: $nfo_url \n";
 231+ $dur=0;
 232+ }
 233+ }else{
 234+ echo "missing nfo file: $nfo_url \n";
 235+ $dur=0;
 236+ }
 237+
 238+ $sql = "INSERT INTO `mv_stream_files` (`stream_id`, `file_desc_msg`, `path`, `duration`)" .
 239+ " VALUES ('{$mvStream->id}', '{$quality_msg}', " ." '{$path}', {$dur} )";
 240+ $dbw->query($sql);
 241+}
 242+//@@todo convert to MV_EditStream
 243+function do_add_stream(& $mvTitle, & $stream) {
 244+ $MV_SpecialAddStream = new MV_SpecialCRUDStream('add');
 245+ $MV_SpecialAddStream->stream_name = $mvTitle->getStreamName();
 246+ $MV_SpecialAddStream->stream_type = 'metavid_file';
 247+ $MV_SpecialAddStream->stream_desc = mv_semantic_stream_desc($mvTitle, $stream);
 248+ //add the stream:
 249+ $MV_SpecialAddStream->add_stream();
 250+}
 251+function do_stream_insert($mode, $stream_name = '') {
 252+ global $mvgIP, $MVStreams, $options;
 253+ $dbr = wfGetDB(DB_SLAVE);
 254+ if ($mode == 'all'){
 255+ $sql = "SELECT * FROM `metavid`.`streams` WHERE `sync_status`='in_sync'";
 256+ }else if($mode=='files') {
 257+ $sql = "SELECT * FROM `metavid`.`streams` WHERE `trascoded` != 'none'";
 258+ }else{
 259+ $sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '{$stream_name}'";
 260+ }
 261+ $res = $dbr->query($sql);
 262+ if ($dbr->numRows($res) == 0)
 263+ die('could not find stream: ' . $stream_name . "\n");
 264+ //load all stream names:
 265+ while ($row = $dbr->fetchObject($res)) {
 266+ $streams[] = $row;
 267+ }
 268+ print "working on " . count($streams) . ' streams'."\n";
 269+ foreach ($streams as $stream) {
 270+ //init the stream
 271+ $MVStreams[$stream->name] = new MV_Stream($stream);
 272+ //check if the stream has already been added to the wiki (if not add it)
 273+ $mvTitle = new MV_Title('MvStream:' . $stream->name);
 274+ if (!$mvTitle->doesStreamExist()) {
 275+ //print 'do stream desc'."\n";
 276+ do_add_stream($mvTitle, $stream);
 277+ echo "stream " . $mvTitle->getStreamName() . " added \n";
 278+ } else {
 279+ do_update_wiki_page($stream->name, mv_semantic_stream_desc($mvTitle, $stream), MV_NS_STREAM);
 280+ //$updated = ' updated' echo "stream " . $mvTitle->getStreamName() . " already present $updated\n";
 281+ }
 282+ //add duration and start_time attr
 283+ do_stream_attr_check($stream);
 284+
 285+ //do insert/copy all media images
 286+ if(!isset($options['noimage'])){
 287+ do_proccess_images($stream);
 288+ }
 289+
 290+ //check for files (make sure they match with metavid db values
 291+ do_stream_file_check($stream);
 292+
 293+ if(!isset($options['skiptext'])){
 294+ //proccess all stream text:
 295+ do_proccess_text($stream);
 296+ }
 297+ }
 298+}
 299+function do_proccess_text($stream){
 300+ $dbr = wfGetDB(DB_SLAVE);
 301+ /* for now use the stream search table (in the future should put in our orphaned person data)
 302+ * should be able to do quick checks against the index. */
 303+ $sql = "SELECT (`time`+" . CC_OFFSET . ") as time, `value` " .
 304+ "FROM `metavid`.`stream_attr_time_text`
 305+ WHERE `stream_fk`=" . $stream->id . "
 306+ AND `time` >= " . $stream->adj_start_time . "
 307+ AND `time` <= " . $stream->adj_end_time . "
 308+ ORDER BY `time` ASC ";
 309+
 310+ //$sql = "SELECT * FROM `metavid`.`stream_search` WHERE `stream_fk`={$stream->id}";
 311+ $page_res = $dbr->query($sql);
 312+ if ($dbr->numRows($page_res) == 0)
 313+ echo 'No pages for stream' . $stream->name . "\n";
 314+ $pages = array ();
 315+ while ($page = $dbr->fetchObject($page_res)) {
 316+ $pages[] = $page;
 317+ }
 318+ print "Checking ".count($pages) . " text pages\n";
 319+ $i=$j=0;
 320+ foreach ($pages as $inx => $page) {
 321+ //status updates:
 322+ if($i==50){
 323+ print "on $j of ". count($pages) . "\n";
 324+ $i=0;
 325+ }
 326+ $i++;
 327+ $j++;
 328+ $start_time = $page->time - $stream->adj_start_time;
 329+ if (seconds2ntp($start_time) < 0)
 330+ $start_time = '0:00:00';
 331+ if (($inx +1) == count($pages)) {
 332+ $end_time = $stream->adj_end_time - $stream->adj_start_time;
 333+ } else {
 334+ $end_time = $pages[$inx +1]->time - $stream->adj_start_time;
 335+ }
 336+ if (($end_time - $start_time) > 40)
 337+ $end_time = $start_time +40;
 338+ //skip if end_time <1
 339+ if ($end_time < 0)
 340+ continue;
 341+ //now pull up the person for the given stream time:`metavid`.`people`.`name_clean`
 342+ $sql = "SELECT * , abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} ) AS `distance` " .
 343+ "FROM `metavid`.`people_attr_stream_time` " .
 344+ "LEFT JOIN `metavid`.`people` ON `metavid`.`people_attr_stream_time`.`people_fk` = `metavid`.`people`.`id` " .
 345+ "WHERE `metavid`.`people_attr_stream_time`.`stream_fk` ={$stream->id} " .
 346+ //have a negative threshold of 4 seconds
 347+ "AND (`metavid`.`people_attr_stream_time`.`time`-{$page->time})>-4 " .
 348+ //have a total distance threshold of 30 seconds
 349+ "AND abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} )< 90 " .
 350+ "ORDER BY `distance` ASC " .
 351+ "LIMIT 1 ";
 352+ $person_res = $dbr->query($sql);
 353+
 354+ $page_title = $stream->name . '/' . seconds2ntp($start_time) . '/' . seconds2ntp($end_time);
 355+ //print $page_title . "\n";
 356+ $page_body = '';
 357+ if ($dbr->numRows($person_res) != 0) {
 358+ $person = $dbr->fetchObject($person_res);
 359+ $person_name = utf8_encode($person->name_clean);
 360+ $page_body .= "\n[[Spoken By::{$person_name}]] ";
 361+ }
 362+ $page_body .= trim(str_replace("\n", ' ', strtolower($page->value)));
 363+
 364+ //print $page_title . "\n";
 365+ //die;
 366+ //print $page_body . "\n\n";
 367+ do_update_wiki_page('Ht_en:' . $page_title, $page_body, MV_NS_MVD);
 368+ }
 369+}
 370+/*
 371+ * for each image add it to the image directory
 372+ */
 373+function do_proccess_images($stream) {
 374+ global $mvLocalImgLoc, $MVStreams, $wgDBname;
 375+ $dbr =& wfGetDB(DB_SLAVE);
 376+ $dbw =& wfGetDB(DB_MASTER);
 377+
 378+ //get all images for the current stream:
 379+ $sql = "SELECT * FROM `metavid`.`image_archive`
 380+ WHERE `stream_fk`= {$stream->id}";
 381+ $image_res = $dbr->query($sql);
 382+ $img_count = $dbr->numRows($image_res);
 383+ print "Found " . $img_count . " images for stream " . $stream->name . "\n";
 384+ //grab from metavid and copy to local directory structure:
 385+ $i=$j= 0;
 386+ while ($row = $dbr->fetchObject($image_res)) {
 387+ $relative_time = $row->time - $stream->adj_start_time;
 388+ //status updates:
 389+ if ($i == 10) {
 390+ print "On image $j of $img_count time: " . seconds2ntp($relative_time) . "\n";
 391+ $i = 0;
 392+ }
 393+ $j++;
 394+ $i++;
 395+ //get streamImage obj:
 396+ $mv_stream_id = $MVStreams[$stream->name]->getStreamId();
 397+ $local_img_dir = MV_StreamImage :: getLocalImageDir($mv_stream_id);
 398+ $metavid_img_url = 'http://metavid.ucsc.edu/image_media/' . $row->id . '.jpg';
 399+
 400+ $local_img_file = $local_img_dir . '/' . $relative_time . '.jpg';
 401+ //check if the image already exist in the new table
 402+ $sql = "SELECT * FROM `$wgDBname`.`mv_stream_images` " .
 403+ "WHERE `stream_id`={$mv_stream_id} " .
 404+ "AND `time`=$relative_time";
 405+ $img_check = $dbr->query($sql);
 406+ $doInsert = true;
 407+ if ($dbr->numRows($img_check) != 0) {
 408+ //make sure its there:
 409+ if (is_file($local_img_file)) {
 410+ //print "skiped stream_id:" . $mv_stream_id . " time: " . $relative_time . "\n";
 411+ continue;
 412+ } else {
 413+ //grab but don't insert:
 414+ $doInsert = false;
 415+ }
 416+ }
 417+ if ($doInsert) {
 418+ //insert:
 419+ $dbw->insert('mv_stream_images', array (
 420+ 'stream_id' => $MVStreams[$stream->name]->getStreamId(), 'time' => $relative_time));
 421+ $img_id = $dbw->insertId();
 422+ //$grab = exec('cd ' . $img_path . '; wget ' . $im_url);
 423+ }
 424+
 425+ if (is_file($local_img_file)) {
 426+ echo "skipped $local_img_file \n";
 427+ continue;
 428+ }
 429+ if (!copy($metavid_img_url, $local_img_file)) {
 430+ echo "failed to copy $metavid_img_url to $local_img_file...\n";
 431+ } else {
 432+ //all good don't report anything'
 433+ }
 434+ }
 435+}
 436+
 437+function do_update_wiki_page($wgTitle, $wikiText, $ns = null, $forceUpdate=false) {
 438+ global $botUserName;
 439+ if (!is_object($wgTitle)) {
 440+ $wgTitle = Title :: makeTitle($ns, $wgTitle);
 441+ }
 442+ //make sure the text is utf8 encoded:
 443+ $wikiText = utf8_encode($wikiText);
 444+
 445+ $wgArticle = new Article($wgTitle);
 446+ if(!mvDoMvPage($wgTitle, $wgArticle, false)){
 447+ print "bad title: ".$wgTitle->getDBkey()." no edit";
 448+ if($wgTitle->exists()){
 449+ print "remove article";
 450+ $wgArticle->doDeleteArticle( 'bad title' );
 451+ }
 452+ //some how mvdIndex and mvd pages got out of sync do a seperate check for the mvd:
 453+ if(MV_Index::getMVDbyTitle($wgArticle->mTitle->getDBkey())!=null){
 454+ print ', rm mvd';
 455+ MV_Index::remove_by_wiki_title($wgArticle->mTitle->getDBkey());
 456+ }
 457+ print "\n";
 458+ return ;
 459+ }
 460+ if ($wgTitle->exists()) {
 461+ //if last edit!=mvBot skip (don't overwite peoples improvments')
 462+ $rev = & Revision::newFromTitle($wgTitle);
 463+ if( $botUserName!= $rev->getRawUserText()){
 464+ print ' skiped page edited by user:'.$rev->getRawUserText()."\n";
 465+ if(!$forceUpdate)return ;
 466+ }
 467+ //proc article:
 468+ $cur_text = $wgArticle->getContent();
 469+ //if its a redirect skip
 470+ if(substr($cur_text, 0, strlen('#REDIRECT') )=='#REDIRECT'){
 471+ print ' skiped page moved by user:'.$rev->getRawUserText()."\n";
 472+ if(!$forceUpdate)return ;
 473+ }
 474+ //check if text is identical:
 475+ if (trim($cur_text) == trim($wikiText)) {
 476+ if(!$forceUpdate)return ;
 477+ }
 478+ }
 479+ //got here do the edit:
 480+ $sum_txt = 'metavid bot insert';
 481+ $wgArticle->doEdit($wikiText, $sum_txt);
 482+ print "did edit on " . $wgTitle->getDBkey() . "\n";
 483+ //die;
 484+}
 485+//given a stream name it pulls all metavid stream data and builds semantic wiki page
 486+function mv_semantic_stream_desc(& $mvTitle, & $stream) {
 487+ global $start_time, $end_time;
 488+ /*$sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '" . $mvTitle->getStreamName() . "'";
 489+ $dbr = wfGetDB(DB_SLAVE);
 490+ $res = $dbr->query($sql);
 491+ //echo "\n" . $sql . "\n";
 492+ $stream = $dbr->fetchObject($res);*/
 493+ $stream_id = $stream->id;
 494+ $out = '';
 495+ $pout = mv_proccess_attr('stream_attr_varchar', $stream_id);
 496+ $pout .= mv_proccess_attr('stream_attr_int', $stream_id);
 497+ //add links/generic text at the start
 498+ $out .= '==Official Record==' . "\n";
 499+ $date = date('Ymd', $start_time);
 500+ $cspan_date = date('Y-m-d', $start_time);
 501+ $ch_type = '';
 502+ if (strpos($mvTitle->getStreamName(), 'house') !== false)
 503+ $ch_type = 'h';
 504+ if (strpos($mvTitle->getStreamName(), 'senate') !== false)
 505+ $ch_type = 's';
 506+ if ($ch_type != '') {
 507+ $out .= '*[[GovTrack]] Congressional Record' .
 508+ '[http://www.govtrack.us/congress/recordindex.xpd?date=' . $date .
 509+ '&where=' . $ch_type .
 510+ ']' . "\n\n";
 511+ $out .= '*[[THOMAS]] Congressional Record ' .
 512+ '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
 513+ ']' . "\n\n";
 514+ $out .= '*[[THOMAS]] Extension of Remarks ' .
 515+ '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
 516+ ']' . "\n\n";
 517+ }
 518+ if ($stream->archive_org != '') {
 519+ $out .= '==More Media Sources=='."\n";
 520+ $out .= '*[[Archive.org]] hosted original copy ' .
 521+ '[http://www.archive.org/details/mv_' . $stream->name . ']' . "\n";
 522+ }
 523+ //all streams have congretional cronical:
 524+ $out .= '*[[CSPAN]]\'s Congressional Chronicle ' .
 525+ '[http://www.c-spanarchives.org/congress/?q=node/69850&date=' . $cspan_date . '&hors=' . $ch_type . ']';
 526+ $out .= "\n\n";
 527+ $out .= $pout;
 528+ $out .= '[[stream_duration:=' . ($end_time - $start_time) . '| ]]' . "\n";
 529+ if($stream->org_start_time){
 530+ $out .= '[[original_date:='.$stream->org_start_time.'| ]]';
 531+ }
 532+
 533+ //add stream category (based on sync status)
 534+ switch($stream->sync_status){
 535+ case 'not_checked':
 536+ $out.="\n\n".'[[Category:Stream Unchecked]]';
 537+ break;
 538+ case 'impossible':
 539+ $out.="\n\n".'[[Category:Stream Out of Sync]]';
 540+ break;
 541+ case 'in_sync':
 542+ $out.="\n\n".'[[Category:Stream Basic Sync]]';
 543+ //other options [stream high quality sync ];
 544+ break;
 545+ }
 546+
 547+ return $out;
 548+}
 549+function mv_proccess_attr($table, $stream_id) {
 550+ global $start_time, $end_time;
 551+ $dbr = wfGetDB(DB_SLAVE);
 552+ $sql = "SELECT * FROM `metavid`.`$table` WHERE `stream_fk`=$stream_id";
 553+ $res = $dbr->query($sql);
 554+ $out = '';
 555+ while ($var = $dbr->fetchObject($res)) {
 556+ $type_title = getTypeTitle($var->type);
 557+ if ($var->type == 'adj_start_time')
 558+ $start_time = $var->value;
 559+ if ($var->type == 'adj_end_time')
 560+ $end_time = $var->value;
 561+ if ($type_title != '') {
 562+ $reltype = ($type_title[0] == 'rel') ? '::' : ':=';
 563+ $out .= '[[' . $var->type . ':=' . $var->value . '| ]]' . "\n";
 564+ }
 565+ }
 566+ return $out;
 567+}
 568+function getTypeTitle($type) {
 569+ switch ($type) {
 570+ case 'cspan_type' :
 571+ return array (
 572+ 'rel',
 573+ 'Government Event'
 574+ );
 575+ break;
 576+ case 'cspan_title' :
 577+ return array (
 578+ 'atr',
 579+ 'C-SPAN Title'
 580+ );
 581+ break;
 582+ case 'cspan_desc' :
 583+ return array (
 584+ 'atr',
 585+ 'C-SPAN Description'
 586+ );
 587+ break;
 588+ case 'adj_start_time' :
 589+ return array (
 590+ 'atr',
 591+ 'Unix Start Time'
 592+ );
 593+ break;
 594+ case 'adj_end_time' :
 595+ return array (
 596+ 'atr',
 597+ 'Unix End Time'
 598+ );
 599+ break;
 600+ default :
 601+ return '';
 602+ break;
 603+ }
 604+}
 605+?>
Index: trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.php
@@ -17,6 +17,7 @@
1818 $cur_path = $IP = dirname(__FILE__);
1919 //include commandLine.inc from the mediaWiki maintance dir:
2020 require_once ('../../../maintenance/commandLine.inc');
 21+require_once ('metavid2mvWiki.inc.php');
2122 /*
2223 * assume the wiki user has access to the metavid table and that the
2324 * metavid table is titled `metavid`
@@ -236,597 +237,6 @@
237238 do_stream_insert('stream', $args[0]);
238239 break;
239240 }
240 -function upTempalte_Ht_en($force = false) {
241 - $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Ht_en');
242 - if (!$wgTemplateTitle->exists() || $force) {
243 - do_update_wiki_page($wgTemplateTitle, '<noinclude>
244 - This is the default Template for the display of transcript text.
245 - </noinclude><includeonly>{{ #if: {{{PersonName|}}} | {{ #ifexist: Image:{{{PersonName}}}.jpg | [[Image:{{{PersonName}}}.jpg|44px|left]]|[[Image:Missing person.jpg|44px|left]]}} |}}{{ #if:{{{PersonName|}}}|[[{{{PersonName}}}]]: |}}{{{BodyText}}}
246 - </includeonly>');
247 - }
248 -}
249 -function upTemplate_person($force = false) {
250 - global $valid_attributes;
251 - $wgTemplateTitle = Title :: makeTitle(NS_TEMPLATE, 'Congress Person');
252 - if (!$wgTemplateTitle->exists() || $force) {
253 - $wgTemplateArticle = new Article($wgTemplateTitle);
254 - $template_body = '<noinclude>Congress Person template simplifies
255 - the structure of articles about Congress People.
256 - <pre>{{Congress Person|' . "\n";
257 - foreach ($valid_attributes as $dbKey => $attr) {
258 - list ($name, $desc) = $attr;
259 - $template_body .= $name . '=' . $desc . "|\n";
260 - }
261 - $template_body .= '}}</pre>' .
262 - 'The order of the fields is not relevant. The template name (Congress Person) should be given as the \'\'first\'\' thing on a page.
263 - </noinclude>' .
264 - '<includeonly>' . "\n";
265 - //include the image if present:
266 - $template_body .= '{{ #if: { Image:{{PAGENAME}}.jpg}| [[Image:{{PAGENAME}}.jpg]] |}}' . "\n";
267 - foreach ($valid_attributes as $dbKey => $attr) {
268 - list ($name, $desc) = $attr;
269 - //raw semantic data (@@todo make pretty template table thing)
270 - $template_body .= "{{ #if: {{{" . $name . "}}}| [[$name:={{{" . $name . "}}}| ]] |}} \n";
271 - }
272 - $template_body .= '[[Category:Congress Person]] [[Category:Person]]
273 - </includeonly>';
274 - echo "updated 'Congress Person' template\n";
275 - do_update_wiki_page($wgTemplateTitle, $template_body);
276 - }
277 -}
278 -function do_people_insert() {
279 - global $valid_attributes, $states_ary;
280 - $dbr = wfGetDB(DB_SLAVE);
281241
282 - //check person
283 - upTemplate_person();
284 - //do people query:
285 - $res = $dbr->query("SELECT * FROM `metavid`.`people`");
286 - if ($dbr->numRows($res) == 0)
287 - die('could not find people: ' . "\n");
288 - $person_ary = array ();
289 - while ($person = $dbr->fetchObject($res)) {
290 - $person_ary[] = $person;
291 - }
292 - foreach ($person_ary as $person) {
293 - $person_title = Title :: newFromUrl($person->name_clean);
294 - //semantic data via template:
295 - $page_body = '{{Congress Person|' . "\n";
296 - foreach ($valid_attributes as $dbKey => $attr) {
297 - list ($name, $desc) = $attr;
298 - if (trim($person-> $dbKey) != '') {
299 - if ($dbKey == 'state')
300 - $person->state = $states_ary[$person->state];
301 - $page_body .= "|{$name}={$person->$dbKey}| \n";
302 - }
303 - }
304 - //add in the full name attribute:
305 - $page_body .= "|Full Name=" . $person->title . ' ' . $person->first .
306 - ' ' . $person->middle . ' ' . $person->last . "| \n";
307 - $page_body .= '}}';
308 - //add in basic info to be overwitten by tranclude (from
309 - $full_name = $person->title . ' ' . $person->first .
310 - ' ' . $person->middle . ' ' . $person->last;
311 - if (trim($full_name) == '')
312 - $full_name = $person->name_clean;
313 -
314 - $page_body .= "\n" .'Basic Person page For <b>' . $full_name . "</b><br>\n".
315 - "Text Spoken By [[Special:MediaSearch/person/{$person->name_clean}|$full_name]] ";
316 - ;
317 - do_update_wiki_page($person_title, $page_body);
318 - }
319 - foreach ($person_ary as $person) {
320 - //download/upload all the photos:
321 - $imgTitle = Title :: makeTitle(NS_IMAGE, $person->name_clean . '.jpg');
322 - //if(!$imgTitle->exists()){
323 - global $wgTmpDirectory;
324 - $url = 'http://www.opensecrets.org/politicians/img/pix/' . $person->osid . '.jpg';
325 - //print $wgTmpDirectory . "\n";
326 - $local_file = tempnam($wgTmpDirectory, 'WEBUPLOAD');
327 - //copy file:
328 -
329 - # Check if already there existence
330 - $image = wfLocalFile($imgTitle);
331 - if ($image->exists()) {
332 - echo ($imgTitle->getDBkey() . " already in the wiki\n");
333 - continue;
334 - }
335 -
336 - for ($ct = 0; $ct < 10; $ct++) {
337 - if (!@ copy($url, $local_file)) {
338 - print ("failed to copy $url to local_file (tring again) \n");
339 - } else {
340 - print "copy success\n";
341 - $ct = 10;
342 - }
343 - if ($ct == 9)
344 - print 'complete failure' . "\n";
345 - }
346 -
347 - # Stash the file
348 - echo ("Saving " . $imgTitle->getDBkey() . "...");
349 - $image = wfLocalFile($imgTitle);
350 -
351 - $archive = $image->publish($local_file);
352 - if (WikiError :: isError($archive)) {
353 - echo ("failed.\n");
354 - continue;
355 - }
356 - echo ("importing...");
357 - $comment = 'Image file for [[' . $person->name_clean . ']]';
358 - $license = '';
359 -
360 - if ($image->recordUpload($archive, $comment, $license)) {
361 - # We're done!
362 - echo ("done.\n");
363 - } else {
364 - echo ("failed.\n");
365 - }
366 - //}
367 - }
368 -}
369 -//$i=0;
370 -function do_stream_attr_check($old_stream) {
371 - global $i;
372 - $mvStream = & mvGetMVStream(array (
373 - 'name' => $old_stream->name
374 - ));
375 - //print "doding stream attr check: ";
376 - //print_r($old_stream);
377 -
378 - if ($mvStream->date_start_time != $old_stream->adj_start_time) {
379 - $mvStream->date_start_time = $old_stream->adj_start_time;
380 - }
381 - if ($mvStream->duration != ($old_stream->adj_end_time - $old_stream->adj_start_time)) {
382 - $mvStream->duration = ($old_stream->adj_end_time - $old_stream->adj_start_time);
383 - }
384 - $mvStream->updateStreamDB();
385 - print "\nran stream db update: " .$mvStream->duration . ' ' . $mvStream->date_start_time."\n";
386 - //if($i==3)die;
387 - //$i++;
388 -}
389 -function do_stream_file_check(& $old_stream) {
390 - global $mvgIP;
391 - $mvStream = & mvGetMVStream(array (
392 - 'name' => $old_stream->name
393 - ));
394 - $file_list = $mvStream->getFileList();
395 -
396 - if ($old_stream->trascoded != 'none') {
397 - //print "transcode is: " . $old_stream->trascoded;
398 - if ($old_stream->trascoded == 'low')
399 - $set = array (
400 - 'mv_ogg_low_quality'
401 - );
402 - if ($old_stream->trascoded == 'high')
403 - $set = array (
404 - 'mv_ogg_high_quality'
405 - );
406 - if ($old_stream->trascoded == 'all')
407 - $set = array (
408 - 'mv_ogg_high_quality',
409 - 'mv_ogg_low_quality'
410 - );
411 - //print "set: " . print_r($set);
412 - //remove old file pointers:
413 - $dbw = wfGetDB(DB_WRITE);
414 - $sql = "DELETE FROM `mv_stream_files` WHERE `stream_id`=".$mvStream->id;
415 - $dbw->query($sql);
416 - //update files:
417 - foreach ($set as $qf) {
418 - do_insert_stream_file($mvStream, $old_stream, $qf);
419 - }
420 - }
421 - //check for archive.org stuff too..
422 - /*if($old_stream->archive_org!=''){
423 - $found=false;
424 - foreach($file_list as $file){
425 - if($file->path_type =='ext_archive_org'){
426 - $found=true;
427 - }
428 - }
429 - if(!$found)do_insert_stream_file($mvStream, $old_stream, 'mv_archive_org_link');
430 - }*/
431 -}
432 -function do_insert_stream_file($mvStream, $old_stream, $quality_msg) {
433 - global $mvVideoArchivePaths;
434 - $dbw = wfGetDB(DB_WRITE);
435 - if ($quality_msg == 'mv_ogg_low_quality') {
436 - $path = $mvVideoArchivePaths[$old_stream->archive_server] . $mvStream->name. '.ogg';
437 - } else if ($quality_msg == 'mv_ogg_high_quality') {
438 - $path = $mvVideoArchivePaths[$old_stream->archive_server] .$mvStream->name.'.HQ.ogg';
439 - }else{
440 - return '';
441 - }
442 - //get file duration from nfo file (if avaliable ):
443 - $nfo_url = $path . '.nfo';
444 - $nfo_txt = file($nfo_url);
445 - if($nfo_txt){
446 - if( isset($nfo_txt[0])){
447 - list($na, $len) = explode('n:', $nfo_txt[0]);
448 - $len = trim($len);
449 - //trim leading zero
450 - if($len[0]=='0')$len=substr($len,1);
451 - //trim sub frame times:
452 - if(strpos($len, '.')!==false){
453 - $len = substr($len, 0, strpos($len, '.'));
454 - }
455 - $dur=ntp2seconds($len);
456 - }else{
457 - echo "empty nfo file: $nfo_url \n";
458 - $dur=0;
459 - }
460 - }else{
461 - echo "missing nfo file: $nfo_url \n";
462 - $dur=0;
463 - }
464 -
465 - $sql = "INSERT INTO `mv_stream_files` (`stream_id`, `file_desc_msg`, `path`, `duration`)" .
466 - " VALUES ('{$mvStream->id}', '{$quality_msg}', " ." '{$path}', {$dur} )";
467 - $dbw->query($sql);
468 -}
469 -//@@todo convert to MV_EditStream
470 -function do_add_stream(& $mvTitle, & $stream) {
471 - $MV_SpecialAddStream = new MV_SpecialCRUDStream('add');
472 - $MV_SpecialAddStream->stream_name = $mvTitle->getStreamName();
473 - $MV_SpecialAddStream->stream_type = 'metavid_file';
474 - $MV_SpecialAddStream->stream_desc = mv_semantic_stream_desc($mvTitle, $stream);
475 - //add the stream:
476 - $MV_SpecialAddStream->add_stream();
477 -}
478 -function do_stream_insert($mode, $stream_name = '') {
479 - global $mvgIP, $MVStreams, $options;
480 - $dbr = wfGetDB(DB_SLAVE);
481 - if ($mode == 'all'){
482 - $sql = "SELECT * FROM `metavid`.`streams` WHERE `sync_status`='in_sync'";
483 - }else if($mode=='files') {
484 - $sql = "SELECT * FROM `metavid`.`streams` WHERE `trascoded` != 'none'";
485 - }else{
486 - $sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '{$stream_name}'";
487 - }
488 - $res = $dbr->query($sql);
489 - if ($dbr->numRows($res) == 0)
490 - die('could not find stream: ' . $stream_name . "\n");
491 - //load all stream names:
492 - while ($row = $dbr->fetchObject($res)) {
493 - $streams[] = $row;
494 - }
495 - print "working on " . count($streams) . ' streams'."\n";
496 - foreach ($streams as $stream) {
497 - //init the stream
498 - $MVStreams[$stream->name] = new MV_Stream($stream);
499 - //check if the stream has already been added to the wiki (if not add it)
500 - $mvTitle = new MV_Title('MvStream:' . $stream->name);
501 - if (!$mvTitle->doesStreamExist()) {
502 - //print 'do stream desc'."\n";
503 - do_add_stream($mvTitle, $stream);
504 - echo "stream " . $mvTitle->getStreamName() . " added \n";
505 - } else {
506 - do_update_wiki_page($stream->name, mv_semantic_stream_desc($mvTitle, $stream), MV_NS_STREAM);
507 - //$updated = ' updated' echo "stream " . $mvTitle->getStreamName() . " already present $updated\n";
508 - }
509 - //add duration and start_time attr
510 - do_stream_attr_check($stream);
511 -
512 - //do insert/copy all media images
513 - if(!isset($options['noimage'])){
514 - do_proccess_images($stream);
515 - }
516 -
517 - //check for files (make sure they match with metavid db values
518 - do_stream_file_check($stream);
519 -
520 - if(!isset($options['skiptext'])){
521 - //proccess all stream text:
522 - do_proccess_text($stream);
523 - }
524 - }
525 -}
526 -function do_proccess_text($stream){
527 - $dbr = wfGetDB(DB_SLAVE);
528 - /* for now use the stream search table (in the future should put in our orphaned person data)
529 - * should be able to do quick checks against the index. */
530 - $sql = "SELECT (`time`+" . CC_OFFSET . ") as time, `value` " .
531 - "FROM `metavid`.`stream_attr_time_text`
532 - WHERE `stream_fk`=" . $stream->id . "
533 - AND `time` >= " . $stream->adj_start_time . "
534 - AND `time` <= " . $stream->adj_end_time . "
535 - ORDER BY `time` ASC ";
536 -
537 - //$sql = "SELECT * FROM `metavid`.`stream_search` WHERE `stream_fk`={$stream->id}";
538 - $page_res = $dbr->query($sql);
539 - if ($dbr->numRows($page_res) == 0)
540 - echo 'No pages for stream' . $stream->name . "\n";
541 - $pages = array ();
542 - while ($page = $dbr->fetchObject($page_res)) {
543 - $pages[] = $page;
544 - }
545 - print "Checking ".count($pages) . " text pages\n";
546 - $i=$j=0;
547 - foreach ($pages as $inx => $page) {
548 - //status updates:
549 - if($i==50){
550 - print "on $j of ". count($pages) . "\n";
551 - $i=0;
552 - }
553 - $i++;
554 - $j++;
555 - $start_time = $page->time - $stream->adj_start_time;
556 - if (seconds2ntp($start_time) < 0)
557 - $start_time = '0:00:00';
558 - if (($inx +1) == count($pages)) {
559 - $end_time = $stream->adj_end_time - $stream->adj_start_time;
560 - } else {
561 - $end_time = $pages[$inx +1]->time - $stream->adj_start_time;
562 - }
563 - if (($end_time - $start_time) > 40)
564 - $end_time = $start_time +40;
565 - //skip if end_time <1
566 - if ($end_time < 0)
567 - continue;
568 - //now pull up the person for the given stream time:`metavid`.`people`.`name_clean`
569 - $sql = "SELECT * , abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} ) AS `distance` " .
570 - "FROM `metavid`.`people_attr_stream_time` " .
571 - "LEFT JOIN `metavid`.`people` ON `metavid`.`people_attr_stream_time`.`people_fk` = `metavid`.`people`.`id` " .
572 - "WHERE `metavid`.`people_attr_stream_time`.`stream_fk` ={$stream->id} " .
573 - //have a negative threshold of 4 seconds
574 - "AND (`metavid`.`people_attr_stream_time`.`time`-{$page->time})>-4 " .
575 - //have a total distance threshold of 30 seconds
576 - "AND abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} )< 90 " .
577 - "ORDER BY `distance` ASC " .
578 - "LIMIT 1 ";
579 - $person_res = $dbr->query($sql);
580 -
581 - $page_title = $stream->name . '/' . seconds2ntp($start_time) . '/' . seconds2ntp($end_time);
582 - //print $page_title . "\n";
583 - $page_body = '';
584 - if ($dbr->numRows($person_res) != 0) {
585 - $person = $dbr->fetchObject($person_res);
586 - $person_name = utf8_encode($person->name_clean);
587 - $page_body .= "\n[[Spoken By::{$person_name}]] ";
588 - }
589 - $page_body .= trim(str_replace("\n", ' ', strtolower($page->value)));
590 -
591 - //print $page_title . "\n";
592 - //die;
593 - //print $page_body . "\n\n";
594 - do_update_wiki_page('Ht_en:' . $page_title, $page_body, MV_NS_MVD);
595 - }
596 -}
597 -/*
598 - * for each image add it to the image directory
599 - */
600 -function do_proccess_images($stream) {
601 - global $mvLocalImgLoc, $MVStreams, $wgDBname;
602 - $dbr = wfGetDB(DB_SLAVE);
603 - $dbw = wfGetDB(DB_MASTER);
604 -
605 - //get all images for the current stream:
606 - $sql = "SELECT * FROM `metavid`.`image_archive`
607 - WHERE `stream_fk`= {$stream->id}";
608 - $image_res = $dbr->query($sql);
609 - $img_count = $dbr->numRows($image_res);
610 - print "Found " . $img_count . " images for stream " . $stream->name . "\n";
611 - //grab from metavid and copy to local directory structure:
612 - $i=$j= 0;
613 - while ($row = $dbr->fetchObject($image_res)) {
614 - $relative_time = $row->time - $stream->adj_start_time;
615 - //status updates:
616 - if ($i == 10) {
617 - print "On image $j of $img_count time: " . seconds2ntp($relative_time) . "\n";
618 - $i = 0;
619 - }
620 - $j++;
621 - $i++;
622 - //get streamImage obj:
623 - $mv_stream_id = $MVStreams[$stream->name]->getStreamId();
624 - $local_img_dir = MV_StreamImage :: getLocalImageDir($mv_stream_id);
625 - $metavid_img_url = 'http://metavid.ucsc.edu/image_media/' . $row->id . '.jpg';
626 -
627 - $local_img_file = $local_img_dir . '/' . $relative_time . '.jpg';
628 - //check if the image already exist in the new table
629 - $sql = "SELECT * FROM `$wgDBname`.`mv_stream_images` " .
630 - "WHERE `stream_id`={$mv_stream_id} " .
631 - "AND `time`=$relative_time";
632 - $img_check = $dbr->query($sql);
633 - $doInsert = true;
634 - if ($dbr->numRows($img_check) != 0) {
635 - //make sure its there:
636 - if (is_file($local_img_file)) {
637 - //print "skiped stream_id:" . $mv_stream_id . " time: " . $relative_time . "\n";
638 - continue;
639 - } else {
640 - //grab but don't insert:
641 - $doInsert = false;
642 - }
643 - }
644 - if ($doInsert) {
645 - //insert:
646 - $dbw->insert('mv_stream_images', array (
647 - 'stream_id' => $MVStreams[$stream->name]->getStreamId(), 'time' => $relative_time));
648 - $img_id = $dbw->insertId();
649 - //$grab = exec('cd ' . $img_path . '; wget ' . $im_url);
650 - }
651 -
652 - if (is_file($local_img_file)) {
653 - echo "skipped $local_img_file \n";
654 - continue;
655 - }
656 - if (!copy($metavid_img_url, $local_img_file)) {
657 - echo "failed to copy $metavid_img_url to $local_img_file...\n";
658 - } else {
659 - //all good don't report anything'
660 - }
661 - }
662 -}
663 -
664 -function do_update_wiki_page($wgTitle, $wikiText, $ns = null, $forceUpdate=false) {
665 - global $botUserName;
666 - if (!is_object($wgTitle)) {
667 - $wgTitle = Title :: makeTitle($ns, $wgTitle);
668 - }
669 - //make sure the text is utf8 encoded:
670 - $wikiText = utf8_encode($wikiText);
671 -
672 - $wgArticle = new Article($wgTitle);
673 - if(!mvDoMvPage($wgTitle, $wgArticle, false)){
674 - print "bad title: ".$wgTitle->getDBkey()." no edit";
675 - if($wgTitle->exists()){
676 - print "remove article";
677 - $wgArticle->doDeleteArticle( 'bad title' );
678 - }
679 - //some how mvdIndex and mvd pages got out of sync do a seperate check for the mvd:
680 - if(MV_Index::getMVDbyTitle($wgArticle->mTitle->getDBkey())!=null){
681 - print ', rm mvd';
682 - MV_Index::remove_by_wiki_title($wgArticle->mTitle->getDBkey());
683 - }
684 - print "\n";
685 - return ;
686 - }
687 - if ($wgTitle->exists()) {
688 - //if last edit!=mvBot skip (don't overwite peoples improvments')
689 - $rev = & Revision::newFromTitle($wgTitle);
690 - if( $botUserName!= $rev->getRawUserText()){
691 - print ' skiped page edited by user:'.$rev->getRawUserText()."\n";
692 - if(!$forceUpdate)return ;
693 - }
694 - //proc article:
695 - $cur_text = $wgArticle->getContent();
696 - //if its a redirect skip
697 - if(substr($cur_text, 0, strlen('#REDIRECT') )=='#REDIRECT'){
698 - print ' skiped page moved by user:'.$rev->getRawUserText()."\n";
699 - if(!$forceUpdate)return ;
700 - }
701 - //check if text is identical:
702 - if (trim($cur_text) == trim($wikiText)) {
703 - if(!$forceUpdate)return ;
704 - }
705 - }
706 - //got here do the edit:
707 - $sum_txt = 'metavid bot insert';
708 - $wgArticle->doEdit($wikiText, $sum_txt);
709 - print "did edit on " . $wgTitle->getDBkey() . "\n";
710 - //die;
711 -}
712 -//given a stream name it pulls all metavid stream data and builds semantic wiki page
713 -function mv_semantic_stream_desc(& $mvTitle, & $stream) {
714 - global $start_time, $end_time;
715 - /*$sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '" . $mvTitle->getStreamName() . "'";
716 - $dbr = wfGetDB(DB_SLAVE);
717 - $res = $dbr->query($sql);
718 - //echo "\n" . $sql . "\n";
719 - $stream = $dbr->fetchObject($res);*/
720 - $stream_id = $stream->id;
721 - $out = '';
722 - $pout = mv_proccess_attr('stream_attr_varchar', $stream_id);
723 - $pout .= mv_proccess_attr('stream_attr_int', $stream_id);
724 - //add links/generic text at the start
725 - $out .= '==Official Record==' . "\n";
726 - $date = date('Ymd', $start_time);
727 - $cspan_date = date('Y-m-d', $start_time);
728 - $ch_type = '';
729 - if (strpos($mvTitle->getStreamName(), 'house') !== false)
730 - $ch_type = 'h';
731 - if (strpos($mvTitle->getStreamName(), 'senate') !== false)
732 - $ch_type = 's';
733 - if ($ch_type != '') {
734 - $out .= '*[[GovTrack]] Congressional Record' .
735 - '[http://www.govtrack.us/congress/recordindex.xpd?date=' . $date .
736 - '&where=' . $ch_type .
737 - ']' . "\n\n";
738 - $out .= '*[[THOMAS]] Congressional Record ' .
739 - '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
740 - ']' . "\n\n";
741 - $out .= '*[[THOMAS]] Extension of Remarks ' .
742 - '[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
743 - ']' . "\n\n";
744 - }
745 - if ($stream->archive_org != '') {
746 - $out .= '==More Media Sources=='."\n";
747 - $out .= '*[[Archive.org]] hosted original copy ' .
748 - '[http://www.archive.org/details/mv_' . $stream->name . ']' . "\n";
749 - }
750 - //all streams have congretional cronical:
751 - $out .= '*[[CSPAN]]\'s Congressional Chronicle ' .
752 - '[http://www.c-spanarchives.org/congress/?q=node/69850&date=' . $cspan_date . '&hors=' . $ch_type . ']';
753 - $out .= "\n\n";
754 - $out .= $pout;
755 - $out .= '[[stream_duration:=' . ($end_time - $start_time) . '| ]]' . "\n";
756 - if($stream->org_start_time){
757 - $out .= '[[original_date:='.$stream->org_start_time.'| ]]';
758 - }
759 -
760 - //add stream category (based on sync status)
761 - switch($stream->sync_status){
762 - case 'not_checked':
763 - $out.="\n\n".'[[Category:Stream Unchecked]]';
764 - break;
765 - case 'impossible':
766 - $out.="\n\n".'[[Category:Stream Out of Sync]]';
767 - break;
768 - case 'in_sync':
769 - $out.="\n\n".'[[Category:Stream Basic Sync]]';
770 - //other options [stream high quality sync ];
771 - break;
772 - }
773 -
774 - return $out;
775 -}
776 -function mv_proccess_attr($table, $stream_id) {
777 - global $start_time, $end_time;
778 - $dbr = wfGetDB(DB_SLAVE);
779 - $sql = "SELECT * FROM `metavid`.`$table` WHERE `stream_fk`=$stream_id";
780 - $res = $dbr->query($sql);
781 - $out = '';
782 - while ($var = $dbr->fetchObject($res)) {
783 - $type_title = getTypeTitle($var->type);
784 - if ($var->type == 'adj_start_time')
785 - $start_time = $var->value;
786 - if ($var->type == 'adj_end_time')
787 - $end_time = $var->value;
788 - if ($type_title != '') {
789 - $reltype = ($type_title[0] == 'rel') ? '::' : ':=';
790 - $out .= '[[' . $var->type . ':=' . $var->value . '| ]]' . "\n";
791 - }
792 - }
793 - return $out;
794 -}
795 -function getTypeTitle($type) {
796 - switch ($type) {
797 - case 'cspan_type' :
798 - return array (
799 - 'rel',
800 - 'Government Event'
801 - );
802 - break;
803 - case 'cspan_title' :
804 - return array (
805 - 'atr',
806 - 'C-SPAN Title'
807 - );
808 - break;
809 - case 'cspan_desc' :
810 - return array (
811 - 'atr',
812 - 'C-SPAN Description'
813 - );
814 - break;
815 - case 'adj_start_time' :
816 - return array (
817 - 'atr',
818 - 'Unix Start Time'
819 - );
820 - break;
821 - case 'adj_end_time' :
822 - return array (
823 - 'atr',
824 - 'Unix End Time'
825 - );
826 - break;
827 - default :
828 - return '';
829 - break;
830 - }
831 -}
832242 ?>
833243
Index: trunk/extensions/MetavidWiki/maintenance/ogg_thumb_insert.sh
@@ -1,15 +1,77 @@
22 #!/bin/bash
33
4 -streamid=${1};
5 -filename=${2};
6 -interval=${3};
 4+###########################################################################
 5+#
 6+# DESCRIPTION
 7+#
 8+# This script can extract jpg frames from your ogg files at a specified
 9+# interval. It can also insert this information into your mvWiki
 10+# database.
 11+#
 12+# USAGE
 13+#
 14+# ./ogg_thumb_insert.sh stream_id filename interval
 15+#
 16+# EXAMPLE
 17+#
 18+# ./ogg_thumb_insert.sh 17 /var/www/localhost/htdocs/media/stream.ogg 20
 19+#
 20+# The previous example should extract frames every 20 seconds into the
 21+# file named stream.ogg. It will place them in the appropriate stream
 22+# directory which by default is '../stream_images/7/17/'. It should also
 23+# insert information about the frame into the 'mv_stream_images' table.
 24+#
 25+###########################################################################
 26+#
 27+# This script relies on a number of programs being in your path, and is
 28+# intended to be executed from the 'maintenance' directory.
 29+#
 30+# Requirements:
 31+#
 32+# ffmpeg
 33+# mysql
 34+# imagemagick
 35+# ogginfo
 36+# grep
 37+# sed
 38+# awk
 39+# gawk
 40+# echo
 41+# wc
 42+# bc
 43+# seq
 44+# mkdir
 45+#
 46+###########################################################################
 47+#
 48+# Use at your own risk. There is very little error checking.
 49+#
 50+###########################################################################
 51+# This quick hack brought to you by Seth McClain smcclain@opengov.org
 52+###########################################################################
753
 54+
 55+## REMOVE THE FOLLOWING TWO LINES BEFORE EXECUTING ##
 56+echo "Please be sure to edit this file and change some variables before executing it";
 57+exit
 58+## REMOVE THE PREVIOUS TWO LINES BEFORE EXECUTING ##
 59+
 60+
 61+## The following variables need to be set to allow the script access to your
 62+## MySQL database
 63+
864 table="mv_stream_images";
965 db="mvwiki";
1066 user="user";
1167 pw="password";
1268 hostname="localhost";
1369
 70+## Do not edit below this line
 71+
 72+streamid=${1};
 73+filename=${2};
 74+interval=${3};
 75+
1476 chars=`echo -n ${streamid} | wc -c`;
1577 dots=`for i in \`seq 1 ${chars}\`; do echo -n .; done | sed -e s/^.//`
1678 dir=`echo ${streamid} | sed -e s/^${dots}//`
@@ -25,7 +87,7 @@
2688
2789 for i in `seq 1 ${interval} ${duration}`
2890 do
29 - #echo "insert into ${table}(stream_id, time) values(${streamid}, ${i});" | mysql -u ${user} --password=${pw} ${db}
 91+ echo "insert into ${table}(stream_id, time) values(${streamid}, ${i});" | mysql -u ${user} --password=${pw} ${db}
3092 ffmpeg -ss ${i} -i ${filename} -vcodec mjpeg -vframes 1 -an -f rawvideo -s 320x240 -y ${filedir}/${i}_320x240.jpg
3193 done
3294
Index: trunk/extensions/MetavidWiki/maintenance/scrape_and_insert.php
@@ -38,6 +38,7 @@
3939 switch($args[0]){
4040 case 'cspan_chronicle':
4141 $MV_CspanScraper = new MV_CspanScraper();
 42+ $MV_CspanScraper->doScrapeInsert();
4243 break;
4344 }
4445 }
@@ -96,15 +97,52 @@
9798 $href='';
9899 $href_match=array();
99100 preg_match('/href="(.*)"/',$matches[5][$k], $href_match);
100 - if(count($href_match)!=0)$href=$href_match[1];
 101+ if(count($href_match)!=0)$href=$href_match[1];
 102+
 103+ $porg = str_replace('<br>',' ',$matches[4][$k]);
 104+ $porg = preg_replace('/[D|R]+\-\[.*\]/', '', $porg);
 105+ $pparts = explode(',',$porg);
 106+ $pname = trim($pparts[1]) . '_' . trim($pparts[0]);
101107 $person_time_ary[]= array(
102108 'start_time'=>strip_tags($matches[1][$k]),
103109 'length'=>$matches[3][$k],
104110 'person_title'=>str_replace('<br>',' ',$matches[4][$k]),
 111+ 'spoken_by'=>$pname,
105112 'href'=>$href
106113 );
107 - }
108 - print_r($person_time_ary);
 114+ }
 115+ //group people in page matches
 116+ $g_person_time_ary=array();
 117+ $prev_person=null;
 118+ foreach($person_time_ary as $ptag){
 119+ $g_person_time_ary[$ptag['spoken_by']][]=$ptag;
 120+ }
 121+
 122+ //retrive rows to find match:
 123+ $dbr =& wfGetDB(DB_SLAVE);
 124+ $mvd_res = MV_Index::getMVDInRange($stream->id, null, null, $mvd_type='ht_en',false,$smw_properties=array('Spoken_by'), '');
 125+ $g_row_matches=array();
 126+ //group peole in db matches:
 127+ while ($row = $dbr->fetchObject($mvd_res)) {
 128+ if(!isset($row->Spoken_by))continue;
 129+ if(!isset($g_row_matches[strtolower($row->Spoken_by)])){
 130+ $g_row_matches[strtolower($row->Spoken_by)]=get_object_vars($row);
 131+ $g_row_matches[strtolower($row->Spoken_by)]['end_time_sec']=$row->end_time;
 132+ }else{
 133+ $g_row_matches[strtolower($row->Spoken_by)]['end_time_sec']+=$row->end_time;
 134+ }
 135+ $cspan_person = next($g_person_time_ary);
 136+ }
 137+ //add in sync offset data for $g_person_time_ary
 138+ reset($g_person_time_ary);
 139+ foreach($g_row_matches as $rp=>$rperson){
 140+
 141+ }
 142+ //find match person1->person2->person3
 143+
 144+
 145+ //average switch time to get offset of stream
 146+ //use offset to insert all $person_time_array data
109147 }
110148 }
111149 }
@@ -132,8 +170,7 @@
133171 $page = file_get_contents($url);
134172 if($page===false){
135173 echo("error retriving $url retrying...\n");
136 - sleep(5);
137 - //@@todo: this may eventually overflow the stack:
 174+ sleep(5);
138175 return $this->doRequest($url);
139176 }
140177 if($page!=''){
Index: trunk/extensions/MetavidWiki/includes/MV_Index.php
@@ -102,17 +102,36 @@
103103 }
104104 /*
105105 * getMVDInRange returns the mvd titles that are in the given range
 106+ * param list got kind of crazy long... @@todo refactor int a request object or something cleaner
106107 */
107 - function getMVDInRange($stream_id, $start_time=null, $end_time=null, $mvd_type='all',$getText=false){
 108+ function getMVDInRange($stream_id, $start_time=null, $end_time=null, $mvd_type='all',$getText=false,$smw_properties=array(), $limit='LIMIT 0, 200'){
108109 global $mvIndexTableName, $mvDefaultClipLength;
109110 $dbr =& wfGetDB(DB_SLAVE);
110111
111 - $sql = "SELECT `mv_page_id` as `id`, `mvd_type`, `wiki_title`, `stream_id`, `start_time`, `end_time` " .
112 - "FROM {$dbr->tableName($mvIndexTableName)} " .
113 - "WHERE `stream_id`={$stream_id} ";
 112+ $sql_sel = "SELECT `mv_page_id` as `id`, `mvd_type`, `wiki_title`, `stream_id`, `start_time`, `end_time` ";
 113+ $sql_from=" FROM {$dbr->tableName($mvIndexTableName)} ";
 114+ if(count($smw_properties)!=0){
 115+ foreach($smw_properties as $prop_name){
 116+ $sql_sel.=", `$prop_name`.`object_title` as `$prop_name`";
 117+ $sql_from.="LEFT JOIN `smw_relations` as `$prop_name` ON (`mv_mvd_index`.`mv_page_id`=`$prop_name`.`subject_id` " .
 118+ "AND `$prop_name`.`relation_title`='$prop_name') ";
 119+ }
 120+ }
 121+ $sql = $sql_sel . $sql_from;
 122+ $sql.= "WHERE `stream_id`={$stream_id} ";
114123 if($mvd_type!='all'){
115124 //check if mvd_type is array:
116 - $sql.="AND `mvd_type`='{$mvd_type}' ";
 125+ if(is_array($mvd_type)){
 126+ $sql.='AND (';
 127+ $or='';
 128+ foreach($mvd_type as $mtype){
 129+ $sql.=$or."`mvd_type'='{$mtype}' ";
 130+ $or='OR ';
 131+ }
 132+ $sql.=')';
 133+ }else{
 134+ $sql.="AND `mvd_type`='{$mvd_type}' ";
 135+ }
117136 }
118137 //get any data that covers this rage:
119138 if($end_time)$sql.=" AND `start_time` <= " . $end_time;
@@ -120,7 +139,7 @@
121140 //add in ordering
122141 $sql.=' ORDER BY `start_time` ASC ';
123142 //add in limit of 200 for now
124 - $sql.=' LIMIT 0, 200';
 143+ $sql.=$limit;
125144 //echo $sql;
126145 $result =& $dbr->query( $sql, 'MV_Index:time_index_query');
127146 return $result;

Status & tagging log