Index: trunk/extensions/MetavidWiki/maintenance/scrape_and_insert.inc.php |
— | — | @@ -608,26 +608,34 @@ |
609 | 609 | } |
610 | 610 | } |
611 | 611 | function proccessBill( $govTrackBillId, $bill_key, $openCongBillId = false, $mapLightBillId = false, $forceUpdate = false ) { |
612 | | - // get the bill title & its sponser / cosponsers: |
| 612 | + // get the bill title & its sponsor / co-sponsors: |
613 | 613 | $rawGovTrackPage = $this->doRequest( $this->govTrack_bill_url . $govTrackBillId ); |
614 | 614 | |
615 | 615 | /***************************** |
616 | | - * Proccess Bill GovTrack info |
| 616 | + * Process Bill GovTrack info |
617 | 617 | *****************************/ |
618 | 618 | print "gov_track id: " . $govTrackBillId . " from: " . $this->govTrack_bill_url . $govTrackBillId . "\n"; |
619 | 619 | |
620 | | - // get title: |
621 | | - $patern = '/property="dc:title" datatype="xsd:string" style="margin-bottom: 1em">([^<]*)<\/div>(<p style="margin-top: 1.75em; margin-bottom: 1.75em">([^<]*))?/'; |
622 | | - preg_match( $patern, $rawGovTrackPage, $title_match ); |
| 620 | + // get title: |
| 621 | + preg_match( '/<title>(.*)<\/title>/', $rawGovTrackPage, $title_match ); |
623 | 622 | if ( isset( $title_match[1] ) ) { |
| 623 | + //strip govtrack.us |
| 624 | + $title_match[1] = str_replace( '(GovTrack.us)', '', $title_match[1]); |
624 | 625 | if ( trim( $title_match[1] ) == '' ) { |
625 | 626 | print "empty title\n"; |
626 | 627 | return false; |
627 | 628 | } |
628 | 629 | $title_short = str_replace( array( '_', '...', ' [110th]', ' [109th]', ' [108th]', ' [107th]' ), array( ' ', '', '', '', '', '' ), $title_match[1] ); |
| 630 | + |
629 | 631 | $this->cur_bill_short_title = $title_short; |
630 | 632 | // set the desc if present: |
631 | | - $title_desc = ( isset( $title_match[3] ) ) ? $title_match[3]:''; |
| 633 | + preg_match( '/<meta name="description" content="([^">]*)"/', $rawGovTrackPage, $desc_match ); |
| 634 | + if(isset($desc_match[1])){ |
| 635 | + $title_desc = $desc_match[1]; |
| 636 | + }else{ |
| 637 | + die('could not find title desc: ' . $title_desc); |
| 638 | + } |
| 639 | + |
632 | 640 | $this->bill_titles[$bill_key] = $title_short; |
633 | 641 | } else { |
634 | 642 | print $this->govTrack_bill_url . $govTrackBillId . "\n" . $patern . "\n" . $rawGovTrackPage; |
— | — | @@ -635,8 +643,8 @@ |
636 | 644 | } |
637 | 645 | |
638 | 646 | // print "raw govtrack:\n $rawGovTrackPage"; |
639 | | - // get the $thomas_match |
640 | | - preg_match( '/thomas\.loc\.gov\/cgi-bin\/bdquery\/z\?(.*):/', $rawGovTrackPage, $thomas_match ); |
| 647 | + // get the $thomas_match |
| 648 | + preg_match( '/thomas\.loc\.gov\/cgi-bin\/bdquery\/z\?([^\"]*)/', $rawGovTrackPage, $thomas_match ); |
641 | 649 | // get introduced: //strange .* does not seem to work :( |
642 | 650 | preg_match( '/Introduced<\/nobr><\/td><td style="padding-left: 1em; font-size: 75%; color: #333333"><nobr>([^<]*)/m', $rawGovTrackPage, $date_intro_match ); |
643 | 651 | // print_r($date_intro_match); |
— | — | @@ -685,19 +693,26 @@ |
686 | 694 | $this->procMapLightInterest( $interest ); |
687 | 695 | $bp .= 'Supporting Interest ' . $i . '=' . $interest['name'] . "|\n"; |
688 | 696 | $i++; |
| 697 | + //process interest |
| 698 | + $this->procMapLightInterest( $interest ); |
689 | 699 | } |
690 | 700 | $i = 1; |
691 | 701 | foreach ( $bill_interest['oppose'] as $interest ) { |
692 | 702 | $bp .= 'Opposing Interest ' . $i . '=' . $interest['name'] . "|\n"; |
693 | 703 | $i++; |
| 704 | + //process interest |
| 705 | + $this->procMapLightInterest( $interest ); |
694 | 706 | } |
695 | 707 | } |
696 | 708 | $bp .= "}}\n"; |
| 709 | + |
697 | 710 | // print 'page : '.$title_short.' ' . $bp . "\n"; |
698 | 711 | // incorporated into the template: |
699 | 712 | // $body.="\n\n".'Source: [[Data Source Name:=GovTrack]] [[Data Source URL:='.$this->govTrack_bill_url . $govTrackBillId.']]'; |
700 | 713 | // set up the base bill page: |
701 | 714 | $wgBillTitle = Title::newFromText( $title_short ); |
| 715 | + //print $bp; |
| 716 | + //die; |
702 | 717 | do_update_wiki_page( $wgBillTitle, $bp ); |
703 | 718 | |
704 | 719 | // set up a redirect for the bill key, and a link for the category page: |
— | — | @@ -721,29 +736,29 @@ |
722 | 737 | // print "map info: $this->mapLightBillInfo \n"; |
723 | 738 | print str_replace( '$1', $mapLightBillId, $this->mapLightBillInfo ) . "\n\n"; |
724 | 739 | $ret_ary = array( 'support' => array(), 'oppose' => array() ); |
725 | | - $bill_page = $this->doRequest( str_replace( '$1', $mapLightBillId, $this->mapLightBillInfo ) ); |
| 740 | + $bill_url = str_replace( '$1', $mapLightBillId, $this->mapLightBillInfo ); |
| 741 | + $bill_page = $this->doRequest( $bill_url); |
726 | 742 | // $bill_page = $this->doRequest('http://maplight.org/map/us/bill/10831/default'); |
727 | 743 | // print $bill_page; |
728 | 744 | // ([^<]*)<\/a>)* |
729 | 745 | // a href="\/map\/us\/interest\/([^"]*) class="interest" |
730 | | - |
731 | | - $pat_interest = '/<li><a\shref="\/map\/us\/interest\/([^"]*)".*>([^<]*)<\/a> .*<\/li>/U'; |
732 | 746 | // class="organizations"\sid="for |
733 | 747 | // preg_match_all('/class="organizations"\sid="for.*<ul class="industries list-clear">()*/',$bill_page, $matches); |
734 | | - preg_match_all( $pat_interest, $bill_page, $matches, PREG_OFFSET_CAPTURE ); |
735 | | - // print_r($matches); |
736 | | - $aginst_pos = strpos( $bill_page, 'class="organizations" id="against"' ); |
| 748 | + print "\n". $bill_url."\n"; |
| 749 | + preg_match_all( '/href\=\"\/map\/us\/interest\/([^"]*)[^>]*>([^<]*)/', $bill_page, $matches, PREG_OFFSET_CAPTURE ); |
| 750 | + |
| 751 | + $aginst_pos = strpos( $bill_page, 'id="against"' ); |
737 | 752 | // return empty arrays if we don't have info to give back:' |
738 | 753 | if ( $aginst_pos === false )return $ret_ary; |
739 | 754 | if ( !isset( $matches[1] ) )return $ret_ary; |
740 | 755 | |
741 | 756 | foreach ( $matches[1] as $inx => $intrest ) { |
742 | 757 | if ( $intrest[1] < $aginst_pos ) { |
743 | | - $ret_ary['support'][] = array( 'key' => $intrest[0], 'name' => $matches[2][$inx][0] ); |
| 758 | + $ret_ary['support'][] = array( 'key' => $intrest[0], 'name' => htmlspecialchars_decode( $matches[2][$inx][0]) ); |
744 | 759 | } else { |
745 | | - $ret_ary['oppose'][] = array( 'key' => $intrest[0], 'name' => $matches[2][$inx][0] ); |
| 760 | + $ret_ary['oppose'][] = array( 'key' => $intrest[0], 'name' => htmlspecialchars_decode( $matches[2][$inx][0] ) ); |
746 | 761 | } |
747 | | - } |
| 762 | + } |
748 | 763 | return $ret_ary; |
749 | 764 | } |
750 | 765 | function get_bill_name_from_mapLight_id( $mapBillId, $doLookup = true ) { |
— | — | @@ -760,7 +775,7 @@ |
761 | 776 | } |
762 | 777 | if ( !isset( $this->mapLight_bill_cache[$mapBillId] ) ) { |
763 | 778 | if ( $doLookup ) { |
764 | | - print "missing bill by mapId: $mapBillId retrive it: \n"; |
| 779 | + print "missing bill by mapId: $mapBillId retrieve it: \n"; |
765 | 780 | $raw_bill_page = $this->doRequest( 'http://www.maplight.org/map/us/bill/' . $mapBillId . '/default' ); |
766 | 781 | preg_match( '/title">([^-]*)-/', $raw_bill_page, $matches ); |
767 | 782 | if ( isset( $matches[1] ) )$bill_key = trim( $matches[1] ); |
Index: trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.php |
— | — | @@ -94,6 +94,7 @@ |
95 | 95 | 'interest' will insert interests (uses people as base so run people first) |
96 | 96 | 'update_templates' will update templates & some semantic properties |
97 | 97 | 'file_check' checks inserted streams file urls/pointers |
| 98 | + 'do_stream_date_check' |
98 | 99 | |
99 | 100 | EOT; |
100 | 101 | exit (); |
— | — | @@ -133,6 +134,9 @@ |
134 | 135 | case 'file_check': |
135 | 136 | do_stream_file_check(); |
136 | 137 | break; |
| 138 | + case 'do_stream_date_check': |
| 139 | + do_stream_date_check(); |
| 140 | + break; |
137 | 141 | case 'update_templates' : |
138 | 142 | $force = ( isset( $options['force'] ) ) ? true:false; |
139 | 143 | include_once( 'metavid_gov_templates.php' ); |
— | — | @@ -147,5 +151,5 @@ |
148 | 152 | break; |
149 | 153 | default : |
150 | 154 | do_stream_insert( 'stream', $args[0] ); |
151 | | - break; |
| 155 | + break; |
152 | 156 | } |
Index: trunk/extensions/MetavidWiki/maintenance/metavid2mvWiki.inc.php |
— | — | @@ -34,6 +34,34 @@ |
35 | 35 | // if($i==3)die; |
36 | 36 | // $i++; |
37 | 37 | } |
| 38 | +function do_stream_date_check(){ |
| 39 | + $dbr = wfGetDB( DB_READ ); |
| 40 | + $result = $dbr->select( 'mv_streams', |
| 41 | + '*', |
| 42 | + '', |
| 43 | + __METHOD__, |
| 44 | + array('LIMIT'=> 9000)); |
| 45 | + if ( $dbr->numRows( $result ) == 0 )die("do_stream_file_check: no streams found"); |
| 46 | + |
| 47 | + while ( $stream = $dbr->fetchObject( $result ) ) { |
| 48 | + $sdate = split('_', $stream->name); |
| 49 | + $sd = split('-',$sdate[count($sdate)-1]); |
| 50 | + if( count($sd) != 3 ) |
| 51 | + continue; |
| 52 | + $sdate = mktime( 9, 0, 0, $sd[0], $sd[1], intval('20'.$sd[2]) ); |
| 53 | + if( date('d-y', $stream->date_start_time) != date('d-y',$sdate) ) { |
| 54 | + //print "should update date: " . $stream->date_start_time . ' to '. $sdate . ' for ' . $stream->name . "\n"; |
| 55 | + $dbw = wfGetDB( DB_WRITE ); |
| 56 | + $sql = "UPDATE `mv_streams` SET `date_start_time`= '$sdate' " . |
| 57 | + " WHERE `id`={$stream->id} LIMIT 1 "; |
| 58 | + $dbw->query($sql); |
| 59 | + print "$stream->name date updated\n"; |
| 60 | + }else{ |
| 61 | + print "$stream->name date is ok\n"; |
| 62 | + } |
| 63 | + |
| 64 | + } |
| 65 | +} |
38 | 66 | function do_stream_file_check( $old_stream=false ) { |
39 | 67 | global $mvgIP, $mvVideoArchivePaths; |
40 | 68 | $stream_set = Array(); |
— | — | @@ -835,9 +863,13 @@ |
836 | 864 | $i++; |
837 | 865 | } |
838 | 866 | } |
839 | | - $raw_results = $mvScrape->doRequest( 'http://maplight.org/map/us/interest/' . $intrestKey . '/bills' ); |
| 867 | + $intrest_bills_url = 'http://maplight.org/map/us/interest/' . $intrestKey . '/bills'; |
| 868 | + $raw_results = $mvScrape->doRequest( $intrest_bills_url ); |
840 | 869 | // get all bills supported or opposed |
841 | 870 | preg_match_all( '/\/map\/us\/bill\/([^"]*)".*\/map\/us\/legislator.*<td>([^<]*)</U', $raw_results, $matches ); |
| 871 | + print $intrest_bills_url . "\n"; |
| 872 | + print_r($matches); |
| 873 | + die; |
842 | 874 | $sinx = $oinx = 1; |
843 | 875 | if ( isset( $matches[1][0] ) ) { |
844 | 876 | $support_count = $oppse_count = 0; |
— | — | @@ -857,7 +889,8 @@ |
858 | 890 | } |
859 | 891 | } |
860 | 892 | $page_body .= '}}'; |
861 | | - print "Interest Page: $intrestName\n"; |
| 893 | + print "Interest Page: $intrestName \n\n $page_body\n"; |
| 894 | + die; |
862 | 895 | $wTitle = Title::makeTitle( NS_MAIN, $intrestName ); |
863 | 896 | do_update_wiki_page( $wTitle, $page_body ); |
864 | 897 | } |