r109331 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109330‎ | r109331 | r109332 >
Date:07:08, 18 January 2012
Author:khorn
Status:reverted (Comments)
Tags:
Comment:
Renaming an ever-more-badly named file.
Modified paths:
  • /var/www/trunk/extensions/CongressLookup/db_manipulator.php/zip_file_parser.php (added) (history)

Diff [purge]

Index: var/www/trunk/extensions/CongressLookup/db_manipulator.php/zip_file_parser.php
@@ -0,0 +1,556 @@
 2+<?php
 3+die( "Dying for safety, because nobody should actually use this.\nIf you disagree, you'll need to uncomment line 2.\n" );
 4+//just don't commit it uncommented. >:[
 5+
 6+$IP = getenv( 'MW_INSTALL_PATH' );
 7+if ( $IP === false ) {
 8+ $IP = dirname( _FILE_ ) . '/../../..';
 9+}
 10+
 11+//If you get errors on this next line, set (and export) your MW_INSTALL_PATH var.
 12+require_once( "$IP/maintenance/Maintenance.php" );
 13+
 14+class ZipFileParser extends Maintenance {
 15+
 16+ protected $rep_lookup_cache = array();
 17+
 18+ function execute(){
 19+ $skip_up_to = 0;
 20+ if ( !empty( $_SERVER['argv'][1] ) && is_numeric( $_SERVER['argv'][1] ) ){
 21+ $skip_up_to = $_SERVER['argv'][1];
 22+ }
 23+ $function = 'fillOutNulls';
 24+ if ( !empty( $_SERVER['argv'][1] ) && !is_numeric( $_SERVER['argv'][1] ) ){
 25+ $function = $_SERVER['argv'][1];
 26+ }
 27+
 28+ switch ( $function ){
 29+ case 'associate':
 30+ //it should go zip, state, district for the next three params.
 31+
 32+ if ( !empty( $_SERVER['argv'][2] ) ){
 33+ $zip = $_SERVER['argv'][2];
 34+ } else {
 35+ die( "Missing parameter 2: 5-digit zip code." );
 36+ }
 37+ if ( !empty( $_SERVER['argv'][3] ) ){
 38+ $state = $_SERVER['argv'][3];
 39+ } else {
 40+ die( "Missing parameter 3: state code." );
 41+ }
 42+ if ( !empty( $_SERVER['argv'][4] ) ){
 43+ $district = $_SERVER['argv'][4];
 44+ } else {
 45+ die( "Missing parameter 4: district code." );
 46+ }
 47+
 48+ $this->associate_zip5_reps( $zip, $state, $district );
 49+
 50+ break;
 51+ case 'disassociate':
 52+ //it should go zip, state, district for the next three params.
 53+
 54+ if ( !empty( $_SERVER['argv'][2] ) ){
 55+ $zip = $_SERVER['argv'][2];
 56+ } else {
 57+ die( "Missing parameter 2: 5-digit zip code." );
 58+ }
 59+ if ( !empty( $_SERVER['argv'][3] ) ){
 60+ $state = $_SERVER['argv'][3];
 61+ } else {
 62+ die( "Missing parameter 3: state code." );
 63+ }
 64+ if ( !empty( $_SERVER['argv'][4] ) ){
 65+ $district = $_SERVER['argv'][4];
 66+ } else {
 67+ die( "Missing parameter 4: district code." );
 68+ }
 69+
 70+ $this->disassociate_zip5_reps( $zip, $state, $district );
 71+
 72+ break;
 73+
 74+ case 'writeSQLFile':
 75+ if ( !empty( $_SERVER['argv'][2] ) ){
 76+ $table = $_SERVER['argv'][2];
 77+ } else {
 78+ die( "Missing parameter 2: Table name" );
 79+ }
 80+
 81+ $this->write_happy_table_dump( $table );
 82+ break;
 83+
 84+ case 'fillOutNulls':
 85+ $this->fill_out_nulls( $skip_up_to );
 86+ break;
 87+ case 'findSplits':
 88+ default:
 89+ $this->find_split_zipcodes( $skip_up_to );
 90+ break;
 91+ }
 92+
 93+ echo "Done\n";
 94+ }
 95+
 96+ function fill_out_nulls( $skip_up_to ){
 97+ //get the "missing" zipcodes
 98+ $missing_zips = $this->getMissingZipcodes();
 99+
 100+ //load the file
 101+ //I know this is a terrible way to do this, but basically, I'm going to.
 102+ //Leaving this here because my laptop failed to explode when I ran
 103+ //the thing, and that's where I'm doing all the data gymnastics
 104+ //anyway. Results have been combined into the .sql files in the data
 105+ //directory, so nobody else should ever have to do this.
 106+ $zipfile = file( 'zipcode_map.txt', FILE_SKIP_EMPTY_LINES );
 107+ $zipmap = array();
 108+ foreach ( $zipfile as $line=>$val ){
 109+ $val = explode(' ', $val);
 110+ $zipgroup = $val[0];
 111+ $val_2 = explode('-', $val[1]);
 112+
 113+ $state = $val_2[0];
 114+ $district = $val_2[1];
 115+
 116+ $zipmap[$zipgroup] = $this->get_rep_ids($state, $district);
 117+ //echo "Zipgroup = $zipgroup, State = $state, District = $district";
 118+ }
 119+
 120+ //so, now we have an array of missing zips, and a map of zips to their representative.
 121+ //how to mash them together?
 122+
 123+ ksort( $zipmap, SORT_STRING ); //this actually makes sense.
 124+ ksort( $missing_zips, SORT_NUMERIC );
 125+
 126+ $unset_counter = 0;
 127+ foreach ( $zipmap as $zipgroup => $whatever ){
 128+ $sortme = array( $zipgroup, $skip_up_to );
 129+ sort($sortme, SORT_STRING);
 130+ if ( $sortme[0] === $zipgroup ){
 131+ if (strpos( (string)$skip_up_to, (string)$zipgroup ) === 0){
 132+ continue;
 133+ } else {
 134+ unset( $zipmap[$zipgroup] );
 135+ ++$unset_counter;
 136+ }
 137+ } else {
 138+ break;
 139+ }
 140+ }
 141+ echo "Unset $unset_counter entries in the file lookup. Whee!\n";
 142+
 143+ foreach ( $missing_zips as $zip => $whatever ){
 144+ $found_zip = array();
 145+ if ( $zip < $skip_up_to ){
 146+ echo "Skipping $zip\n";
 147+ continue;
 148+ }
 149+ $start = microtime( true );
 150+ $zip = $this->make_zip_string($zip);
 151+
 152+ $counter = 0;
 153+ $foundflag = false;
 154+ $last_zipgroup = '';
 155+ foreach ( $zipmap as $zipgroup => $rep_array ){
 156+ if (!$foundflag){
 157+ if ( strpos( (string)$zip, (string)$zipgroup ) === 0 ){
 158+ echo "Found zip $zip in $zipgroup.\n";
 159+ $found_zip[$zipgroup] = $rep_array;
 160+ if ( $zipgroup != $last_zipgroup && $last_zipgroup != '' ){
 161+ unset( $zipmap[$last_zipgroup] ); //speeeeed...
 162+ }
 163+ $foundflag = true;
 164+ }
 165+ }
 166+ }
 167+
 168+ //things we have to deal with:
 169+ //split zipcodes.
 170+ if (!empty($found_zip)){
 171+ if ( count( $found_zip ) > 1 ){
 172+ echo "Split zipcode! Not dealing with this right now. $zip \n";
 173+ } elseif ( count( $found_zip ) == 0 ){
 174+ echo "Nothing there at all. Weird... $zip \n";
 175+ } else {
 176+ //drill down: We know there's only one.
 177+ foreach( $found_zip as $reps ){
 178+ if ( is_array($reps) && count( $reps ) > 0 ){
 179+ foreach ( $reps as $rep_id ){
 180+ echo "$zip Rep ID: $rep_id\n";
 181+ $this->update_rep( $zip, $rep_id );
 182+ }
 183+ } else {
 184+ echo "We got no reps in a really odd way.\n";
 185+ //TODO: Not this, like, 1000 times.
 186+ }
 187+ }
 188+ }
 189+ } else {
 190+ echo "Nothing found for $zip\n";
 191+ }
 192+ $now = microtime( true );
 193+ $duration = $now - $start;
 194+ echo "Time: " . $duration . "\n";
 195+ }
 196+
 197+ }
 198+
 199+
 200+ function find_split_zipcodes( $skip_up_to ){
 201+ //get the "missing" zipcodes
 202+ //$missing_zips = $this->getMissingZipcodes();
 203+
 204+ //load the file
 205+ //Still a terrible way to do this.
 206+ $zipfile = file( 'zipcode_map.txt', FILE_SKIP_EMPTY_LINES );
 207+ $zipmap = array();
 208+ $zip5_reps = array();
 209+ foreach ( $zipfile as $line=>$val ){
 210+ $val = explode(' ', $val);
 211+ $zipgroup = $val[0];
 212+ $extended = '';
 213+ $exploded_zipgroup = explode( '-', $zipgroup );
 214+ $original_zipgroup = $zipgroup;
 215+
 216+ if ( count( $exploded_zipgroup ) > 1 ){
 217+ $zipgroup = $exploded_zipgroup[0];
 218+ $extended = $exploded_zipgroup[1];
 219+ }
 220+
 221+ $val_2 = explode('-', $val[1]);
 222+
 223+ $state = $val_2[0];
 224+ $district = trim($val_2[1]);
 225+
 226+ $zipmap[$zipgroup][$state . '-' . $district][] = $original_zipgroup;
 227+
 228+ //echo "Zipgroup = $zipgroup, State = $state, District = $district";
 229+ }
 230+ //echo print_r( $zipmap, true );
 231+ //die();
 232+
 233+ $splits = array();
 234+ foreach ( $zipmap as $group => $districts ){
 235+ if (count($districts) > 1){
 236+ $splits[$group] = count($districts);
 237+ $this->insert_reps_by_district($group, $districts);
 238+ } else {
 239+ unset( $zipmap[$group] );
 240+ }
 241+ }
 242+
 243+ echo print_r( $splits, true );
 244+ echo count( $splits ) . " split zipcodes found, out of... I don't know. 40 thousand or something.\n";
 245+
 246+ }
 247+
 248+ function associate_zip5_reps( $zip, $state, $district ){
 249+ //first, retrieve the state and district rep.
 250+ $reps = $this->get_rep_ids( $state, $district );
 251+ if ( count( $reps ) !== 1 ){
 252+ if ( empty($reps) ){
 253+ die("No rep found in $state $district");
 254+ } else {
 255+ die("Something very funky happened just then...");
 256+ }
 257+ }
 258+
 259+ $rep = $reps[0];
 260+ $this->insert_rep( $zip, $reps[0] );
 261+
 262+ //if it's not already in there, it should put it in there.
 263+ //if it is already in there, do nothing.
 264+ }
 265+
 266+ function disassociate_zip5_reps( $zip, $state, $district ){
 267+ //first, retrieve the state and district rep.
 268+ $reps = $this->get_rep_ids( $state, $district );
 269+ if ( count( $reps ) !== 1 ){
 270+ if ( empty($reps) ){
 271+ die("No rep found in $state $district");
 272+ } else {
 273+ die("Something very funky happened just then...");
 274+ }
 275+ }
 276+
 277+ $rep = $reps[0];
 278+ $this->remove_rep( $zip, $reps[0] );
 279+
 280+ //if it's not already in there, it should put it in there.
 281+ //if it is already in there, do nothing.
 282+ }
 283+
 284+
 285+ function update_rep( $zip, $rep_id ){
 286+ $dbr = wfGetDB( DB_SLAVE );
 287+ $dbr->update( 'cl_zip5',
 288+ array(
 289+ 'clz5_rep_id' => $rep_id,
 290+ ),
 291+ array(
 292+ 'clz5_zip' => $zip,
 293+ )
 294+ );
 295+ echo "Updated $zip to include rep id $rep_id\n";
 296+ }
 297+
 298+
 299+ function remove_rep( $zip, $rep_id ){
 300+ $dbr = wfGetDB( DB_SLAVE );
 301+
 302+ $dbr->delete( 'cl_zip5',
 303+ array(
 304+ 'clz5_rep_id' => $rep_id,
 305+ 'clz5_zip' => $zip,
 306+ )
 307+ );
 308+ echo "Removed $rep_id from $zip\n";
 309+ }
 310+
 311+ function insert_rep( $zip, $rep_id ){
 312+ $dbr = wfGetDB( DB_SLAVE );
 313+
 314+ $rowCheck = $dbr->selectRow( 'cl_zip5',
 315+ array(
 316+ 'clz5_rep_id',
 317+ 'clz5_zip',
 318+ ),
 319+ array(
 320+ 'clz5_rep_id' => $rep_id,
 321+ 'clz5_zip' => $zip,
 322+ )
 323+ );
 324+
 325+ if ( $rowCheck ){
 326+ echo "Rep id $rep_id already found in $zip.";
 327+ } else {
 328+ $dbr->insert( 'cl_zip5',
 329+ array(
 330+ 'clz5_rep_id' => $rep_id,
 331+ 'clz5_zip' => $zip,
 332+ )
 333+ );
 334+ echo "Updated $zip to include rep id $rep_id\n";
 335+ }
 336+ }
 337+
 338+ function insert_reps_by_district( $zip, $districts ){
 339+ $dbr = wfGetDB( DB_SLAVE );
 340+ echo "Zip: $zip\n";
 341+ echo "Districts: " . print_r( $districts, true );
 342+
 343+ $rep_ids = array();
 344+ foreach ( $districts as $district => $z9 ){
 345+ $district = explode('-', $district);
 346+ $state = $district[0];
 347+ $district = $district[1];
 348+ $rep_ids[] = $this->get_rep_ids( $state, $district );
 349+ }
 350+ echo print_r( $rep_ids, true );
 351+
 352+ //delete everything with the current zipcode.
 353+ $dbr->delete( 'cl_zip5', array( 'clz5_zip' => $zip ) );
 354+
 355+ foreach ( $rep_ids as $rep_id ){
 356+ echo " Count of Rep IDs: " . count( $rep_id ) . "\n";
 357+ if ( count( $rep_id ) === 1 ){
 358+ $rep_id = $rep_id[0];
 359+ $dbr->insert( 'cl_zip5',
 360+ array(
 361+ 'clz5_zip' => $zip,
 362+ 'clz5_rep_id' => $rep_id,
 363+ )
 364+ );
 365+ } else {
 366+// echo " Count of Rep IDs was NOT one for $zip.\n";
 367+ }
 368+ echo "Updated $zip to include rep id $rep_id\n";
 369+ }
 370+
 371+ }
 372+
 373+ function make_zip_string( $zip ){
 374+ $zip = (string)$zip;
 375+ if ( strlen( $zip ) < 5 ){
 376+ $zeroes = 5 - strlen( $zip );
 377+ for ( $i = 0; $i < $zeroes; ++$i ){
 378+ $zip = '0' . $zip;
 379+ }
 380+ }
 381+ return $zip;
 382+ }
 383+
 384+
 385+ function getMissingZipcodes(){
 386+ $dbr = wfGetDB( DB_SLAVE );
 387+
 388+ //for the record, I'd much rather just do this directly...
 389+// $query = "select floor(z5.sz5_zip/100) as zgroup, z5.sz5_zip, z5.sz5_rep_id, z3.sz3_state from cl_zip5 z5
 390+// LEFT JOIN cl_zip3 z3 on (floor(z5.sz5_zip/100)) = z3.sz3_zip
 391+// WHERE z5.sz5_rep_id IS NULL ORDER BY z5.sz5_zip ASC";
 392+ $state_map = array();
 393+ $results = $dbr->select(
 394+ 'cl_zip3',
 395+ array(
 396+ 'clz3_zip',
 397+ 'clz3_state'
 398+ )
 399+ );
 400+ if ( $results ){
 401+ foreach ( $results as $row ){
 402+ $state_map[ $row->clz3_zip ] = $row->clz3_state;
 403+ }
 404+ }
 405+
 406+ $null_zips = array();
 407+ $results = $dbr->select(
 408+ 'cl_zip5',
 409+ array(
 410+ 'clz5_zip',
 411+ ),
 412+ array( 'clz5_rep_id' => NULL )
 413+ );
 414+ if ( $results ){
 415+ foreach ( $results as $row ){
 416+ $zip = $row->clz5_zip;
 417+ $zipgroup = (int)floor($row->clz5_zip/100);
 418+ if ( $zipgroup && array_key_exists( $zipgroup, $state_map) ){
 419+ $null_zips[$zip] = array( 'state' => $state_map[ $zipgroup ] );
 420+ } else {
 421+ $null_zips[$zip] = array( 'state' => NULL );
 422+ }
 423+ }
 424+ }
 425+
 426+ echo count($null_zips) . "\n";
 427+ return $null_zips;
 428+ }
 429+
 430+ function get_rep_ids( $state, $district ){
 431+ $district = (int)$district;
 432+ if ( array_key_exists( $state, $this->rep_lookup_cache) && array_key_exists( $district, $this->rep_lookup_cache[$state]) ){
 433+ return $this->rep_lookup_cache[$state][$district];
 434+ } else {
 435+ $dbr = wfGetDB( DB_SLAVE );
 436+
 437+ $results = $dbr->select(
 438+ 'cl_house',
 439+ array(
 440+ 'clh_id',
 441+ ),
 442+ array(
 443+ 'clh_state' => $state,
 444+ 'clh_district' => $district
 445+ )
 446+ );
 447+ if ( $results ){
 448+ foreach ( $results as $row ){
 449+ $rep_id = $row->clh_id;
 450+ $this->rep_lookup_cache[$state][$district][] = $rep_id;
 451+ }
 452+ }
 453+ if ( array_key_exists( $state, $this->rep_lookup_cache) && array_key_exists( $district, $this->rep_lookup_cache[$state])){
 454+ if ( count($this->rep_lookup_cache[$state][$district]) > 1 ){
 455+ echo "How on Earth did we find more than one for $state $district?\n" . print_r($this->rep_lookup_cache[$state][$district], true);
 456+ }
 457+ return $this->rep_lookup_cache[$state][$district];
 458+ } else {
 459+ echo "Didn't get anything for $state, $district.\n";
 460+ $this->rep_lookup_cache[$state][$district] = NULL;
 461+ return NULL;
 462+ }
 463+ }
 464+ }
 465+
 466+ function write_happy_table_dump( $table ){
 467+ $dbr = wfGetDB( DB_SLAVE );
 468+
 469+ if ( !(strpos( $table, 'cl_' ) === 0) ){
 470+ die("Table $table is not recognized by CongressLookup");
 471+ }
 472+
 473+ if (!$dbr->tableExists( $table ) ) {
 474+ die("Table $table does not exist.");
 475+ }
 476+
 477+ $ignore_me = array(
 478+ 'clz5_id',
 479+ 'clz3_id',
 480+ );
 481+
 482+
 483+ $batch_size = 140;
 484+ $batch_start_line = null;
 485+ $batch_end_line = ";\n";
 486+ $current_count = 0;
 487+
 488+
 489+ //we're running locally, so just go for it.
 490+ $results = $dbr->select( $table, '*' );
 491+ if ( $results ){
 492+ $file = fopen("../data/$table.new.sql", 'w');
 493+ if ( !$file ){
 494+ die("No dice.");
 495+ }
 496+ foreach ( $results as $row ){
 497+
 498+ $row = get_object_vars($row);
 499+ foreach ( $ignore_me as $key ){
 500+ unset( $row[$key] );
 501+ }
 502+
 503+ if ( $batch_start_line === null ){
 504+ $batch_start_line = 'REPLACE INTO /*$wgDBprefix*/' . $table . " ( ";
 505+ $keys = array();
 506+ foreach ( $row as $key => $data ){
 507+ $keys[] = '`' . $key . '`';
 508+ }
 509+ $batch_start_line .= implode( ', ', $keys );
 510+ $batch_start_line .= ") VALUES\n";
 511+ }
 512+ if ( $current_count === 0 ){
 513+ //echo "Writing $batch_start_line";
 514+ fwrite($file, $batch_start_line);
 515+ }
 516+ $line_data = '( 501, 400031 ), ' . "\n";
 517+ foreach ( $row as $key=>$data ){
 518+ if ( !is_numeric($data) ){
 519+ $row[$key] = "'$data'";
 520+ if ( $row[$key] === "''" ){
 521+ $row[$key] = 'NULL';
 522+ }
 523+ }
 524+ }
 525+
 526+ $line_data = '( ' . implode( ', ', $row ) . ' )';
 527+
 528+ ++$current_count;
 529+
 530+ if ( $current_count === $batch_size ){
 531+ $line_data .= $batch_end_line;
 532+ $current_count = 0;
 533+ } else {
 534+ $line_data .= ", \n";
 535+ }
 536+
 537+ //echo "Writing $line_data";
 538+ fwrite($file, $line_data);
 539+ }
 540+ fwrite($file, ';');
 541+
 542+ fclose($file);
 543+
 544+ } else {
 545+ die("Nothing to export.");
 546+ }
 547+
 548+
 549+
 550+ }
 551+
 552+}
 553+
 554+$maintClass = "ZipFileParser";
 555+require_once( "$IP/maintenance/doMaintenance.php" );
 556+
 557+
Property changes on: var/www/trunk/extensions/CongressLookup/db_manipulator.php/zip_file_parser.php
___________________________________________________________________
Added: svn:eol-style
1558 + native

Follow-up revisions

RevisionCommit summaryAuthorDate
r109349Revert r109330 and r109331.siebrand08:33, 18 January 2012

Comments

#Comment by Khorn (WMF) (talk | contribs)   07:15, 18 January 2012

Uhm... I'd revert this incredibly weird thing if I could find it locally. :/

#Comment by Siebrand (talk | contribs)   08:31, 18 January 2012

Let me have a go...

#Comment by Siebrand (talk | contribs)   08:34, 18 January 2012

Reverted using TortoiseSVN's repo browser.

#Comment by Khorn (WMF) (talk | contribs)   18:08, 18 January 2012

Ah, good ol' Tortoise. Thanks, Siebrand! (I should really get tortoise for my virtualbox...)

Status & tagging log