r91319 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91318‎ | r91319 | r91320 >
Date:22:57, 1 July 2011
Author:kbrown
Status:ok (Comments)
Tags:
Comment:
* follow up on r91317 to readd spider.php and setuptables.sql with the correct lowercase name to adhere to coding conventions
* add function to spider.php to deal with replag on databases to prevent duplication by mulitple instances of the spider
* fix setuptables.sql to actually have data instead of being empty (problem in r91047)
Modified paths:
  • /trunk/extensions/ArchiveLinks/setuptables.sql (added) (history)
  • /trunk/extensions/ArchiveLinks/spider.php (added) (history)

Diff [purge]

Index: trunk/extensions/ArchiveLinks/setuptables.sql
@@ -0,0 +1,27 @@
 2+CREATE TABLE IF NOT EXISTS `el_archive_blacklist` (
 3+ `bl_id` int(11) unsigned NOT NULL AUTO_INCREMENT,
 4+ `bl_type` tinyint(4) NOT NULL,
 5+ `bl_url` varchar(10000) NOT NULL,
 6+ `bl_expiry` int(11) unsigned NOT NULL,
 7+ `bl_reason` varchar(255) NOT NULL,
 8+ PRIMARY KEY (`bl_id`)
 9+) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=2 ;
 10+
 11+CREATE TABLE IF NOT EXISTS `el_archive_log` (
 12+ `log_id` int(11) unsigned NOT NULL AUTO_INCREMENT,
 13+ `log_result` tinyint(4) NOT NULL,
 14+ `log_url` varchar(10000) NOT NULL,
 15+ `log_time` int(11) unsigned NOT NULL,
 16+ `log_http_code` varchar(255) NOT NULL,
 17+ PRIMARY KEY (`log_id`)
 18+) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;
 19+
 20+CREATE TABLE IF NOT EXISTS `el_archive_queue` (
 21+ `queue_id` int(11) unsigned NOT NULL AUTO_INCREMENT,
 22+ `page_id` int(11) unsigned NOT NULL,
 23+ `url` varchar(10000) NOT NULL,
 24+ `delay_time` int(11) unsigned NOT NULL,
 25+ `insertion_time` int(11) unsigned NOT NULL,
 26+ `in_progress` varchar(50) NOT NULL,
 27+ PRIMARY KEY (`queue_id`)
 28+) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=105 ;
Property changes on: trunk/extensions/ArchiveLinks/setuptables.sql
___________________________________________________________________
Added: svn:eol-style
129 + native
Index: trunk/extensions/ArchiveLinks/spider.php
@@ -0,0 +1,179 @@
 2+<?php
 3+
 4+/**
 5+ * This class is for the actual spidering and will be calling wget
 6+ */
 7+$path = getenv('MW_INSTALL_PATH');
 8+if (strval($path) === '') {
 9+ $path = dirname(__FILE__) . '/../..';
 10+}
 11+
 12+require_once "$path/maintenance/Maintenance.php";
 13+
 14+class ArchiveLinksSpider extends Maintenance {
 15+
 16+ private $db_master;
 17+ private $db_slave;
 18+ private $db_result;
 19+ private $jobs;
 20+
 21+ public function execute( ) {
 22+ global $wgArchiveLinksConfig, $wgLoadBalancer;
 23+
 24+ $this->db_master = $this->getDB(DB_MASTER);
 25+ $this->db_slave = $this->getDB(DB_SLAVE);
 26+ $this->db_result = array();
 27+
 28+ if ( $wgArchiveLinksConfig['run_spider_in_loop'] ) {
 29+ /* while ( TRUE ) {
 30+ if ( ( $url = $this->check_queue() ) !== false ) {
 31+
 32+ }
 33+ sleep(1);
 34+ } */
 35+ die( 'Sorry, at the current time running the spider as a daemon isn\'t supported.' );
 36+ } else {
 37+ //for right now we will pipe everything through the replication_check_queue function just for testing purposes
 38+ /* if ( $wgLoadBalancer->getServerCount() > 1 ) {
 39+ if ( ( $url = $this->replication_check_queue() ) !== false ) {
 40+
 41+ }
 42+ } else {
 43+ if ( ( $url = $this->check_queue() ) !== false ) {
 44+
 45+ }
 46+ } */
 47+
 48+ if ( ( $url = $this->replication_check_queue() ) !== false ) {
 49+
 50+ }
 51+ }
 52+ return null;
 53+ }
 54+
 55+ /*private function check_queue( ) {
 56+ //need to fix this to use arrays instead of what I'm doing now
 57+ $this->db_result['job-fetch'] = $this->db_slave->select('el_archive_queue', '*', '`el_archive_queue`.`delay_time` <= ' . time()
 58+ . ' AND `el_archive_queue`.`in_progress` = 0'
 59+ . ' ORDER BY `el_archive_queue`.`queue_id` ASC'
 60+ . ' LIMIT 1');
 61+
 62+ if ( $this->db_result['job-fetch']->numRows() > 0 ) {
 63+ $row = $this->db_result['job-fetch']->fetchRow();
 64+
 65+ $this->delete_dups( $row['url'] );
 66+
 67+ return $row['url'];
 68+ } else {
 69+ //there are no jobs to do right now
 70+ return false;
 71+ }
 72+ }*/
 73+
 74+ /**
 75+ * This function checks a local file for a local block of jobs that is to be done
 76+ * if there is none that exists it gets a block, create ones, and waits to avoid any replag problems
 77+ */
 78+ private function replication_check_queue( ) {
 79+ global $path, $wgArchiveLinksConfig;
 80+ if ( file_exists( "$path/extensions/ArchiveLinks/spider-temp.txt" ) ) {
 81+ $file = file_get_contents( "$path/extensions/ArchiveLinks/spider-temp.txt" );
 82+ $file = unserialize( $file );
 83+ } else {
 84+ //we don't have any temp file, lets get a block of jobs to do and make one
 85+ $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*',
 86+ array(
 87+ 'delay_time <= "' . time() . '"',
 88+ 'in_progress' => '0')
 89+ , __METHOD__,
 90+ array(
 91+ 'LIMIT' => '15',
 92+ 'ORDER BY' => 'queue_id ASC'
 93+ ));
 94+ //echo $this->db_result['job-fetch'];
 95+
 96+ $this->jobs = array();
 97+
 98+ $wait_time = $this->db_slave->getLag() * 3;
 99+ $pid = (string) microtime() . ' - ' . getmypid();
 100+ $time = time();
 101+
 102+ //echo $pid;
 103+
 104+ $this->jobs['pid'] = $pid;
 105+ $this->jobs['execute_time'] = $wait_time + $time;
 106+
 107+ if ($this->db_result['job-fetch']->numRows() > 0) {
 108+ //$row = $this->db_result['job-fetch']->fetchRow();
 109+ while ( $row = $this->db_result['job-fetch']->fetchRow() ) {
 110+ //var_export($row);
 111+
 112+ if ( $row['insertion_time'] >= $row['insertion_time'] + $wait_time ) {
 113+ if ( $row['in_progress'] === '0') {
 114+ $retval = $this->reserve_job( $row );
 115+ } else {
 116+ //in_progress is not equal to 0, this means that the job was reserved some time before
 117+ //it could have been by a previous instance of this spider (assuming not running in a loop)
 118+ //or a different spider entirely, since we don't have have a temp file to go on we have to assume
 119+ //it was a different spider (it could have been deleted by a user), we will only ignore the in_progress
 120+ //lock if it has been a long time (2 hours by default) since the job was initally reserved
 121+ $reserve_time = explode( ' ', $row['in_progress'] );
 122+ $reserve_time = $reserve_time[2];
 123+
 124+ array_key_exists( 'in_progress_ignore_delay', $wgArchiveLinksConfig ) ? $ignore_in_prog_time = $wgArchiveLinksConfig['in_progress_ignore_delay'] :
 125+ $ignore_in_prog_time = 7200;
 126+
 127+ if ( $reserve_time - $time > $ignore_in_prog_time ) {
 128+ $retval = $this->reserve_job( $row );
 129+ }
 130+ }
 131+
 132+ } else {
 133+ //let's wait for everything to replicate, add to temp file and check back later
 134+ $this->jobs[] = $row;
 135+ }
 136+ }
 137+ }
 138+
 139+ //var_dump( $this->jobs );
 140+
 141+ $this->jobs = serialize( $this->jobs );
 142+ //file_put_contents( "$path/extensions/ArchiveLinks/spider-temp.txt", $this->jobs );
 143+ }
 144+
 145+ if ( $retval !== true ) {
 146+ $retval = false;
 147+ }
 148+ return $retval;
 149+ }
 150+
 151+ private function delete_dups( $url ) {
 152+ //Since we querried the slave to check for dups when we insterted instead of the master let's check
 153+ //that the job isn't in the queue twice, we don't want to archive it twice
 154+ $this->db_result['dup-check'] = $this->db_slave->select('el_archive_queue', '*', array( 'url' => $url ), __METHOD__,
 155+ array( 'ORDER BY' => 'queue_id ASC' ) );
 156+
 157+ if ( $this->db_result['dup-check']->numRows() > 1 ) {
 158+ //keep only the first job and remove all duplicates
 159+ $this->db_result['dup-check']->fetchRow();
 160+ while ( $del_row = $this->db_result['dup-check']->fetchRow() ) {
 161+ echo 'you have a dup ';
 162+ var_dump( $del_row );
 163+ //this is commented for testing purposes, so I don't have to keep readding the duplicate to my test db
 164+ //in other words this has a giant "remove before flight" ribbon hanging from it...
 165+ //$this->db_master->delete( 'el_archive_queue', '`el_archive_queue`.`queue_id` = ' . $del_row['queue_id'] );
 166+ }
 167+ }
 168+ }
 169+
 170+ private function reserve_job( $row ) {
 171+ $this->jobs['execute_urls'][] = $row['url'];
 172+ $this->db_master->update( 'el_archive_queue', array( $row['in_progress'] => "\"$pid\"" ), array( 'queue_id' => $row['queue_id'] ),
 173+ __METHOD__ ) or die( 'can\'t reserve job' );
 174+ $this->delete_dups( $row['url'] );
 175+ return true;
 176+ }
 177+}
 178+
 179+$maintClass = 'ArchiveLinksSpider';
 180+require_once RUN_MAINTENANCE_IF_MAIN;
\ No newline at end of file
Property changes on: trunk/extensions/ArchiveLinks/spider.php
___________________________________________________________________
Added: svn:eol-style
1181 + native

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r91047add install instructions, add sql file with format of tables, add basics of s...kbrown02:30, 29 June 2011
r91317* rename SpecialModifyBlacklist to SpecialModifyArchiveBlacklist to avoid con...kbrown22:53, 1 July 2011

Comments

#Comment by NeilK (talk | contribs)   18:44, 5 August 2011

varchar(10000) is kind of overkill. In practice URLs over 4K aren't handled by browsers. But okay.

What's the point of the local file? That seems like another thing which can go wrong; why not keep all the state about what you are doing in the db?

Minor nit, instead of

   if ( $retval !== true ) {$retval = false; } return $retval;

you can say

   return ($retval === true);


delete_dups -- try not to select '*', it's bad style

I like the way you did parse_wget_log, very clean that way

#Comment by NeilK (talk | contribs)   18:45, 5 August 2011

Needs documentation for each method, BTW, otherwise okay.

Status & tagging log