Index: trunk/extensions/ArchiveLinks/setuptables.sql |
— | — | @@ -0,0 +1,27 @@ |
| 2 | +CREATE TABLE IF NOT EXISTS `el_archive_blacklist` ( |
| 3 | + `bl_id` int(11) unsigned NOT NULL AUTO_INCREMENT, |
| 4 | + `bl_type` tinyint(4) NOT NULL, |
| 5 | + `bl_url` varchar(10000) NOT NULL, |
| 6 | + `bl_expiry` int(11) unsigned NOT NULL, |
| 7 | + `bl_reason` varchar(255) NOT NULL, |
| 8 | + PRIMARY KEY (`bl_id`) |
| 9 | +) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=2 ; |
| 10 | + |
| 11 | +CREATE TABLE IF NOT EXISTS `el_archive_log` ( |
| 12 | + `log_id` int(11) unsigned NOT NULL AUTO_INCREMENT, |
| 13 | + `log_result` tinyint(4) NOT NULL, |
| 14 | + `log_url` varchar(10000) NOT NULL, |
| 15 | + `log_time` int(11) unsigned NOT NULL, |
| 16 | + `log_http_code` varchar(255) NOT NULL, |
| 17 | + PRIMARY KEY (`log_id`) |
| 18 | +) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ; |
| 19 | + |
| 20 | +CREATE TABLE IF NOT EXISTS `el_archive_queue` ( |
| 21 | + `queue_id` int(11) unsigned NOT NULL AUTO_INCREMENT, |
| 22 | + `page_id` int(11) unsigned NOT NULL, |
| 23 | + `url` varchar(10000) NOT NULL, |
| 24 | + `delay_time` int(11) unsigned NOT NULL, |
| 25 | + `insertion_time` int(11) unsigned NOT NULL, |
| 26 | + `in_progress` varchar(50) NOT NULL, |
| 27 | + PRIMARY KEY (`queue_id`) |
| 28 | +) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=105 ; |
Property changes on: trunk/extensions/ArchiveLinks/setuptables.sql |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 29 | + native |
Index: trunk/extensions/ArchiveLinks/spider.php |
— | — | @@ -0,0 +1,179 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * This class is for the actual spidering and will be calling wget |
| 6 | + */ |
| 7 | +$path = getenv('MW_INSTALL_PATH'); |
| 8 | +if (strval($path) === '') { |
| 9 | + $path = dirname(__FILE__) . '/../..'; |
| 10 | +} |
| 11 | + |
| 12 | +require_once "$path/maintenance/Maintenance.php"; |
| 13 | + |
| 14 | +class ArchiveLinksSpider extends Maintenance { |
| 15 | + |
| 16 | + private $db_master; |
| 17 | + private $db_slave; |
| 18 | + private $db_result; |
| 19 | + private $jobs; |
| 20 | + |
| 21 | + public function execute( ) { |
| 22 | + global $wgArchiveLinksConfig, $wgLoadBalancer; |
| 23 | + |
| 24 | + $this->db_master = $this->getDB(DB_MASTER); |
| 25 | + $this->db_slave = $this->getDB(DB_SLAVE); |
| 26 | + $this->db_result = array(); |
| 27 | + |
| 28 | + if ( $wgArchiveLinksConfig['run_spider_in_loop'] ) { |
| 29 | + /* while ( TRUE ) { |
| 30 | + if ( ( $url = $this->check_queue() ) !== false ) { |
| 31 | + |
| 32 | + } |
| 33 | + sleep(1); |
| 34 | + } */ |
| 35 | + die( 'Sorry, at the current time running the spider as a daemon isn\'t supported.' ); |
| 36 | + } else { |
| 37 | + //for right now we will pipe everything through the replication_check_queue function just for testing purposes |
| 38 | + /* if ( $wgLoadBalancer->getServerCount() > 1 ) { |
| 39 | + if ( ( $url = $this->replication_check_queue() ) !== false ) { |
| 40 | + |
| 41 | + } |
| 42 | + } else { |
| 43 | + if ( ( $url = $this->check_queue() ) !== false ) { |
| 44 | + |
| 45 | + } |
| 46 | + } */ |
| 47 | + |
| 48 | + if ( ( $url = $this->replication_check_queue() ) !== false ) { |
| 49 | + |
| 50 | + } |
| 51 | + } |
| 52 | + return null; |
| 53 | + } |
| 54 | + |
| 55 | + /*private function check_queue( ) { |
| 56 | + //need to fix this to use arrays instead of what I'm doing now |
| 57 | + $this->db_result['job-fetch'] = $this->db_slave->select('el_archive_queue', '*', '`el_archive_queue`.`delay_time` <= ' . time() |
| 58 | + . ' AND `el_archive_queue`.`in_progress` = 0' |
| 59 | + . ' ORDER BY `el_archive_queue`.`queue_id` ASC' |
| 60 | + . ' LIMIT 1'); |
| 61 | + |
| 62 | + if ( $this->db_result['job-fetch']->numRows() > 0 ) { |
| 63 | + $row = $this->db_result['job-fetch']->fetchRow(); |
| 64 | + |
| 65 | + $this->delete_dups( $row['url'] ); |
| 66 | + |
| 67 | + return $row['url']; |
| 68 | + } else { |
| 69 | + //there are no jobs to do right now |
| 70 | + return false; |
| 71 | + } |
| 72 | + }*/ |
| 73 | + |
| 74 | + /** |
| 75 | + * This function checks a local file for a local block of jobs that is to be done |
| 76 | + * if there is none that exists it gets a block, create ones, and waits to avoid any replag problems |
| 77 | + */ |
| 78 | + private function replication_check_queue( ) { |
| 79 | + global $path, $wgArchiveLinksConfig; |
| 80 | + if ( file_exists( "$path/extensions/ArchiveLinks/spider-temp.txt" ) ) { |
| 81 | + $file = file_get_contents( "$path/extensions/ArchiveLinks/spider-temp.txt" ); |
| 82 | + $file = unserialize( $file ); |
| 83 | + } else { |
| 84 | + //we don't have any temp file, lets get a block of jobs to do and make one |
| 85 | + $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*', |
| 86 | + array( |
| 87 | + 'delay_time <= "' . time() . '"', |
| 88 | + 'in_progress' => '0') |
| 89 | + , __METHOD__, |
| 90 | + array( |
| 91 | + 'LIMIT' => '15', |
| 92 | + 'ORDER BY' => 'queue_id ASC' |
| 93 | + )); |
| 94 | + //echo $this->db_result['job-fetch']; |
| 95 | + |
| 96 | + $this->jobs = array(); |
| 97 | + |
| 98 | + $wait_time = $this->db_slave->getLag() * 3; |
| 99 | + $pid = (string) microtime() . ' - ' . getmypid(); |
| 100 | + $time = time(); |
| 101 | + |
| 102 | + //echo $pid; |
| 103 | + |
| 104 | + $this->jobs['pid'] = $pid; |
| 105 | + $this->jobs['execute_time'] = $wait_time + $time; |
| 106 | + |
| 107 | + if ($this->db_result['job-fetch']->numRows() > 0) { |
| 108 | + //$row = $this->db_result['job-fetch']->fetchRow(); |
| 109 | + while ( $row = $this->db_result['job-fetch']->fetchRow() ) { |
| 110 | + //var_export($row); |
| 111 | + |
| 112 | + if ( $row['insertion_time'] >= $row['insertion_time'] + $wait_time ) { |
| 113 | + if ( $row['in_progress'] === '0') { |
| 114 | + $retval = $this->reserve_job( $row ); |
| 115 | + } else { |
| 116 | + //in_progress is not equal to 0, this means that the job was reserved some time before |
| 117 | + //it could have been by a previous instance of this spider (assuming not running in a loop) |
| 118 | + //or a different spider entirely, since we don't have have a temp file to go on we have to assume |
| 119 | + //it was a different spider (it could have been deleted by a user), we will only ignore the in_progress |
| 120 | + //lock if it has been a long time (2 hours by default) since the job was initally reserved |
| 121 | + $reserve_time = explode( ' ', $row['in_progress'] ); |
| 122 | + $reserve_time = $reserve_time[2]; |
| 123 | + |
| 124 | + array_key_exists( 'in_progress_ignore_delay', $wgArchiveLinksConfig ) ? $ignore_in_prog_time = $wgArchiveLinksConfig['in_progress_ignore_delay'] : |
| 125 | + $ignore_in_prog_time = 7200; |
| 126 | + |
| 127 | + if ( $reserve_time - $time > $ignore_in_prog_time ) { |
| 128 | + $retval = $this->reserve_job( $row ); |
| 129 | + } |
| 130 | + } |
| 131 | + |
| 132 | + } else { |
| 133 | + //let's wait for everything to replicate, add to temp file and check back later |
| 134 | + $this->jobs[] = $row; |
| 135 | + } |
| 136 | + } |
| 137 | + } |
| 138 | + |
| 139 | + //var_dump( $this->jobs ); |
| 140 | + |
| 141 | + $this->jobs = serialize( $this->jobs ); |
| 142 | + //file_put_contents( "$path/extensions/ArchiveLinks/spider-temp.txt", $this->jobs ); |
| 143 | + } |
| 144 | + |
| 145 | + if ( $retval !== true ) { |
| 146 | + $retval = false; |
| 147 | + } |
| 148 | + return $retval; |
| 149 | + } |
| 150 | + |
| 151 | + private function delete_dups( $url ) { |
| 152 | + //Since we querried the slave to check for dups when we insterted instead of the master let's check |
| 153 | + //that the job isn't in the queue twice, we don't want to archive it twice |
| 154 | + $this->db_result['dup-check'] = $this->db_slave->select('el_archive_queue', '*', array( 'url' => $url ), __METHOD__, |
| 155 | + array( 'ORDER BY' => 'queue_id ASC' ) ); |
| 156 | + |
| 157 | + if ( $this->db_result['dup-check']->numRows() > 1 ) { |
| 158 | + //keep only the first job and remove all duplicates |
| 159 | + $this->db_result['dup-check']->fetchRow(); |
| 160 | + while ( $del_row = $this->db_result['dup-check']->fetchRow() ) { |
| 161 | + echo 'you have a dup '; |
| 162 | + var_dump( $del_row ); |
| 163 | + //this is commented for testing purposes, so I don't have to keep readding the duplicate to my test db |
| 164 | + //in other words this has a giant "remove before flight" ribbon hanging from it... |
| 165 | + //$this->db_master->delete( 'el_archive_queue', '`el_archive_queue`.`queue_id` = ' . $del_row['queue_id'] ); |
| 166 | + } |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + private function reserve_job( $row ) { |
| 171 | + $this->jobs['execute_urls'][] = $row['url']; |
| 172 | + $this->db_master->update( 'el_archive_queue', array( $row['in_progress'] => "\"$pid\"" ), array( 'queue_id' => $row['queue_id'] ), |
| 173 | + __METHOD__ ) or die( 'can\'t reserve job' ); |
| 174 | + $this->delete_dups( $row['url'] ); |
| 175 | + return true; |
| 176 | + } |
| 177 | +} |
| 178 | + |
| 179 | +$maintClass = 'ArchiveLinksSpider'; |
| 180 | +require_once RUN_MAINTENANCE_IF_MAIN; |
\ No newline at end of file |
Property changes on: trunk/extensions/ArchiveLinks/spider.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 181 | + native |