Index: trunk/extensions/ArchiveLinks/ArchiveLinks.php |
— | — | @@ -54,6 +54,7 @@ |
55 | 55 | $wgArchiveLinksConfig = array ( |
56 | 56 | 'archive_service' => 'wikiwix', |
57 | 57 | 'use_multiple_archives' => false, |
| 58 | + 'run_spider_in_loop' => false, |
58 | 59 | ); |
59 | 60 | |
60 | 61 | class ArchiveLinks { |
— | — | @@ -121,12 +122,14 @@ |
122 | 123 | case 'wikiwix': |
123 | 124 | $link_to_archive = 'http://archive.wikiwix.com/cache/?url=' . $url; |
124 | 125 | break; |
| 126 | + case 'webcitation': |
| 127 | + $link_to_archive = 'http://webcitation.org/query?url=' . $url; |
| 128 | + break; |
125 | 129 | case 'internet_archive': |
| 130 | + default: |
126 | 131 | $link_to_archive = 'http://wayback.archive.org/web/*/' . $url; |
127 | 132 | break; |
128 | | - case 'webcitation': |
129 | | - $link_to_archive = 'http://webcitation.org/query?url=' . $url; |
130 | | - break; |
| 133 | + |
131 | 134 | } |
132 | 135 | } |
133 | 136 | //Note to self: need to fix this to use Html.php instead of direct html |
Index: trunk/extensions/ArchiveLinks/Spider.php |
— | — | @@ -0,0 +1,79 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * This class is for the actual spidering and will be calling wget |
| 5 | + */ |
| 6 | + |
| 7 | +$path = getenv( 'MW_INSTALL_PATH' ); |
| 8 | +if ( strval( $path ) === '' ) { |
| 9 | + $path = dirname( __FILE__ ) . '/../..'; |
| 10 | +} |
| 11 | + |
| 12 | +require_once "$path/maintenance/Maintenance.php"; |
| 13 | + |
| 14 | +class ArchiveLinksSpider extends Maintenance { |
| 15 | + private $db_master; |
| 16 | + private $db_slave; |
| 17 | + private $db_result; |
| 18 | + |
| 19 | + public function execute() { |
| 20 | + global $wgArchiveLinksConfig; |
| 21 | + |
| 22 | + $this->db_master = $this->getDB( DB_MASTER ); |
| 23 | + $this->db_slave = $this->getDB( DB_SLAVE ); |
| 24 | + $this->db_result = array(); |
| 25 | + |
| 26 | + if ( $wgArchiveLinksConfig['run_spider_in_loop'] ) { |
| 27 | + while ( TRUE ) { |
| 28 | + if ( ( $url = $this->check_queue() ) !== false ) { |
| 29 | + //do stuff |
| 30 | + } |
| 31 | + sleep(1); |
| 32 | + } |
| 33 | + } else { |
| 34 | + if ( ( $url = $this->check_queue() ) !== false ) { |
| 35 | + //do stuff |
| 36 | + } |
| 37 | + } |
| 38 | + return null; |
| 39 | + } |
| 40 | + |
| 41 | + private function check_queue() { |
| 42 | + $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*', |
| 43 | + '`el_archive_queue`.`delay_time` <= ' . time() |
| 44 | + . ' AND `el_archive_queue`.`in_progress` = 0' |
| 45 | + . ' ORDER BY `el_archive_queue`.`queue_id` ASC' |
| 46 | + . ' LIMIT 1'); |
| 47 | + |
| 48 | + if ( $this->db_result['job-fetch']->numRows() > 0 ) { |
| 49 | + $row = $this->db_result['job-fetch']->fetchRow(); |
| 50 | + |
| 51 | + //Since we querried the slave to check for dups when we insterted instead of the master let's check |
| 52 | + //that the job isn't in the queue twice, we don't want to archive it twice |
| 53 | + $this->db_result['dup-check'] = $this->db_slave->select( 'el_archive_queue', '*', '`el_archive_queue`.`url` = "' . $row['url'] |
| 54 | + . '" ORDER BY `el_archive_queue`.`queue_id` ASC' ); |
| 55 | + |
| 56 | + if ( $this->db_result['dup-check']->numRows() > 1 ) { |
| 57 | + //keep only the original jobs and remove all duplicates |
| 58 | + $this->db_result['dup-check']->fetchRow(); |
| 59 | + while ( $del_row = $this->db_result['dup-check']->fetchRow() ) { |
| 60 | + echo 'you have a dup '; |
| 61 | + var_dump( $del_row ); |
| 62 | + //this is commented for testing purposes, so I don't have to keep readding the duplicate to my test db |
| 63 | + //in other words this has a giant "remove before flight" ribbon hanging from it... |
| 64 | + //$this->db_master->delete( 'el_archive_queue', '`el_archive_queue`.`queue_id` = ' . $del_row['queue_id'] ); |
| 65 | + } |
| 66 | + |
| 67 | + } |
| 68 | + |
| 69 | + return $row['url']; |
| 70 | + } else { |
| 71 | + //there are no jobs to do right now |
| 72 | + return false; |
| 73 | + } |
| 74 | + } |
| 75 | +} |
| 76 | + |
| 77 | +$maintClass = 'ArchiveLinksSpider'; |
| 78 | +require_once RUN_MAINTENANCE_IF_MAIN; |
| 79 | + |
| 80 | +?> |
\ No newline at end of file |
Property changes on: trunk/extensions/ArchiveLinks/Spider.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 81 | + native |