Index: trunk/extensions/ArchiveLinks/SpecialViewArchive.php |
— | — | @@ -1,11 +1,14 @@ |
2 | 2 | <?php |
3 | 3 | /** |
4 | | - * This special page exists to serve the cached versions of the pages that have been archived. |
| 4 | + * This special page exists to serve the cached versions of the pages that have been archived. |
5 | 5 | */ |
6 | 6 | |
7 | 7 | class SpecialViewArchive extends SpecialPage { |
8 | | - private $db_master; |
9 | | - private $db_slave; |
| 8 | + |
| 9 | + /** |
| 10 | + * @var DatabaseBase |
| 11 | + */ |
| 12 | + private $db_master, $db_slave; |
10 | 13 | private $db_result; |
11 | 14 | |
12 | 15 | function __construct() { |
— | — | @@ -15,30 +18,30 @@ |
16 | 19 | /** |
17 | 20 | * Main function for the view archive page. This queries the resource table, disables |
18 | 21 | * output, and then displays the archived version of whichever page you'd like to view. |
19 | | - * |
20 | | - * @global $wgOut object |
| 22 | + * |
| 23 | + * @global $wgOut OutputPage |
21 | 24 | * @global $wgRequest object |
22 | 25 | * @param $par |
23 | 26 | */ |
24 | 27 | public function execute( $par ) { |
25 | 28 | global $wgOut, $wgRequest; |
26 | | - |
| 29 | + |
27 | 30 | if ( isset( $par ) || $url = $wgRequest->getText( 'archive_url' ) ) { |
28 | 31 | $this->db_master = wfGetDB( DB_MASTER ); |
29 | 32 | $this->db_slave = wfGetDB( DB_SLAVE ); |
30 | 33 | $db_result = array(); |
31 | | - |
| 34 | + |
32 | 35 | if( !isset( $url ) ) { |
33 | 36 | $url = $par; |
34 | 37 | } |
35 | | - |
| 38 | + |
36 | 39 | $this->db_result['url_location'] = $this->db_slave->select( 'el_archive_resource', '*', array( 'resource_url' => $this->db_slave->strencode( $url ) ), __METHOD__ ); |
37 | | - |
| 40 | + |
38 | 41 | if ( $this->db_result['url_location']->numRows() < 1 ) { |
39 | 42 | //This URL doesn't exist in the archive, let's say so |
40 | 43 | $this->db_result['log_check'] = $this->db_slave->select( 'el_archive_log', '*', array( 'log_url' => $this->db_slave->strencode( $url ) ), __METHOD__ ); |
41 | 44 | $this->db_result['queue_check'] = $this->db_slave->select( 'el_archive_queue', '*', array( 'url' => $this->db_slave->strencode( $url ) ), __METHOD__ ); |
42 | | - |
| 45 | + |
43 | 46 | if ( ( $num_rows = $this->db_result['queue_check']->numRows() ) === 1 ) { |
44 | 47 | $in_queue = true; |
45 | 48 | } elseif ( $num_rows > 1 ) { |
— | — | @@ -50,13 +53,13 @@ |
51 | 54 | } else { |
52 | 55 | $in_queue = false; |
53 | 56 | } |
54 | | - |
| 57 | + |
55 | 58 | if ( $this->db_result['log_check']->numRows() >= 1 ) { |
56 | 59 | $in_logs = true; |
57 | 60 | } else { |
58 | 61 | $in_logs = false; |
59 | 62 | } |
60 | | - |
| 63 | + |
61 | 64 | $this->output_form(); |
62 | 65 | $wgOut->addWikiMsg( 'archivelinks-view-archive-url-not-found' ); |
63 | 66 | /*$wgOut->addHTML( |
— | — | @@ -70,29 +73,29 @@ |
71 | 74 | } else { |
72 | 75 | //Disable the output so we don't get a skin around the archived content |
73 | 76 | $wgOut->disable(); |
74 | | - |
| 77 | + |
75 | 78 | ob_start(); |
76 | | - |
| 79 | + |
77 | 80 | echo HTML::htmlHeader(); |
78 | 81 | } |
79 | | - |
| 82 | + |
80 | 83 | } else { |
81 | 84 | //The user has not requested a URL, let's print a form so they can do so :D |
82 | 85 | $this->output_form(); |
83 | 86 | } |
84 | 87 | } |
85 | | - |
| 88 | + |
86 | 89 | /** |
87 | 90 | * Uses the HTML functions to output the appropiate form for the special page if no archived version |
88 | 91 | * exists or if no query has been specified by the user yet. |
89 | | - * |
| 92 | + * |
90 | 93 | * @global $wgOut object |
91 | 94 | */ |
92 | 95 | private function output_form( ) { |
93 | 96 | global $wgOut; |
94 | 97 | $this->setHeaders(); |
95 | | - $wgOut->addWikiMsg( 'archivelinks-view-archive-desc' ); |
96 | | - |
| 98 | + $wgOut->addWikiMsg( 'archivelinks-view-archive-desc' ); |
| 99 | + |
97 | 100 | $wgOut->addHTML( |
98 | 101 | HTML::openElement( 'form', array( 'method' => 'get', 'action' => SpecialPage::getTitleFor( 'ViewArchive' )->getLocalUrl() ) ) . |
99 | 102 | HTML::openElement( 'fieldset' ) . |
— | — | @@ -104,4 +107,4 @@ |
105 | 108 | HTML::closeElement( 'form' ) |
106 | 109 | ); |
107 | 110 | } |
108 | | -} |
\ No newline at end of file |
| 111 | +} |
Index: trunk/extensions/ArchiveLinks/INSTALL |
— | — | @@ -5,9 +5,9 @@ |
6 | 6 | Configuration settings are in the array $wgArchiveLinksConfig, which is currently defined in ArchiveLinks.php for testing purposes. |
7 | 7 | |
8 | 8 | $wgArchiveLinksConfig = array ( |
9 | | - 'archive_service' => 'wikiwix', |
10 | | - 'use_multiple_archives' => false, |
11 | | - 'run_spider_in_loop' => false, |
| 9 | + 'archive_service' => 'wikiwix', |
| 10 | + 'use_multiple_archives' => false, |
| 11 | + 'run_spider_in_loop' => false, |
12 | 12 | ); |
13 | 13 | |
14 | 14 | archive_service has the following options: |
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.php |
— | — | @@ -5,8 +5,6 @@ |
6 | 6 | * in the even they go down a backup will be available. |
7 | 7 | */ |
8 | 8 | |
9 | | -error_reporting( E_ALL | E_STRICT ); |
10 | | - |
11 | 9 | $path = dirname( __FILE__ ); |
12 | 10 | |
13 | 11 | $wgExtensionMessagesFiles['ArchiveLinks'] = "$path/ArchiveLinks.i18n.php"; |
— | — | @@ -44,10 +42,10 @@ |
45 | 43 | ); |
46 | 44 | |
47 | 45 | $wgExtensionCredits['other'][] = array( |
48 | | - 'path' => __FILE__, |
49 | | - 'name' => 'ArchiveLinks', |
50 | | - 'description' => 'Enables archival of external links on the wiki to prevent linkrot.', |
51 | | - 'version' => '0.1', |
52 | | - 'author' => 'Kevin Brown', |
53 | | - 'url' => '', |
54 | | -); |
\ No newline at end of file |
| 46 | + 'path' => __FILE__, |
| 47 | + 'name' => 'ArchiveLinks', |
| 48 | + 'description' => 'Enables archival of external links on the wiki to prevent linkrot.', |
| 49 | + 'version' => '0.1', |
| 50 | + 'author' => 'Kevin Brown', |
| 51 | + 'url' => '', |
| 52 | +); |
Index: trunk/extensions/ArchiveLinks/ApiQueryArchiveFeed.php |
— | — | @@ -4,52 +4,52 @@ |
5 | 5 | function __construct ( $query, $moduleName ) { |
6 | 6 | parent::__construct( $query, $moduleName, 'arl' ); |
7 | 7 | } |
8 | | - |
| 8 | + |
9 | 9 | /** |
10 | 10 | * This is the primary execute function for the API. It processes the query and returns |
11 | 11 | * a valid API result. |
12 | 12 | */ |
13 | 13 | public function execute ( ) { |
14 | 14 | $params = $this->extractRequestParams(); |
15 | | - |
| 15 | + |
16 | 16 | $this->addTables( 'el_archive_queue' ); |
17 | 17 | $this->addFields( '*' ); |
18 | 18 | $this->addWhereRange( 'queue_id', $params['dir'], $params['start'], $params['end'] ); |
19 | 19 | $this->addOption( 'LIMIT', $params['limit'] + 1 ); |
20 | | - |
| 20 | + |
21 | 21 | $res = $this->select( __METHOD__ ); |
22 | | - |
| 22 | + |
23 | 23 | $val = array( ); |
24 | 24 | $count = 0; |
25 | 25 | $result = $this->getResult(); |
26 | | - |
| 26 | + |
27 | 27 | foreach ( $res as $row ) { |
28 | 28 | //much of this is stolen from ApiQueryRecentChanges |
29 | 29 | if ( ++ $count > $params['limit'] ) { |
30 | 30 | $this->setContinueEnumParameter( 'start', $row->queue_id ); |
31 | 31 | break; |
32 | 32 | } |
33 | | - |
| 33 | + |
34 | 34 | $val['feed_id'] = $row->queue_id; |
35 | 35 | $val['time'] = $row->insertion_time; |
36 | | - $val['page_id'] = $row->page_id; |
37 | | - $val['url'] = $row->url; |
| 36 | + $val['page_id'] = $row->page_id; |
| 37 | + $val['url'] = $row->url; |
38 | 38 | |
39 | 39 | $fit = $result->addValue( array( 'query', $this->getModuleName() ), null, $val ); |
40 | | - |
| 40 | + |
41 | 41 | if ( !$fit ) { |
42 | 42 | $this->setContinueEnumParameter( 'start', $row->queue_id ); |
43 | 43 | break; |
44 | 44 | } |
45 | 45 | } |
46 | | - |
47 | | - $result = $result->setIndexedTagName_internal( array( 'query', $this->getModuleName() ), 'al' ); |
| 46 | + |
| 47 | + $result->setIndexedTagName_internal( array( 'query', $this->getModuleName() ), 'al' ); |
48 | 48 | } |
49 | | - |
| 49 | + |
50 | 50 | function getVersion() { |
51 | | - return __CLASS__; |
| 51 | + return __CLASS__ . ': $Id$'; |
52 | 52 | } |
53 | | - |
| 53 | + |
54 | 54 | function getAllowedParams() { |
55 | 55 | return array( |
56 | 56 | 'limit' => array( |
— | — | @@ -74,4 +74,4 @@ |
75 | 75 | ) |
76 | 76 | ); |
77 | 77 | } |
78 | | -} |
\ No newline at end of file |
| 78 | +} |
Property changes on: trunk/extensions/ArchiveLinks/ApiQueryArchiveFeed.php |
___________________________________________________________________ |
Added: svn:keywords |
79 | 79 | + Id |
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php |
— | — | @@ -4,29 +4,29 @@ |
5 | 5 | */ |
6 | 6 | |
7 | 7 | class ArchiveLinks { |
8 | | - private $db_master; |
9 | | - private $db_slave; |
10 | | - private $db_result; |
11 | | - |
12 | | - /** |
| 8 | + private $db_master; |
| 9 | + private $db_slave; |
| 10 | + private $db_result; |
| 11 | + |
| 12 | + /** |
13 | 13 | * This is the primary function for the Archive Links Extension |
14 | 14 | * It fires off of the ArticleSaveComplete hook and is primarily responsible for updating |
15 | 15 | * the appropiate tables to began the process of archival |
16 | | - * |
17 | | - * @param $article object article object from ArticleSaveComplete hook |
| 16 | + * |
| 17 | + * @param $article Article object article object from ArticleSaveComplete hook |
18 | 18 | * @return bool |
19 | 19 | */ |
20 | 20 | public static function queueExternalLinks ( &$article ) { |
21 | 21 | global $wgParser, $wgArchiveLinksConfig; |
22 | 22 | $external_links = $wgParser->getOutput(); |
23 | 23 | $external_links = $external_links->mExternalLinks; |
24 | | - |
| 24 | + |
25 | 25 | $db_master = wfGetDB( DB_MASTER ); |
26 | 26 | $db_slave = wfGetDB( DB_SLAVE ); |
27 | 27 | $db_result = array(); |
28 | | - |
| 28 | + |
29 | 29 | $db_master->begin(); |
30 | | - |
| 30 | + |
31 | 31 | if ( !isset( $wgArchiveLinksConfig['global_rearchive_time'] ) ) { |
32 | 32 | //30 days or 2,592,000 seconds... |
33 | 33 | $wgArchiveLinksConfig['global_rearchive_time'] = 2592000; |
— | — | @@ -36,24 +36,24 @@ |
37 | 37 | //200 days or 17,280,000 seconds |
38 | 38 | $wgArchiveLinksConfig['page_rearchive_time'] = 1728000; |
39 | 39 | } |
40 | | - |
| 40 | + |
41 | 41 | if( !isset( $wgArchiveLinksConfig['previous_archive_lockout_time'] ) ) { |
42 | 42 | //2 days or 172,800 seconds |
43 | 43 | $wgArchiveLinksConfig['previous_archive_lockout_time'] = 172800; |
44 | 44 | } |
45 | | - |
| 45 | + |
46 | 46 | $page_id = $article->getID(); |
47 | 47 | $time = time(); |
48 | | - |
| 48 | + |
49 | 49 | if ( $wgArchiveLinksConfig['generate_feed'] === true ) { |
50 | 50 | $old_id = $article->getTitle(); |
51 | 51 | $old_id = $old_id->getPreviousRevisionID( $page_id ); |
52 | | - |
| 52 | + |
53 | 53 | $db_result['links_on_page'] = $db_master->select( 'el_archive_link_history', '*', array( 'hist_page_id' => $page_id ), __METHOD__ ); |
54 | | - |
| 54 | + |
55 | 55 | $old_external_links = array(); |
56 | 56 | $new_external_links = array(); |
57 | | - |
| 57 | + |
58 | 58 | if ( $db_result['links_on_page']->numRows() > 0 ) { |
59 | 59 | while( $row = $db_result['links_on_page']->fetchRow() ) { |
60 | 60 | $old_external_links[] = $row['hist_url']; |
— | — | @@ -66,14 +66,14 @@ |
67 | 67 | } elseif ( count( $external_links ) > 0 ) { |
68 | 68 | $new_external_links = $external_links; |
69 | 69 | } |
70 | | - |
| 70 | + |
71 | 71 | if ( !isset( $wgArchiveLinksConfig['link_insert_max'] ) ) { |
72 | 72 | $wgArchiveLinksConfig['link_insert_max'] = 100; |
73 | 73 | } |
74 | 74 | die ( count( $new_external_links )); |
75 | 75 | if ( count( $new_external_links ) <= $wgArchiveLinksConfig['link_insert_max'] ) { |
76 | 76 | //insert the links into the queue now |
77 | | - foreach( $new_external_links as $link ) { |
| 77 | + foreach( $new_external_links as $link ) { |
78 | 78 | $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) ); |
79 | 79 | $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) ); |
80 | 80 | |
— | — | @@ -128,14 +128,14 @@ |
129 | 129 | } |
130 | 130 | |
131 | 131 | $db_master->commit(); |
132 | | - |
| 132 | + |
133 | 133 | return true; |
134 | 134 | } |
135 | | - |
| 135 | + |
136 | 136 | /** |
137 | 137 | * This is the function resposible for rewriting the link html to insert the [cache] link |
138 | 138 | * after each external link on the page. This function will get called once for every external link. |
139 | | - * |
| 139 | + * |
140 | 140 | * @global $wgArchiveLinksConfig array |
141 | 141 | * @param $url string The url of the page (what would appear in href) |
142 | 142 | * @param $text string The assoicated text of the URL (what would go between the anchor tags) |
— | — | @@ -172,26 +172,26 @@ |
173 | 173 | break; |
174 | 174 | } |
175 | 175 | } |
176 | | - |
177 | | - $link = HTML::element('a', array ( 'rel' => 'nofollow', 'class' => $attributes['class'], 'href' => $url ), $text ) |
| 176 | + |
| 177 | + $link = HTML::element('a', array ( 'rel' => 'nofollow', 'class' => $attributes['class'], 'href' => $url ), $text ) |
178 | 178 | . HTML::openElement('sup') |
179 | 179 | . HTML::openElement('small') |
180 | 180 | . ' ' |
181 | 181 | . HTML::element('a', array ( 'rel' => 'nofollow', 'href' => $link_to_archive ), '[' . wfMsg( 'archivelinks-cache-title') . ']') |
182 | 182 | . HTML::closeElement('small') |
183 | 183 | . HTML::closeElement('sup'); |
184 | | - |
| 184 | + |
185 | 185 | return false; |
186 | 186 | } else { |
187 | 187 | return true; |
188 | 188 | } |
189 | 189 | } |
190 | | - |
| 190 | + |
191 | 191 | /** |
192 | 192 | * This function is responsible for any database updates within the extension and hooks into |
193 | 193 | * update.php |
194 | | - * |
195 | | - * @param $updater object Passed by the LoadExtensionSchemaUpdates hook |
| 194 | + * |
| 195 | + * @param $updater DatabaseUpdater object Passed by the LoadExtensionSchemaUpdates hook |
196 | 196 | * @return bool |
197 | 197 | */ |
198 | 198 | public static function schemaUpdates ( $updater = null ) { |
— | — | @@ -202,42 +202,17 @@ |
203 | 203 | $path . '/setuptables.sql', |
204 | 204 | true |
205 | 205 | )); |
206 | | - $updater->addExtensionUpdate( array( |
207 | | - 'addTable', |
208 | | - 'el_archive_queue', |
209 | | - $path . '/setuptables.sql', |
210 | | - true |
211 | | - )); |
212 | | - $updater->addExtensionUpdate( array( |
213 | | - 'addTable', |
214 | | - 'el_archive_log', |
215 | | - $path . '/setuptables.sql', |
216 | | - true |
217 | | - )); |
218 | | - $updater->addExtensionUpdate( array( |
219 | | - 'addTable', |
220 | | - 'el_archive_resource', |
221 | | - $path . '/setuptables.sql', |
222 | | - true |
223 | | - )); |
224 | | - $updater->addExtensionUpdate( array( |
225 | | - 'addTable', |
226 | | - 'el_archive_link_blacklist', |
227 | | - $path . '/setuptables.sql', |
228 | | - true |
229 | | - )); |
230 | 206 | return true; |
231 | 207 | } |
232 | 208 | } |
233 | 209 | |
234 | 210 | class InsertURLsIntoQueue extends Job { |
235 | | - public function __construct( $title, $params ) { |
236 | | - // Replace synchroniseThreadArticleData with the an identifier for your job. |
237 | | - parent::__construct( 'insertURLsIntoQueue', $title, $params ); |
238 | | - } |
239 | | - |
240 | | - |
241 | | - public function run() { |
242 | | - |
243 | | - } |
244 | | -} |
\ No newline at end of file |
| 211 | + public function __construct( $title, $params ) { |
| 212 | + // Replace synchroniseThreadArticleData with the an identifier for your job. |
| 213 | + parent::__construct( 'insertURLsIntoQueue', $title, $params ); |
| 214 | + } |
| 215 | + |
| 216 | + public function run() { |
| 217 | + |
| 218 | + } |
| 219 | +} |
Index: trunk/extensions/ArchiveLinks/spider.php |
— | — | @@ -12,31 +12,33 @@ |
13 | 13 | |
14 | 14 | class ArchiveLinksSpider extends Maintenance { |
15 | 15 | |
16 | | - private $db_master; |
17 | | - private $db_slave; |
| 16 | + /** |
| 17 | + * @var DatabaseBase |
| 18 | + */ |
| 19 | + private $db_master, $db_slave; |
18 | 20 | private $db_result; |
19 | 21 | private $jobs; |
20 | 22 | private $downloaded_files; |
21 | 23 | |
22 | 24 | /** |
23 | | - * Primary function called from Maintenance.php to run the actual spider. |
| 25 | + * Primary function called from Maintenance.php to run the actual spider. |
24 | 26 | * Queries the queue and then downloads and stores each link for which archival |
25 | 27 | * has been requested |
26 | | - * |
| 28 | + * |
27 | 29 | * @global $wgArchiveLinksConfig array |
28 | 30 | * @global $wgLoadBalancer object |
29 | 31 | * @global $path string Install path of mediawiki |
30 | 32 | * @return bool |
31 | 33 | */ |
32 | 34 | public function execute( ) { |
33 | | - global $wgArchiveLinksConfig, $wgLoadBalancer, $path; |
| 35 | + global $wgArchiveLinksConfig; |
34 | 36 | |
35 | 37 | $this->db_master = $this->getDB(DB_MASTER); |
36 | 38 | $this->db_slave = $this->getDB(DB_SLAVE); |
37 | 39 | $this->db_result = array(); |
38 | 40 | |
39 | 41 | if ( $wgArchiveLinksConfig['run_spider_in_loop'] ) { |
40 | | - /* while ( TRUE ) { |
| 42 | + /* while ( TRUE ) { |
41 | 43 | if ( ( $url = $this->check_queue() ) !== false ) { |
42 | 44 | |
43 | 45 | } |
— | — | @@ -47,11 +49,11 @@ |
48 | 50 | //for right now we will pipe everything through the replication_check_queue function just for testing purposes |
49 | 51 | /*if ( $wgLoadBalancer->getServerCount() > 1 ) { |
50 | 52 | if ( ( $url = $this->replication_check_queue() ) !== false ) { |
51 | | - |
| 53 | + |
52 | 54 | } |
53 | 55 | } else { |
54 | 56 | if ( ( $url = $this->check_queue() ) !== false ) { |
55 | | - |
| 57 | + |
56 | 58 | } |
57 | 59 | }*/ |
58 | 60 | |
— | — | @@ -71,19 +73,19 @@ |
72 | 74 | } |
73 | 75 | return null; |
74 | 76 | } |
75 | | - |
| 77 | + |
76 | 78 | /** |
77 | 79 | * This function goes and checks to make sure the configuration values are valid |
78 | 80 | * Then calls wget, finds the result and updates the appropiate database tables to |
79 | | - * record it. |
80 | | - * |
| 81 | + * record it. |
| 82 | + * |
81 | 83 | * @global $wgArchiveLinksConfig array |
82 | 84 | * @global $path string |
83 | 85 | * @param $url string the URL that is to be archvied |
84 | 86 | */ |
85 | 87 | private function call_wget( $url ) { |
86 | 88 | global $wgArchiveLinksConfig, $path; |
87 | | - |
| 89 | + |
88 | 90 | //Check Configuration |
89 | 91 | if ( isset( $wgArchiveLinksConfig['file_types'] ) ) { |
90 | 92 | if ( is_array( $wgArchiveLinksConfig['file_types']) ){ |
— | — | @@ -105,8 +107,8 @@ |
106 | 108 | } elseif ( isset( $wgArchiveLinksConfig['content_path'] ) ) { |
107 | 109 | $dir = realpath( $wgArchiveLinksConfig['content_path'] ); |
108 | 110 | if ( !$dir ) { |
109 | | - die ( 'The path you have set for $wgArchiveLinksConfig[\'content_path\'] does not exist. ' . |
110 | | - 'This makes the spider a very sad panda. Please either create it or use a different setting.'); |
| 111 | + $this->error ( 'The path you have set for $wgArchiveLinksConfig[\'content_path\'] does not exist. ' . |
| 112 | + 'This makes the spider a very sad panda. Please either create it or use a different setting.'); |
111 | 113 | } |
112 | 114 | } else { |
113 | 115 | $dir = $path . '/archived_content/'; |
— | — | @@ -130,8 +132,8 @@ |
131 | 133 | //serveral minutes to go through all the retries which has the potential to stall the spider unnecessarily |
132 | 134 | $wgArchiveLinksConfig['retry_times'] = '3'; |
133 | 135 | } |
134 | | - |
135 | | - |
| 136 | + |
| 137 | + |
136 | 138 | //Do stuff with wget |
137 | 139 | if ( isset( $wgArchiveLinksConfig['wget_path'] ) && file_exists( $wgArchiveLinksConfig['wget_path'] ) ) { |
138 | 140 | die ( 'Support is not yet added for wget in a different directory' ); |
— | — | @@ -142,22 +144,22 @@ |
143 | 145 | $this->parse_wget_log( $log_dir, $url ); |
144 | 146 | /*foreach( $this->downloaded_files as $file ) { |
145 | 147 | if ( $file['status'] === 'success' ) { |
146 | | - |
| 148 | + |
147 | 149 | } elseif ( $file['status'] === 'failure' ) { |
148 | 150 | echo 'bar'; |
149 | 151 | } |
150 | 152 | }*/ |
151 | | - $this->db_master->insert( $this->downloaded_files[0]['url'] ); |
| 153 | + $this->db_master->insert( $this->downloaded_files[0]['url'] ); // FIXME: Missing parameters |
152 | 154 | } else { |
153 | 155 | //this is primarily designed with windows in mind and no built in wget, so yeah, *nix support should be added, in other words note to self... |
154 | | - die ( 'wget must be installed in order for the spider to function in wget mode' ); |
| 156 | + $this->error( 'wget must be installed in order for the spider to function in wget mode' ); |
155 | 157 | } |
156 | 158 | } |
157 | 159 | |
158 | 160 | /** |
159 | 161 | * This function checks the archive queue without any attempt to work around replag. |
160 | 162 | * Only one URL is taken at a time. |
161 | | - * |
| 163 | + * |
162 | 164 | * @return mixed The URL to archive on success, False on failure |
163 | 165 | */ |
164 | 166 | private function check_queue( ) { |
— | — | @@ -166,12 +168,12 @@ |
167 | 169 | array( 'delay_time' => ' >=' . time(), 'in_progress' => '0'), |
168 | 170 | __METHOD__, |
169 | 171 | array( 'ORDER BY' => 'queue_id ASC', 'LIMIT' => '1' )); |
170 | | - |
| 172 | + |
171 | 173 | if ( $this->db_result['job-fetch']->numRows() > 0 ) { |
172 | 174 | $row = $this->db_result['job-fetch']->fetchRow(); |
173 | | - |
174 | | - //$this->delete_dups( $row['url'] ); |
175 | 175 | |
| 176 | + //$this->delete_dups( $row['url'] ); |
| 177 | + |
176 | 178 | return $row['url']; |
177 | 179 | } else { |
178 | 180 | //there are no jobs to do right now |
— | — | @@ -181,10 +183,10 @@ |
182 | 184 | |
183 | 185 | /** |
184 | 186 | * This function checks a local file for a local block of jobs that is to be done |
185 | | - * if there is none that exists it gets a block, creates one, and waits for the |
| 187 | + * if there is none that exists it gets a block, creates one, and waits for the |
186 | 188 | * data to propagate to avoid any replag problems. All urls are not returned directly |
187 | 189 | * but are put into $this->jobs. |
188 | | - * |
| 190 | + * |
189 | 191 | * @return bool |
190 | 192 | */ |
191 | 193 | private function replication_check_queue( ) { |
— | — | @@ -194,28 +196,28 @@ |
195 | 197 | $file = unserialize( $file ); |
196 | 198 | } else { |
197 | 199 | //we don't have any temp file, lets get a block of jobs to do and make one |
198 | | - $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*', |
| 200 | + $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*', |
199 | 201 | array( |
200 | 202 | 'delay_time <= "' . time() . '"', |
201 | 203 | 'in_progress' => '0') |
202 | | - , __METHOD__, |
| 204 | + , __METHOD__, |
203 | 205 | array( |
204 | 206 | 'LIMIT' => '15', |
205 | 207 | 'ORDER BY' => 'queue_id ASC' |
206 | 208 | )); |
207 | 209 | //echo $this->db_result['job-fetch']; |
208 | | - |
| 210 | + |
209 | 211 | $this->jobs = array(); |
210 | 212 | |
211 | 213 | $wait_time = wfGetLB()->safeGetLag( $this->db_slave ) * 3; |
212 | 214 | $pid = (string) microtime() . ' - ' . getmypid(); |
213 | 215 | $time = time(); |
214 | | - |
| 216 | + |
215 | 217 | //echo $pid; |
216 | | - |
| 218 | + |
217 | 219 | $this->jobs['pid'] = $pid; |
218 | 220 | $this->jobs['execute_time'] = $wait_time + $time; |
219 | | - |
| 221 | + |
220 | 222 | if ($this->db_result['job-fetch']->numRows() > 0) { |
221 | 223 | //$row = $this->db_result['job-fetch']->fetchRow(); |
222 | 224 | while ( $row = $this->db_result['job-fetch']->fetchRow() ) { |
— | — | @@ -227,40 +229,40 @@ |
228 | 230 | } else { |
229 | 231 | //in_progress is not equal to 0, this means that the job was reserved some time before |
230 | 232 | //it could have been by a previous instance of this spider (assuming not running in a loop) |
231 | | - //or a different spider entirely, since we don't have have a temp file to go on we have to assume |
| 233 | + //or a different spider entirely, since we don't have have a temp file to go on we have to assume |
232 | 234 | //it was a different spider (it could have been deleted by a user), we will only ignore the in_progress |
233 | 235 | //lock if it has been a long time (2 hours by default) since the job was initally reserved |
234 | 236 | $reserve_time = explode( ' ', $row['in_progress'] ); |
235 | 237 | $reserve_time = $reserve_time[2]; |
236 | | - |
| 238 | + |
237 | 239 | isset( $wgArchiveLinksConfig['in_progress_ignore_delay'] ) ? $ignore_in_prog_time = $wgArchiveLinksConfig['in_progress_ignore_delay'] : |
238 | 240 | $ignore_in_prog_time = 7200; |
239 | | - |
| 241 | + |
240 | 242 | if ( $time - $reserve_time - $wait_time > $ignore_in_prog_time ) { |
241 | 243 | $retval = $this->reserve_job( $row ); |
242 | 244 | } |
243 | 245 | } |
244 | | - |
| 246 | + |
245 | 247 | } else { |
246 | 248 | //let's wait for everything to replicate, add to temp file and check back later |
247 | 249 | $this->jobs[] = $row; |
248 | 250 | } |
249 | 251 | } |
250 | 252 | } |
251 | | - |
| 253 | + |
252 | 254 | //var_dump( $this->jobs ); |
253 | | - |
| 255 | + |
254 | 256 | $this->jobs = serialize( $this->jobs ); |
255 | 257 | //file_put_contents( "$path/extensions/ArchiveLinks/spider-temp.txt", $this->jobs ); |
256 | 258 | } |
257 | | - |
| 259 | + |
258 | 260 | if ( $retval !== true ) { |
259 | 261 | $retval = false; |
260 | 262 | } |
261 | 263 | return $retval; |
262 | 264 | } |
263 | | - |
264 | | - |
| 265 | + |
| 266 | + |
265 | 267 | /** |
266 | 268 | * This function checks for duplicates in the queue table, if it finds one it keeps the oldest and deletes |
267 | 269 | * everything else. |
— | — | @@ -270,9 +272,9 @@ |
271 | 273 | private function delete_dups( $url ) { |
272 | 274 | //Since we querried the slave to check for dups when we insterted instead of the master let's check |
273 | 275 | //that the job isn't in the queue twice, we don't want to archive it twice |
274 | | - $this->db_result['dup-check'] = $this->db_slave->select('el_archive_queue', '*', array( 'url' => $url ), __METHOD__, |
| 276 | + $this->db_result['dup-check'] = $this->db_slave->select('el_archive_queue', '*', array( 'url' => $url ), __METHOD__, |
275 | 277 | array( 'ORDER BY' => 'queue_id ASC' ) ); |
276 | | - |
| 278 | + |
277 | 279 | if ( $this->db_result['dup-check']->numRows() > 1 ) { |
278 | 280 | //keep only the first job and remove all duplicates |
279 | 281 | $this->db_result['dup-check']->fetchRow(); |
— | — | @@ -281,16 +283,16 @@ |
282 | 284 | var_dump( $del_row ); |
283 | 285 | //this is commented for testing purposes, so I don't have to keep readding the duplicate to my test db |
284 | 286 | //in other words this has a giant "remove before flight" ribbon hanging from it... |
285 | | - //$this->db_master->delete( 'el_archive_queue', '`el_archive_queue`.`queue_id` = ' . $del_row['queue_id'] ); |
| 287 | + //$this->db_master->delete( 'el_archive_queue', '`el_archive_queue`.`queue_id` = ' . $del_row['queue_id'] ); |
286 | 288 | } |
287 | 289 | } |
288 | 290 | } |
289 | | - |
290 | | - |
| 291 | + |
| 292 | + |
291 | 293 | /** |
292 | 294 | * This function sets in_progess in the queue table to 1 so other instances of the spider know that |
293 | 295 | * the job is in the process of being archived. |
294 | | - * |
| 296 | + * |
295 | 297 | * @param $row array The row of the database result from the database object. |
296 | 298 | * @return bool |
297 | 299 | */ |
— | — | @@ -302,7 +304,7 @@ |
303 | 305 | $this->delete_dups( $row['url'] ); |
304 | 306 | return true; |
305 | 307 | } |
306 | | - |
| 308 | + |
307 | 309 | /** |
308 | 310 | * Uses regular expressions to parse the log file of wget in non-verbose mode |
309 | 311 | * This is then returned to call_wget and updated in the db |
— | — | @@ -313,10 +315,10 @@ |
314 | 316 | */ |
315 | 317 | private function parse_wget_log( $log_path, $url ) { |
316 | 318 | $fp = fopen( $log_path, 'r' ) or die( 'can\'t find wget log file to parse' ); |
317 | | - |
| 319 | + |
318 | 320 | $this->downloaded_files = array ( ); |
319 | | - |
320 | | - $line_regexes = array ( |
| 321 | + |
| 322 | + $line_regexes = array ( |
321 | 323 | 'url' => '%^\d{4}-(?:\d{2}(?:-|:| )?){5}URL:(http://.*?) \[.+?\] ->%', |
322 | 324 | 'finish' => '%^Downloaded: \d+ files, (\d(?:.\d)?+(?:K|M)).*%', |
323 | 325 | 'sole_url' => '%^(http://.*):%', |
— | — | @@ -324,7 +326,7 @@ |
325 | 327 | 'quota_exceed' => '%^Download quota of .*? EXCEEDED!%', |
326 | 328 | 'finish_line' => '%^FINISHED --(\d{4}-(?:\d{2}(?:-|:| )){5})-%', |
327 | 329 | ); |
328 | | - |
| 330 | + |
329 | 331 | while ( $line = fgets( $fp ) ) { |
330 | 332 | foreach( $line_regexes as $line_type => $regex ) { |
331 | 333 | if ( preg_match( $regex, $line, $matches ) ) { |
— | — | @@ -365,7 +367,7 @@ |
366 | 368 | } |
367 | 369 | } |
368 | 370 | } |
369 | | - |
| 371 | + |
370 | 372 | return $this->downloaded_files; |
371 | 373 | } |
372 | 374 | } |
Index: trunk/extensions/ArchiveLinks/README |
— | — | @@ -1 +1 @@ |
2 | | -This a project currently under devolopment to add premementive archival to external links so that in the event that they go down a backup copy will exist. At the current time it is NOT stable and should not be used on any production wiki. |
\ No newline at end of file |
| 2 | +This a project currently under development to add premementive archival to external links so that in the event that they go down a backup copy will exist. At the current time it is NOT stable and should not be used on any production wiki. |