Index: trunk/extensions/ArchiveLinks/SpecialViewArchive.php |
— | — | @@ -12,6 +12,14 @@ |
13 | 13 | parent::__construct( 'ViewArchive' ); |
14 | 14 | } |
15 | 15 | |
| 16 | + /** |
| 17 | + * Main function for the view archive page. This queries the resource table, disables |
| 18 | + * output, and then displays the archived version of whichever page you'd like to view. |
| 19 | + * |
| 20 | + * @global $wgOut object |
| 21 | + * @global $wgRequest object |
| 22 | + * @param $par |
| 23 | + */ |
16 | 24 | public function execute( $par ) { |
17 | 25 | global $wgOut, $wgRequest; |
18 | 26 | |
— | — | @@ -74,6 +82,12 @@ |
75 | 83 | } |
76 | 84 | } |
77 | 85 | |
| 86 | + /** |
| 87 | + * Uses the HTML functions to output the appropiate form for the special page if no archived version |
| 88 | + * exists or if no query has been specified by the user yet. |
| 89 | + * |
| 90 | + * @global $wgOut object |
| 91 | + */ |
78 | 92 | private function output_form( ) { |
79 | 93 | global $wgOut; |
80 | 94 | $this->setHeaders(); |
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.php |
— | — | @@ -27,10 +27,27 @@ |
28 | 28 | $wgAutoloadClasses['ApiQueryArchiveFeed'] = "$path/ApiQueryArchiveFeed.php"; |
29 | 29 | $wgAPIListModules['archivefeed'] = 'ApiQueryArchiveFeed'; |
30 | 30 | |
| 31 | +// Tests |
| 32 | +/*$wgHooks['UnitTestsList'][] = 'efArchiveLinksUnitTests'; |
| 33 | + |
| 34 | +function efArchiveLinksUnitTests( &$files ) { |
| 35 | + $files[] = dirname( __FILE__ ) . '/ArchiveLinksTests.php'; |
| 36 | + return true; |
| 37 | +}*/ |
| 38 | + |
31 | 39 | $wgArchiveLinksConfig = array( |
32 | 40 | 'archive_service' => 'internet_archive', |
33 | 41 | 'use_multiple_archives' => false, |
34 | 42 | 'run_spider_in_loop' => false, |
35 | 43 | 'in_progress_ignore_delay' => 7200, |
36 | | - 'generate_feed' => false, |
| 44 | + 'generate_feed' => true, |
| 45 | +); |
| 46 | + |
| 47 | +$wgExtensionCredits['other'][] = array( |
| 48 | + 'path' => __FILE__, |
| 49 | + 'name' => 'ArchiveLinks', |
| 50 | + 'description' => 'Enables archival of external links on the wiki to prevent linkrot.', |
| 51 | + 'version' => '0.1', |
| 52 | + 'author' => 'Kevin Brown', |
| 53 | + 'url' => '', |
37 | 54 | ); |
\ No newline at end of file |
Index: trunk/extensions/ArchiveLinks/SpecialModifyArchiveBlacklist.php |
— | — | @@ -6,6 +6,14 @@ |
7 | 7 | parent::__construct('ModifyArchiveBlacklist'); |
8 | 8 | } |
9 | 9 | |
| 10 | + /** |
| 11 | + * Main function for special page. At the current time only produces html for the blacklist. |
| 12 | + * Should support updating the blacklist. |
| 13 | + * |
| 14 | + * @global $wgOut object |
| 15 | + * @global $wgRequest object |
| 16 | + * @param $par |
| 17 | + */ |
10 | 18 | public function execute($par) { |
11 | 19 | global $wgOut, $wgRequest; |
12 | 20 | $this->setHeaders(); |
Index: trunk/extensions/ArchiveLinks/ApiQueryArchiveFeed.php |
— | — | @@ -5,6 +5,10 @@ |
6 | 6 | parent::__construct( $query, $moduleName, 'arl' ); |
7 | 7 | } |
8 | 8 | |
| 9 | + /** |
| 10 | + * This is the primary execute function for the API. It processes the query and returns |
| 11 | + * a valid API result. |
| 12 | + */ |
9 | 13 | public function execute ( ) { |
10 | 14 | $params = $this->extractRequestParams(); |
11 | 15 | |
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php |
— | — | @@ -8,7 +8,14 @@ |
9 | 9 | private $db_slave; |
10 | 10 | private $db_result; |
11 | 11 | |
12 | | - |
| 12 | + /** |
| 13 | + * This is the primary function for the Archive Links Extension |
| 14 | + * It fires off of the ArticleSaveComplete hook and is primarily responsible for updating |
| 15 | + * the appropiate tables to began the process of archival |
| 16 | + * |
| 17 | + * @param $article object article object from ArticleSaveComplete hook |
| 18 | + * @return bool |
| 19 | + */ |
13 | 20 | public static function queueExternalLinks ( &$article ) { |
14 | 21 | global $wgParser, $wgArchiveLinksConfig; |
15 | 22 | $external_links = $wgParser->getOutput(); |
— | — | @@ -63,7 +70,7 @@ |
64 | 71 | if ( !isset( $wgArchiveLinksConfig['link_insert_max'] ) ) { |
65 | 72 | $wgArchiveLinksConfig['link_insert_max'] = 100; |
66 | 73 | } |
67 | | - |
| 74 | + die ( count( $new_external_links )); |
68 | 75 | if ( count( $new_external_links ) <= $wgArchiveLinksConfig['link_insert_max'] ) { |
69 | 76 | //insert the links into the queue now |
70 | 77 | foreach( $new_external_links as $link ) { |
— | — | @@ -92,7 +99,6 @@ |
93 | 100 | } else { |
94 | 101 | //insert everything as a job and do the work later to avoid lagging page save |
95 | 102 | } |
96 | | - |
97 | 103 | } else { |
98 | 104 | foreach ( $external_links as $link => $unused_value ) { |
99 | 105 | //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"'); |
— | — | @@ -126,8 +132,19 @@ |
127 | 133 | return true; |
128 | 134 | } |
129 | 135 | |
| 136 | + /** |
| 137 | + * This is the function resposible for rewriting the link html to insert the [cache] link |
| 138 | + * after each external link on the page. This function will get called once for every external link. |
| 139 | + * |
| 140 | + * @global $wgArchiveLinksConfig array |
| 141 | + * @param $url string The url of the page (what would appear in href) |
| 142 | + * @param $text string The assoicated text of the URL (what would go between the anchor tags) |
| 143 | + * @param $link string The rewritten text of the URL |
| 144 | + * @param $attributes array Any attributes for the anchor tag |
| 145 | + * @return bool |
| 146 | + */ |
130 | 147 | public static function rewriteLinks ( &$url, &$text, &$link, &$attributes ) { |
131 | | - if ( array_key_exists('rel', $attributes) && $attributes['rel'] === 'nofollow' ) { |
| 148 | + if ( isset( $attributes['rel'] ) && $attributes['rel'] === 'nofollow' ) { |
132 | 149 | global $wgArchiveLinksConfig; |
133 | 150 | if ( $wgArchiveLinksConfig['use_multiple_archives'] ) { |
134 | 151 | //need to add support for more than one archival service at once |
— | — | @@ -135,19 +152,24 @@ |
136 | 153 | } else { |
137 | 154 | switch ( $wgArchiveLinksConfig['archive_service'] ) { |
138 | 155 | case 'local': |
139 | | - //We need to have something to figure out where the filestore is... |
140 | | - $link_to_archive = urlencode( substr_replace( $url, '', 0, 7 ) ); |
141 | | - break; |
| 156 | + //We need to have something to figure out where the filestore is... |
| 157 | + $link_to_archive = urlencode( substr_replace( $url, '', 0, 7 ) ); |
| 158 | + break; |
142 | 159 | case 'wikiwix': |
143 | | - $link_to_archive = 'http://archive.wikiwix.com/cache/?url=' . $url; |
144 | | - break; |
| 160 | + $link_to_archive = 'http://archive.wikiwix.com/cache/?url=' . $url; |
| 161 | + break; |
145 | 162 | case 'webcitation': |
146 | | - $link_to_archive = 'http://webcitation.org/query?url=' . $url; |
147 | | - break; |
| 163 | + $link_to_archive = 'http://webcitation.org/query?url=' . $url; |
| 164 | + break; |
148 | 165 | case 'internet_archive': |
149 | 166 | default: |
150 | | - $link_to_archive = 'http://wayback.archive.org/web/*/' . $url; |
151 | | - break; |
| 167 | + if ( /*$wgArchiveLinksConfig['generate_feed'] === true*/ false ) { |
| 168 | + $db_slave = wfGetDB( DB_SLAVE ); |
| 169 | + $db_slave->select( 'el_archive_link_history','*'); |
| 170 | + } else { |
| 171 | + $link_to_archive = 'http://wayback.archive.org/web/*/' . $url; |
| 172 | + } |
| 173 | + break; |
152 | 174 | } |
153 | 175 | } |
154 | 176 | |
— | — | @@ -165,6 +187,13 @@ |
166 | 188 | } |
167 | 189 | } |
168 | 190 | |
| 191 | + /** |
| 192 | + * This function is responsible for any database updates within the extension and hooks into |
| 193 | + * update.php |
| 194 | + * |
| 195 | + * @param $updater object Passed by the LoadExtensionSchemaUpdates hook |
| 196 | + * @return bool |
| 197 | + */ |
169 | 198 | public static function schemaUpdates ( $updater = null ) { |
170 | 199 | $path = dirname( __FILE__ ); |
171 | 200 | $updater->addExtensionUpdate( array( |
Index: trunk/extensions/ArchiveLinks/spider.php |
— | — | @@ -18,6 +18,16 @@ |
19 | 19 | private $jobs; |
20 | 20 | private $downloaded_files; |
21 | 21 | |
| 22 | + /** |
| 23 | + * Primary function called from Maintenance.php to run the actual spider. |
| 24 | + * Queries the queue and then downloads and stores each link for which archival |
| 25 | + * has been requested |
| 26 | + * |
| 27 | + * @global $wgArchiveLinksConfig array |
| 28 | + * @global $wgLoadBalancer object |
| 29 | + * @global $path string Install path of mediawiki |
| 30 | + * @return bool |
| 31 | + */ |
22 | 32 | public function execute( ) { |
23 | 33 | global $wgArchiveLinksConfig, $wgLoadBalancer, $path; |
24 | 34 | |
— | — | @@ -62,6 +72,15 @@ |
63 | 73 | return null; |
64 | 74 | } |
65 | 75 | |
| 76 | + /** |
| 77 | + * This function goes and checks to make sure the configuration values are valid |
| 78 | + * Then calls wget, finds the result and updates the appropiate database tables to |
| 79 | + * record it. |
| 80 | + * |
| 81 | + * @global $wgArchiveLinksConfig array |
| 82 | + * @global $path string |
| 83 | + * @param $url string the URL that is to be archvied |
| 84 | + */ |
66 | 85 | private function call_wget( $url ) { |
67 | 86 | global $wgArchiveLinksConfig, $path; |
68 | 87 | |
— | — | @@ -135,6 +154,12 @@ |
136 | 155 | } |
137 | 156 | } |
138 | 157 | |
| 158 | + /** |
| 159 | + * This function checks the archive queue without any attempt to work around replag. |
| 160 | + * Only one URL is taken at a time. |
| 161 | + * |
| 162 | + * @return mixed The URL to archive on success, False on failure |
| 163 | + */ |
139 | 164 | private function check_queue( ) { |
140 | 165 | //need to fix this to use arrays instead of what I'm doing now |
141 | 166 | $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*', |
— | — | @@ -156,7 +181,11 @@ |
157 | 182 | |
158 | 183 | /** |
159 | 184 | * This function checks a local file for a local block of jobs that is to be done |
160 | | - * if there is none that exists it gets a block, creates one, and waits to avoid any replag problems |
| 185 | + * if there is none that exists it gets a block, creates one, and waits for the |
| 186 | + * data to propagate to avoid any replag problems. All urls are not returned directly |
| 187 | + * but are put into $this->jobs. |
| 188 | + * |
| 189 | + * @return bool |
161 | 190 | */ |
162 | 191 | private function replication_check_queue( ) { |
163 | 192 | global $path, $wgArchiveLinksConfig; |
— | — | @@ -231,6 +260,13 @@ |
232 | 261 | return $retval; |
233 | 262 | } |
234 | 263 | |
| 264 | + |
| 265 | + /** |
| 266 | + * This function checks for duplicates in the queue table, if it finds one it keeps the oldest and deletes |
| 267 | + * everything else. |
| 268 | + * |
| 269 | + * @param $url |
| 270 | + */ |
235 | 271 | private function delete_dups( $url ) { |
236 | 272 | //Since we querried the slave to check for dups when we insterted instead of the master let's check |
237 | 273 | //that the job isn't in the queue twice, we don't want to archive it twice |
— | — | @@ -250,6 +286,14 @@ |
251 | 287 | } |
252 | 288 | } |
253 | 289 | |
| 290 | + |
| 291 | + /** |
| 292 | + * This function sets in_progess in the queue table to 1 so other instances of the spider know that |
| 293 | + * the job is in the process of being archived. |
| 294 | + * |
| 295 | + * @param $row array The row of the database result from the database object. |
| 296 | + * @return bool |
| 297 | + */ |
254 | 298 | private function reserve_job( $row ) { |
255 | 299 | // this function was pulled out of replication_check_queue, need to fix the vars in here |
256 | 300 | $this->jobs['execute_urls'][] = $row['url']; |
— | — | @@ -259,6 +303,14 @@ |
260 | 304 | return true; |
261 | 305 | } |
262 | 306 | |
| 307 | + /** |
| 308 | + * Uses regular expressions to parse the log file of wget in non-verbose mode |
| 309 | + * This is then returned to call_wget and updated in the db |
| 310 | + * |
| 311 | + * @param $log_path string Path to the wget log file |
| 312 | + * @param $url string URL of the page that was archived |
| 313 | + * @return array |
| 314 | + */ |
263 | 315 | private function parse_wget_log( $log_path, $url ) { |
264 | 316 | //We have a die statement here, PHP error unnecessary |
265 | 317 | @$fp = fopen( $log_path, 'r' ) or die( 'can\'t find wget log file to parse' ); |