Index: trunk/extensions/ArchiveLinks/spider.php |
— | — | @@ -16,6 +16,7 @@ |
17 | 17 | private $db_slave; |
18 | 18 | private $db_result; |
19 | 19 | private $jobs; |
| 20 | + private $downloaded_files; |
20 | 21 | |
21 | 22 | public function execute( ) { |
22 | 23 | global $wgArchiveLinksConfig, $wgLoadBalancer, $path; |
— | — | @@ -63,56 +64,71 @@ |
64 | 65 | |
65 | 66 | private function call_wget( $url ) { |
66 | 67 | global $wgArchiveLinksConfig, $path; |
| 68 | + |
| 69 | + //Check Configuration |
| 70 | + if ( isset( $wgArchiveLinksConfig['file_types'] ) ) { |
| 71 | + if ( is_array( $wgArchiveLinksConfig['file_types']) ){ |
| 72 | + $accept_file_types = '-A ' . implode( ',', $wgArchiveLinksConfig['file_types'] ); |
| 73 | + } else { |
| 74 | + $accept_file_types = '-A ' . $wgArchiveLinksConfig['file_types']; |
| 75 | + } |
| 76 | + } else { |
| 77 | + //we should set a default, for now we will disable this for testing purposes, but this should be closed sometime later... |
| 78 | + $accept_file_types = ''; |
| 79 | + } |
| 80 | + //At the current time we are only adding support for the local filestore, but swift support is something that will be added later |
| 81 | + //Add shutup operator for PHP notice, it's okay if this is not set as it's an optional config value |
| 82 | + switch( @$wgArchiveLinksConfig['filestore'] ) { |
| 83 | + case 'local': |
| 84 | + default: |
| 85 | + if ( isset( $wgArchiveLinksConfig['subfolder_name'] ) ) { |
| 86 | + $dir = $path . $wgArchiveLinksConfig['subfolder_name']; |
| 87 | + } elseif ( isset( $wgArchiveLinksConfig['content_path'] ) ) { |
| 88 | + $dir = realpath( $wgArchiveLinksConfig['content_path'] ); |
| 89 | + if ( !$dir ) { |
| 90 | + die ( 'The path you have set for $wgArchiveLinksConfig[\'content_path\'] does not exist. ' . |
| 91 | + 'This makes the spider a very sad panda. Please either create it or use a different setting.'); |
| 92 | + } |
| 93 | + } else { |
| 94 | + $dir = $path . '/archived_content/'; |
| 95 | + } |
| 96 | + $dir = $dir . sha1( time() . ' - ' . $url ); |
| 97 | + mkdir( $dir, 0644, TRUE ); |
| 98 | + $log_dir = $dir . '/log.txt'; |
| 99 | + $log_dir_esc = escapeshellarg($log_dir); |
| 100 | + $dir = escapeshellarg( $dir ); |
| 101 | + $sanitized_url = escapeshellarg( $url ); |
| 102 | + } |
| 103 | + |
| 104 | + if ( ! isset( $wgArchiveLinksConfig['wget_quota'] ) ) { |
| 105 | + //We'll set the default max quota for any specific web page for 8 mb, which is kind of a lot but should allow for large images |
| 106 | + $wgArchiveLinksConfig['wget_quota'] = '8m'; |
| 107 | + } |
| 108 | + |
| 109 | + if ( !isset( $wgArchiveLinksConfig['retry_times'] ) ) { |
| 110 | + //by default wget is set to retry something 20 times which is probably *way* too high for our purposes |
| 111 | + //this has the potential to really slow it down as --waitretry is set to 10 seconds by default, meaning that it would take |
| 112 | + //serveral minutes to go through all the retries which has the potential to stall the spider unnecessarily |
| 113 | + $wgArchiveLinksConfig['retry_times'] = '3'; |
| 114 | + } |
| 115 | + |
| 116 | + |
| 117 | + //Do stuff with wget |
67 | 118 | if ( isset( $wgArchiveLinksConfig['wget_path'] ) && file_exists( $wgArchiveLinksConfig['wget_path'] ) ) { |
68 | 119 | die ( 'Support is not yet added for wget in a different directory' ); |
69 | 120 | } elseif ( file_exists( "$path/wget.exe" ) ) { |
70 | | - if ( isset( $wgArchiveLinksConfig['file_types'] ) ) { |
71 | | - if ( is_array( $wgArchiveLinksConfig['file_types']) ){ |
72 | | - $accept_file_types = '-A ' . implode( ',', $wgArchiveLinksConfig['file_types'] ); |
73 | | - } else { |
74 | | - $accept_file_types = '-A ' . $wgArchiveLinksConfig['file_types']; |
| 121 | + wfShellExec( "cd $path" ); |
| 122 | + //echo "\n\nwget.exe -nv -p -H -E -k -t {$wgArchiveLinksConfig['retry_times']} -Q{$wgArchiveLinksConfig['retry_times']} -o $log_dir -P $dir $accept_file_types $sanitized_url\n\n"; |
| 123 | + wfShellExec( "wget.exe -nv -p -H -E -k -t {$wgArchiveLinksConfig['retry_times']} -Q {$wgArchiveLinksConfig['wget_quota']} -o $log_dir_esc -P $dir $accept_file_types $sanitized_url" ); |
| 124 | + $this->parse_wget_log( $log_dir, $url ); |
| 125 | + /*foreach( $this->downloaded_files as $file ) { |
| 126 | + if ( $file['status'] === 'success' ) { |
| 127 | + |
| 128 | + } elseif ( $file['status'] === 'failure' ) { |
| 129 | + echo 'bar'; |
75 | 130 | } |
76 | | - } else { |
77 | | - //we should set a default, for now we will disable this for testing purposes, but this should be closed sometime later... |
78 | | - $accept_file_types = ''; |
79 | | - } |
80 | | - //At the current time we are only adding support for the local filestore, but swift support is something that will be added later |
81 | | - //Add shutup operator for PHP notice, it's okay if this is not set as it's an optional config value |
82 | | - switch( @$wgArchiveLinksConfig['filestore'] ) { |
83 | | - case 'local': |
84 | | - default: |
85 | | - if ( isset( $wgArchiveLinksConfig['subfolder_name'] ) ) { |
86 | | - $dir = $path . $wgArchiveLinksConfig['subfolder_name']; |
87 | | - } elseif ( isset( $wgArchiveLinksConfig['content_path'] ) ) { |
88 | | - $dir = realpath( $wgArchiveLinksConfig['content_path'] ); |
89 | | - if ( !$dir ) { |
90 | | - die ( 'The path you have set for $wgArchiveLinksConfig[\'content_path\'] does not exist. ' . |
91 | | - 'This makes the spider a very sad panda. Please either create it or use a different setting.'); |
92 | | - } |
93 | | - } else { |
94 | | - $dir = $path . '/archived_content/'; |
95 | | - } |
96 | | - $dir = $dir . sha1( time() . ' - ' . $url ); |
97 | | - mkdir( $dir, 0644, TRUE ); |
98 | | - $dir = escapeshellarg( $dir ); |
99 | | - $sanitized_url = escapeshellarg( $url ); |
100 | | - } |
101 | | - |
102 | | - if ( ! isset( $wgArchiveLinksConfig['wget_quota'] ) ) { |
103 | | - //We'll set the default max quota for any specific web page for 8 mb, which is kind of a lot but should allow for large images |
104 | | - $quota = '8m'; |
105 | | - } |
106 | | - |
107 | | - if ( !isset( $wgArchiveLinksConfig['retry_times'] ) ) { |
108 | | - //by default wget is set to retry something 20 times which is probably *way* too high for our purposes |
109 | | - //this has the potential to really slow it down as --waitretry is set to 10 seconds by default, meaning that it would take |
110 | | - //serveral minutes to go through all the retries which has the potential to stall the spider unnecessarily |
111 | | - $wgArchiveLinksConfig['retry_times'] = '3'; |
112 | | - } |
113 | | - |
114 | | - shell_exec( "cd $path" ); |
115 | | - shell_exec( "wget.exe -nv -p -H -E -k -t {$wgArchiveLinksConfig['retry_times']} -Q{$wgArchiveLinksConfig['retry_times']} -o $dir/log.txt -P $dir $accept_file_types $sanitized_url" ); |
116 | | - $this->parse_wget_log( "$dir/log.txt", $url ); |
| 131 | + }*/ |
| 132 | + $this->db_master->insert( $this->downloaded_files[0]['url'] ); |
117 | 133 | } else { |
118 | 134 | //this is primarily designed with windows in mind and no built in wget, so yeah, *nix support should be added, in other words note to self... |
119 | 135 | die ( 'wget must be installed in order for the spider to function in wget mode' ); |
— | — | @@ -121,11 +137,11 @@ |
122 | 138 | |
123 | 139 | private function check_queue( ) { |
124 | 140 | //need to fix this to use arrays instead of what I'm doing now |
125 | | - $this->db_result['job-fetch'] = $this->db_slave->select('el_archive_queue', '*', '`el_archive_queue`.`delay_time` <= ' . time() |
126 | | - . ' AND `el_archive_queue`.`in_progress` = 0' |
127 | | - . ' ORDER BY `el_archive_queue`.`queue_id` ASC' |
128 | | - . ' LIMIT 1'); |
129 | | - |
| 141 | + $this->db_result['job-fetch'] = $this->db_slave->select( 'el_archive_queue', '*', |
| 142 | + array( 'delay_time' => ' >=' . time(), 'in_progress' => '0'), |
| 143 | + __METHOD__, |
| 144 | + array( 'ORDER BY' => 'queue_id ASC', 'LIMIT' => '1' )); |
| 145 | + |
130 | 146 | if ( $this->db_result['job-fetch']->numRows() > 0 ) { |
131 | 147 | $row = $this->db_result['job-fetch']->fetchRow(); |
132 | 148 | |
— | — | @@ -244,37 +260,52 @@ |
245 | 261 | } |
246 | 262 | |
247 | 263 | private function parse_wget_log( $log_path, $url ) { |
248 | | - $fp = fopen( $log_path, 'r' ) or die( 'can\'t find wget log file to parse' ); |
| 264 | + //We have a die statement here, PHP error unnecessary |
| 265 | + @$fp = fopen( $log_path, 'r' ) or die( 'can\'t find wget log file to parse' ); |
249 | 266 | |
250 | | - $downloaded_files = array ( 'failed' => array(), 'success' => array() ); |
| 267 | + $this->downloaded_files = array ( ); |
251 | 268 | |
| 269 | + $line_regexes = array ( |
| 270 | + 'url' => '%^\d{4}-(?:\d{2}(?:-|:| )?){5}URL:(http://.*?) \[.+?\] ->%', |
| 271 | + 'finish' => '%^Downloaded: \d+ files, (\d(?:.\d)?+(?:K|M)).*%', |
| 272 | + 'sole_url' => '%^(http://.*):%', |
| 273 | + 'error' => '%^\d{4}-(?:\d{2}-?){2} (?:\d{2}:?){3} ERROR (\d){3}:(.+)%', |
| 274 | + 'quota_exceed' => '%^Download quota of .*? EXCEEDED!%', |
| 275 | + 'finish_line' => '%^FINISHED --(\d{4}-(?:\d{2}(?:-|:| )){5})-%', |
| 276 | + ); |
| 277 | + |
252 | 278 | while ( $line = fgets( $fp ) ) { |
253 | | - $line_regexes = array ( |
254 | | - 'url' => '%\^d{4}-(?:\d{2}-?){2} (?:\d{2}:?){3} URL:(http://.*) \[.+\] ->%', |
255 | | - 'finish' => '%^Downloaded: \d+ files, (\d+(?:K|M)).*%', |
256 | | - 'sole_url' => '%^(http://.*):%', |
257 | | - 'error' => '%^\d{4}-(?:\d{2}-?){2} (?:\d{2}:?){3} ERROR (\d){3}:(.+)%', |
258 | | - |
259 | | - ); |
260 | 279 | foreach( $line_regexes as $line_type => $regex ) { |
261 | 280 | if ( preg_match( $regex, $line, $matches ) ) { |
262 | 281 | switch ( $line_type ) { |
263 | 282 | case 'url': |
264 | | - $downloaded_files['success'][] = $matches[1]; |
| 283 | + $this->downloaded_files[] = array ( |
| 284 | + 'status' => 'success', |
| 285 | + 'url' => $matches[1] |
| 286 | + ); |
265 | 287 | $last_line = 'url'; |
266 | 288 | break; |
267 | 289 | case 'sole_url': |
268 | | - $downloaded_files['failed'][]['url'] = $matches[1]; |
| 290 | + $this->downloaded_files[] = array ( |
| 291 | + 'status' => 'failed', |
| 292 | + 'url' => $matches[1] |
| 293 | + ); |
269 | 294 | break; |
270 | 295 | case 'error': |
271 | | - end( $downloaded_files['failed'] ); |
272 | | - $array_key = key( $downloaded_files['failed'] ); |
273 | | - $downloaded_files['failed'][$array_key]['error_code'] = $matches[1]; |
274 | | - $downloaded_files['failed'][$array_key]['error_text'] = $matches[2]; |
| 296 | + //this is a contination of the previous line, so just add stuff to that |
| 297 | + end( $this->downloaded_files ); |
| 298 | + $array_key = key( $this->downloaded_files ); |
| 299 | + $this->downloaded_files[$array_key]['error_code'] = $matches[1]; |
| 300 | + $this->downloaded_files[$array_key]['error_text'] = $matches[2]; |
275 | 301 | break; |
276 | 302 | case 'finish': |
277 | 303 | $finish_time = $matches[1]; |
278 | 304 | break; |
| 305 | + case 'finish_line': |
| 306 | + //this is kind of useless, it contains the date/time stamp of when the download finished |
| 307 | + break; |
| 308 | + case 'quote_exceed': |
| 309 | + break; |
279 | 310 | default: |
280 | 311 | //we missed a line type, this is mainly for testing purposes and shouldn't happen when parsing the log |
281 | 312 | echo "\n\nUNKNOWN LINE: $line\n\n"; |
— | — | @@ -283,6 +314,8 @@ |
284 | 315 | } |
285 | 316 | } |
286 | 317 | } |
| 318 | + |
| 319 | + return $this->downloaded_files; |
287 | 320 | } |
288 | 321 | } |
289 | 322 | |