r91671 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91670‎ | r91671 | r91672 >
Date:19:45, 7 July 2011
Author:kbrown
Status:deferred (Comments)
Tags:
Comment:
* add a spidering function for wget in spider.php
* add rel="nofollow" to Cache links
Modified paths:
  • /trunk/extensions/ArchiveLinks/ArchiveLinks.class.php (modified) (history)
  • /trunk/extensions/ArchiveLinks/spider.php (modified) (history)

Diff [purge]

Index: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php
@@ -79,7 +79,7 @@
8080 . HTML::openElement('sup')
8181 . HTML::openElement('small')
8282 . ' '
83 - . HTML::element('a', array ( 'href' => $link_to_archive ), '[' . wfMsg( 'archivelinks-cache-title') . ']')
 83+ . HTML::element('a', array ( 'rel' => 'nofollow', 'href' => $link_to_archive ), '[' . wfMsg( 'archivelinks-cache-title') . ']')
8484 . HTML::closeElement('small')
8585 . HTML::closeElement('sup');
8686
Index: trunk/extensions/ArchiveLinks/spider.php
@@ -5,7 +5,7 @@
66 */
77 $path = getenv('MW_INSTALL_PATH');
88 if (strval($path) === '') {
9 - $path = dirname(__FILE__) . '/../..';
 9+ $path = realpath( dirname(__FILE__) . '/../..' );
1010 }
1111
1212 require_once "$path/maintenance/Maintenance.php";
@@ -18,7 +18,7 @@
1919 private $jobs;
2020
2121 public function execute( ) {
22 - global $wgArchiveLinksConfig, $wgLoadBalancer;
 22+ global $wgArchiveLinksConfig, $wgLoadBalancer, $path;
2323
2424 $this->db_master = $this->getDB(DB_MASTER);
2525 $this->db_slave = $this->getDB(DB_SLAVE);
@@ -34,24 +34,72 @@
3535 die( 'Sorry, at the current time running the spider as a daemon isn\'t supported.' );
3636 } else {
3737 //for right now we will pipe everything through the replication_check_queue function just for testing purposes
38 - /* if ( $wgLoadBalancer->getServerCount() > 1 ) {
39 - if ( ( $url = $this->replication_check_queue() ) !== false ) {
40 -
41 - }
 38+ /*if ( $wgLoadBalancer->getServerCount() > 1 ) {
 39+ if ( ( $url = $this->replication_check_queue() ) !== false ) {
 40+
 41+ }
4242 } else {
43 - if ( ( $url = $this->check_queue() ) !== false ) {
 43+ if ( ( $url = $this->check_queue() ) !== false ) {
 44+
 45+ }
 46+ }*/
4447
45 - }
46 - } */
47 -
48 - if ( ( $url = $this->replication_check_queue() ) !== false ) {
49 -
 48+ if ( ( $url = $this->check_queue() ) !== false ) {
 49+ switch( $wgArchiveLinksConfig['download_lib'] ) {
 50+ case 'curl':
 51+ die( 'At the current time support for libcurl is not available.' );
 52+ case 'wget':
 53+ default:
 54+ $this->call_wget( $url );
 55+ }
5056 }
5157 }
5258 return null;
5359 }
 60+
 61+ private function call_wget( $url ) {
 62+ global $wgArchiveLinksConfig;
 63+ if ( array_key_exists( 'path_to_wget', $wgArchiveLinksConfig ) && file_exists( $wgArchiveLinksConfig['path_to_wget'] ) ) {
 64+ die ( 'Support is not yet added for wget in a different directory' );
 65+ } elseif ( file_exists( "$path/wget.exe" ) ) {
 66+ if ( $wgArchiveLinksConfig['file_types_to_archive'] ) {
 67+ if ( is_array( $wgArchiveLinksConfig['file_types_to_archive']) ){
 68+ $accept_file_types = '-A ' . implode( ',', $wgArchiveLinksConfig['filetypes_to_archive'] );
 69+ } else {
 70+ $accept_file_types = '-A ' . $wgArchiveLinksConfig['file_types_to_archive'];
 71+ }
 72+ } else {
 73+ $accept_file_types = '';
 74+ }
 75+ //At the current time we are only adding support for the local filestore, but swift support is something that will be added later
 76+ switch( $wgArchiveLinksConfig['filestore_to_use'] ) {
 77+ case 'local':
 78+ default:
 79+ if ( $wgArchiveLinksConfig['subfolder_name'] ) {
 80+ $content_dir = 'extensions/ArchiveLinks/' . $wgArchiveLinksConfig['subfolder_name'];
 81+ } elseif ( $wgArchiveLinksConfig['content_path'] ) {
 82+ $content_dir = realpath( $wgArchiveLinksConfig['content_path'] );
 83+ if ( !$content_dir ) {
 84+ die ( 'The path you have set for $wgArchiveLinksConfig[\'content_path\'] does not exist.' .
 85+ 'This makes the spider a very sad panda. Please either create it or use a different setting.');
 86+ }
 87+ } else {
 88+ $content_dir = 'extensions/ArchiveLinks/' . 'archived_content/';
 89+ }
 90+ $dir = $path . $content_dir . sha1( time() . ' - ' . $url );
 91+ $dir = escapeshellarg( $dir );
 92+ $sanitized_url = escapeshellarg( $url );
 93+ }
5494
55 - /*private function check_queue( ) {
 95+ shell_exec( "cd $path" );
 96+ shell_exec( "wget.exe -nH -p -H -E -k -o \"./log.txt\" -Q2m -P $dir $accept_file_types $sanitized_url" );
 97+ } else {
 98+ //this is primarily designed with windows in mind and no built in wget, so yeah, *nix support should be added, in other words note to self...
 99+ die ( 'wget must be installed in order for the spider to function in wget mode' );
 100+ }
 101+ }
 102+
 103+ private function check_queue( ) {
56104 //need to fix this to use arrays instead of what I'm doing now
57105 $this->db_result['job-fetch'] = $this->db_slave->select('el_archive_queue', '*', '`el_archive_queue`.`delay_time` <= ' . time()
58106 . ' AND `el_archive_queue`.`in_progress` = 0'
@@ -61,14 +109,14 @@
62110 if ( $this->db_result['job-fetch']->numRows() > 0 ) {
63111 $row = $this->db_result['job-fetch']->fetchRow();
64112
65 - $this->delete_dups( $row['url'] );
 113+ //$this->delete_dups( $row['url'] );
66114
67115 return $row['url'];
68116 } else {
69117 //there are no jobs to do right now
70118 return false;
71119 }
72 - }*/
 120+ }
73121
74122 /**
75123 * This function checks a local file for a local block of jobs that is to be done
@@ -123,7 +171,7 @@
124172 array_key_exists( 'in_progress_ignore_delay', $wgArchiveLinksConfig ) ? $ignore_in_prog_time = $wgArchiveLinksConfig['in_progress_ignore_delay'] :
125173 $ignore_in_prog_time = 7200;
126174
127 - if ( $reserve_time - $time > $ignore_in_prog_time ) {
 175+ if ( $reserve_time + $ingore_in_prog_time + $wait_time > $ignore_in_prog_time + $wait_time ) {
128176 $retval = $this->reserve_job( $row );
129177 }
130178 }

Comments

#Comment by NeilK (talk | contribs)   18:55, 5 August 2011

You don't even have plans to support curl, the download_lib thing is a waste of space. Add it when you support curl.

You already brought this up yourself, but replace shell_exec() with wfShellExec() - that seems not to be fixed in current version

The path to wget should be a global and should default to where it usually is on Ubuntu Linux (/usr/bin/wget), see how we do it with $wgImageMagickConvertCommand in DefaultSettings.php

Re: $ignore_in_prog_time, Testing for config variables and putting default values deep in code is bad, try to establish the config as early as possible -- think of config as an input to your function, not something you create in the middle of the function. Spelling mistake with $ingore / $ignore. At least try it once before committing. i see you fix this in later revs though.

#Comment by Kevin Brown (talk | contribs)   17:14, 19 August 2011

shell_exec is replaced, the other things need to be fixed, but as discussed that was the majority of the reason for the fixme, setting to new.

Status & tagging log