r92539 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92538‎ | r92539 | r92540 >
Date:16:00, 19 July 2011
Author:kbrown
Status:deferred (Comments)
Tags:
Comment:
* fix SQL injection vulnerability in ArchiveLinks.class.php
* start adding support for feed
* add new table for history of what has appeared in the feed in the past
Modified paths:
  • /trunk/extensions/ArchiveLinks/ArchiveLinks.class.php (modified) (history)
  • /trunk/extensions/ArchiveLinks/setuptables.sql (modified) (history)

Diff [purge]

Index: trunk/extensions/ArchiveLinks/setuptables.sql
@@ -25,3 +25,11 @@
2626 `in_progress` varchar(50) NOT NULL,
2727 PRIMARY KEY (`queue_id`)
2828 ) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=105 ;
 29+
 30+CREATE TABLE IF NOT EXISTS `el_archive_link_history` (
 31+ `hist_id` int(11) unsigned NOT NULL,
 32+ `hist_page_id` int(11) unsigned NOT NULL,
 33+ `hist_url` varchar(10000) NOT NULL,
 34+ `hist_insertion_time` int(11) unsigned NOT NULL,
 35+ PRIMARY KEY (`hist_id`)
 36+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
\ No newline at end of file
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php
@@ -5,7 +5,7 @@
66
77 class ArchiveLinks {
88 public static function queueExternalLinks ( &$article ) {
9 - global $wgParser;
 9+ global $wgParser, $wgArchiveLinksConfig;
1010 $external_links = $wgParser->getOutput();
1111 $external_links = $external_links->mExternalLinks;
1212
@@ -15,29 +15,109 @@
1616
1717 $db_master->begin();
1818
 19+ if( !isset( $wgArchiveLinksConfig['global_rearchive_time'] ) ) {
 20+ //30 days or 2,592,000 seconds...
 21+ $wgArchiveLinksConfig['global_rearchive_time'] = 2592000;
 22+ }
 23+
 24+ if( !isset( $wgArchiveLinksConfig['page_rearchive_time'] ) ) {
 25+ //200 days or 17,280,000 seconds
 26+ $wgArchiveLinksConfig['page_rearchive_time'] = 1728000;
 27+ }
 28+
 29+ if( !isset( $wgArchiveLinksConfig['previous_archive_lockout_time'] ) ) {
 30+ //2 days or 172,800 seconds
 31+ $wgArchiveLinksConfig['previous_archive_lockout_time'] = 172800;
 32+ }
 33+
1934 foreach ( $external_links as $link => $unused_value ) {
20 - //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"');
21 - $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', '`el_archive_blacklist`.`bl_url` = "' . $db_slave->strencode( $link ) . '"');
22 - $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', '`el_archive_queue`.`url` = "' . $db_slave->strencode( $link ) . '"' );
 35+ $link = $db_slave->strencode( $link );
 36+ $page_id = $article->getID();
 37+ $time = time();
2338
24 - if ( $db_result['blacklist']->numRows() === 0 ) {
25 - if ( $db_result['queue']->numRows() === 0 ) {
26 - // this probably a first time job
27 - // but we should check the logs and resource table
28 - // to make sure
29 - $db_master->insert( 'el_archive_queue', array (
30 - 'page_id' => $article->getID(),
 39+ if ( $wgArchiveLinksConfig['generate_feed'] === true ) {
 40+ $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) );
 41+ $db_result['history'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) );
 42+ $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) );
 43+
 44+ $db_result['queue-numrows'] = $db_result['queue']->numRows();
 45+ $db_result['history-numrows'] = $db_result['history']->numRows();
 46+
 47+ $db_result['history-row'] = $db_result['history']->fetchRow();
 48+
 49+ if ( $db_result['history-numrows'] === 0 && $db_result['queue-numrows'] === 0 ) {
 50+ //this link is new to the wiki
 51+ $db_master->insert( 'el_archive_queue', array(
 52+ 'page_id' => $page_id,
3153 'url' => $link,
3254 'delay_time' => '0',
33 - 'insertion_time' => time(),
 55+ 'insertion_time' => $time,
3456 'in_progress' => '0',
3557 ));
36 - } else {
37 - //this job is already in the queue, why?
38 - // * most likely reason is it has already been inserted by another page
39 - // * or we are checking it later because the site was down at last archival
40 - // in either case we don't really need to do anything right now, so skip...
 58+
 59+ $db_master->insert( 'el_archive_link_history', array(
 60+ 'page_id' => $page_id,
 61+ 'url' => $link,
 62+ 'delay_time' => '0',
 63+ 'insertion_time' => $time,
 64+ 'in_progress' => '0',
 65+ ));
 66+ } elseif ( $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['global_rearchive_time'] ) {
 67+ $db_result['history_page'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link, 'page_id' => $page_id ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) );
 68+
 69+ $db_result['history_page-numrows'] = $db_result['history_page']->numRows();
 70+ $db_result['history_page-row'] = $db_result['history_page']->fetchRow();
 71+
 72+ if ( $db_result['history_page-numrows'] === 0 && $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['previous_archive_lockout_time'] ) {
 73+ //this link is new to this particular page but has been archived on another page less than the rearchive delay
 74+ //grab a new version of it in case the content has changed
 75+ $db_master->insert( 'el_archive_queue', array(
 76+ 'page_id' => $page_id,
 77+ 'url' => $link,
 78+ 'delay_time' => '0',
 79+ 'insertion_time' => $time,
 80+ 'in_progress' => '0',
 81+ ));
 82+
 83+ $db_master->insert( 'el_archive_link_history', array(
 84+ 'page_id' => $page_id,
 85+ 'url' => $link,
 86+ 'delay_time' => '0',
 87+ 'insertion_time' => $time,
 88+ 'in_progress' => '0',
 89+ ));
 90+
 91+ }
 92+
 93+ if ( $db_result['history_page-row']['insertion_time'] >= $time - $wgArchiveLinksConfig['page_rearchive_time']) {
 94+
 95+ }
4196 }
 97+
 98+ } else {
 99+ //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"');
 100+ $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__ );
 101+ $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__ );
 102+
 103+ if ( $db_result['blacklist']->numRows() === 0 ) {
 104+ if ( $db_result['queue']->numRows() === 0 ) {
 105+ // this probably a first time job
 106+ // but we should check the logs and resource table
 107+ // to make sure
 108+ $db_master->insert( 'el_archive_queue', array (
 109+ 'page_id' => $page_id,
 110+ 'url' => $link,
 111+ 'delay_time' => '0',
 112+ 'insertion_time' => $time,
 113+ 'in_progress' => '0',
 114+ ));
 115+ } else {
 116+ //this job is already in the queue, why?
 117+ // * most likely reason is it has already been inserted by another page
 118+ // * or we are checking it later because the site was down at last archival
 119+ // in either case we don't really need to do anything right now, so skip...
 120+ }
 121+ }
42122 }
43123 }
44124

Comments

#Comment by Kevin Brown (talk | contribs)   16:09, 19 July 2011

Yeah I feel like an idiot, I forgot to escape $link in the insert statement. I caught this after I was looked at it again. It's now fixed....

#Comment by Jack Phoenix (talk | contribs)   17:10, 19 July 2011
+CREATE TABLE IF NOT EXISTS `el_archive_link_history` (
+  `hist_id` int(11) unsigned NOT NULL,
+  `hist_page_id` int(11) unsigned NOT NULL,
+  `hist_url` varchar(10000) NOT NULL,
+  `hist_insertion_time` int(11) unsigned NOT NULL,
+  PRIMARY KEY (`hist_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;

This seems to be rather MySQL-specific and it doesn't even take MW things like $wgDBprefix or $wgDBTableOptions into account. We usually prefix column names with the table name (if it's short) or an abbreviation of it. Additionally, `hist_url` varchar(10000) NOT NULL looks rather scary to me, but I'm not a SQL expert. Still, it might be a good idea to make sure that your code supports $wgDBprefix, $wgDBTableOptions and maybe even the SQLite DBMS — AFAIK that can be done very easily by moving the PRIMARY KEY declaration.

#Comment by NeilK (talk | contribs)   21:45, 5 August 2011

Yeah, for the "right" way to do this see what Jeroen did for UploadWizard's new tables:

in UploadWizardHooks.php;

   $wgHooks['LoadExtensionSchemaUpdates'][] = 'UploadWizardHooks::onSchemaUpdate';

And check out what's in UploadWizardHooks::onSchemaUpdate().

The related SQL files are defined in that function, and they have the proper magic comments for table prefixes and database options.

Status & tagging log