Index: trunk/extensions/ArchiveLinks/setuptables.sql |
— | — | @@ -25,3 +25,11 @@ |
26 | 26 | `in_progress` varchar(50) NOT NULL, |
27 | 27 | PRIMARY KEY (`queue_id`) |
28 | 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=105 ; |
| 29 | + |
| 30 | +CREATE TABLE IF NOT EXISTS `el_archive_link_history` ( |
| 31 | + `hist_id` int(11) unsigned NOT NULL, |
| 32 | + `hist_page_id` int(11) unsigned NOT NULL, |
| 33 | + `hist_url` varchar(10000) NOT NULL, |
| 34 | + `hist_insertion_time` int(11) unsigned NOT NULL, |
| 35 | + PRIMARY KEY (`hist_id`) |
| 36 | +) ENGINE=InnoDB DEFAULT CHARSET=utf8; |
\ No newline at end of file |
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php |
— | — | @@ -5,7 +5,7 @@ |
6 | 6 | |
7 | 7 | class ArchiveLinks { |
8 | 8 | public static function queueExternalLinks ( &$article ) { |
9 | | - global $wgParser; |
| 9 | + global $wgParser, $wgArchiveLinksConfig; |
10 | 10 | $external_links = $wgParser->getOutput(); |
11 | 11 | $external_links = $external_links->mExternalLinks; |
12 | 12 | |
— | — | @@ -15,29 +15,109 @@ |
16 | 16 | |
17 | 17 | $db_master->begin(); |
18 | 18 | |
| 19 | + if( !isset( $wgArchiveLinksConfig['global_rearchive_time'] ) ) { |
| 20 | + //30 days or 2,592,000 seconds... |
| 21 | + $wgArchiveLinksConfig['global_rearchive_time'] = 2592000; |
| 22 | + } |
| 23 | + |
| 24 | + if( !isset( $wgArchiveLinksConfig['page_rearchive_time'] ) ) { |
| 25 | + //200 days or 17,280,000 seconds |
| 26 | + $wgArchiveLinksConfig['page_rearchive_time'] = 1728000; |
| 27 | + } |
| 28 | + |
| 29 | + if( !isset( $wgArchiveLinksConfig['previous_archive_lockout_time'] ) ) { |
| 30 | + //2 days or 172,800 seconds |
| 31 | + $wgArchiveLinksConfig['previous_archive_lockout_time'] = 172800; |
| 32 | + } |
| 33 | + |
19 | 34 | foreach ( $external_links as $link => $unused_value ) { |
20 | | - //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"'); |
21 | | - $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', '`el_archive_blacklist`.`bl_url` = "' . $db_slave->strencode( $link ) . '"'); |
22 | | - $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', '`el_archive_queue`.`url` = "' . $db_slave->strencode( $link ) . '"' ); |
| 35 | + $link = $db_slave->strencode( $link ); |
| 36 | + $page_id = $article->getID(); |
| 37 | + $time = time(); |
23 | 38 | |
24 | | - if ( $db_result['blacklist']->numRows() === 0 ) { |
25 | | - if ( $db_result['queue']->numRows() === 0 ) { |
26 | | - // this probably a first time job |
27 | | - // but we should check the logs and resource table |
28 | | - // to make sure |
29 | | - $db_master->insert( 'el_archive_queue', array ( |
30 | | - 'page_id' => $article->getID(), |
| 39 | + if ( $wgArchiveLinksConfig['generate_feed'] === true ) { |
| 40 | + $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) ); |
| 41 | + $db_result['history'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) ); |
| 42 | + $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) ); |
| 43 | + |
| 44 | + $db_result['queue-numrows'] = $db_result['queue']->numRows(); |
| 45 | + $db_result['history-numrows'] = $db_result['history']->numRows(); |
| 46 | + |
| 47 | + $db_result['history-row'] = $db_result['history']->fetchRow(); |
| 48 | + |
| 49 | + if ( $db_result['history-numrows'] === 0 && $db_result['queue-numrows'] === 0 ) { |
| 50 | + //this link is new to the wiki |
| 51 | + $db_master->insert( 'el_archive_queue', array( |
| 52 | + 'page_id' => $page_id, |
31 | 53 | 'url' => $link, |
32 | 54 | 'delay_time' => '0', |
33 | | - 'insertion_time' => time(), |
| 55 | + 'insertion_time' => $time, |
34 | 56 | 'in_progress' => '0', |
35 | 57 | )); |
36 | | - } else { |
37 | | - //this job is already in the queue, why? |
38 | | - // * most likely reason is it has already been inserted by another page |
39 | | - // * or we are checking it later because the site was down at last archival |
40 | | - // in either case we don't really need to do anything right now, so skip... |
| 58 | + |
| 59 | + $db_master->insert( 'el_archive_link_history', array( |
| 60 | + 'page_id' => $page_id, |
| 61 | + 'url' => $link, |
| 62 | + 'delay_time' => '0', |
| 63 | + 'insertion_time' => $time, |
| 64 | + 'in_progress' => '0', |
| 65 | + )); |
| 66 | + } elseif ( $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['global_rearchive_time'] ) { |
| 67 | + $db_result['history_page'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link, 'page_id' => $page_id ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) ); |
| 68 | + |
| 69 | + $db_result['history_page-numrows'] = $db_result['history_page']->numRows(); |
| 70 | + $db_result['history_page-row'] = $db_result['history_page']->fetchRow(); |
| 71 | + |
| 72 | + if ( $db_result['history_page-numrows'] === 0 && $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['previous_archive_lockout_time'] ) { |
| 73 | + //this link is new to this particular page but has been archived on another page less than the rearchive delay |
| 74 | + //grab a new version of it in case the content has changed |
| 75 | + $db_master->insert( 'el_archive_queue', array( |
| 76 | + 'page_id' => $page_id, |
| 77 | + 'url' => $link, |
| 78 | + 'delay_time' => '0', |
| 79 | + 'insertion_time' => $time, |
| 80 | + 'in_progress' => '0', |
| 81 | + )); |
| 82 | + |
| 83 | + $db_master->insert( 'el_archive_link_history', array( |
| 84 | + 'page_id' => $page_id, |
| 85 | + 'url' => $link, |
| 86 | + 'delay_time' => '0', |
| 87 | + 'insertion_time' => $time, |
| 88 | + 'in_progress' => '0', |
| 89 | + )); |
| 90 | + |
| 91 | + } |
| 92 | + |
| 93 | + if ( $db_result['history_page-row']['insertion_time'] >= $time - $wgArchiveLinksConfig['page_rearchive_time']) { |
| 94 | + |
| 95 | + } |
41 | 96 | } |
| 97 | + |
| 98 | + } else { |
| 99 | + //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"'); |
| 100 | + $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__ ); |
| 101 | + $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__ ); |
| 102 | + |
| 103 | + if ( $db_result['blacklist']->numRows() === 0 ) { |
| 104 | + if ( $db_result['queue']->numRows() === 0 ) { |
| 105 | + // this probably a first time job |
| 106 | + // but we should check the logs and resource table |
| 107 | + // to make sure |
| 108 | + $db_master->insert( 'el_archive_queue', array ( |
| 109 | + 'page_id' => $page_id, |
| 110 | + 'url' => $link, |
| 111 | + 'delay_time' => '0', |
| 112 | + 'insertion_time' => $time, |
| 113 | + 'in_progress' => '0', |
| 114 | + )); |
| 115 | + } else { |
| 116 | + //this job is already in the queue, why? |
| 117 | + // * most likely reason is it has already been inserted by another page |
| 118 | + // * or we are checking it later because the site was down at last archival |
| 119 | + // in either case we don't really need to do anything right now, so skip... |
| 120 | + } |
| 121 | + } |
42 | 122 | } |
43 | 123 | } |
44 | 124 | |