r93277 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r93276‎ | r93277 | r93278 >
Date:15:32, 27 July 2011
Author:kbrown
Status:deferred
Tags:
Comment:
changes to spider.php for wget log parsing and stuff for feed in ArchiveLinks.class.php
Modified paths:
  • /trunk/extensions/ArchiveLinks/ArchiveLinks.class.php (modified) (history)
  • /trunk/extensions/ArchiveLinks/ArchiveLinks.php (modified) (history)

Diff [purge]

Index: trunk/extensions/ArchiveLinks/ArchiveLinks.php
@@ -28,4 +28,5 @@
2929 'use_multiple_archives' => false,
3030 'run_spider_in_loop' => false,
3131 'in_progress_ignore_delay' => 7200,
 32+ 'generate_feed' => true,
3233 );
\ No newline at end of file
Index: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php
@@ -4,6 +4,11 @@
55 */
66
77 class ArchiveLinks {
 8+ private $db_master;
 9+ private $db_slave;
 10+ private $db_result;
 11+
 12+
813 public static function queueExternalLinks ( &$article ) {
914 global $wgParser, $wgArchiveLinksConfig;
1015 $external_links = $wgParser->getOutput();
@@ -15,12 +20,12 @@
1621
1722 $db_master->begin();
1823
19 - if( !isset( $wgArchiveLinksConfig['global_rearchive_time'] ) ) {
 24+ if ( !isset( $wgArchiveLinksConfig['global_rearchive_time'] ) ) {
2025 //30 days or 2,592,000 seconds...
2126 $wgArchiveLinksConfig['global_rearchive_time'] = 2592000;
2227 }
2328
24 - if( !isset( $wgArchiveLinksConfig['page_rearchive_time'] ) ) {
 29+ if ( !isset( $wgArchiveLinksConfig['page_rearchive_time'] ) ) {
2530 //200 days or 17,280,000 seconds
2631 $wgArchiveLinksConfig['page_rearchive_time'] = 1728000;
2732 }
@@ -30,47 +35,44 @@
3136 $wgArchiveLinksConfig['previous_archive_lockout_time'] = 172800;
3237 }
3338
34 - foreach ( $external_links as $link => $unused_value ) {
35 - $link = $db_slave->strencode( $link );
36 - $page_id = $article->getID();
37 - $time = time();
 39+ $page_id = $article->getID();
 40+ $time = time();
 41+
 42+ if ( $wgArchiveLinksConfig['generate_feed'] === true ) {
 43+ $old_id = $article->getTitle();
 44+ $old_id = $old_id->getPreviousRevisionID( $page_id );
3845
39 - if ( $wgArchiveLinksConfig['generate_feed'] === true ) {
40 - $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) );
41 - $db_result['history'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) );
42 - $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) );
43 -
44 - $db_result['queue-numrows'] = $db_result['queue']->numRows();
45 - $db_result['history-numrows'] = $db_result['history']->numRows();
46 -
47 - $db_result['history-row'] = $db_result['history']->fetchRow();
48 -
49 - if ( $db_result['history-numrows'] === 0 && $db_result['queue-numrows'] === 0 ) {
50 - //this link is new to the wiki
51 - $db_master->insert( 'el_archive_queue', array(
52 - 'page_id' => $page_id,
53 - 'url' => $link,
54 - 'delay_time' => '0',
55 - 'insertion_time' => $time,
56 - 'in_progress' => '0',
57 - ));
 46+ $db_result['links_on_page'] = $db_master->select( 'el_archive_link_history', '*', array( 'hist_page_id' => $page_id ), __METHOD__ );
 47+
 48+ $old_external_links = array();
 49+ $new_external_links = array();
 50+
 51+ if ( $db_result['links_on_page']->numRows() > 0 ) {
 52+ while( $row = $db_result['links_on_page']->fetchRow() ) {
 53+ $old_external_links[] = $row['hist_url'];
 54+ }
 55+
 56+ $new_external_links = array_diff( $external_links, $old_external_links );
 57+ unset( $old_external_links );
 58+
 59+ die( var_dump( $old_external_links ) );
 60+ } elseif ( count( $external_links ) > 0 ) {
 61+ $new_external_links = $external_links;
 62+ }
 63+
 64+ if ( !isset( $wgArchiveLinksConfig['link_insert_max'] ) ) {
 65+ $wgArchiveLinksConfig['link_insert_max'] = 100;
 66+ }
 67+
 68+ if ( count( $new_external_links ) <= $wgArchiveLinksConfig['link_insert_max'] ) {
 69+ //insert the links into the queue now
 70+ foreach( $new_external_links as $link ) {
 71+ /*$db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) );
 72+
5873
59 - $db_master->insert( 'el_archive_link_history', array(
60 - 'page_id' => $page_id,
61 - 'url' => $link,
62 - 'delay_time' => '0',
63 - 'insertion_time' => $time,
64 - 'in_progress' => '0',
65 - ));
66 - } elseif ( $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['global_rearchive_time'] ) {
67 - $db_result['history_page'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link, 'page_id' => $page_id ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) );
68 -
69 - $db_result['history_page-numrows'] = $db_result['history_page']->numRows();
70 - $db_result['history_page-row'] = $db_result['history_page']->fetchRow();
71 -
72 - if ( $db_result['history_page-numrows'] === 0 && $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['previous_archive_lockout_time'] ) {
73 - //this link is new to this particular page but has been archived on another page less than the rearchive delay
74 - //grab a new version of it in case the content has changed
 74+ /*
 75+ if ( $db_result['blacklist-numrows'] === 0 && $db_result['queue-numrows'] === 0 ) {
 76+ //this link is new to the wiki
7577 $db_master->insert( 'el_archive_queue', array(
7678 'page_id' => $page_id,
7779 'url' => $link,
@@ -86,36 +88,88 @@
8789 'insertion_time' => $time,
8890 'in_progress' => '0',
8991 ));
90 -
91 - }
92 -
93 - if ( $db_result['history_page-row']['insertion_time'] >= $time - $wgArchiveLinksConfig['page_rearchive_time']) {
94 -
95 - }
 92+ } elseif ( $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['global_rearchive_time'] ) {
 93+ $db_result['history_page'] = $db_slave->select( 'el_archive_link_history', '*', array( 'hist_url' => $link, 'page_id' => $page_id ), __METHOD__, array( 'LIMIT' => '1', 'ORDER BY' => 'hist_id DESC' ) );
 94+
 95+ $db_result['history_page-numrows'] = $db_result['history_page']->numRows();
 96+ $db_result['history_page-row'] = $db_result['history_page']->fetchRow();
 97+
 98+ if ( $db_result['history_page-numrows'] === 0 && $db_result['history-row']['hist_insertion_time'] >= $time - $wgArchiveLinksConfig['previous_archive_lockout_time'] ) {
 99+ //this link is new to this particular page but has been archived on another page less than the rearchive delay
 100+ //grab a new version of it in case the content has changed
 101+ $db_master->insert( 'el_archive_queue', array(
 102+ 'page_id' => $page_id,
 103+ 'url' => $link,
 104+ 'delay_time' => '0',
 105+ 'insertion_time' => $time,
 106+ 'in_progress' => '0',
 107+ ));
 108+
 109+ $db_master->insert( 'el_archive_link_history', array(
 110+ 'page_id' => $page_id,
 111+ 'url' => $link,
 112+ 'delay_time' => '0',
 113+ 'insertion_time' => $time,
 114+ 'in_progress' => '0',
 115+ ));
 116+
 117+ }
 118+
 119+ if ( $db_result['history_page-row']['insertion_time'] >= $time - $wgArchiveLinksConfig['page_rearchive_time']) {
 120+
 121+ }
 122+ }*/
96123 }
97 -
98124 } else {
99 - //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"');
100 - $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__ );
101 - $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__ );
 125+ //insert everything as a job and do the work later to avoid lagging page save
 126+ }
 127+
 128+ } else {
102129
103 - if ( $db_result['blacklist']->numRows() === 0 ) {
104 - if ( $db_result['queue']->numRows() === 0 ) {
105 - // this probably a first time job
106 - // but we should check the logs and resource table
107 - // to make sure
108 - $db_master->insert( 'el_archive_queue', array (
109 - 'page_id' => $page_id,
110 - 'url' => $link,
111 - 'delay_time' => '0',
112 - 'insertion_time' => $time,
113 - 'in_progress' => '0',
114 - ));
115 - } else {
116 - //this job is already in the queue, why?
117 - // * most likely reason is it has already been inserted by another page
118 - // * or we are checking it later because the site was down at last archival
119 - // in either case we don't really need to do anything right now, so skip...
 130+ foreach ( $external_links as $link => $unused_value ) {
 131+ $link = $db_slave->strencode( $link );
 132+
 133+ if ( $wgArchiveLinksConfig['generate_feed'] === true ) {
 134+
 135+
 136+
 137+ /*$diff_eng = new DifferenceEngine( null, $old_id, $page_id, null, false );
 138+
 139+ $diff = $diff_eng->getDiffBody();
 140+ die( var_dump($diff) );
 141+ */
 142+
 143+ //file_put_contents('stf.txt', var_export( $diff, TRUE ) );
 144+
 145+ /*
 146+ * Querying the db server with selects for every link on the page would potentially be a whole bunch of unnecessary load
 147+ * Let's take the diff first then do it on a job instead...
 148+ *
 149+*/
 150+
 151+ } else {
 152+ //$db_result['resource'] = $db_slave->select( 'el_archive_resource', '*', '`el_archive_resource`.`resource_url` = "' . $db_slave->strencode( $link ) . '"');
 153+ $db_result['blacklist'] = $db_slave->select( 'el_archive_blacklist', '*', array( 'bl_url' => $link ), __METHOD__ );
 154+ $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__ );
 155+
 156+ if ( $db_result['blacklist']->numRows() === 0 ) {
 157+ if ( $db_result['queue']->numRows() === 0 ) {
 158+ // this probably a first time job
 159+ // but we should check the logs and resource table
 160+ // to make sure
 161+ $db_master->insert( 'el_archive_queue', array (
 162+ 'page_id' => $page_id,
 163+ 'url' => $link,
 164+ 'delay_time' => '0',
 165+ 'insertion_time' => $time,
 166+ 'in_progress' => '0',
 167+ ));
 168+ } else {
 169+ //this job is already in the queue, why?
 170+ // * most likely reason is it has already been inserted by another page
 171+ // * or we are checking it later because the site was down at last archival
 172+ // in either case we don't really need to do anything right now, so skip...
 173+ }
120174 }
121175 }
122176 }
@@ -163,5 +217,28 @@
164218 } else {
165219 return true;
166220 }
167 - }
 221+ }
 222+
 223+ public function feed_insert_links ( $url, $escaped = false ) {
 224+ if ( !$escaped ) {
 225+ $url = $this->strencode( $url );
 226+ }
 227+
 228+ $db_result['queue'] = $db_slave->select( 'el_archive_queue', '*', array( 'url' => $link ), __METHOD__, array( 'LIMIT' => '1', ) );
 229+
 230+ $db_result['queue-numrows'] = $db_result['queue']->numRows();
 231+ $db_result['blacklist-numrows'] = $db_result['blacklist']->numRows();
 232+ }
 233+}
 234+
 235+class InsertURLsIntoQueue extends Job {
 236+ public function __construct( $title, $params ) {
 237+ // Replace synchroniseThreadArticleData with the an identifier for your job.
 238+ parent::__construct( 'insertURLsIntoQueue', $title, $params );
 239+ }
 240+
 241+
 242+ public function run() {
 243+
 244+ }
168245 }
\ No newline at end of file

Status & tagging log