Index: trunk/extensions/SpamBlacklist/SpamBlacklist.php |
— | — | @@ -18,90 +18,34 @@ |
19 | 19 | $dir = dirname(__FILE__) . '/'; |
20 | 20 | $wgExtensionMessagesFiles['SpamBlackList'] = $dir . 'SpamBlacklist.i18n.php'; |
21 | 21 | |
22 | | -global $wgSpamBlacklistFiles; |
23 | | -global $wgSpamBlacklistSettings; |
24 | | - |
25 | | -$wgSpamBlacklistFiles = false; |
26 | | -$wgSpamBlacklistSettings = array(); |
27 | | - |
28 | | -$wgHooks['EditFilterMerged'][] = 'wfSpamBlacklistFilterMerged'; |
29 | | -$wgHooks['EditFilter'][] = 'wfSpamBlacklistValidate'; |
30 | | -$wgHooks['ArticleSaveComplete'][] = 'wfSpamBlacklistArticleSave'; |
31 | | -$wgHooks['APIEditBeforeSave'][] = 'wfSpamBlacklistFilterAPIEditBeforeSave'; |
32 | | - |
33 | 22 | /** |
34 | | - * Get an instance of SpamBlacklist and do some first-call initialisation. |
35 | | - * All actual functionality is implemented in that object |
| 23 | + * Array of settings for blacklist classes |
36 | 24 | */ |
37 | | -function wfSpamBlacklistObject() { |
38 | | - global $wgSpamBlacklistFiles, $wgSpamBlacklistSettings; |
39 | | - static $spamObj; |
40 | | - if ( !$spamObj ) { |
41 | | - require_once( "SpamBlacklist_body.php" ); |
42 | | - $spamObj = new SpamBlacklist( $wgSpamBlacklistSettings ); |
43 | | - if( $wgSpamBlacklistFiles ) { |
44 | | - $spamObj->files = $wgSpamBlacklistFiles; |
45 | | - } |
46 | | - } |
47 | | - return $spamObj; |
48 | | -} |
| 25 | +$wgBlacklistSettings = array( |
| 26 | + 'spam' => array( |
| 27 | + 'files' => array(), |
| 28 | + ), |
| 29 | +); |
49 | 30 | |
50 | 31 | /** |
51 | | - * Hook function for EditFilterMerged |
| 32 | + * @deprecated |
52 | 33 | */ |
53 | | -function wfSpamBlacklistFilterMerged( $editPage, $text, &$hookErr, $editSummary ) { |
54 | | - global $wgTitle; |
55 | | - if( is_null( $wgTitle ) ) { |
56 | | - # API mode |
57 | | - # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist |
58 | | - return true; |
59 | | - } |
| 34 | +$wgSpamBlacklistFiles =& $wgBlacklistSettings['spam']['files']; |
60 | 35 | |
61 | | - $spamObj = wfSpamBlacklistObject(); |
62 | | - $title = $editPage->mArticle->getTitle(); |
63 | | - $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage ); |
64 | | - if ( $ret !== false ) { |
65 | | - // spamPageWithContent() method was added in MW 1.17 |
66 | | - if ( method_exists( $editPage, 'spamPageWithContent' ) ) { |
67 | | - $editPage->spamPageWithContent( $ret ); |
68 | | - } else { |
69 | | - $editPage->spamPage( $ret ); |
70 | | - } |
71 | | - } |
72 | | - // Return convention for hooks is the inverse of $wgFilterCallback |
73 | | - return ( $ret === false ); |
74 | | -} |
75 | | - |
76 | 36 | /** |
77 | | - * Hook function for APIEditBeforeSave |
| 37 | + * @deprecated |
78 | 38 | */ |
79 | | -function wfSpamBlacklistFilterAPIEditBeforeSave( $editPage, $text, &$resultArr ) { |
80 | | - $spamObj = wfSpamBlacklistObject(); |
81 | | - $title = $editPage->mArticle->getTitle(); |
82 | | - $ret = $spamObj->filter( $title, $text, '', '', $editPage ); |
83 | | - if ( $ret!==false ) { |
84 | | - $resultArr['spamblacklist'] = $ret; |
85 | | - } |
86 | | - // Return convention for hooks is the inverse of $wgFilterCallback |
87 | | - return ( $ret === false ); |
88 | | -} |
| 39 | +$wgSpamBlacklistSettings =& $wgBlacklistSettings['spam']; |
89 | 40 | |
90 | | -/** |
91 | | - * Hook function for EditFilter |
92 | | - * Confirm that a local blacklist page being saved is valid, |
93 | | - * and toss back a warning to the user if it isn't. |
94 | | - */ |
95 | | -function wfSpamBlacklistValidate( $editPage, $text, $section, &$hookError ) { |
96 | | - $spamObj = wfSpamBlacklistObject(); |
97 | | - return $spamObj->validate( $editPage, $text, $section, $hookError ); |
98 | | -} |
| 41 | +$wgHooks['EditFilterMerged'][] = 'SpamBlacklistHooks::filterMerged'; |
| 42 | +$wgHooks['APIEditBeforeSave'][] = 'SpamBlacklistHooks::filterAPIEditBeforeSave'; |
| 43 | +$wgHooks['EditFilter'][] = 'SpamBlacklistHooks::validate'; |
| 44 | +$wgHooks['ArticleSaveComplete'][] = 'SpamBlacklistHooks::articleSave'; |
99 | 45 | |
100 | | -/** |
101 | | - * Hook function for ArticleSaveComplete |
102 | | - * Clear local spam blacklist caches on page save. |
103 | | - */ |
104 | | -function wfSpamBlacklistArticleSave( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) { |
105 | | - $spamObj = wfSpamBlacklistObject(); |
106 | | - return $spamObj->onArticleSave( $article, $user, $text, $summary, $isminor, $iswatch, $section ); |
107 | | -} |
| 46 | +$wgAutoloadClasses['BaseBlacklist'] = $dir . 'BaseBlacklist.php'; |
| 47 | +$wgAutoloadClasses['SpamBlacklistHooks'] = $dir . 'SpamBlacklistHooks.php'; |
| 48 | +$wgAutoloadClasses['SpamBlacklist'] = $dir . 'SpamBlacklist_body.php'; |
| 49 | +$wgAutoloadClasses['SpamRegexBatch'] = $dir . 'SpamRegexBatch.php'; |
108 | 50 | |
| 51 | + |
| 52 | + |
Index: trunk/extensions/SpamBlacklist/SpamBlacklistHooks.php |
— | — | @@ -0,0 +1,126 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * Hooks for the spam blacklist extension |
| 6 | + */ |
| 7 | +class SpamBlacklistHooks { |
| 8 | + |
| 9 | + /** |
| 10 | + * Hook function for EditFilterMerged |
| 11 | + * |
| 12 | + * @param $editPage EditPage |
| 13 | + * @param $text string |
| 14 | + * @param $hookErr string |
| 15 | + * @param $editSummary string |
| 16 | + * @return bool |
| 17 | + */ |
| 18 | + static function filterMerged( $editPage, $text, &$hookErr, $editSummary ) { |
| 19 | + global $wgTitle; |
| 20 | + if( is_null( $wgTitle ) ) { |
| 21 | + # API mode |
| 22 | + # wfSpamBlacklistFilterAPIEditBeforeSave already checked the blacklist |
| 23 | + return true; |
| 24 | + } |
| 25 | + |
| 26 | + $spamObj = BaseBlacklist::getInstance( 'spam' ); |
| 27 | + $title = $editPage->mArticle->getTitle(); |
| 28 | + $ret = $spamObj->filter( $title, $text, '', $editSummary, $editPage ); |
| 29 | + if ( $ret !== false ) { |
| 30 | + // spamPageWithContent() method was added in MW 1.17 |
| 31 | + if ( method_exists( $editPage, 'spamPageWithContent' ) ) { |
| 32 | + $editPage->spamPageWithContent( $ret ); |
| 33 | + } else { |
| 34 | + $editPage->spamPage( $ret ); |
| 35 | + } |
| 36 | + } |
| 37 | + // Return convention for hooks is the inverse of $wgFilterCallback |
| 38 | + return ( $ret === false ); |
| 39 | + } |
| 40 | + |
| 41 | + /** |
| 42 | + * Hook function for APIEditBeforeSave |
| 43 | + * |
| 44 | + * @param $editPage EditPage |
| 45 | + * @param $text string |
| 46 | + * @param $resultArr array |
| 47 | + * @return bool |
| 48 | + */ |
| 49 | + static function filterAPIEditBeforeSave( $editPage, $text, &$resultArr ) { |
| 50 | + $spamObj = BaseBlacklist::getInstance( 'spam' ); |
| 51 | + $title = $editPage->mArticle->getTitle(); |
| 52 | + $ret = $spamObj->filter( $title, $text, '', '', $editPage ); |
| 53 | + if ( $ret!==false ) { |
| 54 | + $resultArr['spamblacklist'] = $ret; |
| 55 | + } |
| 56 | + // Return convention for hooks is the inverse of $wgFilterCallback |
| 57 | + return ( $ret === false ); |
| 58 | + } |
| 59 | + |
| 60 | + /** |
| 61 | + * Hook function for EditFilter |
| 62 | + * Confirm that a local blacklist page being saved is valid, |
| 63 | + * and toss back a warning to the user if it isn't. |
| 64 | + * |
| 65 | + * @param $editPage EditPage |
| 66 | + * @param $text string |
| 67 | + * @param $section string |
| 68 | + * @param $hookError string |
| 69 | + * @return bool |
| 70 | + */ |
| 71 | + static function validate( $editPage, $text, $section, &$hookError ) { |
| 72 | + $thisPageName = $editPage->mTitle->getPrefixedDBkey(); |
| 73 | + |
| 74 | + if( !BaseBlacklist::isLocalSource( $editPage->mTitle ) ) { |
| 75 | + wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" ); |
| 76 | + return true; |
| 77 | + } |
| 78 | + |
| 79 | + $lines = explode( "\n", $text ); |
| 80 | + |
| 81 | + $badLines = SpamRegexBatch::getBadLines( $lines ); |
| 82 | + if( $badLines ) { |
| 83 | + wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " . |
| 84 | + implode( ', ', $badLines ) . "\n" ); |
| 85 | + |
| 86 | + $badList = "*<tt>" . |
| 87 | + implode( "</tt>\n*<tt>", |
| 88 | + array_map( 'wfEscapeWikiText', $badLines ) ) . |
| 89 | + "</tt>\n"; |
| 90 | + $hookError = |
| 91 | + "<div class='errorbox'>" . |
| 92 | + wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badLines ) ) . "<br />" . |
| 93 | + $badList . |
| 94 | + "</div>\n" . |
| 95 | + "<br clear='all' />\n"; |
| 96 | + return true; |
| 97 | + } else { |
| 98 | + wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" ); |
| 99 | + return true; |
| 100 | + } |
| 101 | + } |
| 102 | + |
| 103 | + /** |
| 104 | + * Hook function for ArticleSaveComplete |
| 105 | + * Clear local spam blacklist caches on page save. |
| 106 | + * |
| 107 | + * @param $article Article |
| 108 | + * @param $user User |
| 109 | + * @param $text string |
| 110 | + * @param $summary string |
| 111 | + * @param $isminor |
| 112 | + * @param $iswatch |
| 113 | + * @param $section |
| 114 | + * @return bool |
| 115 | + */ |
| 116 | + static function articleSave( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) { |
| 117 | + if( !BaseBlacklist::isLocalSource( $article->getTitle() ) ) { |
| 118 | + return false; |
| 119 | + } |
| 120 | + global $wgMemc, $wgDBname; |
| 121 | + |
| 122 | + // This sucks because every Blacklist needs to be cleared |
| 123 | + foreach ( BaseBlacklist::getBlacklistTypes() as $type => $class ) { |
| 124 | + $wgMemc->delete( "$wgDBname:{$type}_blacklist_regexes" ); |
| 125 | + } |
| 126 | + } |
| 127 | +} |
Property changes on: trunk/extensions/SpamBlacklist/SpamBlacklistHooks.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 128 | + native |
Index: trunk/extensions/SpamBlacklist/SpamRegexBatch.php |
— | — | @@ -0,0 +1,171 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * Utility class for working with blacklists |
| 6 | + */ |
| 7 | +class SpamRegexBatch { |
| 8 | + /** |
| 9 | + * Build a set of regular expressions matching URLs with the list of regex fragments. |
| 10 | + * Returns an empty list if the input list is empty. |
| 11 | + * |
| 12 | + * @param array $lines list of fragments which will match in URLs |
| 13 | + * @param int $batchSize largest allowed batch regex; |
| 14 | + * if 0, will produce one regex per line |
| 15 | + * @return array |
| 16 | + */ |
| 17 | + static function buildRegexes( $lines, $batchSize=4096 ) { |
| 18 | + # Make regex |
| 19 | + # It's faster using the S modifier even though it will usually only be run once |
| 20 | + //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; |
| 21 | + //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; |
| 22 | + $regexes = array(); |
| 23 | + $regexStart = '/(?:https?:)?\/\/+[a-z0-9_\-.]*('; |
| 24 | + $regexEnd = ($batchSize > 0 ) ? ')/Sim' : ')/im'; |
| 25 | + $build = false; |
| 26 | + foreach( $lines as $line ) { |
| 27 | + if( substr( $line, -1, 1 ) == "\\" ) { |
| 28 | + // Final \ will break silently on the batched regexes. |
| 29 | + // Skip it here to avoid breaking the next line; |
| 30 | + // warnings from getBadLines() will still trigger on |
| 31 | + // edit to keep new ones from floating in. |
| 32 | + continue; |
| 33 | + } |
| 34 | + // FIXME: not very robust size check, but should work. :) |
| 35 | + if( $build === false ) { |
| 36 | + $build = $line; |
| 37 | + } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) { |
| 38 | + $regexes[] = $regexStart . |
| 39 | + str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . |
| 40 | + $regexEnd; |
| 41 | + $build = $line; |
| 42 | + } else { |
| 43 | + $build .= '|'; |
| 44 | + $build .= $line; |
| 45 | + } |
| 46 | + } |
| 47 | + if( $build !== false ) { |
| 48 | + $regexes[] = $regexStart . |
| 49 | + str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . |
| 50 | + $regexEnd; |
| 51 | + } |
| 52 | + return $regexes; |
| 53 | + } |
| 54 | + |
| 55 | + /** |
| 56 | + * Confirm that a set of regexes is either empty or valid. |
| 57 | + * |
| 58 | + * @param $regexes array set of regexes |
| 59 | + * @return bool true if ok, false if contains invalid lines |
| 60 | + */ |
| 61 | + static function validateRegexes( $regexes ) { |
| 62 | + foreach( $regexes as $regex ) { |
| 63 | + wfSuppressWarnings(); |
| 64 | + $ok = preg_match( $regex, '' ); |
| 65 | + wfRestoreWarnings(); |
| 66 | + |
| 67 | + if( $ok === false ) { |
| 68 | + return false; |
| 69 | + } |
| 70 | + } |
| 71 | + return true; |
| 72 | + } |
| 73 | + |
| 74 | + /** |
| 75 | + * Strip comments and whitespace, then remove blanks |
| 76 | + * |
| 77 | + * @param $lines array |
| 78 | + * @return array |
| 79 | + */ |
| 80 | + static function stripLines( $lines ) { |
| 81 | + return array_filter( |
| 82 | + array_map( 'trim', |
| 83 | + preg_replace( '/#.*$/', '', |
| 84 | + $lines ) ) ); |
| 85 | + } |
| 86 | + |
| 87 | + /** |
| 88 | + * Do a sanity check on the batch regex. |
| 89 | + * |
| 90 | + * @param $lines string unsanitized input lines |
| 91 | + * @param $fileName string optional for debug reporting |
| 92 | + * @return array of regexes |
| 93 | + */ |
| 94 | + static function buildSafeRegexes( $lines, $fileName=false ) { |
| 95 | + $lines = SpamRegexBatch::stripLines( $lines ); |
| 96 | + $regexes = SpamRegexBatch::buildRegexes( $lines ); |
| 97 | + if( SpamRegexBatch::validateRegexes( $regexes ) ) { |
| 98 | + return $regexes; |
| 99 | + } else { |
| 100 | + // _Something_ broke... rebuild line-by-line; it'll be |
| 101 | + // slower if there's a lot of blacklist lines, but one |
| 102 | + // broken line won't take out hundreds of its brothers. |
| 103 | + if( $fileName ) { |
| 104 | + wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" ); |
| 105 | + } |
| 106 | + return SpamRegexBatch::buildRegexes( $lines, 0 ); |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + /** |
| 111 | + * Returns an array of invalid lines |
| 112 | + * |
| 113 | + * @param array $lines |
| 114 | + * @return array of input lines which produce invalid input, or empty array if no problems |
| 115 | + */ |
| 116 | + static function getBadLines( $lines ) { |
| 117 | + $lines = SpamRegexBatch::stripLines( $lines ); |
| 118 | + |
| 119 | + $badLines = array(); |
| 120 | + foreach( $lines as $line ) { |
| 121 | + if( substr( $line, -1, 1 ) == "\\" ) { |
| 122 | + // Final \ will break silently on the batched regexes. |
| 123 | + $badLines[] = $line; |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + $regexes = SpamRegexBatch::buildRegexes( $lines ); |
| 128 | + if( SpamRegexBatch::validateRegexes( $regexes ) ) { |
| 129 | + // No other problems! |
| 130 | + return $badLines; |
| 131 | + } |
| 132 | + |
| 133 | + // Something failed in the batch, so check them one by one. |
| 134 | + foreach( $lines as $line ) { |
| 135 | + $regexes = SpamRegexBatch::buildRegexes( array( $line ) ); |
| 136 | + if( !SpamRegexBatch::validateRegexes( $regexes ) ) { |
| 137 | + $badLines[] = $line; |
| 138 | + } |
| 139 | + } |
| 140 | + return $badLines; |
| 141 | + } |
| 142 | + |
| 143 | + /** |
| 144 | + * Build a set of regular expressions from the given multiline input text, |
| 145 | + * with empty lines and comments stripped. |
| 146 | + * |
| 147 | + * @param $source string |
| 148 | + * @param $fileName bool|string optional, for reporting of bad files |
| 149 | + * @return array of regular expressions, potentially empty |
| 150 | + */ |
| 151 | + static function regexesFromText( $source, $fileName=false ) { |
| 152 | + $lines = explode( "\n", $source ); |
| 153 | + return SpamRegexBatch::buildSafeRegexes( $lines, $fileName ); |
| 154 | + } |
| 155 | + |
| 156 | + /** |
| 157 | + * Build a set of regular expressions from a MediaWiki message. |
| 158 | + * Will be correctly empty if the message isn't present. |
| 159 | + * |
| 160 | + * @param $message string |
| 161 | + * @return array of regular expressions, potentially empty |
| 162 | + */ |
| 163 | + static function regexesFromMessage( $message ) { |
| 164 | + $source = wfMsgForContent( $message ); |
| 165 | + if( $source && !wfEmptyMsg( $message, $source ) ) { |
| 166 | + return SpamRegexBatch::regexesFromText( $source ); |
| 167 | + } else { |
| 168 | + return array(); |
| 169 | + } |
| 170 | + } |
| 171 | +} |
| 172 | + |
Property changes on: trunk/extensions/SpamBlacklist/SpamRegexBatch.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 173 | + native |
Index: trunk/extensions/SpamBlacklist/BaseBlacklist.php |
— | — | @@ -0,0 +1,336 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * Base class for different kinds of blacklists |
| 6 | + */ |
| 7 | +abstract class BaseBlacklist { |
| 8 | + |
| 9 | + /** |
| 10 | + * Array of blacklist sources |
| 11 | + * |
| 12 | + * @var array |
| 13 | + */ |
| 14 | + public $files = array(); |
| 15 | + |
| 16 | + /** |
| 17 | + * Array containing regexes to test against |
| 18 | + * |
| 19 | + * @var bool|array |
| 20 | + */ |
| 21 | + protected $regexes = false; |
| 22 | + |
| 23 | + /** |
| 24 | + * Chance of receiving a warning when the filter is hit |
| 25 | + * |
| 26 | + * @var int |
| 27 | + */ |
| 28 | + public $warningChance = 100; |
| 29 | + |
| 30 | + /** |
| 31 | + * @var int |
| 32 | + */ |
| 33 | + public $warningTime = 600; |
| 34 | + |
| 35 | + /** |
| 36 | + * @var int |
| 37 | + */ |
| 38 | + public $expiryTime = 900; |
| 39 | + |
| 40 | + /** |
| 41 | + * Array containing blacklists that extend BaseBlacklist |
| 42 | + * |
| 43 | + * @var array |
| 44 | + */ |
| 45 | + private static $blacklistTypes = array( |
| 46 | + 'spam' => 'SpamBlacklist', |
| 47 | + ); |
| 48 | + |
| 49 | + /** |
| 50 | + * Array of blacklist instances |
| 51 | + * |
| 52 | + * @var array |
| 53 | + */ |
| 54 | + private static $instances = array(); |
| 55 | + |
| 56 | + /** |
| 57 | + * Constructor |
| 58 | + * |
| 59 | + * @param array $settings |
| 60 | + */ |
| 61 | + function __construct( $settings = array() ) { |
| 62 | + foreach ( $settings as $name => $value ) { |
| 63 | + $this->$name = $value; |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + /** |
| 68 | + * Adds a blacklist class to the registry |
| 69 | + * |
| 70 | + * @param $type string |
| 71 | + * @param $class string |
| 72 | + */ |
| 73 | + public static function addBlacklistType( $type, $class ) { |
| 74 | + self::$blacklistTypes[$type] = $class; |
| 75 | + } |
| 76 | + |
| 77 | + /** |
| 78 | + * Return the array of blacklist types currently defined |
| 79 | + * |
| 80 | + * @return array |
| 81 | + */ |
| 82 | + public static function getBlacklistTypes() { |
| 83 | + return self::$blacklistTypes; |
| 84 | + } |
| 85 | + |
| 86 | + /** |
| 87 | + * Returns an instance of the given blacklist |
| 88 | + * |
| 89 | + * @param $type string Code for the blacklist |
| 90 | + * @return BaseBlacklist |
| 91 | + * @throws MWException |
| 92 | + */ |
| 93 | + public static function getInstance( $type ) { |
| 94 | + if ( !isset( self::$blacklistTypes[$type] ) ) { |
| 95 | + throw new MWException( "Invalid blacklist type '$type' passed to " . __METHOD__ ); |
| 96 | + } |
| 97 | + |
| 98 | + if ( !isset( self::$instances[$type] ) ) { |
| 99 | + global $wgBlacklistSettings; |
| 100 | + |
| 101 | + // Prevent notices |
| 102 | + if ( !isset( $wgBlacklistSettings[$type] ) ) { |
| 103 | + $wgBlacklistSettings[$type] = array(); |
| 104 | + } |
| 105 | + |
| 106 | + self::$instances[$type] = new self::$blacklistTypes[$type]( $wgBlacklistSettings[$type] ); |
| 107 | + } |
| 108 | + |
| 109 | + return self::$instances[$type]; |
| 110 | + } |
| 111 | + |
| 112 | + /** |
| 113 | + * Returns the code for the blacklist implementation |
| 114 | + * |
| 115 | + * @return string |
| 116 | + */ |
| 117 | + abstract protected function getBlacklistType(); |
| 118 | + |
| 119 | + /** |
| 120 | + * Check if the given local page title is a spam regex source. |
| 121 | + * |
| 122 | + * @param Title $title |
| 123 | + * @return bool |
| 124 | + */ |
| 125 | + public static function isLocalSource( $title ) { |
| 126 | + global $wgDBname, $wgBlacklistSettings; |
| 127 | + |
| 128 | + if( $title->getNamespace() == NS_MEDIAWIKI ) { |
| 129 | + $sources = array(); |
| 130 | + foreach ( self::$blacklistTypes as $type => $class ) { |
| 131 | + $type = ucfirst( $type ); |
| 132 | + $sources += array( |
| 133 | + "$type-blacklist", |
| 134 | + "$type-whitelist" |
| 135 | + ); |
| 136 | + } |
| 137 | + |
| 138 | + if( in_array( $title->getDBkey(), $sources ) ) { |
| 139 | + return true; |
| 140 | + } |
| 141 | + } |
| 142 | + |
| 143 | + $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP ); |
| 144 | + $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/'; |
| 145 | + |
| 146 | + $files = array(); |
| 147 | + foreach ( self::$blacklistTypes as $type => $class ) { |
| 148 | + if ( isset( $wgBlacklistSettings[$type]['files'] ) ) { |
| 149 | + $files += $wgBlacklistSettings[$type]['files']; |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | + foreach( $files as $fileName ) { |
| 154 | + $matches = array(); |
| 155 | + if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { |
| 156 | + if ( $wgDBname == $matches[1] ) { |
| 157 | + if( $matches[2] == $title->getPrefixedDbKey() ) { |
| 158 | + // Local DB fetch of this page... |
| 159 | + return true; |
| 160 | + } |
| 161 | + } |
| 162 | + } elseif( preg_match( $thisHttpRegex, $fileName ) ) { |
| 163 | + // Raw view of this page |
| 164 | + return true; |
| 165 | + } |
| 166 | + } |
| 167 | + |
| 168 | + return false; |
| 169 | + } |
| 170 | + |
| 171 | + /** |
| 172 | + * Fetch local and (possibly cached) remote blacklists. |
| 173 | + * Will be cached locally across multiple invocations. |
| 174 | + * @return array set of regular expressions, potentially empty. |
| 175 | + */ |
| 176 | + function getBlacklists() { |
| 177 | + if( $this->regexes === false ) { |
| 178 | + $this->regexes = array_merge( |
| 179 | + $this->getLocalBlacklists(), |
| 180 | + $this->getSharedBlacklists() ); |
| 181 | + } |
| 182 | + return $this->regexes; |
| 183 | + } |
| 184 | + |
| 185 | + /** |
| 186 | + * Returns the local blacklist |
| 187 | + * |
| 188 | + * @return array Regular expressions |
| 189 | + */ |
| 190 | + public function getLocalBlacklists() { |
| 191 | + return SpamRegexBatch::regexesFromMessage( "{$this->getBlacklistType()}-blacklist" ); |
| 192 | + } |
| 193 | + |
| 194 | + /** |
| 195 | + * Returns the (local) whitelist |
| 196 | + * |
| 197 | + * @return array Regular expressions |
| 198 | + */ |
| 199 | + public function getWhitelists() { |
| 200 | + return SpamRegexBatch::regexesFromMessage( "{$this->getBlacklistType()}-whitelist" ); |
| 201 | + } |
| 202 | + |
| 203 | + /** |
| 204 | + * Fetch (possibly cached) remote blacklists. |
| 205 | + * @return array |
| 206 | + */ |
| 207 | + function getSharedBlacklists() { |
| 208 | + global $wgMemc, $wgDBname; |
| 209 | + $listType = $this->getBlacklistType(); |
| 210 | + $fname = 'SpamBlacklist::getRegex'; |
| 211 | + wfProfileIn( $fname ); |
| 212 | + |
| 213 | + wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." ); |
| 214 | + |
| 215 | + if ( count( $this->files ) == 0 ){ |
| 216 | + # No lists |
| 217 | + wfDebugLog( 'SpamBlacklist', "no files specified\n" ); |
| 218 | + wfProfileOut( $fname ); |
| 219 | + return array(); |
| 220 | + } |
| 221 | + |
| 222 | + // This used to be cached per-site, but that could be bad on a shared |
| 223 | + // server where not all wikis have the same configuration. |
| 224 | + $cachedRegexes = $wgMemc->get( "$wgDBname:{$listType}_blacklist_regexes" ); |
| 225 | + if( is_array( $cachedRegexes ) ) { |
| 226 | + wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" ); |
| 227 | + wfProfileOut( $fname ); |
| 228 | + return $cachedRegexes; |
| 229 | + } |
| 230 | + |
| 231 | + $regexes = $this->buildSharedBlacklists(); |
| 232 | + $wgMemc->set( "$wgDBname:{$listType}_blacklist_regexes", $regexes, $this->expiryTime ); |
| 233 | + |
| 234 | + return $regexes; |
| 235 | + } |
| 236 | + |
| 237 | + function clearCache() { |
| 238 | + global $wgMemc, $wgDBname; |
| 239 | + $listType = $this->getBlacklistType(); |
| 240 | + |
| 241 | + $wgMemc->delete( "$wgDBname:{$listType}_blacklist_regexes" ); |
| 242 | + wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" ); |
| 243 | + } |
| 244 | + |
| 245 | + function buildSharedBlacklists() { |
| 246 | + $regexes = array(); |
| 247 | + $listType = $this->getBlacklistType(); |
| 248 | + # Load lists |
| 249 | + wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" ); |
| 250 | + foreach ( $this->files as $fileName ) { |
| 251 | + $matches = array(); |
| 252 | + if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { |
| 253 | + $text = $this->getArticleText( $matches[1], $matches[2] ); |
| 254 | + } elseif ( preg_match( '/^http:\/\//', $fileName ) ) { |
| 255 | + $text = $this->getHttpText( $fileName ); |
| 256 | + } else { |
| 257 | + $text = file_get_contents( $fileName ); |
| 258 | + wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" ); |
| 259 | + } |
| 260 | + |
| 261 | + // Build a separate batch of regexes from each source. |
| 262 | + // While in theory we could squeeze a little efficiency |
| 263 | + // out of combining multiple sources in one regex, if |
| 264 | + // there's a bad line in one of them we'll gain more |
| 265 | + // from only having to break that set into smaller pieces. |
| 266 | + $regexes = array_merge( $regexes, |
| 267 | + SpamRegexBatch::regexesFromText( $text, $fileName ) ); |
| 268 | + } |
| 269 | + |
| 270 | + return $regexes; |
| 271 | + } |
| 272 | + |
| 273 | + function getHttpText( $fileName ) { |
| 274 | + global $wgDBname, $messageMemc; |
| 275 | + $listType = $this->getBlacklistType(); |
| 276 | + |
| 277 | + # HTTP request |
| 278 | + # To keep requests to a minimum, we save results into $messageMemc, which is |
| 279 | + # similar to $wgMemc except almost certain to exist. By default, it is stored |
| 280 | + # in the database |
| 281 | + # |
| 282 | + # There are two keys, when the warning key expires, a random thread will refresh |
| 283 | + # the real key. This reduces the chance of multiple requests under high traffic |
| 284 | + # conditions. |
| 285 | + $key = "{$listType}_blacklist_file:$fileName"; |
| 286 | + $warningKey = "$wgDBname:{$listType}filewarning:$fileName"; |
| 287 | + $httpText = $messageMemc->get( $key ); |
| 288 | + $warning = $messageMemc->get( $warningKey ); |
| 289 | + |
| 290 | + if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { |
| 291 | + wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" ); |
| 292 | + $httpText = Http::get( $fileName ); |
| 293 | + if( $httpText === false ) { |
| 294 | + wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" ); |
| 295 | + } |
| 296 | + $messageMemc->set( $warningKey, 1, $this->warningTime ); |
| 297 | + $messageMemc->set( $key, $httpText, $this->expiryTime ); |
| 298 | + } else { |
| 299 | + wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" ); |
| 300 | + } |
| 301 | + return $httpText; |
| 302 | + } |
| 303 | + |
| 304 | + /** |
| 305 | + * Fetch an article from this or another local MediaWiki database. |
| 306 | + * This is probably *very* fragile, and shouldn't be used perhaps. |
| 307 | + * |
| 308 | + * @param string $db |
| 309 | + * @param string $article |
| 310 | + * @return string |
| 311 | + */ |
| 312 | + function getArticleText( $db, $article ) { |
| 313 | + wfDebugLog( 'SpamBlacklist', "Fetching {$this->getBlacklistType()} spam blacklist from '$article' on '$db'...\n" ); |
| 314 | + global $wgDBname; |
| 315 | + $dbr = wfGetDB( DB_READ ); |
| 316 | + $dbr->selectDB( $db ); |
| 317 | + $text = false; |
| 318 | + if ( $dbr->tableExists( 'page' ) ) { |
| 319 | + // 1.5 schema |
| 320 | + $dbw = wfGetDB( DB_READ ); |
| 321 | + $dbw->selectDB( $db ); |
| 322 | + $revision = Revision::newFromTitle( Title::newFromText( $article ) ); |
| 323 | + if ( $revision ) { |
| 324 | + $text = $revision->getText(); |
| 325 | + } |
| 326 | + $dbw->selectDB( $wgDBname ); |
| 327 | + } else { |
| 328 | + // 1.4 schema |
| 329 | + $title = Title::newFromText( $article ); |
| 330 | + $text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(), |
| 331 | + 'cur_title' => $title->getDBkey() ), __METHOD__ ); |
| 332 | + } |
| 333 | + $dbr->selectDB( $wgDBname ); |
| 334 | + return strval( $text ); |
| 335 | + } |
| 336 | + |
| 337 | +} |
Property changes on: trunk/extensions/SpamBlacklist/BaseBlacklist.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 338 | + native |
Index: trunk/extensions/SpamBlacklist/SpamBlacklist_body.php |
— | — | @@ -4,177 +4,20 @@ |
5 | 5 | exit; |
6 | 6 | } |
7 | 7 | |
8 | | -class SpamBlacklist { |
9 | | - var $regexes = false; |
| 8 | +class SpamBlacklist extends BaseBlacklist { |
10 | 9 | var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" ); |
11 | | - var $warningTime = 600; |
12 | | - var $expiryTime = 900; |
13 | | - var $warningChance = 100; |
14 | 10 | var $ignoreEditSummary = false; |
15 | 11 | |
16 | | - function __construct( $settings = array() ) { |
17 | | - foreach ( $settings as $name => $value ) { |
18 | | - $this->$name = $value; |
19 | | - } |
20 | | - } |
21 | | - |
22 | 12 | /** |
23 | | - * Check if the given local page title is a spam regex source. |
24 | | - * @param Title $title |
25 | | - * @return bool |
| 13 | + * Returns the code for the blacklist implementation |
| 14 | + * |
| 15 | + * @return string |
26 | 16 | */ |
27 | | - function isLocalSource( $title ) { |
28 | | - global $wgDBname; |
29 | | - |
30 | | - if( $title->getNamespace() == NS_MEDIAWIKI ) { |
31 | | - $sources = array( |
32 | | - "Spam-blacklist", |
33 | | - "Spam-whitelist" ); |
34 | | - if( in_array( $title->getDBkey(), $sources ) ) { |
35 | | - return true; |
36 | | - } |
37 | | - } |
38 | | - |
39 | | - $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP ); |
40 | | - $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/'; |
41 | | - |
42 | | - foreach( $this->files as $fileName ) { |
43 | | - $matches = array(); |
44 | | - if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { |
45 | | - if ( $wgDBname == $matches[1] ) { |
46 | | - if( $matches[2] == $title->getPrefixedDbKey() ) { |
47 | | - // Local DB fetch of this page... |
48 | | - return true; |
49 | | - } |
50 | | - } |
51 | | - } elseif( preg_match( $thisHttpRegex, $fileName ) ) { |
52 | | - // Raw view of this page |
53 | | - return true; |
54 | | - } |
55 | | - } |
56 | | - |
57 | | - return false; |
| 17 | + protected function getBlacklistType() { |
| 18 | + return 'spam'; |
58 | 19 | } |
59 | 20 | |
60 | 21 | /** |
61 | | - * Fetch local and (possibly cached) remote blacklists. |
62 | | - * Will be cached locally across multiple invocations. |
63 | | - * @return array set of regular expressions, potentially empty. |
64 | | - */ |
65 | | - function getBlacklists() { |
66 | | - if( $this->regexes === false ) { |
67 | | - $this->regexes = array_merge( |
68 | | - $this->getLocalBlacklists(), |
69 | | - $this->getSharedBlacklists() ); |
70 | | - } |
71 | | - return $this->regexes; |
72 | | - } |
73 | | - |
74 | | - /** |
75 | | - * Fetch (possibly cached) remote blacklists. |
76 | | - * @return array |
77 | | - */ |
78 | | - function getSharedBlacklists() { |
79 | | - global $wgMemc, $wgDBname; |
80 | | - $fname = 'SpamBlacklist::getRegex'; |
81 | | - wfProfileIn( $fname ); |
82 | | - |
83 | | - wfDebugLog( 'SpamBlacklist', "Loading spam regex..." ); |
84 | | - |
85 | | - if ( count( $this->files ) == 0 ){ |
86 | | - # No lists |
87 | | - wfDebugLog( 'SpamBlacklist', "no files specified\n" ); |
88 | | - wfProfileOut( $fname ); |
89 | | - return array(); |
90 | | - } |
91 | | - |
92 | | - // This used to be cached per-site, but that could be bad on a shared |
93 | | - // server where not all wikis have the same configuration. |
94 | | - $cachedRegexes = $wgMemc->get( "$wgDBname:spam_blacklist_regexes" ); |
95 | | - if( is_array( $cachedRegexes ) ) { |
96 | | - wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" ); |
97 | | - wfProfileOut( $fname ); |
98 | | - return $cachedRegexes; |
99 | | - } |
100 | | - |
101 | | - $regexes = $this->buildSharedBlacklists(); |
102 | | - $wgMemc->set( "$wgDBname:spam_blacklist_regexes", $regexes, $this->expiryTime ); |
103 | | - |
104 | | - return $regexes; |
105 | | - } |
106 | | - |
107 | | - function clearCache() { |
108 | | - global $wgMemc, $wgDBname; |
109 | | - $wgMemc->delete( "$wgDBname:spam_blacklist_regexes" ); |
110 | | - wfDebugLog( 'SpamBlacklist', "Spam blacklist local cache cleared.\n" ); |
111 | | - } |
112 | | - |
113 | | - function buildSharedBlacklists() { |
114 | | - $regexes = array(); |
115 | | - # Load lists |
116 | | - wfDebugLog( 'SpamBlacklist', "Constructing spam blacklist\n" ); |
117 | | - foreach ( $this->files as $fileName ) { |
118 | | - $matches = array(); |
119 | | - if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { |
120 | | - $text = $this->getArticleText( $matches[1], $matches[2] ); |
121 | | - } elseif ( preg_match( '/^http:\/\//', $fileName ) ) { |
122 | | - $text = $this->getHttpText( $fileName ); |
123 | | - } else { |
124 | | - $text = file_get_contents( $fileName ); |
125 | | - wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" ); |
126 | | - } |
127 | | - |
128 | | - // Build a separate batch of regexes from each source. |
129 | | - // While in theory we could squeeze a little efficiency |
130 | | - // out of combining multiple sources in one regex, if |
131 | | - // there's a bad line in one of them we'll gain more |
132 | | - // from only having to break that set into smaller pieces. |
133 | | - $regexes = array_merge( $regexes, |
134 | | - SpamRegexBatch::regexesFromText( $text, $fileName ) ); |
135 | | - } |
136 | | - |
137 | | - return $regexes; |
138 | | - } |
139 | | - |
140 | | - function getHttpText( $fileName ) { |
141 | | - global $wgDBname, $messageMemc; |
142 | | - |
143 | | - # HTTP request |
144 | | - # To keep requests to a minimum, we save results into $messageMemc, which is |
145 | | - # similar to $wgMemc except almost certain to exist. By default, it is stored |
146 | | - # in the database |
147 | | - # |
148 | | - # There are two keys, when the warning key expires, a random thread will refresh |
149 | | - # the real key. This reduces the chance of multiple requests under high traffic |
150 | | - # conditions. |
151 | | - $key = "spam_blacklist_file:$fileName"; |
152 | | - $warningKey = "$wgDBname:spamfilewarning:$fileName"; |
153 | | - $httpText = $messageMemc->get( $key ); |
154 | | - $warning = $messageMemc->get( $warningKey ); |
155 | | - |
156 | | - if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { |
157 | | - wfDebugLog( 'SpamBlacklist', "Loading spam blacklist from $fileName\n" ); |
158 | | - $httpText = Http::get( $fileName ); |
159 | | - if( $httpText === false ) { |
160 | | - wfDebugLog( 'SpamBlacklist', "Error loading blacklist from $fileName\n" ); |
161 | | - } |
162 | | - $messageMemc->set( $warningKey, 1, $this->warningTime ); |
163 | | - $messageMemc->set( $key, $httpText, $this->expiryTime ); |
164 | | - } else { |
165 | | - wfDebugLog( 'SpamBlacklist', "Got spam blacklist from HTTP cache for $fileName\n" ); |
166 | | - } |
167 | | - return $httpText; |
168 | | - } |
169 | | - |
170 | | - static function getLocalBlacklists() { |
171 | | - return SpamRegexBatch::regexesFromMessage( 'spam-blacklist' ); |
172 | | - } |
173 | | - |
174 | | - static function getWhitelists() { |
175 | | - return SpamRegexBatch::regexesFromMessage( 'spam-whitelist' ); |
176 | | - } |
177 | | - |
178 | | - /** |
179 | 22 | * @param Title $title |
180 | 23 | * @param string $text Text of section, or entire text if $editPage!=false |
181 | 24 | * @param string $section Section number or name |
— | — | @@ -183,14 +26,18 @@ |
184 | 27 | * @return Matched text if the edit should not be allowed, false otherwise |
185 | 28 | */ |
186 | 29 | function filter( &$title, $text, $section, $editsummary = '', EditPage &$editPage = null ) { |
| 30 | + /** |
| 31 | + * @var $wgParser Parser |
| 32 | + */ |
187 | 33 | global $wgParser, $wgUser; |
188 | 34 | |
189 | 35 | $fname = 'wfSpamBlacklistFilter'; |
190 | 36 | wfProfileIn( $fname ); |
191 | 37 | |
192 | | - $this->title = $title; |
193 | | - $this->text = $text; |
194 | | - $this->section = $section; |
| 38 | + # These don't do anything, commenting out... |
| 39 | + #$this->title = $title; |
| 40 | + #$this->text = $text; |
| 41 | + #$this->section = $section; |
195 | 42 | $text = str_replace( '.', '.', $text ); //@bug 12896 |
196 | 43 | |
197 | 44 | $blacklists = $this->getBlacklists(); |
— | — | @@ -267,6 +114,8 @@ |
268 | 115 | * ignore them on a second run. |
269 | 116 | * |
270 | 117 | * WARNING: I can add more *of the same link* with no problem here. |
| 118 | + * @param $title Title |
| 119 | + * @return array |
271 | 120 | */ |
272 | 121 | function getCurrentLinks( $title ) { |
273 | 122 | $dbr = wfGetDB( DB_SLAVE ); |
— | — | @@ -279,250 +128,4 @@ |
280 | 129 | } |
281 | 130 | return $links; |
282 | 131 | } |
283 | | - |
284 | | - /** |
285 | | - * Fetch an article from this or another local MediaWiki database. |
286 | | - * This is probably *very* fragile, and shouldn't be used perhaps. |
287 | | - * @param string $db |
288 | | - * @param string $article |
289 | | - */ |
290 | | - function getArticleText( $db, $article ) { |
291 | | - wfDebugLog( 'SpamBlacklist', "Fetching local spam blacklist from '$article' on '$db'...\n" ); |
292 | | - global $wgDBname; |
293 | | - $dbr = wfGetDB( DB_READ ); |
294 | | - $dbr->selectDB( $db ); |
295 | | - $text = false; |
296 | | - if ( $dbr->tableExists( 'page' ) ) { |
297 | | - // 1.5 schema |
298 | | - $dbw = wfGetDB( DB_READ ); |
299 | | - $dbw->selectDB( $db ); |
300 | | - $revision = Revision::newFromTitle( Title::newFromText( $article ) ); |
301 | | - if ( $revision ) { |
302 | | - $text = $revision->getText(); |
303 | | - } |
304 | | - $dbw->selectDB( $wgDBname ); |
305 | | - } else { |
306 | | - // 1.4 schema |
307 | | - $title = Title::newFromText( $article ); |
308 | | - $text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(), |
309 | | - 'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleText' ); |
310 | | - } |
311 | | - $dbr->selectDB( $wgDBname ); |
312 | | - return strval( $text ); |
313 | | - } |
314 | | - |
315 | | - /** |
316 | | - * Confirm that a local blacklist page being saved is valid, |
317 | | - * and toss back a warning to the user if it isn't. |
318 | | - * This is an EditFilter hook. |
319 | | - */ |
320 | | - function validate( $editPage, $text, $section, &$hookError ) { |
321 | | - $thisPageName = $editPage->mTitle->getPrefixedDBkey(); |
322 | | - |
323 | | - if( !$this->isLocalSource( $editPage->mTitle ) ) { |
324 | | - wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" ); |
325 | | - return true; |
326 | | - } |
327 | | - |
328 | | - $lines = explode( "\n", $text ); |
329 | | - |
330 | | - $badLines = SpamRegexBatch::getBadLines( $lines ); |
331 | | - if( $badLines ) { |
332 | | - wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " . |
333 | | - implode( ', ', $badLines ) . "\n" ); |
334 | | - |
335 | | - $badList = "*<tt>" . |
336 | | - implode( "</tt>\n*<tt>", |
337 | | - array_map( 'wfEscapeWikiText', $badLines ) ) . |
338 | | - "</tt>\n"; |
339 | | - $hookError = |
340 | | - "<div class='errorbox'>" . |
341 | | - wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badLines ) ) . "<br />" . |
342 | | - $badList . |
343 | | - "</div>\n" . |
344 | | - "<br clear='all' />\n"; |
345 | | - return true; |
346 | | - } else { |
347 | | - wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" ); |
348 | | - return true; |
349 | | - } |
350 | | - } |
351 | | - |
352 | | - function onArticleSave( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) { |
353 | | - if( $this->isLocalSource( $article->getTitle() ) ) { |
354 | | - $this->clearCache(); |
355 | | - } |
356 | | - return true; |
357 | | - } |
358 | | -} |
359 | | - |
360 | | - |
361 | | -class SpamRegexBatch { |
362 | | - /** |
363 | | - * Build a set of regular expressions matching URLs with the list of regex fragments. |
364 | | - * Returns an empty list if the input list is empty. |
365 | | - * |
366 | | - * @param array $lines list of fragments which will match in URLs |
367 | | - * @param int $batchSize largest allowed batch regex; |
368 | | - * if 0, will produce one regex per line |
369 | | - * @return array |
370 | | - * @private |
371 | | - * @static |
372 | | - */ |
373 | | - static function buildRegexes( $lines, $batchSize=4096 ) { |
374 | | - # Make regex |
375 | | - # It's faster using the S modifier even though it will usually only be run once |
376 | | - //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; |
377 | | - //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; |
378 | | - $regexes = array(); |
379 | | - $regexStart = '/(?:https?:)?\/\/+[a-z0-9_\-.]*('; |
380 | | - $regexEnd = ($batchSize > 0 ) ? ')/Sim' : ')/im'; |
381 | | - $build = false; |
382 | | - foreach( $lines as $line ) { |
383 | | - if( substr( $line, -1, 1 ) == "\\" ) { |
384 | | - // Final \ will break silently on the batched regexes. |
385 | | - // Skip it here to avoid breaking the next line; |
386 | | - // warnings from getBadLines() will still trigger on |
387 | | - // edit to keep new ones from floating in. |
388 | | - continue; |
389 | | - } |
390 | | - // FIXME: not very robust size check, but should work. :) |
391 | | - if( $build === false ) { |
392 | | - $build = $line; |
393 | | - } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) { |
394 | | - $regexes[] = $regexStart . |
395 | | - str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . |
396 | | - $regexEnd; |
397 | | - $build = $line; |
398 | | - } else { |
399 | | - $build .= '|'; |
400 | | - $build .= $line; |
401 | | - } |
402 | | - } |
403 | | - if( $build !== false ) { |
404 | | - $regexes[] = $regexStart . |
405 | | - str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . |
406 | | - $regexEnd; |
407 | | - } |
408 | | - return $regexes; |
409 | | - } |
410 | | - |
411 | | - /** |
412 | | - * Confirm that a set of regexes is either empty or valid. |
413 | | - * @param array $lines set of regexes |
414 | | - * @return bool true if ok, false if contains invalid lines |
415 | | - * @private |
416 | | - * @static |
417 | | - */ |
418 | | - static function validateRegexes( $regexes ) { |
419 | | - foreach( $regexes as $regex ) { |
420 | | - wfSuppressWarnings(); |
421 | | - $ok = preg_match( $regex, '' ); |
422 | | - wfRestoreWarnings(); |
423 | | - |
424 | | - if( $ok === false ) { |
425 | | - return false; |
426 | | - } |
427 | | - } |
428 | | - return true; |
429 | | - } |
430 | | - |
431 | | - /** |
432 | | - * Strip comments and whitespace, then remove blanks |
433 | | - * @private |
434 | | - * @static |
435 | | - */ |
436 | | - static function stripLines( $lines ) { |
437 | | - return array_filter( |
438 | | - array_map( 'trim', |
439 | | - preg_replace( '/#.*$/', '', |
440 | | - $lines ) ) ); |
441 | | - } |
442 | | - |
443 | | - /** |
444 | | - * Do a sanity check on the batch regex. |
445 | | - * @param lines unsanitized input lines |
446 | | - * @param string $fileName optional for debug reporting |
447 | | - * @return array of regexes |
448 | | - * @private |
449 | | - * @static |
450 | | - */ |
451 | | - static function buildSafeRegexes( $lines, $fileName=false ) { |
452 | | - $lines = SpamRegexBatch::stripLines( $lines ); |
453 | | - $regexes = SpamRegexBatch::buildRegexes( $lines ); |
454 | | - if( SpamRegexBatch::validateRegexes( $regexes ) ) { |
455 | | - return $regexes; |
456 | | - } else { |
457 | | - // _Something_ broke... rebuild line-by-line; it'll be |
458 | | - // slower if there's a lot of blacklist lines, but one |
459 | | - // broken line won't take out hundreds of its brothers. |
460 | | - if( $fileName ) { |
461 | | - wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" ); |
462 | | - } |
463 | | - return SpamRegexBatch::buildRegexes( $lines, 0 ); |
464 | | - } |
465 | | - } |
466 | | - |
467 | | - /** |
468 | | - * @param array $lines |
469 | | - * @return array of input lines which produce invalid input, or empty array if no problems |
470 | | - * @static |
471 | | - */ |
472 | | - static function getBadLines( $lines ) { |
473 | | - $lines = SpamRegexBatch::stripLines( $lines ); |
474 | | - |
475 | | - $badLines = array(); |
476 | | - foreach( $lines as $line ) { |
477 | | - if( substr( $line, -1, 1 ) == "\\" ) { |
478 | | - // Final \ will break silently on the batched regexes. |
479 | | - $badLines[] = $line; |
480 | | - } |
481 | | - } |
482 | | - |
483 | | - $regexes = SpamRegexBatch::buildRegexes( $lines ); |
484 | | - if( SpamRegexBatch::validateRegexes( $regexes ) ) { |
485 | | - // No other problems! |
486 | | - return $badLines; |
487 | | - } |
488 | | - |
489 | | - // Something failed in the batch, so check them one by one. |
490 | | - foreach( $lines as $line ) { |
491 | | - $regexes = SpamRegexBatch::buildRegexes( array( $line ) ); |
492 | | - if( !SpamRegexBatch::validateRegexes( $regexes ) ) { |
493 | | - $badLines[] = $line; |
494 | | - } |
495 | | - } |
496 | | - return $badLines; |
497 | | - } |
498 | | - |
499 | | - /** |
500 | | - * Build a set of regular expressions from the given multiline input text, |
501 | | - * with empty lines and comments stripped. |
502 | | - * |
503 | | - * @param string $source |
504 | | - * @param string $fileName optional, for reporting of bad files |
505 | | - * @return array of regular expressions, potentially empty |
506 | | - * @static |
507 | | - */ |
508 | | - static function regexesFromText( $source, $fileName=false ) { |
509 | | - $lines = explode( "\n", $source ); |
510 | | - return SpamRegexBatch::buildSafeRegexes( $lines, $fileName ); |
511 | | - } |
512 | | - |
513 | | - /** |
514 | | - * Build a set of regular expressions from a MediaWiki message. |
515 | | - * Will be correctly empty if the message isn't present. |
516 | | - * @param string $source |
517 | | - * @return array of regular expressions, potentially empty |
518 | | - * @static |
519 | | - */ |
520 | | - static function regexesFromMessage( $message ) { |
521 | | - $source = wfMsgForContent( $message ); |
522 | | - if( $source && !wfEmptyMsg( $message, $source ) ) { |
523 | | - return SpamRegexBatch::regexesFromText( $source ); |
524 | | - } else { |
525 | | - return array(); |
526 | | - } |
527 | | - } |
528 | | -} |
529 | | - |
| 132 | +} |
\ No newline at end of file |