Index: trunk/extensions/SpamBlacklist/SpamBlacklist.php |
— | — | @@ -28,6 +28,9 @@ |
29 | 29 | |
30 | 30 | $wgExtensionFunctions[] = 'wfSpamBlacklistMessageLoader'; |
31 | 31 | |
| 32 | +$wgHooks['EditFilter'][] = 'wfSpamBlacklistValidate'; |
| 33 | +$wgHooks['ArticleSaveComplete'][] = 'wfSpamBlacklistClearCache'; |
| 34 | + |
32 | 35 | function wfSpamBlacklistMessageLoader() { |
33 | 36 | global $wgMessageCache; |
34 | 37 | require_once( 'SpamBlacklist.i18n.php' ); |
— | — | @@ -37,21 +40,73 @@ |
38 | 41 | } |
39 | 42 | |
40 | 43 | function wfSpamBlacklistLoader( &$title, $text, $section ) { |
41 | | - require_once( "SpamBlacklist_body.php" ); |
42 | 44 | static $spamObj = false; |
43 | | - global $wgSpamBlacklistFiles, $wgSpamBlacklistSettings, $wgPreSpamFilterCallback; |
44 | 45 | |
45 | 46 | if ( $spamObj === false ) { |
46 | | - $spamObj = new SpamBlacklist( $wgSpamBlacklistSettings ); |
47 | | - if ( $wgSpamBlacklistFiles ) { |
48 | | - $spamObj->files = $wgSpamBlacklistFiles; |
49 | | - $spamObj->previousFilter = $wgPreSpamFilterCallback; |
50 | | - } |
| 47 | + $spamObj = wfSpamBlacklistObject(); |
51 | 48 | } |
52 | 49 | |
53 | 50 | return $spamObj->filter( $title, $text, $section ); |
54 | 51 | } |
55 | 52 | |
56 | | -} # End invocation guard |
| 53 | +function wfSpamBlacklistObject() { |
| 54 | + require_once( "SpamBlacklist_body.php" ); |
| 55 | + global $wgSpamBlacklistFiles, $wgSpamBlacklistSettings, $wgPreSpamFilterCallback; |
| 56 | + $spamObj = new SpamBlacklist( $wgSpamBlacklistSettings ); |
| 57 | + if( $wgSpamBlacklistFiles ) { |
| 58 | + $spamObj->files = $wgSpamBlacklistFiles; |
| 59 | + } |
| 60 | + $spamObj->previousFilter = $wgPreSpamFilterCallback; |
| 61 | + return $spamObj; |
| 62 | +} |
57 | 63 | |
| 64 | +/** |
| 65 | + * Confirm that a local blacklist page being saved is valid, |
| 66 | + * and toss back a warning to the user if it isn't. |
| 67 | + */ |
| 68 | +function wfSpamBlacklistValidate( $editPage, $text, $section, &$hookError ) { |
| 69 | + $thisPageName = $editPage->mTitle->getPrefixedDBkey(); |
| 70 | + |
| 71 | + $spamObj = wfSpamBlacklistObject(); |
| 72 | + if( !$spamObj->isLocalSource( $editPage->mTitle ) ) { |
| 73 | + wfDebug( "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" ); |
| 74 | + return true; |
| 75 | + } |
| 76 | + |
| 77 | + $lines = explode( "\n", $text ); |
| 78 | + |
| 79 | + $badLines = SpamRegexBatch::getBadLines( $lines ); |
| 80 | + if( $badLines ) { |
| 81 | + wfDebug( "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " . |
| 82 | + implode( ', ', $badLines ) . "\n" ); |
58 | 83 | |
| 84 | + $badList = "*<tt>" . |
| 85 | + implode( "</tt>\n*<tt>", |
| 86 | + array_map( 'wfEscapeWikiText', $badLines ) ) . |
| 87 | + "</tt>\n"; |
| 88 | + $hookError = |
| 89 | + "<div class='errorbox'>" . |
| 90 | + wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badList ) ) . |
| 91 | + $badList . |
| 92 | + "</div>\n" . |
| 93 | + "<br clear='all' />\n"; |
| 94 | + return true; |
| 95 | + } else { |
| 96 | + wfDebug( "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" ); |
| 97 | + return true; |
| 98 | + } |
| 99 | +} |
| 100 | + |
| 101 | +/** |
| 102 | + * Clear local spam blacklist caches on page save. |
| 103 | + */ |
| 104 | +function wfSpamBlacklistClearCache( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) { |
| 105 | + $spamObj = wfSpamBlacklistObject(); |
| 106 | + if( $spamObj->isLocalSource( $article->getTitle() ) ) { |
| 107 | + $spamObj->clearCache(); |
| 108 | + } |
| 109 | + return true; |
| 110 | +} |
| 111 | + |
| 112 | + |
| 113 | +} # End invocation guard |
Index: trunk/extensions/SpamBlacklist/SpamBlacklist_body.php |
— | — | @@ -5,14 +5,13 @@ |
6 | 6 | class SpamBlacklist { |
7 | 7 | var $regexes = false; |
8 | 8 | var $previousFilter = false; |
9 | | - var $files = array(); |
| 9 | + var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" ); |
10 | 10 | var $warningTime = 600; |
11 | 11 | var $expiryTime = 900; |
12 | 12 | var $warningChance = 100; |
13 | 13 | |
14 | 14 | function SpamBlacklist( $settings = array() ) { |
15 | 15 | global $IP; |
16 | | - $this->files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" ); |
17 | 16 | |
18 | 17 | foreach ( $settings as $name => $value ) { |
19 | 18 | $this->$name = $value; |
— | — | @@ -20,32 +19,75 @@ |
21 | 20 | } |
22 | 21 | |
23 | 22 | /** |
| 23 | + * Check if the given local page title is a spam regex source. |
| 24 | + * @param Title $title |
| 25 | + * @return bool |
| 26 | + */ |
| 27 | + function isLocalSource( $title ) { |
| 28 | + global $wgDBname; |
| 29 | + |
| 30 | + if( $title->getNamespace() == NS_MEDIAWIKI ) { |
| 31 | + $sources = array( |
| 32 | + "Spam-blacklist", |
| 33 | + "Spam-whitelist" ); |
| 34 | + if( in_array( $title->getDbKey(), $sources ) ) { |
| 35 | + return true; |
| 36 | + } |
| 37 | + } |
| 38 | + |
| 39 | + $thisHttp = $title->getFullUrl( 'action=raw' ); |
| 40 | + $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/'; |
| 41 | + |
| 42 | + foreach( $this->files as $fileName ) { |
| 43 | + if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { |
| 44 | + if ( $wgDBname == $matches[1] ) { |
| 45 | + $sources[] = $matches[2]; |
| 46 | + if( $matches[2] == $title->getPrefixedDbKey() ) { |
| 47 | + // Local DB fetch of this page... |
| 48 | + return true; |
| 49 | + } |
| 50 | + } |
| 51 | + } elseif( preg_match( $thisHttpRegex, $fileName ) ) { |
| 52 | + // Raw view of this page |
| 53 | + return true; |
| 54 | + } |
| 55 | + } |
| 56 | + |
| 57 | + return false; |
| 58 | + } |
| 59 | + |
| 60 | + /** |
24 | 61 | * @deprecated back-compat |
25 | 62 | */ |
26 | 63 | function getRegexes() { |
27 | 64 | return $this->getBlacklists(); |
28 | 65 | } |
29 | 66 | |
| 67 | + /** |
| 68 | + * Fetch local and (possibly cached) remote blacklists. |
| 69 | + * Will be cached locally across multiple invocations. |
| 70 | + * @return array set of regular expressions, potentially empty. |
| 71 | + */ |
30 | 72 | function getBlacklists() { |
31 | | - return array_merge( |
32 | | - $this->getLocalBlacklists(), |
33 | | - $this->getSharedBlacklists() ); |
| 73 | + if( $this->regexes === false ) { |
| 74 | + $this->regexes = array_merge( |
| 75 | + $this->getLocalBlacklists(), |
| 76 | + $this->getSharedBlacklists() ); |
| 77 | + } |
| 78 | + return $this->regexes; |
34 | 79 | } |
35 | 80 | |
| 81 | + /** |
| 82 | + * Fetch (possibly cached) remote blacklists. |
| 83 | + * @return array |
| 84 | + */ |
36 | 85 | function getSharedBlacklists() { |
37 | | - global $wgMemc, $wgDBname, $messageMemc; |
| 86 | + global $wgMemc, $wgDBname; |
38 | 87 | $fname = 'SpamBlacklist::getRegex'; |
39 | 88 | wfProfileIn( $fname ); |
40 | 89 | |
41 | | - if ( $this->regexes !== false ) { |
42 | | - return $this->pureArray( $this->regexes ); |
43 | | - } |
44 | | - |
45 | 90 | wfDebug( "Loading spam regex..." ); |
46 | 91 | |
47 | | - if ( !is_array( $this->files ) ) { |
48 | | - $this->files = array( $this->files ); |
49 | | - } |
50 | 92 | if ( count( $this->files ) == 0 ){ |
51 | 93 | # No lists |
52 | 94 | wfDebug( "no files specified\n" ); |
— | — | @@ -53,154 +95,91 @@ |
54 | 96 | return array(); |
55 | 97 | } |
56 | 98 | |
57 | | - # Refresh cache if we are saving the blacklist |
58 | | - $recache = false; |
| 99 | + // This used to be cached per-site, but that could be bad on a shared |
| 100 | + // server where not all wikis have the same configuration. |
| 101 | + $cachedRegexes = $wgMemc->get( "$wgDBname:spam_blacklist_regexes" ); |
| 102 | + if( is_array( $cachedRegexes ) ) { |
| 103 | + wfDebug( "Got shared spam regexes from cache\n" ); |
| 104 | + wfProfileOut( $fname ); |
| 105 | + return $cachedRegexes; |
| 106 | + } |
| 107 | + |
| 108 | + $regexes = $this->buildSharedBlacklists(); |
| 109 | + $wgMemc->set( "$wgDBname:spam_blacklist_regexes", $regexes, $this->expiryTime ); |
| 110 | + |
| 111 | + return $regexes; |
| 112 | + } |
| 113 | + |
| 114 | + function clearCache() { |
| 115 | + global $wgMemc, $wgDBname; |
| 116 | + $wgMemc->delete( "$wgDBname:spam_blacklist_regexes" ); |
| 117 | + wfDebug( "Spam blacklist local cache cleared.\n" ); |
| 118 | + } |
| 119 | + |
| 120 | + function buildSharedBlacklists() { |
| 121 | + $regexes = array(); |
| 122 | + # Load lists |
| 123 | + wfDebug( "Constructing spam blacklist\n" ); |
59 | 124 | foreach ( $this->files as $fileName ) { |
60 | | - if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { |
61 | | - if ( $wgDBname == $matches[1] && $this->title && $this->title->getPrefixedDBkey() == $matches[2] ) { |
62 | | - $recache = true; |
63 | | - break; |
64 | | - } |
| 125 | + if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { |
| 126 | + $text = $this->getArticleText( $matches[1], $matches[2] ); |
| 127 | + } elseif ( preg_match( '/^http:\/\//', $fileName ) ) { |
| 128 | + $text = $this->getHttpText( $fileName ); |
| 129 | + } else { |
| 130 | + $text = file_get_contents( $fileName ); |
| 131 | + wfDebug( "got from file $fileName\n" ); |
65 | 132 | } |
| 133 | + |
| 134 | + // Build a separate batch of regexes from each source. |
| 135 | + // While in theory we could squeeze a little efficiency |
| 136 | + // out of combining multiple sources in one regex, if |
| 137 | + // there's a bad line in one of them we'll gain more |
| 138 | + // from only having to break that set into smaller pieces. |
| 139 | + $regexes = array_merge( $regexes, |
| 140 | + SpamRegexBatch::regexesFromText( $text, $fileName ) ); |
66 | 141 | } |
| 142 | + |
| 143 | + return $regexes; |
| 144 | + } |
| 145 | + |
| 146 | + function getHttpText( $fileName ) { |
| 147 | + global $wgDBname, $messageMemc; |
| 148 | + |
| 149 | + # HTTP request |
| 150 | + # To keep requests to a minimum, we save results into $messageMemc, which is |
| 151 | + # similar to $wgMemc except almost certain to exist. By default, it is stored |
| 152 | + # in the database |
| 153 | + # |
| 154 | + # There are two keys, when the warning key expires, a random thread will refresh |
| 155 | + # the real key. This reduces the chance of multiple requests under high traffic |
| 156 | + # conditions. |
| 157 | + $key = "spam_blacklist_file:$fileName"; |
| 158 | + $warningKey = "$wgDBname:spamfilewarning:$fileName"; |
| 159 | + $httpText = $messageMemc->get( $key ); |
| 160 | + $warning = $messageMemc->get( $warningKey ); |
67 | 161 | |
68 | | - if ( $this->regexes === false || $recache ) { |
69 | | - if ( !$recache ) { |
70 | | - $this->regexes = $wgMemc->get( "spam_blacklist_regexes" ); |
| 162 | + if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { |
| 163 | + wfDebug( "Loading spam blacklist from $fileName\n" ); |
| 164 | + $httpText = $this->getHTTP( $fileName ); |
| 165 | + if( $httpText === false ) { |
| 166 | + wfDebug( "Error loading blacklist from $fileName\n" ); |
71 | 167 | } |
72 | | - if ( $this->regexes === false || $this->regexes === null ) { |
73 | | - $lines = array(); |
74 | | - # Load lists |
75 | | - wfDebug( "Constructing spam blacklist\n" ); |
76 | | - foreach ( $this->files as $fileName ) { |
77 | | - if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { |
78 | | - if ( $wgDBname == $matches[1] && $this->title && $this->title->getPrefixedDBkey() == $matches[2] ) { |
79 | | - wfDebug( "Fetching default local spam blacklist...\n" ); |
80 | | - $lines = array_merge( $lines, explode( "\n", $this->text ) ); |
81 | | - } else { |
82 | | - wfDebug( "Fetching local spam blackist from '{$matches[2]}' on '{$matches[1]}'...\n" ); |
83 | | - $lines = array_merge( $lines, $this->getArticleLines( $matches[1], $matches[2] ) ); |
84 | | - } |
85 | | - wfDebug( "got from DB\n" ); |
86 | | - } elseif ( preg_match( '/^http:\/\//', $fileName ) ) { |
87 | | - # HTTP request |
88 | | - # To keep requests to a minimum, we save results into $messageMemc, which is |
89 | | - # similar to $wgMemc except almost certain to exist. By default, it is stored |
90 | | - # in the database |
91 | | - # |
92 | | - # There are two keys, when the warning key expires, a random thread will refresh |
93 | | - # the real key. This reduces the chance of multiple requests under high traffic |
94 | | - # conditions. |
95 | | - $key = "spam_blacklist_file:$fileName"; |
96 | | - $warningKey = "$wgDBname:spamfilewarning:$fileName"; |
97 | | - $httpText = $messageMemc->get( $key ); |
98 | | - $warning = $messageMemc->get( $warningKey ); |
99 | | - |
100 | | - if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { |
101 | | - wfDebug( "Loading spam blacklist from $fileName\n" ); |
102 | | - $httpText = $this->getHTTP( $fileName ); |
103 | | - $messageMemc->set( $warningKey, 1, $this->warningTime ); |
104 | | - $messageMemc->set( $key, $httpText, $this->expiryTime ); |
105 | | - } else { |
106 | | - wfDebug( "got from HTTP cache\n" ); |
107 | | - } |
108 | | - $lines = array_merge( $lines, explode( "\n", $httpText ) ); |
109 | | - } else { |
110 | | - $lines = array_merge( $lines, file( $fileName ) ); |
111 | | - wfDebug( "got from file\n" ); |
112 | | - } |
113 | | - } |
114 | | - |
115 | | - $this->regexes = $this->buildRegexes( $lines ); |
116 | | - $wgMemc->set( "spam_blacklist_regexes", $this->regexes, $this->expiryTime ); |
117 | | - } else { |
118 | | - wfDebug( "got from cache\n" ); |
119 | | - } |
| 168 | + $messageMemc->set( $warningKey, 1, $this->warningTime ); |
| 169 | + $messageMemc->set( $key, $httpText, $this->expiryTime ); |
| 170 | + } else { |
| 171 | + wfDebug( "Got spam blacklist from HTTP cache for $fileName\n" ); |
120 | 172 | } |
121 | | - if( $this->regexes !== true && !is_array( $this->regexes ) ) { |
122 | | - // Corrupt regex |
123 | | - wfDebug( "Corrupt regex\n" ); |
124 | | - $this->regexes = false; |
125 | | - } |
126 | | - wfProfileOut( $fname ); |
127 | | - return $this->pureArray( $this->regexes ); |
| 173 | + return $httpText; |
128 | 174 | } |
129 | 175 | |
130 | 176 | function getLocalBlacklists() { |
131 | | - return $this->regexesFromMessage( 'spam-blacklist' ); |
| 177 | + return SpamRegexBatch::regexesFromMessage( 'spam-blacklist' ); |
132 | 178 | } |
133 | 179 | |
134 | 180 | function getWhitelists() { |
135 | | - return $this->regexesFromMessage( 'spam-whitelist' ); |
| 181 | + return SpamRegexBatch::regexesFromMessage( 'spam-whitelist' ); |
136 | 182 | } |
137 | | - |
138 | | - function regexesFromMessage( $message ) { |
139 | | - $lines = $this->linesFromMessage( $message ); |
140 | | - $regexes = $this->buildRegexes( $lines ); |
141 | | - return $this->pureArray( $regexes ); |
142 | | - } |
143 | | - |
144 | | - /// There's some wackiness where booleans are used for |
145 | | - /// indicating empty or invalid input to aid in caching, |
146 | | - /// but when we're called all we want is a damn array. |
147 | | - function pureArray( $regexes ) { |
148 | | - if( is_array( $regexes ) ) { |
149 | | - return $regexes; |
150 | | - } else { |
151 | | - return array(); |
152 | | - } |
153 | | - } |
154 | 183 | |
155 | | - function linesFromMessage( $message ) { |
156 | | - $source = wfMsgForContent( $message ); |
157 | | - if( $source && !wfEmptyMsg( $message, $source ) ) { |
158 | | - return array_filter( explode( "\n", $source ) ); |
159 | | - } else { |
160 | | - return array(); |
161 | | - } |
162 | | - } |
163 | | - |
164 | | - function buildRegexes( $lines ) { |
165 | | - # Strip comments and whitespace, then remove blanks |
166 | | - $lines = array_filter( array_map( 'trim', preg_replace( '/#.*$/', '', $lines ) ) ); |
167 | | - |
168 | | - # No lines, don't make a regex which will match everything |
169 | | - if ( count( $lines ) == 0 ) { |
170 | | - wfDebug( "No lines\n" ); |
171 | | - return true; |
172 | | - } else { |
173 | | - # Make regex |
174 | | - # It's faster using the S modifier even though it will usually only be run once |
175 | | - //$regex = 'http://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; |
176 | | - //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Si'; |
177 | | - $regexes = array(); |
178 | | - $regexStart = '/http:\/\/+[a-z0-9_\-.]*('; |
179 | | - $regexEnd = ')/Si'; |
180 | | - $regexMax = 4096; |
181 | | - $build = false; |
182 | | - foreach( $lines as $line ) { |
183 | | - // FIXME: not very robust size check, but should work. :) |
184 | | - if( $build === false ) { |
185 | | - $build = $line; |
186 | | - } elseif( strlen( $build ) + strlen( $line ) > $regexMax ) { |
187 | | - $regexes[] = $regexStart . |
188 | | - str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $build) ) . |
189 | | - $regexEnd; |
190 | | - $build = $line; |
191 | | - } else { |
192 | | - $build .= '|'; |
193 | | - $build .= $line; |
194 | | - } |
195 | | - } |
196 | | - if( $build !== false ) { |
197 | | - $regexes[] = $regexStart . |
198 | | - str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $build) ) . |
199 | | - $regexEnd; |
200 | | - } |
201 | | - return $regexes; |
202 | | - } |
203 | | - } |
204 | | - |
205 | 184 | function filter( &$title, $text, $section ) { |
206 | 185 | global $wgArticle, $wgVersion, $wgOut, $wgParser, $wgUser; |
207 | 186 | |
— | — | @@ -245,7 +224,10 @@ |
246 | 225 | " regexes: " . implode( ', ', $blacklists ) . "\n" ); |
247 | 226 | $retVal = false; |
248 | 227 | foreach( $blacklists as $regex ) { |
249 | | - if ( preg_match( $regex, $links, $matches ) ) { |
| 228 | + wfSuppressWarnings(); |
| 229 | + $check = preg_match( $regex, $links, $matches ); |
| 230 | + wfRestoreWarnings(); |
| 231 | + if( $check ) { |
250 | 232 | wfDebug( "Match!\n" ); |
251 | 233 | EditPage::spamPage( $matches[0] ); |
252 | 234 | $retVal = true; |
— | — | @@ -260,7 +242,14 @@ |
261 | 243 | return $retVal; |
262 | 244 | } |
263 | 245 | |
264 | | - function getArticleLines( $db, $article ) { |
| 246 | + /** |
| 247 | + * Fetch an article from this or another local MediaWiki database. |
| 248 | + * This is probably *very* fragile, and shouldn't be used perhaps. |
| 249 | + * @param string $db |
| 250 | + * @param string $article |
| 251 | + */ |
| 252 | + function getArticleText( $db, $article ) { |
| 253 | + wfDebug( "Fetching local spam blacklist from '$article' on '$db'...\n" ); |
265 | 254 | global $wgDBname; |
266 | 255 | $dbr = wfGetDB( DB_READ ); |
267 | 256 | $dbr->selectDB( $db ); |
— | — | @@ -276,23 +265,19 @@ |
277 | 266 | $dbw->selectDB( $wgDBname ); |
278 | 267 | } else { |
279 | 268 | // 1.4 schema |
280 | | - $cur = $dbr->tableName( 'cur' ); |
281 | 269 | $title = Title::newFromText( $article ); |
282 | 270 | $text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(), |
283 | | - 'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleLines' ); |
| 271 | + 'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleText' ); |
284 | 272 | } |
285 | 273 | $dbr->selectDB( $wgDBname ); |
286 | | - if ( $text !== false ) { |
287 | | - return explode( "\n", $text ); |
288 | | - } else { |
289 | | - return array(); |
290 | | - } |
| 274 | + return strval( $text ); |
291 | 275 | } |
292 | 276 | |
293 | 277 | function getHTTP( $url ) { |
294 | 278 | // Use wfGetHTTP from MW 1.5 if it is available |
295 | 279 | global $IP; |
296 | 280 | include_once( "$IP/includes/HttpFunctions.php" ); |
| 281 | + wfSuppressWarnings(); |
297 | 282 | if ( function_exists( 'wfGetHTTP' ) ) { |
298 | 283 | $text = wfGetHTTP( $url ); |
299 | 284 | } else { |
— | — | @@ -300,10 +285,164 @@ |
301 | 286 | $text = file_get_contents( $url ); |
302 | 287 | ini_set( 'allow_url_fopen', $url_fopen ); |
303 | 288 | } |
| 289 | + wfRestoreWarnings(); |
304 | 290 | return $text; |
305 | 291 | } |
306 | 292 | } |
307 | 293 | |
308 | 294 | |
| 295 | +class SpamRegexBatch { |
| 296 | + /** |
| 297 | + * Build a set of regular expressions matching URLs with the list of regex fragments. |
| 298 | + * Returns an empty list if the input list is empty. |
| 299 | + * |
| 300 | + * @param array $lines list of fragments which will match in URLs |
| 301 | + * @param int $batchSize largest allowed batch regex; |
| 302 | + * if 0, will produce one regex per line |
| 303 | + * @return array |
| 304 | + * @private |
| 305 | + * @static |
| 306 | + */ |
| 307 | + function buildRegexes( $lines, $batchSize=4096 ) { |
| 308 | + # Make regex |
| 309 | + # It's faster using the S modifier even though it will usually only be run once |
| 310 | + //$regex = 'http://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; |
| 311 | + //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Si'; |
| 312 | + $regexes = array(); |
| 313 | + $regexStart = '/http:\/\/+[a-z0-9_\-.]*('; |
| 314 | + $regexEnd = ($batchSize > 0 ) ? ')/Si' : ')/i'; |
| 315 | + $build = false; |
| 316 | + foreach( $lines as $line ) { |
| 317 | + // FIXME: not very robust size check, but should work. :) |
| 318 | + if( $build === false ) { |
| 319 | + $build = $line; |
| 320 | + } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) { |
| 321 | + $regexes[] = $regexStart . |
| 322 | + str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $build) ) . |
| 323 | + $regexEnd; |
| 324 | + $build = $line; |
| 325 | + } else { |
| 326 | + $build .= '|'; |
| 327 | + $build .= $line; |
| 328 | + } |
| 329 | + } |
| 330 | + if( $build !== false ) { |
| 331 | + $regexes[] = $regexStart . |
| 332 | + str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $build) ) . |
| 333 | + $regexEnd; |
| 334 | + } |
| 335 | + return $regexes; |
| 336 | + } |
| 337 | + |
| 338 | + /** |
| 339 | + * Confirm that a set of regexes is either empty or valid. |
| 340 | + * @param array $lines set of regexes |
| 341 | + * @return bool true if ok, false if contains invalid lines |
| 342 | + * @private |
| 343 | + * @static |
| 344 | + */ |
| 345 | + function validateRegexes( $regexes ) { |
| 346 | + foreach( $regexes as $regex ) { |
| 347 | + wfSuppressWarnings(); |
| 348 | + $ok = preg_match( $regex, '' ); |
| 349 | + wfRestoreWarnings(); |
| 350 | + |
| 351 | + if( $ok === false ) { |
| 352 | + return false; |
| 353 | + } |
| 354 | + } |
| 355 | + return true; |
| 356 | + } |
| 357 | + |
| 358 | + /** |
| 359 | + * Strip comments and whitespace, then remove blanks |
| 360 | + * @private |
| 361 | + * @static |
| 362 | + */ |
| 363 | + function stripLines( $lines ) { |
| 364 | + return array_filter( |
| 365 | + array_map( 'trim', |
| 366 | + preg_replace( '/#.*$/', '', |
| 367 | + $lines ) ) ); |
| 368 | + } |
| 369 | + |
| 370 | + /** |
| 371 | + * Do a sanity check on the batch regex. |
| 372 | + * @param lines unsanitized input lines |
| 373 | + * @param string $fileName optional for debug reporting |
| 374 | + * @return array of regexes |
| 375 | + * @private |
| 376 | + * @static |
| 377 | + */ |
| 378 | + function buildSafeRegexes( $lines, $fileName=false ) { |
| 379 | + $lines = SpamRegexBatch::stripLines( $lines ); |
| 380 | + $regexes = SpamRegexBatch::buildRegexes( $lines ); |
| 381 | + if( SpamRegexBatch::validateRegexes( $regexes ) ) { |
| 382 | + return $regexes; |
| 383 | + } else { |
| 384 | + // _Something_ broke... rebuild line-by-line; it'll be |
| 385 | + // slower if there's a lot of blacklist lines, but one |
| 386 | + // broken line won't take out hundreds of its brothers. |
| 387 | + if( $fileName ) { |
| 388 | + wfDebug( "Spam blacklist warning: bogus line in $fileName\n" ); |
| 389 | + } |
| 390 | + return SpamRegexBatch::buildRegexes( $lines, 0 ); |
| 391 | + } |
| 392 | + } |
| 393 | + |
| 394 | + /** |
| 395 | + * @param array $lines |
| 396 | + * @return array of input lines which produce invalid input, or empty array if no problems |
| 397 | + * @static |
| 398 | + */ |
| 399 | + function getBadLines( $lines ) { |
| 400 | + $lines = SpamRegexBatch::stripLines( $lines ); |
| 401 | + $regexes = SpamRegexBatch::buildRegexes( $lines ); |
| 402 | + if( SpamRegexBatch::validateRegexes( $regexes ) ) { |
| 403 | + // No problems! |
| 404 | + return array(); |
| 405 | + } |
| 406 | + |
| 407 | + $badLines = array(); |
| 408 | + foreach( $lines as $line ) { |
| 409 | + $regexes = SpamRegexBatch::buildRegexes( array( $line ) ); |
| 410 | + if( !SpamRegexBatch::validateRegexes( $regexes ) ) { |
| 411 | + $badLines[] = $line; |
| 412 | + } |
| 413 | + } |
| 414 | + return $badLines; |
| 415 | + } |
| 416 | + |
| 417 | + /** |
| 418 | + * Build a set of regular expressions from the given multiline input text, |
| 419 | + * with empty lines and comments stripped. |
| 420 | + * |
| 421 | + * @param string $source |
| 422 | + * @param string $fileName optional, for reporting of bad files |
| 423 | + * @return array of regular expressions, potentially empty |
| 424 | + * @static |
| 425 | + */ |
| 426 | + function regexesFromText( $source, $fileName=false ) { |
| 427 | + $lines = explode( "\n", $source ); |
| 428 | + return SpamRegexBatch::buildSafeRegexes( $lines, $fileName ); |
| 429 | + } |
| 430 | + |
| 431 | + /** |
| 432 | + * Build a set of regular expressions from a MediaWiki message. |
| 433 | + * Will be correctly empty if the message isn't present. |
| 434 | + * @param string $source |
| 435 | + * @return array of regular expressions, potentially empty |
| 436 | + * @static |
| 437 | + */ |
| 438 | + function regexesFromMessage( $message ) { |
| 439 | + $source = wfMsgForContent( $message ); |
| 440 | + if( $source && !wfEmptyMsg( $message, $source ) ) { |
| 441 | + return SpamRegexBatch::regexesFromText( $source ); |
| 442 | + } else { |
| 443 | + return array(); |
| 444 | + } |
| 445 | + } |
| 446 | +} |
| 447 | + |
309 | 448 | } # End invocation guard |
310 | 449 | |
Index: trunk/extensions/SpamBlacklist/README |
— | — | @@ -31,6 +31,9 @@ |
32 | 32 | "DB: wikidb My_spam_blacklist", |
33 | 33 | ); |
34 | 34 | |
| 35 | +The local pages [[MediaWiki:Spam-blacklist]] and [[MediaWiki:Spam-whitelist]] |
| 36 | +will always be used, whatever additional files are listed. |
| 37 | + |
35 | 38 | Compatibility |
36 | 39 | ----------- |
37 | 40 | |
— | — | @@ -78,6 +81,22 @@ |
79 | 82 | caching solution. The SpamBlacklist extension will cache the constructed regex |
80 | 83 | if such a system is present. |
81 | 84 | |
| 85 | +Caching behavior |
| 86 | +---------------- |
| 87 | + |
| 88 | +Blacklist files loaded from remote web sites are cached locally, in the cache |
| 89 | +subsystem used for MediaWiki's localization. (This usually means the objectcache |
| 90 | +table on a default install.) |
| 91 | + |
| 92 | +By default, the list is cached for 15 minutes (if successfully fetched) or |
| 93 | +10 minutes (if the network fetch failed), after which point it will be fetched |
| 94 | +again when next requested. This should be a decent balance between avoiding |
| 95 | +too-frequent fetches if your site is frequently used and staying up to date. |
| 96 | + |
| 97 | +Fully-processed blacklist data may be cached in memcached or another shared |
| 98 | +memory cache if it's been configured in MediaWiki. |
| 99 | + |
| 100 | + |
82 | 101 | Stability |
83 | 102 | --------- |
84 | 103 | |
Index: trunk/extensions/SpamBlacklist/SpamBlacklist.i18n.php |
— | — | @@ -35,6 +35,10 @@ |
36 | 36 | # * Every non-blank line is a regex fragment which will only match hosts inside URLs |
37 | 37 | |
38 | 38 | #</pre> <!-- leave this line exactly as it is -->', |
| 39 | + 'spam-invalid-lines' => |
| 40 | + "The following spam blacklist {{PLURAL:$1|line is an|lines are}} " . |
| 41 | + " invalid regular {{PLURAL:$1|expression|expressions}} " . |
| 42 | + " and {{PLURAL:$1|needs|need}} to be corrected before saving the page:\n", |
39 | 43 | ), |
40 | 44 | |
41 | 45 | 'id' => array( |