r24288 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r24287‎ \| r24288 \| r24289 >
Date:	21:13, 20 July 2007
Author:	brion
Status:	old
Tags:
Comment:	Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :) * Handle bad regexes more gracefully: - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly. - Suppress warnings and be more verbose in the debug log. - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines. * Caching issues: - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration. - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later. * Split out some of the regex batch functions for clarity. There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile. Test this a bit more before syncing. :)
Modified paths:	/trunk/extensions/SpamBlacklist/README (modified) (history) /trunk/extensions/SpamBlacklist/SpamBlacklist.i18n.php (modified) (history) /trunk/extensions/SpamBlacklist/SpamBlacklist.php (modified) (history) /trunk/extensions/SpamBlacklist/SpamBlacklist_body.php (modified) (history)

Diff [purge]

Index: trunk/extensions/SpamBlacklist/SpamBlacklist.php
—	—	@@ -28,6 +28,9 @@
29	29
30	30	$wgExtensionFunctions[] = 'wfSpamBlacklistMessageLoader';
31	31
	32	+$wgHooks['EditFilter'][] = 'wfSpamBlacklistValidate';
	33	+$wgHooks['ArticleSaveComplete'][] = 'wfSpamBlacklistClearCache';
	34	+
32	35	function wfSpamBlacklistMessageLoader() {
33	36	global $wgMessageCache;
34	37	require_once( 'SpamBlacklist.i18n.php' );
—	—	@@ -37,21 +40,73 @@
38	41	}
39	42
40	43	function wfSpamBlacklistLoader( &$title, $text, $section ) {
41		~~- require_once( "SpamBlacklist_body.php" );~~
42	44	static $spamObj = false;
43		~~- global $wgSpamBlacklistFiles, $wgSpamBlacklistSettings, $wgPreSpamFilterCallback;~~
44	45
45	46	if ( $spamObj === false ) {
46		~~- $spamObj = new SpamBlacklist( $wgSpamBlacklistSettings );~~
47		~~- if ( $wgSpamBlacklistFiles ) {~~
48		~~- $spamObj->files = $wgSpamBlacklistFiles;~~
49		~~- $spamObj->previousFilter = $wgPreSpamFilterCallback;~~
50		~~- }~~
	47	+ $spamObj = wfSpamBlacklistObject();
51	48	}
52	49
53	50	return $spamObj->filter( $title, $text, $section );
54	51	}
55	52
56		~~-} # End invocation guard~~
	53	+function wfSpamBlacklistObject() {
	54	+ require_once( "SpamBlacklist_body.php" );
	55	+ global $wgSpamBlacklistFiles, $wgSpamBlacklistSettings, $wgPreSpamFilterCallback;
	56	+ $spamObj = new SpamBlacklist( $wgSpamBlacklistSettings );
	57	+ if( $wgSpamBlacklistFiles ) {
	58	+ $spamObj->files = $wgSpamBlacklistFiles;
	59	+ }
	60	+ $spamObj->previousFilter = $wgPreSpamFilterCallback;
	61	+ return $spamObj;
	62	+}
57	63
	64	+/**
	65	+ * Confirm that a local blacklist page being saved is valid,
	66	+ * and toss back a warning to the user if it isn't.
	67	+ */
	68	+function wfSpamBlacklistValidate( $editPage, $text, $section, &$hookError ) {
	69	+ $thisPageName = $editPage->mTitle->getPrefixedDBkey();
	70	+
	71	+ $spamObj = wfSpamBlacklistObject();
	72	+ if( !$spamObj->isLocalSource( $editPage->mTitle ) ) {
	73	+ wfDebug( "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" );
	74	+ return true;
	75	+ }
	76	+
	77	+ $lines = explode( "\n", $text );
	78	+
	79	+ $badLines = SpamRegexBatch::getBadLines( $lines );
	80	+ if( $badLines ) {
	81	+ wfDebug( "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " .
	82	+ implode( ', ', $badLines ) . "\n" );
58	83
	84	+ $badList = "*<tt>" .
	85	+ implode( "</tt>\n*<tt>",
	86	+ array_map( 'wfEscapeWikiText', $badLines ) ) .
	87	+ "</tt>\n";
	88	+ $hookError =
	89	+ "<div class='errorbox'>" .
	90	+ wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badList ) ) .
	91	+ $badList .
	92	+ "</div>\n" .
	93	+ "<br clear='all' />\n";
	94	+ return true;
	95	+ } else {
	96	+ wfDebug( "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" );
	97	+ return true;
	98	+ }
	99	+}
	100	+
	101	+/**
	102	+ * Clear local spam blacklist caches on page save.
	103	+ */
	104	+function wfSpamBlacklistClearCache( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) {
	105	+ $spamObj = wfSpamBlacklistObject();
	106	+ if( $spamObj->isLocalSource( $article->getTitle() ) ) {
	107	+ $spamObj->clearCache();
	108	+ }
	109	+ return true;
	110	+}
	111	+
	112	+
	113	+} # End invocation guard
Index: trunk/extensions/SpamBlacklist/SpamBlacklist_body.php
—	—	@@ -5,14 +5,13 @@
6	6	class SpamBlacklist {
7	7	var $regexes = false;
8	8	var $previousFilter = false;
9		~~- var $files = array();~~
	9	+ var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );
10	10	var $warningTime = 600;
11	11	var $expiryTime = 900;
12	12	var $warningChance = 100;
13	13
14	14	function SpamBlacklist( $settings = array() ) {
15	15	global $IP;
16		~~- $this->files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );~~
17	16
18	17	foreach ( $settings as $name => $value ) {
19	18	$this->$name = $value;
—	—	@@ -20,32 +19,75 @@
21	20	}
22	21
23	22	/**
	23	+ * Check if the given local page title is a spam regex source.
	24	+ * @param Title $title
	25	+ * @return bool
	26	+ */
	27	+ function isLocalSource( $title ) {
	28	+ global $wgDBname;
	29	+
	30	+ if( $title->getNamespace() == NS_MEDIAWIKI ) {
	31	+ $sources = array(
	32	+ "Spam-blacklist",
	33	+ "Spam-whitelist" );
	34	+ if( in_array( $title->getDbKey(), $sources ) ) {
	35	+ return true;
	36	+ }
	37	+ }
	38	+
	39	+ $thisHttp = $title->getFullUrl( 'action=raw' );
	40	+ $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
	41	+
	42	+ foreach( $this->files as $fileName ) {
	43	+ if ( preg_match( '/^DB: (\w) (.)$/', $fileName, $matches ) ) {
	44	+ if ( $wgDBname == $matches[1] ) {
	45	+ $sources[] = $matches[2];
	46	+ if( $matches[2] == $title->getPrefixedDbKey() ) {
	47	+ // Local DB fetch of this page...
	48	+ return true;
	49	+ }
	50	+ }
	51	+ } elseif( preg_match( $thisHttpRegex, $fileName ) ) {
	52	+ // Raw view of this page
	53	+ return true;
	54	+ }
	55	+ }
	56	+
	57	+ return false;
	58	+ }
	59	+
	60	+ /**
24	61	* @deprecated back-compat
25	62	*/
26	63	function getRegexes() {
27	64	return $this->getBlacklists();
28	65	}
29	66
	67	+ /**
	68	+ * Fetch local and (possibly cached) remote blacklists.
	69	+ * Will be cached locally across multiple invocations.
	70	+ * @return array set of regular expressions, potentially empty.
	71	+ */
30	72	function getBlacklists() {
31		~~- return array_merge(~~
32		~~- $this->getLocalBlacklists(),~~
33		~~- $this->getSharedBlacklists() );~~
	73	+ if( $this->regexes === false ) {
	74	+ $this->regexes = array_merge(
	75	+ $this->getLocalBlacklists(),
	76	+ $this->getSharedBlacklists() );
	77	+ }
	78	+ return $this->regexes;
34	79	}
35	80
	81	+ /**
	82	+ * Fetch (possibly cached) remote blacklists.
	83	+ * @return array
	84	+ */
36	85	function getSharedBlacklists() {
37		~~- global $wgMemc, $wgDBname, $messageMemc;~~
	86	+ global $wgMemc, $wgDBname;
38	87	$fname = 'SpamBlacklist::getRegex';
39	88	wfProfileIn( $fname );
40	89
41		~~- if ( $this->regexes !== false ) {~~
42		~~- return $this->pureArray( $this->regexes );~~
43		~~- }~~
44		-
45	90	wfDebug( "Loading spam regex..." );
46	91
47		~~- if ( !is_array( $this->files ) ) {~~
48		~~- $this->files = array( $this->files );~~
49		~~- }~~
50	92	if ( count( $this->files ) == 0 ){
51	93	# No lists
52	94	wfDebug( "no files specified\n" );
—	—	@@ -53,154 +95,91 @@
54	96	return array();
55	97	}
56	98
57		~~- # Refresh cache if we are saving the blacklist~~
58		~~- $recache = false;~~
	99	+ // This used to be cached per-site, but that could be bad on a shared
	100	+ // server where not all wikis have the same configuration.
	101	+ $cachedRegexes = $wgMemc->get( "$wgDBname:spam_blacklist_regexes" );
	102	+ if( is_array( $cachedRegexes ) ) {
	103	+ wfDebug( "Got shared spam regexes from cache\n" );
	104	+ wfProfileOut( $fname );
	105	+ return $cachedRegexes;
	106	+ }
	107	+
	108	+ $regexes = $this->buildSharedBlacklists();
	109	+ $wgMemc->set( "$wgDBname:spam_blacklist_regexes", $regexes, $this->expiryTime );
	110	+
	111	+ return $regexes;
	112	+ }
	113	+
	114	+ function clearCache() {
	115	+ global $wgMemc, $wgDBname;
	116	+ $wgMemc->delete( "$wgDBname:spam_blacklist_regexes" );
	117	+ wfDebug( "Spam blacklist local cache cleared.\n" );
	118	+ }
	119	+
	120	+ function buildSharedBlacklists() {
	121	+ $regexes = array();
	122	+ # Load lists
	123	+ wfDebug( "Constructing spam blacklist\n" );
59	124	foreach ( $this->files as $fileName ) {
60		~~- if ( preg_match( '/^DB: (\w) (.)$/', $fileName, $matches ) ) {~~
61		~~- if ( $wgDBname == $matches[1] && $this->title && $this->title->getPrefixedDBkey() == $matches[2] ) {~~
62		~~- $recache = true;~~
63		~~- break;~~
64		~~- }~~
	125	+ if ( preg_match( '/^DB: ([\w-]) (.)$/', $fileName, $matches ) ) {
	126	+ $text = $this->getArticleText( $matches[1], $matches[2] );
	127	+ } elseif ( preg_match( '/^http:\/\//', $fileName ) ) {
	128	+ $text = $this->getHttpText( $fileName );
	129	+ } else {
	130	+ $text = file_get_contents( $fileName );
	131	+ wfDebug( "got from file $fileName\n" );
65	132	}
	133	+
	134	+ // Build a separate batch of regexes from each source.
	135	+ // While in theory we could squeeze a little efficiency
	136	+ // out of combining multiple sources in one regex, if
	137	+ // there's a bad line in one of them we'll gain more
	138	+ // from only having to break that set into smaller pieces.
	139	+ $regexes = array_merge( $regexes,
	140	+ SpamRegexBatch::regexesFromText( $text, $fileName ) );
66	141	}
	142	+
	143	+ return $regexes;
	144	+ }
	145	+
	146	+ function getHttpText( $fileName ) {
	147	+ global $wgDBname, $messageMemc;
	148	+
	149	+ # HTTP request
	150	+ # To keep requests to a minimum, we save results into $messageMemc, which is
	151	+ # similar to $wgMemc except almost certain to exist. By default, it is stored
	152	+ # in the database
	153	+ #
	154	+ # There are two keys, when the warning key expires, a random thread will refresh
	155	+ # the real key. This reduces the chance of multiple requests under high traffic
	156	+ # conditions.
	157	+ $key = "spam_blacklist_file:$fileName";
	158	+ $warningKey = "$wgDBname:spamfilewarning:$fileName";
	159	+ $httpText = $messageMemc->get( $key );
	160	+ $warning = $messageMemc->get( $warningKey );
67	161
68		~~- if ( $this->regexes === false \|\| $recache ) {~~
69		~~- if ( !$recache ) {~~
70		~~- $this->regexes = $wgMemc->get( "spam_blacklist_regexes" );~~
	162	+ if ( !is_string( $httpText ) \|\| ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
	163	+ wfDebug( "Loading spam blacklist from $fileName\n" );
	164	+ $httpText = $this->getHTTP( $fileName );
	165	+ if( $httpText === false ) {
	166	+ wfDebug( "Error loading blacklist from $fileName\n" );
71	167	}
72		~~- if ( $this->regexes === false \|\| $this->regexes === null ) {~~
73		~~- $lines = array();~~
74		~~- # Load lists~~
75		~~- wfDebug( "Constructing spam blacklist\n" );~~
76		~~- foreach ( $this->files as $fileName ) {~~
77		~~- if ( preg_match( '/^DB: ([\w-]) (.)$/', $fileName, $matches ) ) {~~
78		~~- if ( $wgDBname == $matches[1] && $this->title && $this->title->getPrefixedDBkey() == $matches[2] ) {~~
79		~~- wfDebug( "Fetching default local spam blacklist...\n" );~~
80		~~- $lines = array_merge( $lines, explode( "\n", $this->text ) );~~
81		~~- } else {~~
82		~~- wfDebug( "Fetching local spam blackist from '{$matches[2]}' on '{$matches[1]}'...\n" );~~
83		~~- $lines = array_merge( $lines, $this->getArticleLines( $matches[1], $matches[2] ) );~~
84		~~- }~~
85		~~- wfDebug( "got from DB\n" );~~
86		~~- } elseif ( preg_match( '/^http:\/\//', $fileName ) ) {~~
87		~~- # HTTP request~~
88		~~- # To keep requests to a minimum, we save results into $messageMemc, which is~~
89		~~- # similar to $wgMemc except almost certain to exist. By default, it is stored~~
90		~~- # in the database~~
91		~~- #~~
92		~~- # There are two keys, when the warning key expires, a random thread will refresh~~
93		~~- # the real key. This reduces the chance of multiple requests under high traffic~~
94		~~- # conditions.~~
95		~~- $key = "spam_blacklist_file:$fileName";~~
96		~~- $warningKey = "$wgDBname:spamfilewarning:$fileName";~~
97		~~- $httpText = $messageMemc->get( $key );~~
98		~~- $warning = $messageMemc->get( $warningKey );~~
99		-
100		~~- if ( !is_string( $httpText ) \|\| ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {~~
101		~~- wfDebug( "Loading spam blacklist from $fileName\n" );~~
102		~~- $httpText = $this->getHTTP( $fileName );~~
103		~~- $messageMemc->set( $warningKey, 1, $this->warningTime );~~
104		~~- $messageMemc->set( $key, $httpText, $this->expiryTime );~~
105		~~- } else {~~
106		~~- wfDebug( "got from HTTP cache\n" );~~
107		~~- }~~
108		~~- $lines = array_merge( $lines, explode( "\n", $httpText ) );~~
109		~~- } else {~~
110		~~- $lines = array_merge( $lines, file( $fileName ) );~~
111		~~- wfDebug( "got from file\n" );~~
112		~~- }~~
113		~~- }~~
114		-
115		~~- $this->regexes = $this->buildRegexes( $lines );~~
116		~~- $wgMemc->set( "spam_blacklist_regexes", $this->regexes, $this->expiryTime );~~
117		~~- } else {~~
118		~~- wfDebug( "got from cache\n" );~~
119		~~- }~~
	168	+ $messageMemc->set( $warningKey, 1, $this->warningTime );
	169	+ $messageMemc->set( $key, $httpText, $this->expiryTime );
	170	+ } else {
	171	+ wfDebug( "Got spam blacklist from HTTP cache for $fileName\n" );
120	172	}
121		~~- if( $this->regexes !== true && !is_array( $this->regexes ) ) {~~
122		~~- // Corrupt regex~~
123		~~- wfDebug( "Corrupt regex\n" );~~
124		~~- $this->regexes = false;~~
125		~~- }~~
126		~~- wfProfileOut( $fname );~~
127		~~- return $this->pureArray( $this->regexes );~~
	173	+ return $httpText;
128	174	}
129	175
130	176	function getLocalBlacklists() {
131		~~- return $this->regexesFromMessage( 'spam-blacklist' );~~
	177	+ return SpamRegexBatch::regexesFromMessage( 'spam-blacklist' );
132	178	}
133	179
134	180	function getWhitelists() {
135		~~- return $this->regexesFromMessage( 'spam-whitelist' );~~
	181	+ return SpamRegexBatch::regexesFromMessage( 'spam-whitelist' );
136	182	}
137		-
138		~~- function regexesFromMessage( $message ) {~~
139		~~- $lines = $this->linesFromMessage( $message );~~
140		~~- $regexes = $this->buildRegexes( $lines );~~
141		~~- return $this->pureArray( $regexes );~~
142		~~- }~~
143		-
144		~~- /// There's some wackiness where booleans are used for~~
145		~~- /// indicating empty or invalid input to aid in caching,~~
146		~~- /// but when we're called all we want is a damn array.~~
147		~~- function pureArray( $regexes ) {~~
148		~~- if( is_array( $regexes ) ) {~~
149		~~- return $regexes;~~
150		~~- } else {~~
151		~~- return array();~~
152		~~- }~~
153		~~- }~~
154	183
155		~~- function linesFromMessage( $message ) {~~
156		~~- $source = wfMsgForContent( $message );~~
157		~~- if( $source && !wfEmptyMsg( $message, $source ) ) {~~
158		~~- return array_filter( explode( "\n", $source ) );~~
159		~~- } else {~~
160		~~- return array();~~
161		~~- }~~
162		~~- }~~
163		-
164		~~- function buildRegexes( $lines ) {~~
165		~~- # Strip comments and whitespace, then remove blanks~~
166		~~- $lines = array_filter( array_map( 'trim', preg_replace( '/#.*$/', '', $lines ) ) );~~
167		-
168		~~- # No lines, don't make a regex which will match everything~~
169		~~- if ( count( $lines ) == 0 ) {~~
170		~~- wfDebug( "No lines\n" );~~
171		~~- return true;~~
172		~~- } else {~~
173		~~- # Make regex~~
174		~~- # It's faster using the S modifier even though it will usually only be run once~~
175		~~- //$regex = 'http://+[a-z0-9_\-.]*(' . implode( '\|', $lines ) . ')';~~
176		~~- //return '/' . str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $regex) ) . '/Si';~~
177		~~- $regexes = array();~~
178		~~- $regexStart = '/http:\/\/+[a-z0-9_\-.]*(';~~
179		~~- $regexEnd = ')/Si';~~
180		~~- $regexMax = 4096;~~
181		~~- $build = false;~~
182		~~- foreach( $lines as $line ) {~~
183		~~- // FIXME: not very robust size check, but should work. :)~~
184		~~- if( $build === false ) {~~
185		~~- $build = $line;~~
186		~~- } elseif( strlen( $build ) + strlen( $line ) > $regexMax ) {~~
187		~~- $regexes[] = $regexStart .~~
188		~~- str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $build) ) .~~
189		~~- $regexEnd;~~
190		~~- $build = $line;~~
191		~~- } else {~~
192		~~- $build .= '\|';~~
193		~~- $build .= $line;~~
194		~~- }~~
195		~~- }~~
196		~~- if( $build !== false ) {~~
197		~~- $regexes[] = $regexStart .~~
198		~~- str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $build) ) .~~
199		~~- $regexEnd;~~
200		~~- }~~
201		~~- return $regexes;~~
202		~~- }~~
203		~~- }~~
204		-
205	184	function filter( &$title, $text, $section ) {
206	185	global $wgArticle, $wgVersion, $wgOut, $wgParser, $wgUser;
207	186
—	—	@@ -245,7 +224,10 @@
246	225	" regexes: " . implode( ', ', $blacklists ) . "\n" );
247	226	$retVal = false;
248	227	foreach( $blacklists as $regex ) {
249		~~- if ( preg_match( $regex, $links, $matches ) ) {~~
	228	+ wfSuppressWarnings();
	229	+ $check = preg_match( $regex, $links, $matches );
	230	+ wfRestoreWarnings();
	231	+ if( $check ) {
250	232	wfDebug( "Match!\n" );
251	233	EditPage::spamPage( $matches[0] );
252	234	$retVal = true;
—	—	@@ -260,7 +242,14 @@
261	243	return $retVal;
262	244	}
263	245
264		~~- function getArticleLines( $db, $article ) {~~
	246	+ /**
	247	+ * Fetch an article from this or another local MediaWiki database.
	248	+ * This is probably very fragile, and shouldn't be used perhaps.
	249	+ * @param string $db
	250	+ * @param string $article
	251	+ */
	252	+ function getArticleText( $db, $article ) {
	253	+ wfDebug( "Fetching local spam blacklist from '$article' on '$db'...\n" );
265	254	global $wgDBname;
266	255	$dbr = wfGetDB( DB_READ );
267	256	$dbr->selectDB( $db );
—	—	@@ -276,23 +265,19 @@
277	266	$dbw->selectDB( $wgDBname );
278	267	} else {
279	268	// 1.4 schema
280		~~- $cur = $dbr->tableName( 'cur' );~~
281	269	$title = Title::newFromText( $article );
282	270	$text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(),
283		~~- 'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleLines' );~~
	271	+ 'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleText' );
284	272	}
285	273	$dbr->selectDB( $wgDBname );
286		~~- if ( $text !== false ) {~~
287		~~- return explode( "\n", $text );~~
288		~~- } else {~~
289		~~- return array();~~
290		~~- }~~
	274	+ return strval( $text );
291	275	}
292	276
293	277	function getHTTP( $url ) {
294	278	// Use wfGetHTTP from MW 1.5 if it is available
295	279	global $IP;
296	280	include_once( "$IP/includes/HttpFunctions.php" );
	281	+ wfSuppressWarnings();
297	282	if ( function_exists( 'wfGetHTTP' ) ) {
298	283	$text = wfGetHTTP( $url );
299	284	} else {
—	—	@@ -300,10 +285,164 @@
301	286	$text = file_get_contents( $url );
302	287	ini_set( 'allow_url_fopen', $url_fopen );
303	288	}
	289	+ wfRestoreWarnings();
304	290	return $text;
305	291	}
306	292	}
307	293
308	294
	295	+class SpamRegexBatch {
	296	+ /**
	297	+ * Build a set of regular expressions matching URLs with the list of regex fragments.
	298	+ * Returns an empty list if the input list is empty.
	299	+ *
	300	+ * @param array $lines list of fragments which will match in URLs
	301	+ * @param int $batchSize largest allowed batch regex;
	302	+ * if 0, will produce one regex per line
	303	+ * @return array
	304	+ * @private
	305	+ * @static
	306	+ */
	307	+ function buildRegexes( $lines, $batchSize=4096 ) {
	308	+ # Make regex
	309	+ # It's faster using the S modifier even though it will usually only be run once
	310	+ //$regex = 'http://+[a-z0-9_\-.]*(' . implode( '\|', $lines ) . ')';
	311	+ //return '/' . str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $regex) ) . '/Si';
	312	+ $regexes = array();
	313	+ $regexStart = '/http:\/\/+[a-z0-9_\-.]*(';
	314	+ $regexEnd = ($batchSize > 0 ) ? ')/Si' : ')/i';
	315	+ $build = false;
	316	+ foreach( $lines as $line ) {
	317	+ // FIXME: not very robust size check, but should work. :)
	318	+ if( $build === false ) {
	319	+ $build = $line;
	320	+ } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
	321	+ $regexes[] = $regexStart .
	322	+ str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $build) ) .
	323	+ $regexEnd;
	324	+ $build = $line;
	325	+ } else {
	326	+ $build .= '\|';
	327	+ $build .= $line;
	328	+ }
	329	+ }
	330	+ if( $build !== false ) {
	331	+ $regexes[] = $regexStart .
	332	+ str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $build) ) .
	333	+ $regexEnd;
	334	+ }
	335	+ return $regexes;
	336	+ }
	337	+
	338	+ /**
	339	+ * Confirm that a set of regexes is either empty or valid.
	340	+ * @param array $lines set of regexes
	341	+ * @return bool true if ok, false if contains invalid lines
	342	+ * @private
	343	+ * @static
	344	+ */
	345	+ function validateRegexes( $regexes ) {
	346	+ foreach( $regexes as $regex ) {
	347	+ wfSuppressWarnings();
	348	+ $ok = preg_match( $regex, '' );
	349	+ wfRestoreWarnings();
	350	+
	351	+ if( $ok === false ) {
	352	+ return false;
	353	+ }
	354	+ }
	355	+ return true;
	356	+ }
	357	+
	358	+ /**
	359	+ * Strip comments and whitespace, then remove blanks
	360	+ * @private
	361	+ * @static
	362	+ */
	363	+ function stripLines( $lines ) {
	364	+ return array_filter(
	365	+ array_map( 'trim',
	366	+ preg_replace( '/#.*$/', '',
	367	+ $lines ) ) );
	368	+ }
	369	+
	370	+ /**
	371	+ * Do a sanity check on the batch regex.
	372	+ * @param lines unsanitized input lines
	373	+ * @param string $fileName optional for debug reporting
	374	+ * @return array of regexes
	375	+ * @private
	376	+ * @static
	377	+ */
	378	+ function buildSafeRegexes( $lines, $fileName=false ) {
	379	+ $lines = SpamRegexBatch::stripLines( $lines );
	380	+ $regexes = SpamRegexBatch::buildRegexes( $lines );
	381	+ if( SpamRegexBatch::validateRegexes( $regexes ) ) {
	382	+ return $regexes;
	383	+ } else {
	384	+ // _Something_ broke... rebuild line-by-line; it'll be
	385	+ // slower if there's a lot of blacklist lines, but one
	386	+ // broken line won't take out hundreds of its brothers.
	387	+ if( $fileName ) {
	388	+ wfDebug( "Spam blacklist warning: bogus line in $fileName\n" );
	389	+ }
	390	+ return SpamRegexBatch::buildRegexes( $lines, 0 );
	391	+ }
	392	+ }
	393	+
	394	+ /**
	395	+ * @param array $lines
	396	+ * @return array of input lines which produce invalid input, or empty array if no problems
	397	+ * @static
	398	+ */
	399	+ function getBadLines( $lines ) {
	400	+ $lines = SpamRegexBatch::stripLines( $lines );
	401	+ $regexes = SpamRegexBatch::buildRegexes( $lines );
	402	+ if( SpamRegexBatch::validateRegexes( $regexes ) ) {
	403	+ // No problems!
	404	+ return array();
	405	+ }
	406	+
	407	+ $badLines = array();
	408	+ foreach( $lines as $line ) {
	409	+ $regexes = SpamRegexBatch::buildRegexes( array( $line ) );
	410	+ if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
	411	+ $badLines[] = $line;
	412	+ }
	413	+ }
	414	+ return $badLines;
	415	+ }
	416	+
	417	+ /**
	418	+ * Build a set of regular expressions from the given multiline input text,
	419	+ * with empty lines and comments stripped.
	420	+ *
	421	+ * @param string $source
	422	+ * @param string $fileName optional, for reporting of bad files
	423	+ * @return array of regular expressions, potentially empty
	424	+ * @static
	425	+ */
	426	+ function regexesFromText( $source, $fileName=false ) {
	427	+ $lines = explode( "\n", $source );
	428	+ return SpamRegexBatch::buildSafeRegexes( $lines, $fileName );
	429	+ }
	430	+
	431	+ /**
	432	+ * Build a set of regular expressions from a MediaWiki message.
	433	+ * Will be correctly empty if the message isn't present.
	434	+ * @param string $source
	435	+ * @return array of regular expressions, potentially empty
	436	+ * @static
	437	+ */
	438	+ function regexesFromMessage( $message ) {
	439	+ $source = wfMsgForContent( $message );
	440	+ if( $source && !wfEmptyMsg( $message, $source ) ) {
	441	+ return SpamRegexBatch::regexesFromText( $source );
	442	+ } else {
	443	+ return array();
	444	+ }
	445	+ }
	446	+}
	447	+
309	448	} # End invocation guard
310	449
Index: trunk/extensions/SpamBlacklist/README
—	—	@@ -31,6 +31,9 @@
32	32	"DB: wikidb My_spam_blacklist",
33	33	);
34	34
	35	+The local pages [[MediaWiki:Spam-blacklist]] and [[MediaWiki:Spam-whitelist]]
	36	+will always be used, whatever additional files are listed.
	37	+
35	38	Compatibility
36	39	-----------
37	40
—	—	@@ -78,6 +81,22 @@
79	82	caching solution. The SpamBlacklist extension will cache the constructed regex
80	83	if such a system is present.
81	84
	85	+Caching behavior
	86	+----------------
	87	+
	88	+Blacklist files loaded from remote web sites are cached locally, in the cache
	89	+subsystem used for MediaWiki's localization. (This usually means the objectcache
	90	+table on a default install.)
	91	+
	92	+By default, the list is cached for 15 minutes (if successfully fetched) or
	93	+10 minutes (if the network fetch failed), after which point it will be fetched
	94	+again when next requested. This should be a decent balance between avoiding
	95	+too-frequent fetches if your site is frequently used and staying up to date.
	96	+
	97	+Fully-processed blacklist data may be cached in memcached or another shared
	98	+memory cache if it's been configured in MediaWiki.
	99	+
	100	+
82	101	Stability
83	102	---------
84	103
Index: trunk/extensions/SpamBlacklist/SpamBlacklist.i18n.php
—	—	@@ -35,6 +35,10 @@
36	36	# * Every non-blank line is a regex fragment which will only match hosts inside URLs
37	37
38	38	#</pre> <!-- leave this line exactly as it is -->',
	39	+ 'spam-invalid-lines' =>
	40	+ "The following spam blacklist {{PLURAL:$1\|line is an\|lines are}} " .
	41	+ " invalid regular {{PLURAL:$1\|expression\|expressions}} " .
	42	+ " and {{PLURAL:$1\|needs\|need}} to be corrected before saving the page:\n",
39	43	),
40	44
41	45	'id' => array(

Status & tagging log

15:20, 12 September 2011 Meno25 (talk | contribs) changed the status of r24288 [removed: ok added: old]