Index: trunk/extensions/RegexFun/RELEASE-NOTES |
— | — | @@ -2,6 +2,9 @@ |
3 | 3 | ========== |
4 | 4 | |
5 | 5 | * (trunk) -- Version 1.0.2 alpha |
| 6 | + - Limit won't exceed early when 'e' flag with many backrefs in replacement is used extensivelly. |
| 7 | + - It's possible to use the 'Regex Fun' regex system with advanced flags within other extensions. |
| 8 | + - Performance increased for executing huge numbers of the same regex on different strings. |
6 | 9 | - Internal representative functions for parser functions now have a 'pf_' prefix. |
7 | 10 | |
8 | 11 | * November 6, 2011 -- Version 1.0.1 |
Index: trunk/extensions/RegexFun/RegexFun.php |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | * |
59 | 59 | * @var string |
60 | 60 | */ |
61 | | - const VERSION = '1.0.2 alpha'; |
| 61 | + const VERSION = '1.0.2'; |
62 | 62 | |
63 | 63 | /** |
64 | 64 | * Sets up parser functions |
— | — | @@ -238,6 +238,7 @@ |
239 | 239 | return false; |
240 | 240 | } |
241 | 241 | if( $resetLastRegex ) { |
| 242 | + // store infos for this regex for '#regex_var' |
242 | 243 | self::initLastRegex( $parser, $pattern, $subject ); |
243 | 244 | } |
244 | 245 | return true; |
— | — | @@ -249,25 +250,26 @@ |
250 | 251 | * @param $parser Parser instance of running Parse |
251 | 252 | * @param $subject String input string to evaluate |
252 | 253 | * @param $pattern String regular expression pattern - must use /, | or % delimiter |
253 | | - * @param $replace String regular expression replacement |
| 254 | + * @param $replacement String regular expression replacement |
254 | 255 | * |
255 | 256 | * @return String Result of replacing pattern with replacement in string, or matching text if replacement was omitted |
256 | 257 | */ |
257 | | - public static function pf_regex( Parser &$parser, $subject = '', $pattern = '', $replace = null, $limit = -1 ) { |
| 258 | + public static function pf_regex( Parser &$parser, $subject = '', $pattern = '', $replacement = null, $limit = -1 ) { |
258 | 259 | // check whether limit exceeded: |
259 | 260 | if( self::limitExceeded( $parser ) ) { |
260 | 261 | return self::msgLimitExceeded(); |
261 | 262 | } |
262 | 263 | self::increaseRegexCount( $parser ); |
263 | 264 | |
264 | | - // validate, initialise and check for wrong input: |
265 | | - $continue = self::validateRegexCall( $parser, $subject, $pattern, $specialFlags, true ); |
266 | | - if( ! $continue ) { |
267 | | - return self::msgInvalidRegex( $pattern ); |
268 | | - } |
269 | | - |
270 | | - if( $replace === null ) { |
| 265 | + if( $replacement === null ) { |
271 | 266 | // search mode: |
| 267 | + |
| 268 | + // validate, initialise and check for wrong input: |
| 269 | + $continue = self::validateRegexCall( $parser, $subject, $pattern, $specialFlags, true ); |
| 270 | + if( ! $continue ) { |
| 271 | + return self::msgInvalidRegex( $pattern ); |
| 272 | + } |
| 273 | + |
272 | 274 | $lastMatches = self::getLastMatches( $parser ); |
273 | 275 | $output = ( preg_match( $pattern, $subject, $lastMatches ) ? $lastMatches[0] : '' ); |
274 | 276 | self::setLastMatches( $parser, $lastMatches ); |
— | — | @@ -275,49 +277,132 @@ |
276 | 278 | // replace mode: |
277 | 279 | $limit = (int)$limit; |
278 | 280 | |
279 | | - // set last matches to 'false' and get them on demand instead since preg_replace won't communicate them |
280 | | - self::setLastMatches( $parser , false ); |
281 | | - |
282 | | - // FLAG 'e' (parse replace after match) handling: |
283 | | - if( ! empty( $specialFlags[ self::FLAG_REPLACEMENT_PARSE ] ) ) { |
284 | | - // if 'e' flag is set, each replacement has to be parsed after matches are inserted but before replacing! |
285 | | - self::$tmpRegexCB = array( |
286 | | - 'replacement' => $replace, |
287 | | - 'parser' => &$parser, |
288 | | - ); |
289 | | - $output = preg_replace_callback( $pattern, array( __CLASS__, 'regex_eFlag_callback' ), $subject, $limit, $count ); |
| 281 | + // set last matches to 'false' and get them on demand instead since preg_replace won't communicate them |
| 282 | + self::setLastMatches( $parser, false ); |
| 283 | + |
| 284 | + // do the regex plus all handling of special flags and validation |
| 285 | + $output = self::doPregReplace( $pattern, $replacement, $subject, $limit, $parser ); |
| 286 | + |
| 287 | + if( $output === false ) { |
| 288 | + // invalid regex, don't store any infor for '#regex_var' |
| 289 | + self::setLastMatches( $parser , null ); |
| 290 | + return self::msgInvalidRegex( $pattern ); |
290 | 291 | } |
291 | | - else { |
292 | | - $output = preg_replace( $pattern, $replace, $subject, $limit, $count ); |
293 | | - } |
294 | 292 | |
295 | | - // FLAG 'r' (no replacement - no output) handling: |
296 | | - if( ! empty( $specialFlags[ self::FLAG_NO_REPLACE_NO_OUT ] ) ) { |
297 | | - /* |
298 | | - * only output replacement result if there actually was a match and therewith a replacement happened |
299 | | - * (otherwise the input string would be returned) |
300 | | - */ |
301 | | - if( $count < 1 ) { |
302 | | - return ''; |
303 | | - } |
304 | | - } |
| 293 | + // set these infos after pattern validation/correction |
| 294 | + self::setLastPattern( $parser, $pattern ); |
| 295 | + self::setLastSubject( $parser, $subject ); |
305 | 296 | } |
| 297 | + |
306 | 298 | return $output; |
307 | 299 | } |
308 | 300 | |
309 | | - private static function regex_eFlag_callback( $matches ) { |
| 301 | + /** |
| 302 | + * 'preg_replace' like function but can handle special modifiers 'e' and 'r'. |
| 303 | + * |
| 304 | + * @param string &$pattern |
| 305 | + * @param string $replacement |
| 306 | + * @param string $subject |
| 307 | + * @param int $limit |
| 308 | + * @param Parser &$parser if 'e' flag should be allowed, a parser objecdt for parsing is required. |
| 309 | + * @param array $allowedSpecialFlags all special flags that should be handled, by default 'e' and 'r'. |
| 310 | + */ |
| 311 | + public static function doPregReplace( |
| 312 | + &$pattern, |
| 313 | + $replacement, |
| 314 | + $subject, |
| 315 | + $limit = -1, |
| 316 | + &$parser = null, |
| 317 | + array $allowedSpecialFlags = array( |
| 318 | + self::FLAG_REPLACEMENT_PARSE, |
| 319 | + self::FLAG_NO_REPLACE_NO_OUT, |
| 320 | + ) |
| 321 | + ) { |
| 322 | + static $lastPattern = null; |
| 323 | + static $lastFlags = null; |
| 324 | + static $specialFlags = null; |
310 | 325 | |
| 326 | + /* |
| 327 | + * cache validated pattern and use it as long as nothing has changed, this makes things |
| 328 | + * faster in case we do a lot of stuff with the same regex. |
| 329 | + */ |
| 330 | + if( $lastPattern === null || $lastPattern !== $pattern |
| 331 | + || $lastFlags !== implode( ',', $allowedSpecialFlags ) |
| 332 | + ) { |
| 333 | + // if allowed special flags change, we have to validate again^^ |
| 334 | + $lastFlags = implode( ',', $allowedSpecialFlags ); |
| 335 | + |
| 336 | + // validate regex and get special flags 'e' and 'r' if given: |
| 337 | + if( ! self::validateRegex( $pattern, $specialFlags ) ) { |
| 338 | + // invalid regex! |
| 339 | + return false; |
| 340 | + } |
| 341 | + |
| 342 | + // filter unwanted special flags: |
| 343 | + $allowedSpecialFlags = array_flip( $allowedSpecialFlags ); |
| 344 | + $specialFlags = array_intersect_key( $specialFlags, $allowedSpecialFlags ); |
| 345 | + |
| 346 | + $lastPattern = $pattern; |
| 347 | + } |
| 348 | + |
| 349 | + |
| 350 | + // FLAG 'e' (parse replace after match) handling: |
| 351 | + if( ! empty( $specialFlags[ self::FLAG_REPLACEMENT_PARSE ] ) ) { |
| 352 | + |
| 353 | + // 'e' requires a Parser for parsing! |
| 354 | + if( ! ( $parser instanceof Parser ) ) { |
| 355 | + // no valid Parser object, without, we can't parse anything! |
| 356 | + throw new MWException( "Regex Fun 'e' flag discovered but no Parser object given!" ); |
| 357 | + } |
| 358 | + |
| 359 | + // if 'e' flag is set, each replacement has to be parsed after matches are inserted but before replacing! |
| 360 | + self::$tmpRegexCB = array( |
| 361 | + 'replacement' => $replacement, |
| 362 | + 'parser' => &$parser, |
| 363 | + 'internal' => isset( $parser->mExtRegexFun['lastMatches'] ) && $parser->mExtRegexFun['lastMatches'] === false |
| 364 | + ); |
| 365 | + |
| 366 | + $output = preg_replace_callback( $pattern, array( __CLASS__, 'doPregReplace_eFlag_callback' ), $subject, $limit, $count ); |
| 367 | + } |
| 368 | + else { |
| 369 | + // no 'e' flag, we can perform the standard function |
| 370 | + $output = preg_replace( $pattern, $replacement, $subject, $limit, $count ); |
| 371 | + } |
| 372 | + |
| 373 | + |
| 374 | + // FLAG 'r' (no replacement - no output) handling: |
| 375 | + if( ! empty( $specialFlags[ self::FLAG_NO_REPLACE_NO_OUT ] ) ) { |
| 376 | + /* |
| 377 | + * only output replacement result if there actually was a match and therewith a replacement happened |
| 378 | + * (otherwise the input string would be returned) |
| 379 | + */ |
| 380 | + if( $count < 1 ) { |
| 381 | + return ''; |
| 382 | + } |
| 383 | + } |
| 384 | + |
| 385 | + return $output; |
| 386 | + } |
| 387 | + |
| 388 | + private static function doPregReplace_eFlag_callback( $matches ) { |
| 389 | + |
311 | 390 | /** Don't cache this since it could contain dynamic content like #var which should be parsed */ |
312 | 391 | |
313 | | - $replace = self::$tmpRegexCB['replacement']; |
314 | | - $parser = self::$tmpRegexCB['parser']; |
| 392 | + $replace = self::$tmpRegexCB['replacement']; |
| 393 | + $parser = self::$tmpRegexCB['parser']; |
| 394 | + $internal = self::$tmpRegexCB['internal']; // whether doPregReplace() is called as part of a parser function |
315 | 395 | |
316 | | - // last matches in #regex replace mode were set to false before, set them now: |
317 | | - self::setLastMatches( $parser, $matches ); |
| 396 | + /* |
| 397 | + * only do this if set to false before, internally, so we won't destroy things if |
| 398 | + * doPregReplace() was called from outside 'Regex Fun' |
| 399 | + */ |
| 400 | + if( $internal ) { |
| 401 | + // last matches in #regex replace mode were set to false before, set them now: |
| 402 | + self::setLastMatches( $parser, $matches ); |
| 403 | + } |
| 404 | + // replace backrefs with their actual values: |
| 405 | + $replace = self::regexVarReplace( $replace, $matches ); |
318 | 406 | |
319 | | - // use #regex_var for transforming replacement string with matches: |
320 | | - $replace = self::pf_regex_var( $parser, $replace ); |
321 | | - |
322 | 407 | // parse the replacement after matches are inserted |
323 | 408 | // use a new frame, no need for SFH_OBJECT_ARGS style parser functions |
324 | 409 | $frame = $parser->getPreprocessor()->newCustomFrame( $parser ); |
— | — | @@ -410,33 +495,42 @@ |
411 | 496 | } |
412 | 497 | self::increaseRegexCount( $parser ); |
413 | 498 | |
414 | | - /* |
415 | | - * replace all back-references with their number increased by 1! |
416 | | - * this way we can also handle $0 in the right way! |
417 | | - */ |
418 | | - $index = preg_replace_callback( |
419 | | - '%(?<!\\\)(?:\$(?:(\d+)|\{(\d+)\})|\\\(\d+))%', |
420 | | - array( __CLASS__, 'regexVarIncreaseBackref' ), |
421 | | - $index |
422 | | - ); |
423 | | - /* |
424 | | - * build a helper regex matching all the last matches to use preg_replace |
425 | | - * which will handle all the replace-escaping handling correct |
426 | | - */ |
427 | | - $regEx = ''; |
428 | | - foreach( $lastMatches as $match ) { |
429 | | - $regEx .= '(' . preg_quote( $match, '/' ) . ')'; |
430 | | - } |
431 | | - $regEx = "/^{$regEx}$/"; |
432 | | - $output = preg_replace( $regEx, $index, implode( '', $lastMatches ) ); |
433 | | - |
434 | | - return $output; |
| 499 | + // do the actual transformation: |
| 500 | + return self::regexVarReplace( $index, $lastMatches ); |
435 | 501 | } |
436 | 502 | } |
| 503 | + |
437 | 504 | /** |
438 | | - * only used by 'preg_replace_callback' in 'regex_var' |
| 505 | + * Replaces all backref variables within a replacement string with the backrefs actual |
| 506 | + * values just like preg_replace would do it. |
439 | 507 | */ |
440 | | - private static function regexVarIncreaseBackref( $matches ) { |
| 508 | + private static function regexVarReplace( $replacement, $matches ) { |
| 509 | + /* |
| 510 | + * replace all back-references with their number increased by 1! |
| 511 | + * this way we can also handle $0 in the right way! |
| 512 | + */ |
| 513 | + $replacement = preg_replace_callback( |
| 514 | + '%(?<!\\\)(?:\$(?:(\d+)|\{(\d+)\})|\\\(\d+))%', |
| 515 | + array( __CLASS__, 'regexVarReplace_increaseBackrefs_callback' ), |
| 516 | + $replacement |
| 517 | + ); |
| 518 | + /* |
| 519 | + * build a helper regex matching all the last matches to use preg_replace |
| 520 | + * which will handle all the replace-escaping handling correct |
| 521 | + */ |
| 522 | + $regEx = ''; |
| 523 | + foreach( $matches as $match ) { |
| 524 | + $regEx .= '(' . preg_quote( $match, '/' ) . ')'; |
| 525 | + } |
| 526 | + $regEx = "/^{$regEx}$/"; |
| 527 | + |
| 528 | + return preg_replace( $regEx, $replacement, implode( '', $matches ) ); |
| 529 | + } |
| 530 | + |
| 531 | + /** |
| 532 | + * only used by 'preg_replace_callback' in 'regexVarReplace' |
| 533 | + */ |
| 534 | + private static function regexVarReplace_increaseBackrefs_callback( $matches ) { |
441 | 535 | // find index: |
442 | 536 | $index = false; |
443 | 537 | $full = $matches[0]; |
— | — | @@ -565,7 +659,11 @@ |
566 | 660 | |
567 | 661 | // last matches are set to false in case last regex was in replace mode! Get them on demand: |
568 | 662 | if( $parser->mExtRegexFun['lastMatches'] === false ) { |
569 | | - preg_match( self::getLastPattern( $parser ), self::getLastSubject( $parser ), $parser->mExtRegexFun['lastMatches'] ); |
| 663 | + preg_match( |
| 664 | + self::getLastPattern( $parser ), |
| 665 | + self::getLastSubject( $parser ), |
| 666 | + $parser->mExtRegexFun['lastMatches'] |
| 667 | + ); |
570 | 668 | } |
571 | 669 | return $parser->mExtRegexFun['lastMatches']; |
572 | 670 | } |