Index: trunk/phase3/tests/phpunit/includes/parser/PreprocessorTest.php |
— | — | @@ -14,7 +14,7 @@ |
15 | 15 | } |
16 | 16 | |
17 | 17 | function getStripList() { |
18 | | - return array( 'gallery' ); |
| 18 | + return array( 'gallery', 'display map' /* Used by Maps, see r80025 CR */, '/foo' ); |
19 | 19 | } |
20 | 20 | |
21 | 21 | function provideCases() { |
— | — | @@ -66,8 +66,13 @@ |
67 | 67 | array( "{{{{{{Foo}}}}}}", "<root><tplarg><title><tplarg><title>Foo</title></tplarg></title></tplarg></root>" ), |
68 | 68 | array( "{{{{{{Foo}}}}}", "<root>{<template><title><tplarg><title>Foo</title></tplarg></title></template></root>" ), |
69 | 69 | array( "[[[Foo]]", "<root>[[[Foo]]</root>" ), |
70 | | - array( "{{Foo|[[[[bar]]|baz]]}}", "<root><template><title>Foo</title><part><name index=\"1\" /><value>[[[[bar]]|baz]]</value></part></template></root>" ), /* This test is important, since it means the difference between having the [[ rule stacked or not */ |
| 70 | + array( "{{Foo|[[[[bar]]|baz]]}}", "<root><template><title>Foo</title><part><name index=\"1\" /><value>[[[[bar]]|baz]]</value></part></template></root>" ), // This test is important, since it means the difference between having the [[ rule stacked or not |
71 | 71 | array( "{{Foo|[[[[bar]|baz]]}}", "<root>{{Foo|[[[[bar]|baz]]}}</root>" ), |
| 72 | + array( "{{Foo|Foo [[[[bar]|baz]]}}", "<root>{{Foo|Foo [[[[bar]|baz]]}}</root>" ), |
| 73 | + array( "Foo <display map>Bar</display map >Baz", "<root>Foo <ext><name>display map</name><attr></attr><inner>Bar</inner><close></display map ></close></ext>Baz</root>" ), |
| 74 | + array( "Foo <display map foo>Bar</display map >Baz", "<root>Foo <ext><name>display map</name><attr> foo</attr><inner>Bar</inner><close></display map ></close></ext>Baz</root>" ), |
| 75 | + array( "Foo <gallery bar=\"baz\" />", "<root>Foo <ext><name>gallery</name><attr> bar="baz" </attr></ext></root>" ), |
| 76 | + array( "</foo>Foo<//foo>", "<root><ext><name>/foo</name><attr></attr><inner>Foo</inner><close><//foo></close></ext></root>" ), # Worth blacklisting IMHO |
72 | 77 | /* array( file_get_contents( dirname( __FILE__ ) . '/QuoteQuran.txt' ), file_get_contents( dirname( __FILE__ ) . '/QuoteQuranExpanded.txt' ) ), */ |
73 | 78 | ); |
74 | 79 | } |
Index: trunk/extensions/NativePreprocessor/Preprocessor_Native.php |
— | — | @@ -21,7 +21,7 @@ |
22 | 22 | |
23 | 23 | function preprocessToObjInternal( $text, $flags = 0 ) { |
24 | 24 | $nativePP = new MediaWikiPreprocessor(); |
25 | | - $ntobj = $nativePP->preprocessToObjInternal( $text, $flags, $this->parser->getStripList() ); |
| 25 | + $ntobj = $nativePP->preprocessToObj( $text, $flags, $this->parser->getStripList() ); |
26 | 26 | |
27 | 27 | return $ntobj; |
28 | 28 | } |
— | — | @@ -41,7 +41,7 @@ |
42 | 42 | $childrenLen = hexdec( substr( $node, 2, 6 ) ); |
43 | 43 | $textLen = hexdec( substr( $node, 8, 8 ) ); |
44 | 44 | $result = htmlspecialchars( substr( $text, 0, $textLen ) ); |
45 | | - if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node' ); |
| 45 | + if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node of type ' . $node[0] . ". Expected $textLen bytes, but only " . strlen( $text ) . " available." ); |
46 | 46 | $text = substr( $text, $textLen ); |
47 | 47 | if ( strpos( '<et|p', $node[0] ) !== false ) |
48 | 48 | $result = ''; // Not present in Preprocessor_DOM |
Index: trunk/extensions/NativePreprocessor/config.m4 |
— | — | @@ -5,6 +5,6 @@ |
6 | 6 | |
7 | 7 | if test "$PHP_MEDIAWIKIPREPROCESSOR" != "no"; then |
8 | 8 | dnl Enable the extension |
9 | | - PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c in_array.c preprocesstoobj.c, $ext_shared) |
| 9 | + PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c tag_util.c preprocesstoobj.c, $ext_shared) |
10 | 10 | PHP_SUBST(MEDIAWIKI_PREPROCESSOR_SHARED_LIBADD) |
11 | 11 | fi |
Index: trunk/extensions/NativePreprocessor/preprocesstoobj.c |
— | — | @@ -7,7 +7,7 @@ |
8 | 8 | #undef NDEBUG |
9 | 9 | #include <assert.h> |
10 | 10 | |
11 | | -#include "in_array.h" |
| 11 | +#include "tag_util.h" |
12 | 12 | #include "nodes.h" |
13 | 13 | |
14 | 14 | #define PTD_FOR_INCLUSION 1 /* Matches Parser::PTD_FOR_INCLUSION */ |
— | — | @@ -23,14 +23,6 @@ |
24 | 24 | #define strsize(x) (sizeof(x)-1) |
25 | 25 | #define min(x,y) (((x) < (y)) ? (x) : (y)) |
26 | 26 | |
27 | | -enum internalTags { |
28 | | - None, |
29 | | - includeonly, |
30 | | - onlyinclude, |
31 | | - noinclude |
32 | | -}; |
33 | | -const char* internalTagNames[] = { NULL, "includeonly", "onlyinclude", "noinclude" }; |
34 | | - |
35 | 27 | enum internalTags getInternalTag(const char* name, int name_len) { |
36 | 28 | #define CHECK_INTERNAL_TAG(x) if ((sizeof(#x)-1 == name_len) && !strncasecmp(name, #x, sizeof(#x)-1)) return x; |
37 | 29 | if (name[0] == '/') { |
— | — | @@ -72,7 +64,7 @@ |
73 | 65 | size_t mwpp_strcspn(const char* text, int text_len, const char* search, int offset) { |
74 | 66 | /* Optimize me */ |
75 | 67 | //printf(" mwpp_strcspn(%s, %d, %s, %d)\n", text, text_len, search, offset); |
76 | | - return php_strcspn( text + offset, search, text + text_len, search + strlen(search) ); |
| 68 | + return php_strcspn( (char*)text + offset, (char*)search, (char*)text + text_len, (char*)search + strlen(search) ); |
77 | 69 | } |
78 | 70 | |
79 | 71 | /** |
— | — | @@ -173,11 +165,18 @@ |
174 | 166 | bool enableOnlyinclude = false; |
175 | 167 | enum internalTags ignoredElement; /* Act as this tag isn't there */ |
176 | 168 | |
177 | | - HashTable* xmlishElements = parserStripList; |
178 | 169 | /* Instead of $xmlishRegex, we use directly the stripList. |
179 | 170 | * As it is shared with Parser, includeonly/onlyinclude/noinclude are handled separatedly. |
180 | 171 | * Per Parser::set{FunctionTag,}Hook(), the items are all strings and lowercase. |
181 | 172 | */ |
| 173 | + int longestTagLen = array_max_strlen( parserStripList ); |
| 174 | + if ( longestTagLen == -1 ) { |
| 175 | + *preprocessed_len = 1; |
| 176 | + return NULL; |
| 177 | + } |
| 178 | + if ( longestTagLen < strsize( "onlyinclude" ) ) { |
| 179 | + longestTagLen = strsize( "onlyinclude" ); |
| 180 | + } |
182 | 181 | |
183 | 182 | if ( forInclusion ) { |
184 | 183 | /* $ignoredTags = array( 'includeonly', '/includeonly' ); */ |
— | — | @@ -192,6 +191,7 @@ |
193 | 192 | #define isIgnoredTag(internalTag) (forInclusion ? ((internalTag) == includeonly) : ((internalTag) > includeonly) ) |
194 | 193 | |
195 | 194 | int i = 0; |
| 195 | + char * lowername = NULL; |
196 | 196 | bool findEquals = false; // True to find equals signs in arguments |
197 | 197 | bool findPipe = false; // True to take notice of pipe characters |
198 | 198 | int headingIndex = 1; |
— | — | @@ -225,7 +225,7 @@ |
226 | 226 | findOnlyinclude = false; |
227 | 227 | } |
228 | 228 | |
229 | | - enum foundTypes found; |
| 229 | + enum foundTypes found = -1; |
230 | 230 | if ( fakeLineStart ) { |
231 | 231 | found = lineStart; |
232 | 232 | } else if ( fakePipeFound ) { |
— | — | @@ -260,7 +260,7 @@ |
261 | 261 | // Output literal section, advance input counter |
262 | 262 | size_t literalLength = mwpp_strcspn( text, text_len, search, i ); |
263 | 263 | if ( literalLength > 0 ) { |
264 | | - addLiteral( text, i, literalLength ); |
| 264 | + addLiteral( text, i, (int)literalLength ); |
265 | 265 | i += literalLength; |
266 | 266 | } |
267 | 267 | if ( i >= text_len ) { |
— | — | @@ -393,30 +393,32 @@ |
394 | 394 | } |
395 | 395 | |
396 | 396 | /** |
397 | | - * We differ here from the $xmlishRegex approach |
398 | | - * The regex ends the tag name with a \s character, /> or > |
399 | | - * so we start seeking for them, then look which name is it. |
| 397 | + * The identifyTag() function performs everything the $xmlishRegex would have done. |
400 | 398 | */ |
| 399 | + if ( !lowername ) { |
| 400 | + lowername = emalloc( longestTagLen + 2 ); |
| 401 | + } |
401 | 402 | assert(text[i] == '<'); |
| 403 | + enum internalTags internalTag; |
402 | 404 | const char* name = text + i + 1; |
403 | 405 | int name_len; |
404 | 406 | /* TODO: optimize this search by not going further than |
405 | 407 | * max( strlen( getParserStripList() + internalTags() ) ) |
406 | 408 | * while not setting noMoreGT in such case. |
407 | 409 | */ |
408 | | - name_len = findSpaceOrAngle(name, text_len - i - 1); |
409 | | - if ( name_len > 0 && name[name_len] == '>' && name[name_len - 1] == '/' ) { |
410 | | - name_len--; |
| 410 | + name_len = identifyTag(name, text_len - i - 1, parserStripList, &internalTag, lowername); |
| 411 | + if ( name_len == -1 ) { /* Does it make sense to allow 0-length tags? */ |
| 412 | + addLiteral( text, i, 1 ); |
| 413 | + i++; |
| 414 | + continue; |
411 | 415 | } |
| 416 | + |
412 | 417 | int attrStart = i + name_len + 1; |
413 | 418 | |
414 | | - int tagEndPos = -1; |
415 | | - if ( name_len != -1 ) { |
416 | | - // Find end of tag |
417 | | - char* end = memchr(name + name_len, '>', text_len - i - 1); |
418 | | - |
419 | | - tagEndPos = end ? end - text : -1; |
420 | | - } |
| 419 | + // Find end of tag |
| 420 | + char* end = memchr(name + name_len, '>', text_len - i - 1); |
| 421 | + int tagEndPos = end ? end - text : -1; |
| 422 | + |
421 | 423 | if ( tagEndPos == -1 ) { |
422 | 424 | // Infinite backtrack |
423 | 425 | // Disable tag search to prevent worst-case O(N^2) performance |
— | — | @@ -427,9 +429,6 @@ |
428 | 430 | } |
429 | 431 | assert(text[tagEndPos] == '>'); |
430 | 432 | |
431 | | - enum internalTags internalTag; |
432 | | - internalTag = getInternalTag(name, name_len); |
433 | | - |
434 | 433 | // Handle ignored tags |
435 | 434 | if ( isIgnoredTag( internalTag ) ) { |
436 | 435 | addNodeWithText( ignore_node, text, i, tagEndPos - i + 1 ); |
— | — | @@ -437,28 +436,11 @@ |
438 | 437 | continue; |
439 | 438 | } |
440 | 439 | |
441 | | - char * lowername; |
442 | | - if ( internalTag == None ) { |
443 | | - int j; |
444 | | - // Verify that it's not just tag-looking text |
445 | | - lowername = alloca( name_len + 1 ); /* FIXME */ |
446 | | - for (j = 0; j < name_len; j++) { |
447 | | - lowername[j] = tolower(name[j]); |
448 | | - } |
449 | | - lowername[j] = '\0'; |
450 | | - if ( !str_in_array(lowername, name_len, xmlishElements, true) ) { |
451 | | - addLiteral( text, i, 1 ); |
452 | | - ++i; |
453 | | - continue; |
454 | | - } |
455 | | - } else { |
456 | | - lowername = (char*)internalTagNames[internalTag]; |
457 | | - } |
458 | | - |
459 | 440 | int tagStartPos, attrEnd, endTagBegin, endTagLen; |
460 | 441 | int innerTextBegin, innerTextLen; |
461 | 442 | tagStartPos = i; endTagLen = 0; |
462 | 443 | innerTextBegin = -1; innerTextLen = -1; |
| 444 | + endTagBegin = 42; /* Disable warning. This variable is only used when endTagLen != 0 */ |
463 | 445 | |
464 | 446 | if ( text[tagEndPos-1] == '/' ) { |
465 | 447 | attrEnd = tagEndPos - 1; |
— | — | @@ -824,6 +806,10 @@ |
825 | 807 | } |
826 | 808 | } |
827 | 809 | |
| 810 | + if ( lowername ) { // No reason to TSRMLS_FETCH() if we didn't allocate anything |
| 811 | + efree( lowername ); |
| 812 | + } |
| 813 | + |
828 | 814 | nodeString[nodeStringLen] = '\0'; |
829 | 815 | *preprocessed_len = nodeStringLen; |
830 | 816 | return nodeString; |
Index: trunk/extensions/NativePreprocessor/FORMAT |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +The serialized format of the Native Preprocessor is a list of node types with |
| 3 | +indexes to the original text included in that node. It is returned in a php |
| 4 | +string type. For easier debugging, the characters of those string are restricted |
| 5 | +to printable characters. |
| 6 | +This string is then returned joined in an array with a refcounted copy of the |
| 7 | +original text. |
| 8 | + |
| 9 | +The full string is formed by two or more nodes. |
| 10 | + |
| 11 | +The node format is as follows: |
| 12 | + +------+-------+------+------+------+------+------+------+------+ |
| 13 | + | Type | Flags | Next sibling | Content length | |
| 14 | + +------+-------+------+------+------+------+------+------+------+ |
| 15 | + |
| 16 | +Type: |
| 17 | + A character which identifies the kind of node. |
| 18 | + |
| 19 | +Flags: |
| 20 | + A value whose meaning depends on the type. The default value is '0'. |
| 21 | + |
| 22 | +Next sibling: |
| 23 | + 6 hexadecimal characters which specify the length in bytes of all its child nodes. |
| 24 | + |
| 25 | +Content lenght: |
| 26 | + 8 hexadecimal characters which specify the length in bytes of the text content of this |
| 27 | +node in the wikitext. The beginning of the node in the text is implied by all the previous |
| 28 | +lengths. |
| 29 | + |
| 30 | +Total node length: 16 bytes. |
| 31 | + |
Index: trunk/extensions/NativePreprocessor/tag_util.c |
— | — | @@ -0,0 +1,124 @@ |
| 2 | +#include <ctype.h> |
| 3 | +#include <stdbool.h> |
| 4 | +#include "php.h" |
| 5 | +#define const |
| 6 | +#include "tag_util.h" |
| 7 | + |
| 8 | +const zvalue_value internalTagZvalues[] = { |
| 9 | + { .str = { NULL, 0 } }, |
| 10 | + { .str = { "includeonly", 11 } }, |
| 11 | + { .str = { "onlyinclude", 11 } }, |
| 12 | + { .str = { "noinclude", 9 } } |
| 13 | +}; |
| 14 | + |
| 15 | +/** |
| 16 | + * This functions is given an array of strings and returns the length of |
| 17 | + * the longest one. If there are other kind of items, it returns -1 |
| 18 | + */ |
| 19 | +int array_max_strlen( const HashTable* array ) { |
| 20 | + zval **entry; /* pointer to array entry */ |
| 21 | + HashPosition pos; /* hash iterator */ |
| 22 | + int max_length = 0; |
| 23 | + |
| 24 | + zend_hash_internal_pointer_reset_ex(array, &pos); |
| 25 | + while (zend_hash_get_current_data_ex(array, (void **)&entry, &pos) == SUCCESS) { |
| 26 | + if (Z_TYPE_PP(entry) != IS_STRING) { |
| 27 | + return -1; |
| 28 | + } |
| 29 | + if (Z_STRLEN_PP(entry) > max_length) { |
| 30 | + max_length = Z_STRLEN_PP(entry); |
| 31 | + } |
| 32 | + zend_hash_move_forward_ex(array, &pos); |
| 33 | + } |
| 34 | + return max_length; |
| 35 | +} |
| 36 | + |
| 37 | +/** |
| 38 | + * Returns if a given character matches a "\s>". |
| 39 | + * Remember that for PERL compatibility, \s doesn't |
| 40 | + * include the Vertical Tab (0x11) |
| 41 | + */ |
| 42 | +static inline bool isRegexSpaceOrAngle(int character) { |
| 43 | + switch ( character ) { |
| 44 | + case '\t': |
| 45 | + case '\n': |
| 46 | + case '\f': |
| 47 | + case '\r': |
| 48 | + case ' ': |
| 49 | + case '>': |
| 50 | + return true; |
| 51 | + } |
| 52 | + return false; |
| 53 | +} |
| 54 | + |
| 55 | +static inline int min( int a, int b ) { |
| 56 | + if ( a < b ) { |
| 57 | + return a; |
| 58 | + } |
| 59 | + return b; |
| 60 | +} |
| 61 | + |
| 62 | +/** |
| 63 | + * Returns the length of the first tag case-insensitive present in the |
| 64 | + * lowercase array or internal, and followed by a space character, '/>' or '>'. |
| 65 | + * The matched tag is stored in lowercase in the lowername parameter, |
| 66 | + * which is allocated by the caller. |
| 67 | + * It also calculates and returns the internalTag parameter |
| 68 | + */ |
| 69 | +int identifyTag(const char* __restrict string, int string_len, const HashTable* __restrict array, enum internalTags *__restrict internalTag, char* __restrict lowername) { |
| 70 | + zval **entryp, *entry; |
| 71 | + HashPosition pos; |
| 72 | + *internalTag = None; |
| 73 | + int i = 0; |
| 74 | + |
| 75 | + zend_hash_internal_pointer_reset_ex(array, &pos); |
| 76 | + while ( 1 ) { |
| 77 | + if ( *internalTag == None ) { |
| 78 | + if ( zend_hash_get_current_data_ex(array, (void **)&entryp, &pos) == FAILURE ) { |
| 79 | + (*internalTag)++; |
| 80 | + if ( string[0] == '/' ) { |
| 81 | + string++; |
| 82 | + string_len--; |
| 83 | + } |
| 84 | + continue; |
| 85 | + } |
| 86 | + assert( Z_TYPE_PP(entryp) == IS_STRING ); /* Already checked in array_max_strlen */ |
| 87 | + entry = *entryp; |
| 88 | + } else if ( *internalTag == EndInternalTags ) { |
| 89 | + return -1; |
| 90 | + } else { |
| 91 | + entry = (zval*)&internalTagZvalues[*internalTag]; |
| 92 | + } |
| 93 | + |
| 94 | + if (Z_STRLEN_P(entry) < string_len) { |
| 95 | + if ( isRegexSpaceOrAngle( string[Z_STRLEN_P(entry)] ) |
| 96 | + || ( string[Z_STRLEN_P(entry)] == '/' && Z_STRLEN_P(entry) + 2 <= string_len && string[Z_STRLEN_P(entry)+1] == '>') ) |
| 97 | + { |
| 98 | + /* Verify the already lowercased name */ |
| 99 | + if ( !memcmp( lowername, Z_STRVAL_P(entry), min( i, Z_STRLEN_P(entry) ) ) ) { |
| 100 | + for ( ; ; ) { |
| 101 | + lowername[i] = tolower( string[i] ); /* This is locale dependant, just as strtolower() and the original code */ |
| 102 | + if ( lowername[i] != Z_STRVAL_P(entry)[i] ) { |
| 103 | + i++; |
| 104 | + break; |
| 105 | + } |
| 106 | + |
| 107 | + i++; |
| 108 | + |
| 109 | + if ( i == Z_STRLEN_P(entry) ) { |
| 110 | + lowername[i] = '\0'; |
| 111 | + return i; |
| 112 | + } |
| 113 | + } |
| 114 | + } |
| 115 | + } |
| 116 | + } |
| 117 | + if ( *internalTag == None ) { |
| 118 | + zend_hash_move_forward_ex(array, &pos); |
| 119 | + } else { |
| 120 | + (*internalTag)++; |
| 121 | + } |
| 122 | + } |
| 123 | + |
| 124 | + return -1; |
| 125 | +} |
Property changes on: trunk/extensions/NativePreprocessor/tag_util.c |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 126 | + native |
Index: trunk/extensions/NativePreprocessor/tag_util.h |
— | — | @@ -0,0 +1,11 @@ |
| 2 | +enum internalTags { |
| 3 | + None, |
| 4 | + includeonly, |
| 5 | + onlyinclude, |
| 6 | + noinclude, |
| 7 | + EndInternalTags |
| 8 | +}; |
| 9 | + |
| 10 | +int array_max_strlen( const HashTable* array ); |
| 11 | +int identifyTag(const char* string, int string_len, const HashTable* array, enum internalTags *internalTag, char* lowername); |
| 12 | + |
Property changes on: trunk/extensions/NativePreprocessor/tag_util.h |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 13 | + native |