r80461 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80460‎ | r80461 | r80462 >
Date:19:54, 17 January 2011
Author:platonides
Status:deferred
Tags:
Comment:
Follow up r80376. Added missing file FORMAT.
Fixed method call in Preprocessor_Native.php.
Added support for tags containing spaces (r80025), following the same odd order dependant behavior as the php preprocessors.
Extensions shouldn't rely on it. See http://thread.gmane.org/gmane.science.linguistics.wikipedia.technical/51496
As a result of these changes, there is much less worst-case lookahead now.
in_array.{c,h} are now unused.
Modified paths:
  • /trunk/extensions/NativePreprocessor/FORMAT (added) (history)
  • /trunk/extensions/NativePreprocessor/Preprocessor_Native.php (modified) (history)
  • /trunk/extensions/NativePreprocessor/config.m4 (modified) (history)
  • /trunk/extensions/NativePreprocessor/preprocesstoobj.c (modified) (history)
  • /trunk/extensions/NativePreprocessor/tag_util.c (added) (history)
  • /trunk/extensions/NativePreprocessor/tag_util.h (added) (history)
  • /trunk/phase3/tests/phpunit/includes/parser/PreprocessorTest.php (modified) (history)

Diff [purge]

Index: trunk/phase3/tests/phpunit/includes/parser/PreprocessorTest.php
@@ -14,7 +14,7 @@
1515 }
1616
1717 function getStripList() {
18 - return array( 'gallery' );
 18+ return array( 'gallery', 'display map' /* Used by Maps, see r80025 CR */, '/foo' );
1919 }
2020
2121 function provideCases() {
@@ -66,8 +66,13 @@
6767 array( "{{{{{{Foo}}}}}}", "<root><tplarg><title><tplarg><title>Foo</title></tplarg></title></tplarg></root>" ),
6868 array( "{{{{{{Foo}}}}}", "<root>{<template><title><tplarg><title>Foo</title></tplarg></title></template></root>" ),
6969 array( "[[[Foo]]", "<root>[[[Foo]]</root>" ),
70 - array( "{{Foo|[[[[bar]]|baz]]}}", "<root><template><title>Foo</title><part><name index=\"1\" /><value>[[[[bar]]|baz]]</value></part></template></root>" ), /* This test is important, since it means the difference between having the [[ rule stacked or not */
 70+ array( "{{Foo|[[[[bar]]|baz]]}}", "<root><template><title>Foo</title><part><name index=\"1\" /><value>[[[[bar]]|baz]]</value></part></template></root>" ), // This test is important, since it means the difference between having the [[ rule stacked or not
7171 array( "{{Foo|[[[[bar]|baz]]}}", "<root>{{Foo|[[[[bar]|baz]]}}</root>" ),
 72+ array( "{{Foo|Foo [[[[bar]|baz]]}}", "<root>{{Foo|Foo [[[[bar]|baz]]}}</root>" ),
 73+ array( "Foo <display map>Bar</display map >Baz", "<root>Foo <ext><name>display map</name><attr></attr><inner>Bar</inner><close>&lt;/display map &gt;</close></ext>Baz</root>" ),
 74+ array( "Foo <display map foo>Bar</display map >Baz", "<root>Foo <ext><name>display map</name><attr> foo</attr><inner>Bar</inner><close>&lt;/display map &gt;</close></ext>Baz</root>" ),
 75+ array( "Foo <gallery bar=\"baz\" />", "<root>Foo <ext><name>gallery</name><attr> bar=&quot;baz&quot; </attr></ext></root>" ),
 76+ array( "</foo>Foo<//foo>", "<root><ext><name>/foo</name><attr></attr><inner>Foo</inner><close>&lt;//foo&gt;</close></ext></root>" ), # Worth blacklisting IMHO
7277 /* array( file_get_contents( dirname( __FILE__ ) . '/QuoteQuran.txt' ), file_get_contents( dirname( __FILE__ ) . '/QuoteQuranExpanded.txt' ) ), */
7378 );
7479 }
Index: trunk/extensions/NativePreprocessor/Preprocessor_Native.php
@@ -21,7 +21,7 @@
2222
2323 function preprocessToObjInternal( $text, $flags = 0 ) {
2424 $nativePP = new MediaWikiPreprocessor();
25 - $ntobj = $nativePP->preprocessToObjInternal( $text, $flags, $this->parser->getStripList() );
 25+ $ntobj = $nativePP->preprocessToObj( $text, $flags, $this->parser->getStripList() );
2626
2727 return $ntobj;
2828 }
@@ -41,7 +41,7 @@
4242 $childrenLen = hexdec( substr( $node, 2, 6 ) );
4343 $textLen = hexdec( substr( $node, 8, 8 ) );
4444 $result = htmlspecialchars( substr( $text, 0, $textLen ) );
45 - if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node' );
 45+ if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node of type ' . $node[0] . ". Expected $textLen bytes, but only " . strlen( $text ) . " available." );
4646 $text = substr( $text, $textLen );
4747 if ( strpos( '<et|p', $node[0] ) !== false )
4848 $result = ''; // Not present in Preprocessor_DOM
Index: trunk/extensions/NativePreprocessor/config.m4
@@ -5,6 +5,6 @@
66
77 if test "$PHP_MEDIAWIKIPREPROCESSOR" != "no"; then
88 dnl Enable the extension
9 - PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c in_array.c preprocesstoobj.c, $ext_shared)
 9+ PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c tag_util.c preprocesstoobj.c, $ext_shared)
1010 PHP_SUBST(MEDIAWIKI_PREPROCESSOR_SHARED_LIBADD)
1111 fi
Index: trunk/extensions/NativePreprocessor/preprocesstoobj.c
@@ -7,7 +7,7 @@
88 #undef NDEBUG
99 #include <assert.h>
1010
11 -#include "in_array.h"
 11+#include "tag_util.h"
1212 #include "nodes.h"
1313
1414 #define PTD_FOR_INCLUSION 1 /* Matches Parser::PTD_FOR_INCLUSION */
@@ -23,14 +23,6 @@
2424 #define strsize(x) (sizeof(x)-1)
2525 #define min(x,y) (((x) < (y)) ? (x) : (y))
2626
27 -enum internalTags {
28 - None,
29 - includeonly,
30 - onlyinclude,
31 - noinclude
32 -};
33 -const char* internalTagNames[] = { NULL, "includeonly", "onlyinclude", "noinclude" };
34 -
3527 enum internalTags getInternalTag(const char* name, int name_len) {
3628 #define CHECK_INTERNAL_TAG(x) if ((sizeof(#x)-1 == name_len) && !strncasecmp(name, #x, sizeof(#x)-1)) return x;
3729 if (name[0] == '/') {
@@ -72,7 +64,7 @@
7365 size_t mwpp_strcspn(const char* text, int text_len, const char* search, int offset) {
7466 /* Optimize me */
7567 //printf(" mwpp_strcspn(%s, %d, %s, %d)\n", text, text_len, search, offset);
76 - return php_strcspn( text + offset, search, text + text_len, search + strlen(search) );
 68+ return php_strcspn( (char*)text + offset, (char*)search, (char*)text + text_len, (char*)search + strlen(search) );
7769 }
7870
7971 /**
@@ -173,11 +165,18 @@
174166 bool enableOnlyinclude = false;
175167 enum internalTags ignoredElement; /* Act as this tag isn't there */
176168
177 - HashTable* xmlishElements = parserStripList;
178169 /* Instead of $xmlishRegex, we use directly the stripList.
179170 * As it is shared with Parser, includeonly/onlyinclude/noinclude are handled separatedly.
180171 * Per Parser::set{FunctionTag,}Hook(), the items are all strings and lowercase.
181172 */
 173+ int longestTagLen = array_max_strlen( parserStripList );
 174+ if ( longestTagLen == -1 ) {
 175+ *preprocessed_len = 1;
 176+ return NULL;
 177+ }
 178+ if ( longestTagLen < strsize( "onlyinclude" ) ) {
 179+ longestTagLen = strsize( "onlyinclude" );
 180+ }
182181
183182 if ( forInclusion ) {
184183 /* $ignoredTags = array( 'includeonly', '/includeonly' ); */
@@ -192,6 +191,7 @@
193192 #define isIgnoredTag(internalTag) (forInclusion ? ((internalTag) == includeonly) : ((internalTag) > includeonly) )
194193
195194 int i = 0;
 195+ char * lowername = NULL;
196196 bool findEquals = false; // True to find equals signs in arguments
197197 bool findPipe = false; // True to take notice of pipe characters
198198 int headingIndex = 1;
@@ -225,7 +225,7 @@
226226 findOnlyinclude = false;
227227 }
228228
229 - enum foundTypes found;
 229+ enum foundTypes found = -1;
230230 if ( fakeLineStart ) {
231231 found = lineStart;
232232 } else if ( fakePipeFound ) {
@@ -260,7 +260,7 @@
261261 // Output literal section, advance input counter
262262 size_t literalLength = mwpp_strcspn( text, text_len, search, i );
263263 if ( literalLength > 0 ) {
264 - addLiteral( text, i, literalLength );
 264+ addLiteral( text, i, (int)literalLength );
265265 i += literalLength;
266266 }
267267 if ( i >= text_len ) {
@@ -393,30 +393,32 @@
394394 }
395395
396396 /**
397 - * We differ here from the $xmlishRegex approach
398 - * The regex ends the tag name with a \s character, /> or >
399 - * so we start seeking for them, then look which name is it.
 397+ * The identifyTag() function performs everything the $xmlishRegex would have done.
400398 */
 399+ if ( !lowername ) {
 400+ lowername = emalloc( longestTagLen + 2 );
 401+ }
401402 assert(text[i] == '<');
 403+ enum internalTags internalTag;
402404 const char* name = text + i + 1;
403405 int name_len;
404406 /* TODO: optimize this search by not going further than
405407 * max( strlen( getParserStripList() + internalTags() ) )
406408 * while not setting noMoreGT in such case.
407409 */
408 - name_len = findSpaceOrAngle(name, text_len - i - 1);
409 - if ( name_len > 0 && name[name_len] == '>' && name[name_len - 1] == '/' ) {
410 - name_len--;
 410+ name_len = identifyTag(name, text_len - i - 1, parserStripList, &internalTag, lowername);
 411+ if ( name_len == -1 ) { /* Does it make sense to allow 0-length tags? */
 412+ addLiteral( text, i, 1 );
 413+ i++;
 414+ continue;
411415 }
 416+
412417 int attrStart = i + name_len + 1;
413418
414 - int tagEndPos = -1;
415 - if ( name_len != -1 ) {
416 - // Find end of tag
417 - char* end = memchr(name + name_len, '>', text_len - i - 1);
418 -
419 - tagEndPos = end ? end - text : -1;
420 - }
 419+ // Find end of tag
 420+ char* end = memchr(name + name_len, '>', text_len - i - 1);
 421+ int tagEndPos = end ? end - text : -1;
 422+
421423 if ( tagEndPos == -1 ) {
422424 // Infinite backtrack
423425 // Disable tag search to prevent worst-case O(N^2) performance
@@ -427,9 +429,6 @@
428430 }
429431 assert(text[tagEndPos] == '>');
430432
431 - enum internalTags internalTag;
432 - internalTag = getInternalTag(name, name_len);
433 -
434433 // Handle ignored tags
435434 if ( isIgnoredTag( internalTag ) ) {
436435 addNodeWithText( ignore_node, text, i, tagEndPos - i + 1 );
@@ -437,28 +436,11 @@
438437 continue;
439438 }
440439
441 - char * lowername;
442 - if ( internalTag == None ) {
443 - int j;
444 - // Verify that it's not just tag-looking text
445 - lowername = alloca( name_len + 1 ); /* FIXME */
446 - for (j = 0; j < name_len; j++) {
447 - lowername[j] = tolower(name[j]);
448 - }
449 - lowername[j] = '\0';
450 - if ( !str_in_array(lowername, name_len, xmlishElements, true) ) {
451 - addLiteral( text, i, 1 );
452 - ++i;
453 - continue;
454 - }
455 - } else {
456 - lowername = (char*)internalTagNames[internalTag];
457 - }
458 -
459440 int tagStartPos, attrEnd, endTagBegin, endTagLen;
460441 int innerTextBegin, innerTextLen;
461442 tagStartPos = i; endTagLen = 0;
462443 innerTextBegin = -1; innerTextLen = -1;
 444+ endTagBegin = 42; /* Disable warning. This variable is only used when endTagLen != 0 */
463445
464446 if ( text[tagEndPos-1] == '/' ) {
465447 attrEnd = tagEndPos - 1;
@@ -824,6 +806,10 @@
825807 }
826808 }
827809
 810+ if ( lowername ) { // No reason to TSRMLS_FETCH() if we didn't allocate anything
 811+ efree( lowername );
 812+ }
 813+
828814 nodeString[nodeStringLen] = '\0';
829815 *preprocessed_len = nodeStringLen;
830816 return nodeString;
Index: trunk/extensions/NativePreprocessor/FORMAT
@@ -0,0 +1,30 @@
 2+The serialized format of the Native Preprocessor is a list of node types with
 3+indexes to the original text included in that node. It is returned in a php
 4+string type. For easier debugging, the characters of those string are restricted
 5+to printable characters.
 6+This string is then returned joined in an array with a refcounted copy of the
 7+original text.
 8+
 9+The full string is formed by two or more nodes.
 10+
 11+The node format is as follows:
 12+ +------+-------+------+------+------+------+------+------+------+
 13+ | Type | Flags | Next sibling | Content length |
 14+ +------+-------+------+------+------+------+------+------+------+
 15+
 16+Type:
 17+ A character which identifies the kind of node.
 18+
 19+Flags:
 20+ A value whose meaning depends on the type. The default value is '0'.
 21+
 22+Next sibling:
 23+ 6 hexadecimal characters which specify the length in bytes of all its child nodes.
 24+
 25+Content lenght:
 26+ 8 hexadecimal characters which specify the length in bytes of the text content of this
 27+node in the wikitext. The beginning of the node in the text is implied by all the previous
 28+lengths.
 29+
 30+Total node length: 16 bytes.
 31+
Index: trunk/extensions/NativePreprocessor/tag_util.c
@@ -0,0 +1,124 @@
 2+#include <ctype.h>
 3+#include <stdbool.h>
 4+#include "php.h"
 5+#define const
 6+#include "tag_util.h"
 7+
 8+const zvalue_value internalTagZvalues[] = {
 9+ { .str = { NULL, 0 } },
 10+ { .str = { "includeonly", 11 } },
 11+ { .str = { "onlyinclude", 11 } },
 12+ { .str = { "noinclude", 9 } }
 13+};
 14+
 15+/**
 16+ * This functions is given an array of strings and returns the length of
 17+ * the longest one. If there are other kind of items, it returns -1
 18+ */
 19+int array_max_strlen( const HashTable* array ) {
 20+ zval **entry; /* pointer to array entry */
 21+ HashPosition pos; /* hash iterator */
 22+ int max_length = 0;
 23+
 24+ zend_hash_internal_pointer_reset_ex(array, &pos);
 25+ while (zend_hash_get_current_data_ex(array, (void **)&entry, &pos) == SUCCESS) {
 26+ if (Z_TYPE_PP(entry) != IS_STRING) {
 27+ return -1;
 28+ }
 29+ if (Z_STRLEN_PP(entry) > max_length) {
 30+ max_length = Z_STRLEN_PP(entry);
 31+ }
 32+ zend_hash_move_forward_ex(array, &pos);
 33+ }
 34+ return max_length;
 35+}
 36+
 37+/**
 38+ * Returns if a given character matches a "\s>".
 39+ * Remember that for PERL compatibility, \s doesn't
 40+ * include the Vertical Tab (0x11)
 41+ */
 42+static inline bool isRegexSpaceOrAngle(int character) {
 43+ switch ( character ) {
 44+ case '\t':
 45+ case '\n':
 46+ case '\f':
 47+ case '\r':
 48+ case ' ':
 49+ case '>':
 50+ return true;
 51+ }
 52+ return false;
 53+}
 54+
 55+static inline int min( int a, int b ) {
 56+ if ( a < b ) {
 57+ return a;
 58+ }
 59+ return b;
 60+}
 61+
 62+/**
 63+ * Returns the length of the first tag case-insensitive present in the
 64+ * lowercase array or internal, and followed by a space character, '/>' or '>'.
 65+ * The matched tag is stored in lowercase in the lowername parameter,
 66+ * which is allocated by the caller.
 67+ * It also calculates and returns the internalTag parameter
 68+ */
 69+int identifyTag(const char* __restrict string, int string_len, const HashTable* __restrict array, enum internalTags *__restrict internalTag, char* __restrict lowername) {
 70+ zval **entryp, *entry;
 71+ HashPosition pos;
 72+ *internalTag = None;
 73+ int i = 0;
 74+
 75+ zend_hash_internal_pointer_reset_ex(array, &pos);
 76+ while ( 1 ) {
 77+ if ( *internalTag == None ) {
 78+ if ( zend_hash_get_current_data_ex(array, (void **)&entryp, &pos) == FAILURE ) {
 79+ (*internalTag)++;
 80+ if ( string[0] == '/' ) {
 81+ string++;
 82+ string_len--;
 83+ }
 84+ continue;
 85+ }
 86+ assert( Z_TYPE_PP(entryp) == IS_STRING ); /* Already checked in array_max_strlen */
 87+ entry = *entryp;
 88+ } else if ( *internalTag == EndInternalTags ) {
 89+ return -1;
 90+ } else {
 91+ entry = (zval*)&internalTagZvalues[*internalTag];
 92+ }
 93+
 94+ if (Z_STRLEN_P(entry) < string_len) {
 95+ if ( isRegexSpaceOrAngle( string[Z_STRLEN_P(entry)] )
 96+ || ( string[Z_STRLEN_P(entry)] == '/' && Z_STRLEN_P(entry) + 2 <= string_len && string[Z_STRLEN_P(entry)+1] == '>') )
 97+ {
 98+ /* Verify the already lowercased name */
 99+ if ( !memcmp( lowername, Z_STRVAL_P(entry), min( i, Z_STRLEN_P(entry) ) ) ) {
 100+ for ( ; ; ) {
 101+ lowername[i] = tolower( string[i] ); /* This is locale dependant, just as strtolower() and the original code */
 102+ if ( lowername[i] != Z_STRVAL_P(entry)[i] ) {
 103+ i++;
 104+ break;
 105+ }
 106+
 107+ i++;
 108+
 109+ if ( i == Z_STRLEN_P(entry) ) {
 110+ lowername[i] = '\0';
 111+ return i;
 112+ }
 113+ }
 114+ }
 115+ }
 116+ }
 117+ if ( *internalTag == None ) {
 118+ zend_hash_move_forward_ex(array, &pos);
 119+ } else {
 120+ (*internalTag)++;
 121+ }
 122+ }
 123+
 124+ return -1;
 125+}
Property changes on: trunk/extensions/NativePreprocessor/tag_util.c
___________________________________________________________________
Added: svn:eol-style
1126 + native
Index: trunk/extensions/NativePreprocessor/tag_util.h
@@ -0,0 +1,11 @@
 2+enum internalTags {
 3+ None,
 4+ includeonly,
 5+ onlyinclude,
 6+ noinclude,
 7+ EndInternalTags
 8+};
 9+
 10+int array_max_strlen( const HashTable* array );
 11+int identifyTag(const char* string, int string_len, const HashTable* array, enum internalTags *internalTag, char* lowername);
 12+
Property changes on: trunk/extensions/NativePreprocessor/tag_util.h
___________________________________________________________________
Added: svn:eol-style
113 + native

Follow-up revisions

RevisionCommit summaryAuthorDate
r80463TODO comment fixed with r80461.platonides20:47, 17 January 2011
r80465Make a bunch of functions static....platonides21:14, 17 January 2011
r80473Follow up r80461. In some cases the closing tags for internal tags were not r...platonides23:12, 17 January 2011
r80573Since r80461 preprocessToObj() may return NULL with the error code in the len....platonides16:27, 19 January 2011

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r80025Forbid '<', '>', ' ', '\n', '\r' in parser hook names....platonides18:38, 11 January 2011
r80376First half of the Native Preprocessor....platonides08:45, 15 January 2011

Status & tagging log