r80461 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r80460‎ \| r80461 \| r80462 >
Date:	19:54, 17 January 2011
Author:	platonides
Status:	deferred
Tags:
Comment:	Follow up r80376. Added missing file FORMAT. Fixed method call in Preprocessor_Native.php. Added support for tags containing spaces (r80025), following the same odd order dependant behavior as the php preprocessors. Extensions shouldn't rely on it. See http://thread.gmane.org/gmane.science.linguistics.wikipedia.technical/51496 As a result of these changes, there is much less worst-case lookahead now. in_array.{c,h} are now unused.
Modified paths:	/trunk/extensions/NativePreprocessor/FORMAT (added) (history) /trunk/extensions/NativePreprocessor/Preprocessor_Native.php (modified) (history) /trunk/extensions/NativePreprocessor/config.m4 (modified) (history) /trunk/extensions/NativePreprocessor/preprocesstoobj.c (modified) (history) /trunk/extensions/NativePreprocessor/tag_util.c (added) (history) /trunk/extensions/NativePreprocessor/tag_util.h (added) (history) /trunk/phase3/tests/phpunit/includes/parser/PreprocessorTest.php (modified) (history)

Diff [purge]

Index: trunk/phase3/tests/phpunit/includes/parser/PreprocessorTest.php
—	—	@@ -14,7 +14,7 @@
15	15	}
16	16
17	17	function getStripList() {
18		~~- return array( 'gallery' );~~
	18	+ return array( 'gallery', 'display map' /* Used by Maps, see r80025 CR */, '/foo' );
19	19	}
20	20
21	21	function provideCases() {
—	—	@@ -66,8 +66,13 @@
67	67	array( "{{{{{{Foo}}}}}}", "<root><tplarg><title><tplarg><title>Foo</title></tplarg></title></tplarg></root>" ),
68	68	array( "{{{{{{Foo}}}}}", "<root>{<template><title><tplarg><title>Foo</title></tplarg></title></template></root>" ),
69	69	array( "[[[Foo]]", "<root>[[[Foo]]</root>" ),
70		- array( "{{Foo\|[[[[bar]]\|baz]]}}", "<root><template><title>Foo</title><part><name index=\"1\" /><value>[[[[bar]]\|baz]]</value></part></template></root>" ), /* This test is important, since it means the difference between having the [[ rule stacked or not */
	70	+ array( "{{Foo\|[[[[bar]]\|baz]]}}", "<root><template><title>Foo</title><part><name index=\"1\" /><value>[[[[bar]]\|baz]]</value></part></template></root>" ), // This test is important, since it means the difference between having the [[ rule stacked or not
71	71	array( "{{Foo\|[[[[bar]\|baz]]}}", "<root>{{Foo\|[[[[bar]\|baz]]}}</root>" ),
	72	+ array( "{{Foo\|Foo [[[[bar]\|baz]]}}", "<root>{{Foo\|Foo [[[[bar]\|baz]]}}</root>" ),
	73	+ array( "Foo <display map>Bar</display map >Baz", "<root>Foo <ext><name>display map</name><attr></attr><inner>Bar</inner><close></display map ></close></ext>Baz</root>" ),
	74	+ array( "Foo <display map foo>Bar</display map >Baz", "<root>Foo <ext><name>display map</name><attr> foo</attr><inner>Bar</inner><close></display map ></close></ext>Baz</root>" ),
	75	+ array( "Foo <gallery bar=\"baz\" />", "<root>Foo <ext><name>gallery</name><attr> bar="baz" </attr></ext></root>" ),
	76	+ array( "</foo>Foo<//foo>", "<root><ext><name>/foo</name><attr></attr><inner>Foo</inner><close><//foo></close></ext></root>" ), # Worth blacklisting IMHO
72	77	/* array( file_get_contents( dirname( __FILE__ ) . '/QuoteQuran.txt' ), file_get_contents( dirname( __FILE__ ) . '/QuoteQuranExpanded.txt' ) ), */
73	78	);
74	79	}
Index: trunk/extensions/NativePreprocessor/Preprocessor_Native.php
—	—	@@ -21,7 +21,7 @@
22	22
23	23	function preprocessToObjInternal( $text, $flags = 0 ) {
24	24	$nativePP = new MediaWikiPreprocessor();
25		~~- $ntobj = $nativePP->preprocessToObjInternal( $text, $flags, $this->parser->getStripList() );~~
	25	+ $ntobj = $nativePP->preprocessToObj( $text, $flags, $this->parser->getStripList() );
26	26
27	27	return $ntobj;
28	28	}
—	—	@@ -41,7 +41,7 @@
42	42	$childrenLen = hexdec( substr( $node, 2, 6 ) );
43	43	$textLen = hexdec( substr( $node, 8, 8 ) );
44	44	$result = htmlspecialchars( substr( $text, 0, $textLen ) );
45		~~- if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node' );~~
	45	+ if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node of type ' . $node[0] . ". Expected $textLen bytes, but only " . strlen( $text ) . " available." );
46	46	$text = substr( $text, $textLen );
47	47	if ( strpos( '<et\|p', $node[0] ) !== false )
48	48	$result = ''; // Not present in Preprocessor_DOM
Index: trunk/extensions/NativePreprocessor/config.m4
—	—	@@ -5,6 +5,6 @@
6	6
7	7	if test "$PHP_MEDIAWIKIPREPROCESSOR" != "no"; then
8	8	dnl Enable the extension
9		~~- PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c in_array.c preprocesstoobj.c, $ext_shared)~~
	9	+ PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c tag_util.c preprocesstoobj.c, $ext_shared)
10	10	PHP_SUBST(MEDIAWIKI_PREPROCESSOR_SHARED_LIBADD)
11	11	fi
Index: trunk/extensions/NativePreprocessor/preprocesstoobj.c
—	—	@@ -7,7 +7,7 @@
8	8	#undef NDEBUG
9	9	#include <assert.h>
10	10
11		~~-#include "in_array.h"~~
	11	+#include "tag_util.h"
12	12	#include "nodes.h"
13	13
14	14	#define PTD_FOR_INCLUSION 1 /* Matches Parser::PTD_FOR_INCLUSION */
—	—	@@ -23,14 +23,6 @@
24	24	#define strsize(x) (sizeof(x)-1)
25	25	#define min(x,y) (((x) < (y)) ? (x) : (y))
26	26
27		~~-enum internalTags {~~
28		~~- None,~~
29		~~- includeonly,~~
30		~~- onlyinclude,~~
31		~~- noinclude~~
32		~~-};~~
33		~~-const char* internalTagNames[] = { NULL, "includeonly", "onlyinclude", "noinclude" };~~
34		-
35	27	enum internalTags getInternalTag(const char* name, int name_len) {
36	28	#define CHECK_INTERNAL_TAG(x) if ((sizeof(#x)-1 == name_len) && !strncasecmp(name, #x, sizeof(#x)-1)) return x;
37	29	if (name[0] == '/') {
—	—	@@ -72,7 +64,7 @@
73	65	size_t mwpp_strcspn(const char* text, int text_len, const char* search, int offset) {
74	66	/* Optimize me */
75	67	//printf(" mwpp_strcspn(%s, %d, %s, %d)\n", text, text_len, search, offset);
76		~~- return php_strcspn( text + offset, search, text + text_len, search + strlen(search) );~~
	68	+ return php_strcspn( (char)text + offset, (char)search, (char)text + text_len, (char)search + strlen(search) );
77	69	}
78	70
79	71	/**
—	—	@@ -173,11 +165,18 @@
174	166	bool enableOnlyinclude = false;
175	167	enum internalTags ignoredElement; /* Act as this tag isn't there */
176	168
177		~~- HashTable* xmlishElements = parserStripList;~~
178	169	/* Instead of $xmlishRegex, we use directly the stripList.
179	170	* As it is shared with Parser, includeonly/onlyinclude/noinclude are handled separatedly.
180	171	* Per Parser::set{FunctionTag,}Hook(), the items are all strings and lowercase.
181	172	*/
	173	+ int longestTagLen = array_max_strlen( parserStripList );
	174	+ if ( longestTagLen == -1 ) {
	175	+ *preprocessed_len = 1;
	176	+ return NULL;
	177	+ }
	178	+ if ( longestTagLen < strsize( "onlyinclude" ) ) {
	179	+ longestTagLen = strsize( "onlyinclude" );
	180	+ }
182	181
183	182	if ( forInclusion ) {
184	183	/* $ignoredTags = array( 'includeonly', '/includeonly' ); */
—	—	@@ -192,6 +191,7 @@
193	192	#define isIgnoredTag(internalTag) (forInclusion ? ((internalTag) == includeonly) : ((internalTag) > includeonly) )
194	193
195	194	int i = 0;
	195	+ char * lowername = NULL;
196	196	bool findEquals = false; // True to find equals signs in arguments
197	197	bool findPipe = false; // True to take notice of pipe characters
198	198	int headingIndex = 1;
—	—	@@ -225,7 +225,7 @@
226	226	findOnlyinclude = false;
227	227	}
228	228
229		~~- enum foundTypes found;~~
	229	+ enum foundTypes found = -1;
230	230	if ( fakeLineStart ) {
231	231	found = lineStart;
232	232	} else if ( fakePipeFound ) {
—	—	@@ -260,7 +260,7 @@
261	261	// Output literal section, advance input counter
262	262	size_t literalLength = mwpp_strcspn( text, text_len, search, i );
263	263	if ( literalLength > 0 ) {
264		~~- addLiteral( text, i, literalLength );~~
	264	+ addLiteral( text, i, (int)literalLength );
265	265	i += literalLength;
266	266	}
267	267	if ( i >= text_len ) {
—	—	@@ -393,30 +393,32 @@
394	394	}
395	395
396	396	/**
397		~~- * We differ here from the $xmlishRegex approach~~
398		~~- * The regex ends the tag name with a \s character, /> or >~~
399		~~- * so we start seeking for them, then look which name is it.~~
	397	+ * The identifyTag() function performs everything the $xmlishRegex would have done.
400	398	*/
	399	+ if ( !lowername ) {
	400	+ lowername = emalloc( longestTagLen + 2 );
	401	+ }
401	402	assert(text[i] == '<');
	403	+ enum internalTags internalTag;
402	404	const char* name = text + i + 1;
403	405	int name_len;
404	406	/* TODO: optimize this search by not going further than
405	407	* max( strlen( getParserStripList() + internalTags() ) )
406	408	* while not setting noMoreGT in such case.
407	409	*/
408		~~- name_len = findSpaceOrAngle(name, text_len - i - 1);~~
409		~~- if ( name_len > 0 && name[name_len] == '>' && name[name_len - 1] == '/' ) {~~
410		~~- name_len--;~~
	410	+ name_len = identifyTag(name, text_len - i - 1, parserStripList, &internalTag, lowername);
	411	+ if ( name_len == -1 ) { /* Does it make sense to allow 0-length tags? */
	412	+ addLiteral( text, i, 1 );
	413	+ i++;
	414	+ continue;
411	415	}
	416	+
412	417	int attrStart = i + name_len + 1;
413	418
414		~~- int tagEndPos = -1;~~
415		~~- if ( name_len != -1 ) {~~
416		~~- // Find end of tag~~
417		~~- char* end = memchr(name + name_len, '>', text_len - i - 1);~~
418		-
419		~~- tagEndPos = end ? end - text : -1;~~
420		~~- }~~
	419	+ // Find end of tag
	420	+ char* end = memchr(name + name_len, '>', text_len - i - 1);
	421	+ int tagEndPos = end ? end - text : -1;
	422	+
421	423	if ( tagEndPos == -1 ) {
422	424	// Infinite backtrack
423	425	// Disable tag search to prevent worst-case O(N^2) performance
—	—	@@ -427,9 +429,6 @@
428	430	}
429	431	assert(text[tagEndPos] == '>');
430	432
431		~~- enum internalTags internalTag;~~
432		~~- internalTag = getInternalTag(name, name_len);~~
433		-
434	433	// Handle ignored tags
435	434	if ( isIgnoredTag( internalTag ) ) {
436	435	addNodeWithText( ignore_node, text, i, tagEndPos - i + 1 );
—	—	@@ -437,28 +436,11 @@
438	437	continue;
439	438	}
440	439
441		~~- char * lowername;~~
442		~~- if ( internalTag == None ) {~~
443		~~- int j;~~
444		~~- // Verify that it's not just tag-looking text~~
445		~~- lowername = alloca( name_len + 1 ); /* FIXME */~~
446		~~- for (j = 0; j < name_len; j++) {~~
447		~~- lowername[j] = tolower(name[j]);~~
448		~~- }~~
449		~~- lowername[j] = '\0';~~
450		~~- if ( !str_in_array(lowername, name_len, xmlishElements, true) ) {~~
451		~~- addLiteral( text, i, 1 );~~
452		~~- ++i;~~
453		~~- continue;~~
454		~~- }~~
455		~~- } else {~~
456		~~- lowername = (char*)internalTagNames[internalTag];~~
457		~~- }~~
458		-
459	440	int tagStartPos, attrEnd, endTagBegin, endTagLen;
460	441	int innerTextBegin, innerTextLen;
461	442	tagStartPos = i; endTagLen = 0;
462	443	innerTextBegin = -1; innerTextLen = -1;
	444	+ endTagBegin = 42; /* Disable warning. This variable is only used when endTagLen != 0 */
463	445
464	446	if ( text[tagEndPos-1] == '/' ) {
465	447	attrEnd = tagEndPos - 1;
—	—	@@ -824,6 +806,10 @@
825	807	}
826	808	}
827	809
	810	+ if ( lowername ) { // No reason to TSRMLS_FETCH() if we didn't allocate anything
	811	+ efree( lowername );
	812	+ }
	813	+
828	814	nodeString[nodeStringLen] = '\0';
829	815	*preprocessed_len = nodeStringLen;
830	816	return nodeString;
Index: trunk/extensions/NativePreprocessor/FORMAT
—	—	@@ -0,0 +1,30 @@
	2	+The serialized format of the Native Preprocessor is a list of node types with
	3	+indexes to the original text included in that node. It is returned in a php
	4	+string type. For easier debugging, the characters of those string are restricted
	5	+to printable characters.
	6	+This string is then returned joined in an array with a refcounted copy of the
	7	+original text.
	8	+
	9	+The full string is formed by two or more nodes.
	10	+
	11	+The node format is as follows:
	12	+ +------+-------+------+------+------+------+------+------+------+
	13	+ \| Type \| Flags \| Next sibling \| Content length \|
	14	+ +------+-------+------+------+------+------+------+------+------+
	15	+
	16	+Type:
	17	+ A character which identifies the kind of node.
	18	+
	19	+Flags:
	20	+ A value whose meaning depends on the type. The default value is '0'.
	21	+
	22	+Next sibling:
	23	+ 6 hexadecimal characters which specify the length in bytes of all its child nodes.
	24	+
	25	+Content lenght:
	26	+ 8 hexadecimal characters which specify the length in bytes of the text content of this
	27	+node in the wikitext. The beginning of the node in the text is implied by all the previous
	28	+lengths.
	29	+
	30	+Total node length: 16 bytes.
	31	+
Index: trunk/extensions/NativePreprocessor/tag_util.c
—	—	@@ -0,0 +1,124 @@
	2	+#include <ctype.h>
	3	+#include <stdbool.h>
	4	+#include "php.h"
	5	+#define const
	6	+#include "tag_util.h"
	7	+
	8	+const zvalue_value internalTagZvalues[] = {
	9	+ { .str = { NULL, 0 } },
	10	+ { .str = { "includeonly", 11 } },
	11	+ { .str = { "onlyinclude", 11 } },
	12	+ { .str = { "noinclude", 9 } }
	13	+};
	14	+
	15	+/**
	16	+ * This functions is given an array of strings and returns the length of
	17	+ * the longest one. If there are other kind of items, it returns -1
	18	+ */
	19	+int array_max_strlen( const HashTable* array ) {
	20	+ zval *entry; / pointer to array entry */
	21	+ HashPosition pos; /* hash iterator */
	22	+ int max_length = 0;
	23	+
	24	+ zend_hash_internal_pointer_reset_ex(array, &pos);
	25	+ while (zend_hash_get_current_data_ex(array, (void **)&entry, &pos) == SUCCESS) {
	26	+ if (Z_TYPE_PP(entry) != IS_STRING) {
	27	+ return -1;
	28	+ }
	29	+ if (Z_STRLEN_PP(entry) > max_length) {
	30	+ max_length = Z_STRLEN_PP(entry);
	31	+ }
	32	+ zend_hash_move_forward_ex(array, &pos);
	33	+ }
	34	+ return max_length;
	35	+}
	36	+
	37	+/**
	38	+ * Returns if a given character matches a "\s>".
	39	+ * Remember that for PERL compatibility, \s doesn't
	40	+ * include the Vertical Tab (0x11)
	41	+ */
	42	+static inline bool isRegexSpaceOrAngle(int character) {
	43	+ switch ( character ) {
	44	+ case '\t':
	45	+ case '\n':
	46	+ case '\f':
	47	+ case '\r':
	48	+ case ' ':
	49	+ case '>':
	50	+ return true;
	51	+ }
	52	+ return false;
	53	+}
	54	+
	55	+static inline int min( int a, int b ) {
	56	+ if ( a < b ) {
	57	+ return a;
	58	+ }
	59	+ return b;
	60	+}
	61	+
	62	+/**
	63	+ * Returns the length of the first tag case-insensitive present in the
	64	+ * lowercase array or internal, and followed by a space character, '/>' or '>'.
	65	+ * The matched tag is stored in lowercase in the lowername parameter,
	66	+ * which is allocated by the caller.
	67	+ * It also calculates and returns the internalTag parameter
	68	+ */
	69	+int identifyTag(const char* __restrict string, int string_len, const HashTable* __restrict array, enum internalTags __restrict internalTag, char __restrict lowername) {
	70	+ zval *entryp, entry;
	71	+ HashPosition pos;
	72	+ *internalTag = None;
	73	+ int i = 0;
	74	+
	75	+ zend_hash_internal_pointer_reset_ex(array, &pos);
	76	+ while ( 1 ) {
	77	+ if ( *internalTag == None ) {
	78	+ if ( zend_hash_get_current_data_ex(array, (void **)&entryp, &pos) == FAILURE ) {
	79	+ (*internalTag)++;
	80	+ if ( string[0] == '/' ) {
	81	+ string++;
	82	+ string_len--;
	83	+ }
	84	+ continue;
	85	+ }
	86	+ assert( Z_TYPE_PP(entryp) == IS_STRING ); /* Already checked in array_max_strlen */
	87	+ entry = *entryp;
	88	+ } else if ( *internalTag == EndInternalTags ) {
	89	+ return -1;
	90	+ } else {
	91	+ entry = (zval)&internalTagZvalues[internalTag];
	92	+ }
	93	+
	94	+ if (Z_STRLEN_P(entry) < string_len) {
	95	+ if ( isRegexSpaceOrAngle( string[Z_STRLEN_P(entry)] )
	96	+ \|\| ( string[Z_STRLEN_P(entry)] == '/' && Z_STRLEN_P(entry) + 2 <= string_len && string[Z_STRLEN_P(entry)+1] == '>') )
	97	+ {
	98	+ /* Verify the already lowercased name */
	99	+ if ( !memcmp( lowername, Z_STRVAL_P(entry), min( i, Z_STRLEN_P(entry) ) ) ) {
	100	+ for ( ; ; ) {
	101	+ lowername[i] = tolower( string[i] ); /* This is locale dependant, just as strtolower() and the original code */
	102	+ if ( lowername[i] != Z_STRVAL_P(entry)[i] ) {
	103	+ i++;
	104	+ break;
	105	+ }
	106	+
	107	+ i++;
	108	+
	109	+ if ( i == Z_STRLEN_P(entry) ) {
	110	+ lowername[i] = '\0';
	111	+ return i;
	112	+ }
	113	+ }
	114	+ }
	115	+ }
	116	+ }
	117	+ if ( *internalTag == None ) {
	118	+ zend_hash_move_forward_ex(array, &pos);
	119	+ } else {
	120	+ (*internalTag)++;
	121	+ }
	122	+ }
	123	+
	124	+ return -1;
	125	+}
Property changes on: trunk/extensions/NativePreprocessor/tag_util.c
___________________________________________________________________
Added: svn:eol-style
1	126	+ native
Index: trunk/extensions/NativePreprocessor/tag_util.h
—	—	@@ -0,0 +1,11 @@
	2	+enum internalTags {
	3	+ None,
	4	+ includeonly,
	5	+ onlyinclude,
	6	+ noinclude,
	7	+ EndInternalTags
	8	+};
	9	+
	10	+int array_max_strlen( const HashTable* array );
	11	+int identifyTag(const char* string, int string_len, const HashTable* array, enum internalTags internalTag, char lowername);
	12	+
Property changes on: trunk/extensions/NativePreprocessor/tag_util.h
___________________________________________________________________
Added: svn:eol-style
1	13	+ native

Follow-up revisions

Revision	Commit summary	Author	Date
r80463	TODO comment fixed with r80461.	platonides	20:47, 17 January 2011
r80465	Make a bunch of functions static....	platonides	21:14, 17 January 2011
r80473	Follow up r80461. In some cases the closing tags for internal tags were not r...	platonides	23:12, 17 January 2011
r80573	Since r80461 preprocessToObj() may return NULL with the error code in the len....	platonides	16:27, 19 January 2011

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r80025	Forbid '<', '>', ' ', '\n', '\r' in parser hook names....	platonides	18:38, 11 January 2011
r80376	First half of the Native Preprocessor....	platonides	08:45, 15 January 2011

Status & tagging log

18:01, 6 March 2011 Hashar (talk | contribs) changed the status of r80461 [removed: new added: deferred]