r80376 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r80375‎ \| r80376 \| r80377 >
Date:	08:45, 15 January 2011
Author:	platonides
Status:	deferred
Tags:
Comment:	First half of the Native Preprocessor. Not a bad birthday present, Wikipedia :)
Modified paths:	/trunk/extensions/NativePreprocessor (added) (history) /trunk/extensions/NativePreprocessor/Preprocessor_Native.php (added) (history) /trunk/extensions/NativePreprocessor/config.h (added) (history) /trunk/extensions/NativePreprocessor/config.m4 (added) (history) /trunk/extensions/NativePreprocessor/in_array.c (added) (history) /trunk/extensions/NativePreprocessor/in_array.h (added) (history) /trunk/extensions/NativePreprocessor/mediawiki_preprocessor.c (added) (history) /trunk/extensions/NativePreprocessor/nodes.h (added) (history) /trunk/extensions/NativePreprocessor/php_mediawiki_preprocessor.h (added) (history) /trunk/extensions/NativePreprocessor/preprocesstoobj.c (added) (history)

Diff [purge]

Index: trunk/extensions/NativePreprocessor/Preprocessor_Native.php
—	—	@@ -0,0 +1,115 @@
	2	+<?php
	3	+
	4	+if ( class_exists( 'MediaWikiPreprocessor' ) ) {
	5	+ global $wgParserConf;
	6	+ $wgParserConf['preprocessorClass'] = 'Preprocessor_Native';
	7	+}
	8	+
	9	+class Preprocessor_Native implements Preprocessor {
	10	+ var $parser;
	11	+
	12	+ function __construct( $parser ) {
	13	+ $this->parser = $parser;
	14	+ }
	15	+
	16	+
	17	+ function preprocessToObj( $text, $flags = 0 ) {
	18	+ $ntobj = $this->preprocessToObjInternal( $text, $flags );
	19	+
	20	+ return array( 'text' => $text, 'nodes' => $ntobj );
	21	+ }
	22	+
	23	+ function preprocessToObjInternal( $text, $flags = 0 ) {
	24	+ $nativePP = new MediaWikiPreprocessor();
	25	+ $ntobj = $nativePP->preprocessToObjInternal( $text, $flags, $this->parser->getStripList() );
	26	+
	27	+ return $ntobj;
	28	+ }
	29	+
	30	+ /**
	31	+ * Completely inefficient function to transform into the xml serialization.
	32	+ */
	33	+ function preprocessToXml( $text, $flags = 0 ) {
	34	+ $ser = $this->preprocessToObjInternal( $text, $flags );
	35	+
	36	+ return $this->unserializeNode( substr( $ser, 0, 16 ), substr( $ser, 16 ), $text );
	37	+ }
	38	+
	39	+ const NODE_LEN = 16;
	40	+ function unserializeNode( $node, $children, &$text ) {
	41	+ $flags = ord( $node[1] ) - 48;
	42	+ $childrenLen = hexdec( substr( $node, 2, 6 ) );
	43	+ $textLen = hexdec( substr( $node, 8, 8 ) );
	44	+ $result = htmlspecialchars( substr( $text, 0, $textLen ) );
	45	+ if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node' );
	46	+ $text = substr( $text, $textLen );
	47	+ if ( strpos( '<et\|p', $node[0] ) !== false )
	48	+ $result = ''; // Not present in Preprocessor_DOM
	49	+
	50	+ while ( $childrenLen > 0 ) {
	51	+ $result .= $this->unserializeNode( substr( $children, 0, 16 ), substr( $children, 16 ), $text );
	52	+ $n = self::NODE_LEN + hexdec( substr( $children, 2, 6 ) );
	53	+ $children = substr( $children, $n );
	54	+ $childrenLen -= $n;
	55	+ }
	56	+ switch ( $node[0] ) {
	57	+ case '/':
	58	+ return "<root>$result</root>";
	59	+ case 'L':
	60	+ return $result;
	61	+ case 'I':
	62	+ return "<ignore>$result</ignore>";
	63	+ case '-':
	64	+ return "<comment>$result</comment>";
	65	+ case '<':
	66	+ return "<ext>$result</ext>";
	67	+ case 'N':
	68	+ if ($flags)
	69	+ return "<name index=\"$flags\" />";
	70	+ else
	71	+ return "<name>$result</name>";
	72	+ case 'a':
	73	+ return "<attr>$result</attr>";
	74	+ case 'e':
	75	+ return $result;
	76	+ case '.':
	77	+ return "<inner>$result</inner>";
	78	+ case '>':
	79	+ return "<close>$result</close>";
	80	+ case 'i':
	81	+ case 'j':
	82	+ case 'k':
	83	+ case 'l':
	84	+ case 'm':
	85	+ case 'n':
	86	+ return "<h level=\"" . ( ord( $node[0] ) - ord( 'h' ) ) . "\" i=\"" . ( ord( $node[1] ) - ord( '0' ) ) . "\">$result</h>";
	87	+ case 't':
	88	+ $lineStart = $flags ? " lineStart=\"1\"" : "";
	89	+ return "<template$lineStart>$result</template>";
	90	+ case 'p':
	91	+ return "<tplarg>$result</tplarg>";
	92	+ case 'T':
	93	+ return "<title>$result</title>";
	94	+ case '\|':
	95	+ return "<part>$result</part>";
	96	+ case 'v':
	97	+ return "<value>$result</value>";
	98	+ case '}':
	99	+ return '';
	100	+ default:
	101	+ throw new Exception( "Unknown node of type '" . $node[0] . "'");
	102	+ }
	103	+ }
	104	+
	105	+ function newFrame() {
	106	+ throw new Exception( __METHOD__ . 'unimplemented' );
	107	+ }
	108	+
	109	+ function newCustomFrame( $args ) {
	110	+ throw new Exception( __METHOD__ . 'unimplemented' );
	111	+ }
	112	+
	113	+ function newPartNodeArray( $values ) {
	114	+ throw new Exception( __METHOD__ . 'unimplemented' );
	115	+ }
	116	+}
Property changes on: trunk/extensions/NativePreprocessor/Preprocessor_Native.php
___________________________________________________________________
Added: svn:eol-style
1	117	+ native
Index: trunk/extensions/NativePreprocessor/config.m4
—	—	@@ -0,0 +1,10 @@
	2	+dnl Change that 'yes' to 'no' to not build it by default
	3	+
	4	+PHP_ARG_ENABLE(mediawiki-preprocessor, mediawiki preprocessor support,
	5	+[ --enable-mediawiki-preprocessor Include MediaWiki preprocessor extension], no, yes)
	6	+
	7	+if test "$PHP_MEDIAWIKIPREPROCESSOR" != "no"; then
	8	+ dnl Enable the extension
	9	+ PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c in_array.c preprocesstoobj.c, $ext_shared)
	10	+ PHP_SUBST(MEDIAWIKI_PREPROCESSOR_SHARED_LIBADD)
	11	+fi
Index: trunk/extensions/NativePreprocessor/preprocesstoobj.c
—	—	@@ -0,0 +1,830 @@
	2	+#include <string.h>
	3	+#include <stdbool.h>
	4	+
	5	+#include "php.h"
	6	+#include "ext/standard/php_string.h"
	7	+
	8	+#undef NDEBUG
	9	+#include <assert.h>
	10	+
	11	+#include "in_array.h"
	12	+#include "nodes.h"
	13	+
	14	+#define PTD_FOR_INCLUSION 1 /* Matches Parser::PTD_FOR_INCLUSION */
	15	+
	16	+// FIXME: Do not rely on the terminating \0
	17	+#define STRSTR(haystack, needle) strpos(haystack, needle, 0)
	18	+int strpos(const char* haystack, const char* needle, int offset) {
	19	+ char* s = strstr(haystack+offset, needle);
	20	+ if (!s) return -1;
	21	+ return s - haystack;
	22	+}
	23	+
	24	+#define strsize(x) (sizeof(x)-1)
	25	+#define min(x,y) (((x) < (y)) ? (x) : (y))
	26	+
	27	+enum internalTags {
	28	+ None,
	29	+ includeonly,
	30	+ onlyinclude,
	31	+ noinclude
	32	+};
	33	+const char* internalTagNames[] = { NULL, "includeonly", "onlyinclude", "noinclude" };
	34	+
	35	+enum internalTags getInternalTag(const char* name, int name_len) {
	36	+ #define CHECK_INTERNAL_TAG(x) if ((sizeof(#x)-1 == name_len) && !strncasecmp(name, #x, sizeof(#x)-1)) return x;
	37	+ if (name[0] == '/') {
	38	+ name++;
	39	+ name_len--;
	40	+ }
	41	+ CHECK_INTERNAL_TAG(includeonly);
	42	+ CHECK_INTERNAL_TAG(onlyinclude);
	43	+ CHECK_INTERNAL_TAG(noinclude);
	44	+ return None;
	45	+}
	46	+
	47	+#define pipe foundPipe /* Avoid conflicts with pipe(2) */
	48	+
	49	+enum foundTypes {
	50	+ lineStart,
	51	+ lineEnd,
	52	+ pipe = '\|',
	53	+ equals = '=',
	54	+ angle = '<',
	55	+ closeBrace = '}',
	56	+ closeBracket = ']',
	57	+ openBrace = '{',
	58	+ openBracket = '[',
	59	+};
	60	+
	61	+#define searchReset() strcpy(search, "[{<\n") // $search = $searchBase;
	62	+#define addSearch(x) addToSearch(search, sizeof(search), x) // $search .= 'x';
	63	+#define MAX_SEARCH_CHARS "[{<\n\|=}]"
	64	+void addToSearch(char* search, int search_len, char x) {
	65	+ int e;
	66	+ assert(strchr(MAX_SEARCH_CHARS, x));
	67	+ e = strlen(search);
	68	+ assert(e < search_len - 2);
	69	+ search[e] = x;
	70	+ search[e+1] = '\0';
	71	+}
	72	+
	73	+size_t mwpp_strcspn(const char* text, int text_len, const char* search, int offset) {
	74	+ /* Optimize me */
	75	+ //printf(" mwpp_strcspn(%s, %d, %s, %d)\n", text, text_len, search, offset);
	76	+ return php_strcspn( text + offset, search, text + text_len, search + strlen(search) );
	77	+}
	78	+
	79	+/**
	80	+ * Counts the number of times the character c appears since start, up to length.
	81	+ */
	82	+int chrspn( const char* text, int c, int start, int length ) {
	83	+ int i;
	84	+ for (i=0; i < length; i++) {
	85	+ if ( text[start+i] != c ) {
	86	+ break;
	87	+ }
	88	+ }
	89	+ return i;
	90	+}
	91	+
	92	+/**
	93	+ * Return the first index in text that either matches a PCRE \s or a '<'
	94	+ * Returns -1 if not found. Remember that for PERL compatibility, \s doesn't
	95	+ * include the Vertical Tab (0x11)
	96	+ */
	97	+int findSpaceOrAngle(const char* text, int text_len) {
	98	+ int i;
	99	+ for (i = 0; i < text_len; i++) {
	100	+ switch ( text[i] ) {
	101	+ case '\t':
	102	+ case '\n':
	103	+ case '\f':
	104	+ case '\r':
	105	+ case ' ':
	106	+ case '>':
	107	+ return i;
	108	+ }
	109	+ }
	110	+ return -1;
	111	+}
	112	+
	113	+/**
	114	+ * Locates an end tag for the given tag name.
	115	+ * Matches the regex "/<\/$name\s*>/i"
	116	+ * Doesn't (completely) support tag names which contain '<'
	117	+ *
	118	+ * @param text String: Text in which to find the tag
	119	+ * @param text_len int: Length of text
	120	+ * @param from int: Offset from which to begin the search
	121	+ * @param name String: lowercase name of the tag to close
	122	+ * @param name_len int: length of name
	123	+ * @param endTagLen int*: length of the found tag (output value)
	124	+ * @return int: The position from text where the end tag begins or -1 if not found
	125	+ */
	126	+static int findEndTag( const char* text, int text_len, int from, const char* name, int name_len, int* endTagLen ) {
	127	+ int i, j;
	128	+ for (i = from; i < text_len - 2 - name_len; i++) {
	129	+ if ( text[i] == '<' && text[i+1] == '/' ) {
	130	+ for (j = 0; j < name_len; j++) {
	131	+ if ( name[j] != tolower( text[i+2+j] ) ) {
	132	+ i += j;
	133	+ break;
	134	+ }
	135	+ }
	136	+ if ( j == name_len ) {
	137	+ while ( text[i+2+j] == ' ' ) j++;
	138	+ if ( text[i+2+j] == '>' ) {
	139	+ *endTagLen = j + strsize("</>");
	140	+ return i;
	141	+ }
	142	+ i += j;
	143	+ }
	144	+ }
	145	+ }
	146	+ return -1;
	147	+}
	148	+
	149	+/**
	150	+ * Returns the number of times the character c appears in text, searching backwards from position start
	151	+ */
	152	+int chrrspn( const char* text, int c, int start ) {
	153	+ int i = 0;
	154	+ while ( ( start-i >= 0 ) && text[start-i] == c ) {
	155	+ i++;
	156	+ }
	157	+ return i;
	158	+}
	159	+
	160	+char* preprocessToObj( const char* text, int text_len, int flags, HashTable* parserStripList, int* preprocessed_len ) {
	161	+ DEFINE_NODE_STRING()
	162	+
	163	+ /* The php preprocessors have an array of rules to use,
	164	+ * Those are hardcoded here. Places relying on it are
	165	+ * marked with a 'Known rules' comment.
	166	+ */
	167	+ #define BraceRuleMin 2
	168	+ #define BraceRuleMax 3
	169	+ #define BracketRuleMin 2
	170	+ #define BracketRuleMax 2
	171	+
	172	+ bool forInclusion = flags & PTD_FOR_INCLUSION;
	173	+
	174	+ bool enableOnlyinclude = false;
	175	+ enum internalTags ignoredElement; /* Act as this tag isn't there */
	176	+
	177	+ HashTable* xmlishElements = parserStripList;
	178	+ /* Instead of $xmlishRegex, we use directly the stripList.
	179	+ * As it is shared with Parser, includeonly/onlyinclude/noinclude are handled separatedly.
	180	+ * Per Parser::set{FunctionTag,}Hook(), the items are all strings and lowercase.
	181	+ */
	182	+
	183	+ if ( forInclusion ) {
	184	+ /* $ignoredTags = array( 'includeonly', '/includeonly' ); */
	185	+ ignoredElement = noinclude;
	186	+ if ( STRSTR( text, "<onlyinclude>" ) && STRSTR( text, "</onlyinclude>" ) ) {
	187	+ enableOnlyinclude = true;
	188	+ }
	189	+ } else {
	190	+ /* $ignoredTags = array( 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ); */
	191	+ ignoredElement = includeonly;
	192	+ }
	193	+ #define isIgnoredTag(internalTag) (forInclusion ? ((internalTag) == includeonly) : ((internalTag) > includeonly) )
	194	+
	195	+ int i = 0;
	196	+ bool findEquals = false; // True to find equals signs in arguments
	197	+ bool findPipe = false; // True to take notice of pipe characters
	198	+ int headingIndex = 1;
	199	+ bool inHeading = false; // True if $i is inside a possible heading
	200	+ bool noMoreGT = false; // True if there are no more greater-than (>) signs right of $i
	201	+ bool findOnlyinclude = enableOnlyinclude; // True to ignore all input up to the next <onlyinclude>
	202	+ bool fakeLineStart = true; // Do a line-start run without outputting an LF character
	203	+ bool fakePipeFound = false;
	204	+ char currentClosing = '\0';
	205	+ int lineStartPos = -1;
	206	+ char search[sizeof(MAX_SEARCH_CHARS)];
	207	+
	208	+ #define getFlags() \
	209	+ inHeading = (parentNode->type == heading_node); \
	210	+ findPipe = (parentNode->type != heading_node) && (parentNode->type != bracket_node); \
	211	+ findEquals = findPipe && ( parentNode->nextSibling > 0 ) && ( parentNode->type != value_node );
	212	+
	213	+ while ( true ) {
	214	+
	215	+ if ( findOnlyinclude ) {
	216	+ // Ignore all input up to the next <onlyinclude>
	217	+ int startPos = strpos( text, "<onlyinclude>", i );
	218	+ if ( startPos == -1 ) {
	219	+ // Ignored section runs to the end
	220	+ addNodeWithText(ignore_node, text, i, -1);
	221	+ break;
	222	+ }
	223	+ int tagEndPos = startPos + strsize( "<onlyinclude>" ); // past-the-end
	224	+ addNodeWithText(ignore_node, text, i, tagEndPos - i);
	225	+ i = tagEndPos;
	226	+ findOnlyinclude = false;
	227	+ }
	228	+
	229	+ enum foundTypes found;
	230	+ if ( fakeLineStart ) {
	231	+ found = lineStart;
	232	+ } else if ( fakePipeFound ) {
	233	+ found = pipe;
	234	+ } else {
	235	+ // Find next opening brace, closing brace or pipe
	236	+ searchReset();
	237	+ if ( parentNode->type == root_node ) {
	238	+ currentClosing = 0;
	239	+ } else {
	240	+ /* This is too ugly */
	241	+ if ( parentNode->type == heading_node ) {
	242	+ currentClosing = '\n';
	243	+ } else if ( parentNode->type == '[' ) {
	244	+ currentClosing = ']'; /* Known rules */
	245	+ } else if ( parentNode->parent && ( parentNode->parent->type == '{'
	246	+ \|\| ( parentNode->parent->parent && parentNode->parent->parent->type == '{' ) ) ) {
	247	+ currentClosing = '}'; /* Known rules */
	248	+ } else {
	249	+ currentClosing = 0;
	250	+ }
	251	+ addSearch( currentClosing );
	252	+ }
	253	+ if ( findPipe ) {
	254	+ addSearch( '\|' );
	255	+ }
	256	+ if ( findEquals ) {
	257	+ // First equals will be for the template
	258	+ addSearch( '=' );
	259	+ }
	260	+
	261	+ // Output literal section, advance input counter
	262	+ size_t literalLength = mwpp_strcspn( text, text_len, search, i );
	263	+ if ( literalLength > 0 ) {
	264	+ addLiteral( text, i, literalLength );
	265	+ i += literalLength;
	266	+ }
	267	+ if ( i >= text_len ) {
	268	+ if ( currentClosing == '\n' ) {
	269	+ // Do a past-the-end run to finish off the heading
	270	+ found = lineEnd;
	271	+ } else if ( parentNode->type == name_node && parentNode->parent && parentNode->parent->type == part_node && findEquals ) {
	272	+ // Convert this part\name into a value and add the name
	273	+ fakePipeFound = true;
	274	+ found = pipe;
	275	+ } else {
	276	+ // All done
	277	+ break;
	278	+ }
	279	+ } else {
	280	+ switch ( text[i] ) {
	281	+ case '\|':
	282	+ case '=':
	283	+ case '<':
	284	+ found = text[i];
	285	+ break;
	286	+ case '\n':
	287	+ if ( inHeading ) {
	288	+ found = lineEnd;
	289	+ } else {
	290	+ found = lineStart;
	291	+ }
	292	+ break;
	293	+ case '}': /* Known rules */
	294	+ case ']':
	295	+ if ( text[i] == currentClosing ) {
	296	+ found = currentClosing;
	297	+ }
	298	+ break;
	299	+ case '{': /* Known rules */
	300	+ case '[':
	301	+ found = text[i];
	302	+ break;
	303	+
	304	+ default:
	305	+ // Some versions of PHP have a strcspn which stops on null characters {{refneeded}}
	306	+ // Ignore and continue
	307	+ ++i;
	308	+ continue;
	309	+ }
	310	+ }
	311	+ }
	312	+
	313	+ if ( found == angle ) {
	314	+ // Determine which tag is this
	315	+ if ( enableOnlyinclude && strncasecmp( text + i, "</onlyinclude>", strsize( "</onlyinclude>" ) ) ) {
	316	+ findOnlyinclude = true;
	317	+ continue;
	318	+ }
	319	+
	320	+ // Handle comments
	321	+ if ( !strncmp( text + i, "<!--", 4 ) ) {
	322	+ // To avoid leaving blank lines, when a comment is both preceded
	323	+ // and followed by a newline (ignoring spaces), trim leading and
	324	+ // trailing spaces and one of the newlines.
	325	+
	326	+ // Find the end
	327	+ int endPos = strpos( text, "-->", i + 4 );
	328	+ if ( endPos == -1 ) {
	329	+ // Unclosed comment in input, runs to end
	330	+ addNodeWithText(comment_node, text, i, -1);
	331	+ i = text_len;
	332	+ } else {
	333	+ // Search backwards for leading whitespace
	334	+
	335	+ int wsStart;
	336	+ for (wsStart = i - 1; wsStart > 0; wsStart--) {
	337	+ if ( text[wsStart] != ' ') { /* It can't go over wikitext_len because the php string has a \0 terminator, too */
	338	+ wsStart++;
	339	+ break;
	340	+ }
	341	+ }
	342	+
	343	+ // Search forwards for trailing whitespace
	344	+ // wsEnd will be the position of the last space (or the > if there's none)
	345	+ int startPos, wsEnd = endPos + 3;
	346	+ while (text[wsEnd] == ' ') { wsEnd++; }
	347	+ wsEnd--; // A bit silly since we will be using wsEnd+1 everywhere, but we want to keep this the same as $wsEnd
	348	+
	349	+ // Eat the line if possible
	350	+ // This could theoretically be done if $wsStart == 0, i.e. for comments at
	351	+ // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
	352	+ // it's a possible beneficial b/c break.
	353	+ if ( wsStart > 0 && text[wsStart - 1] == '\n' && text[wsEnd + 1] == '\n' )
	354	+ {
	355	+ startPos = wsStart;
	356	+ endPos = wsEnd + 1;
	357	+ // Remove leading whitespace from the end of the accumulator
	358	+ // Sanity check first though
	359	+ int wsLength = i - wsStart;
	360	+ if ( wsLength > 0 && currentLiteral.len >= wsLength ) {
	361	+ if ( strspn( text + currentLiteral.from + currentLiteral.len - wsLength, " " ) != wsLength ) {
	362	+ // Can this ever be false?
	363	+ assert(0);
	364	+ }
	365	+ currentLiteral.len -= wsLength;
	366	+ }
	367	+ // Do a line-start run next time to look for headings after the comment
	368	+ fakeLineStart = true;
	369	+ } else {
	370	+ // No line to eat, just take the comment itself
	371	+ startPos = i;
	372	+ endPos += 2;
	373	+ }
	374	+
	375	+ if ( parentNode ) {
	376	+ if ( parentNode->commentEnd != -1 && parentNode->commentEnd == wsStart - 1 ) {
	377	+ // Comments abutting, no change in visual end
	378	+ parentNode->commentEnd = wsEnd;
	379	+ } else {
	380	+ parentNode->visualEnd = wsStart - 1;
	381	+ parentNode->commentEnd = endPos;
	382	+ }
	383	+ }
	384	+ i = endPos + 1;
	385	+ addNodeWithText(comment_node, text, startPos, endPos - startPos + 1);
	386	+ }
	387	+ continue;
	388	+ }
	389	+
	390	+ if ( noMoreGT ) {
	391	+ addLiteral( text, i, 1 );
	392	+ ++i;
	393	+ continue;
	394	+ }
	395	+
	396	+ /**
	397	+ * We differ here from the $xmlishRegex approach
	398	+ * The regex ends the tag name with a \s character, /> or >
	399	+ * so we start seeking for them, then look which name is it.
	400	+ */
	401	+ assert(text[i] == '<');
	402	+ const char* name = text + i + 1;
	403	+ int name_len;
	404	+ /* TODO: optimize this search by not going further than
	405	+ * max( strlen( getParserStripList() + internalTags() ) )
	406	+ * while not setting noMoreGT in such case.
	407	+ */
	408	+ name_len = findSpaceOrAngle(name, text_len - i - 1);
	409	+ if ( name_len > 0 && name[name_len] == '>' && name[name_len - 1] == '/' ) {
	410	+ name_len--;
	411	+ }
	412	+ int attrStart = i + name_len + 1;
	413	+
	414	+ int tagEndPos = -1;
	415	+ if ( name_len != -1 ) {
	416	+ // Find end of tag
	417	+ char* end = memchr(name + name_len, '>', text_len - i - 1);
	418	+
	419	+ tagEndPos = end ? end - text : -1;
	420	+ }
	421	+ if ( tagEndPos == -1 ) {
	422	+ // Infinite backtrack
	423	+ // Disable tag search to prevent worst-case O(N^2) performance
	424	+ noMoreGT = true;
	425	+ addLiteral( text, i, 1 );
	426	+ ++i;
	427	+ continue;
	428	+ }
	429	+ assert(text[tagEndPos] == '>');
	430	+
	431	+ enum internalTags internalTag;
	432	+ internalTag = getInternalTag(name, name_len);
	433	+
	434	+ // Handle ignored tags
	435	+ if ( isIgnoredTag( internalTag ) ) {
	436	+ addNodeWithText( ignore_node, text, i, tagEndPos - i + 1 );
	437	+ i = tagEndPos + 1;
	438	+ continue;
	439	+ }
	440	+
	441	+ char * lowername;
	442	+ if ( internalTag == None ) {
	443	+ int j;
	444	+ // Verify that it's not just tag-looking text
	445	+ lowername = alloca( name_len + 1 ); /* FIXME */
	446	+ for (j = 0; j < name_len; j++) {
	447	+ lowername[j] = tolower(name[j]);
	448	+ }
	449	+ lowername[j] = '\0';
	450	+ if ( !str_in_array(lowername, name_len, xmlishElements, true) ) {
	451	+ addLiteral( text, i, 1 );
	452	+ ++i;
	453	+ continue;
	454	+ }
	455	+ } else {
	456	+ lowername = (char*)internalTagNames[internalTag];
	457	+ }
	458	+
	459	+ int tagStartPos, attrEnd, endTagBegin, endTagLen;
	460	+ int innerTextBegin, innerTextLen;
	461	+ tagStartPos = i; endTagLen = 0;
	462	+ innerTextBegin = -1; innerTextLen = -1;
	463	+
	464	+ if ( text[tagEndPos-1] == '/' ) {
	465	+ attrEnd = tagEndPos - 1;
	466	+ i = tagEndPos + 1;
	467	+ } else {
	468	+ attrEnd = tagEndPos;
	469	+ // Find closing tag
	470	+
	471	+ endTagBegin = findEndTag( text, text_len, tagEndPos + 1, lowername, name_len, &endTagLen );
	472	+
	473	+ if ( endTagBegin != -1 )
	474	+ {
	475	+ innerTextBegin = tagEndPos + 1;
	476	+ innerTextLen = endTagBegin - tagEndPos - 1;
	477	+ i = endTagBegin + endTagLen;
	478	+ } else {
	479	+ // No end tag -- let it run out to the end of the text.
	480	+ innerTextBegin = tagEndPos + 1;
	481	+ i = text_len;
	482	+ }
	483	+ }
	484	+
	485	+ if ( isIgnoredTag( internalTag ) ) {
	486	+ addNodeWithText(ignore_node, text, tagStartPos, i - tagStartPos );
	487	+ continue;
	488	+ }
	489	+
	490	+ addNodeWithTags( ext_node, 1 ); /* The '<' is implicit in Preprocessor_DOM */
	491	+ addNodeWithText( name_node, text, tagStartPos + 1, name_len );
	492	+
	493	+ // Note that the attr element contains the whitespace between name and attribute,
	494	+ // this is necessary for precise reconstruction during pre-save transform.
	495	+ assert(attrEnd >= attrStart);
	496	+ addNodeWithText( attr_node, text, attrStart, attrEnd - attrStart );
	497	+ addNodeWithText( end_name_node, text, attrEnd, tagEndPos - attrEnd + 1 );
	498	+
	499	+ if ( innerTextBegin != -1 ) {
	500	+ addNodeWithText( inner_node, text, innerTextBegin, innerTextLen );
	501	+ }
	502	+ if ( endTagLen ) {
	503	+ addNodeWithText( close_node, text, endTagBegin, endTagLen );
	504	+ }
	505	+ closeNode( ext_node );
	506	+ }
	507	+ else if ( found == lineStart ) {
	508	+ // Is this the start of a heading?
	509	+ // Line break belongs before the heading element in any case
	510	+ if ( fakeLineStart ) {
	511	+ fakeLineStart = false;
	512	+ } else {
	513	+ addLiteral( text, i, 1 );
	514	+ i++;
	515	+ }
	516	+
	517	+ int count = chrspn( text, '=', i, 6 );
	518	+ if ( count == 1 && findEquals ) {
	519	+ // DWIM: This looks kind of like a name/value separator
	520	+ // Let's let the equals handler have it and break the potential heading
	521	+ // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
	522	+ } else if ( count > 0 ) {
	523	+ /*
	524	+ piece = array(
	525	+ 'open' => "\n",
	526	+ 'close' => "\n",
	527	+ 'parts' => array( new PPDPart( str_repeat( '=', $count ) ) ),
	528	+ 'startPos' => $i,
	529	+ 'count' => $count );
	530	+ */
	531	+ lineStartPos = i; /* This lived in the stack in php, but there can't be two open header pieces */
	532	+ addNodeWithTags(heading_node, count);
	533	+ currentClosing = '\n';
	534	+ /* extract( $stack->getFlags(); ) */
	535	+ getFlags()
	536	+ i += count;
	537	+ }
	538	+ } else if ( found == lineEnd ) {
	539	+
	540	+ // A heading must be open, otherwise \n wouldn't have been in the search list
	541	+ assert( parentNode->type == heading_node );
	542	+ assert( lineStartPos != -1 );
	543	+
	544	+ // Search back through the input to see if it has a proper close
	545	+ // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
	546	+ int searchStart;
	547	+ for (searchStart = i - 1; searchStart > 0; --searchStart) {
	548	+ if ( ( text[searchStart] != ' ' ) && ( text[searchStart] != '\t' ) ) {
	549	+ break;
	550	+ }
	551	+ }
	552	+
	553	+ if ( parentNode->commentEnd != -1 && searchStart == parentNode->commentEnd ) {
	554	+ // Comment found at line end
	555	+ // Search for equals signs before the comment
	556	+ for (searchStart = parentNode->visualEnd; searchStart > 0; --searchStart) {
	557	+ if (text[i] != ' ' && text[i] != '\t')
	558	+ break;
	559	+ }
	560	+ }
	561	+ searchStart++;
	562	+
	563	+ int count = parentNode->contentLength;
	564	+ int equalsLength = chrrspn( text, '=', searchStart - 1 );
	565	+
	566	+ if ( equalsLength > 0 ) {
	567	+ if ( searchStart - equalsLength == lineStartPos ) {
	568	+ // This is just a single string of equals signs on its own line
	569	+ // Replicate the doHeadings behaviour /={count}(.+)={count}/
	570	+ // First find out how many equals signs there really are (don't stop at 6)
	571	+ count = equalsLength;
	572	+ if ( count < 3 ) {
	573	+ count = 0;
	574	+ } else {
	575	+ count = min( 6, ( count - 1 ) / 2 );
	576	+ }
	577	+ } else {
	578	+ count = min( equalsLength, count );
	579	+ }
	580	+ if ( count > 0 ) {
	581	+ // Normal match, output <h>
	582	+ assert( count < 7 );
	583	+ parentNode->type = heading_node + count;
	584	+ parentNode->flags = headingIndex;
	585	+ headingIndex++;
	586	+ } else {
	587	+ // Single equals sign on its own line, count=0
	588	+ parentNode->type = literal_node;
	589	+ }
	590	+ } else {
	591	+ // No match, no <h>, just pass down the inner text
	592	+ parentNode->type = literal_node;
	593	+ }
	594	+ // Unwind the stack
	595	+ closeNode( parentNode->type );
	596	+ /* extract( getFlags() ); */
	597	+ getFlags();
	598	+
	599	+ // Note that we do NOT increment the input pointer.
	600	+ // This is because the closing linebreak could be the opening linebreak of
	601	+ // another heading. Infinite loops are avoided because the next iteration MUST
	602	+ // hit the heading open case above, which unconditionally increments the
	603	+ // input pointer.
	604	+ assert( inHeading == false );
	605	+ } else if ( found == openBrace \|\| found == openBracket ) {
	606	+ // count opening brace characters
	607	+ int count = chrspn( text, text[i], i, text_len - i );
	608	+
	609	+ // we need to add to stack only if opening brace count is enough for one of the rules
	610	+ int rulemin = 2; /* Known rules */
	611	+
	612	+ if ( count >= rulemin ) {
	613	+ // Add it to the stack
	614	+ addNodeWithTags( found, count );
	615	+ parentNode->flags = (i > 0 && text[i-1] == '\n') /* lineStart boolean */;
	616	+ /* close char does not need to be stored per Known rules */
	617	+ parentNode->count = count;
	618	+ parentNode->argIndex = 0;
	619	+ if ( found == openBrace ) {
	620	+ addNodeWithTags( title_node, 0 );
	621	+ }
	622	+ getFlags();
	623	+ } else {
	624	+ // Add literal brace(s)
	625	+ addLiteral( text, i, count );
	626	+ }
	627	+ i += count;
	628	+ } else if ( found == closeBrace \|\| found == closeBracket ) {
	629	+ // lets check if there are enough characters for closing brace
	630	+
	631	+ if ( parentNode->type == name_node ) {
	632	+ /* Go to close it */
	633	+ fakePipeFound = true;
	634	+ continue;
	635	+ }
	636	+ if ( parentNode->type == value_node ) {
	637	+ closeNode( parentNode->type );
	638	+ assert( parentNode->type == part_node );
	639	+ }
	640	+ assert( ( parentNode->type == found - 2 ) \|\| ( parentNode->parent && ( parentNode->parent->type == found - 2 ) ) );
	641	+
	642	+ int maxCount = found == closeBracket ? parentNode->count : parentNode->parent->count;
	643	+ int count = chrspn( text, found, i, maxCount );
	644	+
	645	+ // check for maximum matching characters (if there are 5 closing
	646	+ // characters, we will probably need only 3 - depending on the rules)
	647	+ int ruleMax = ( found == closeBrace ) ? 3 : 2; /* Known rules */
	648	+ int matchingCount = 0;
	649	+ if ( count > ruleMax ) {
	650	+ // The specified maximum exists in the callback array, unless the caller
	651	+ // has made an error
	652	+ matchingCount = ruleMax;
	653	+ } else {
	654	+ // Count is less than the maximum
	655	+ // Skip any gaps in the callback array to find the true largest match
	656	+ matchingCount = count;
	657	+ /* Known rules: If we have three opening braces but only two closing ones, we want the two.
	658	+ * With less than the minimum, matchingCount = 0.
	659	+ */
	660	+ if ( count >= 2 /* min / ) { / Known rules */
	661	+ matchingCount = count;
	662	+ }
	663	+ }
	664	+
	665	+ if ( matchingCount <= 0 ) {
	666	+ // No matching element found in callback array
	667	+ // Output a literal closing brace and continue
	668	+ assert( count == 1 );
	669	+ addLiteral( text, i, count );
	670	+ i += count;
	671	+ continue;
	672	+ }
	673	+
	674	+
	675	+ if ( found == closeBracket ) { /* Known rules */
	676	+ // No element, just literal text
	677	+ parentNode->count -= matchingCount;
	678	+
	679	+ /* The preprocessor DOM adds a new literal here, then goes
	680	+ * backwards and readds another node before if there are
	681	+ * brackets left.
	682	+ * We leave the same bracket node open (with decreasing counts)
	683	+ * until closing time, since we know that all brackets
	684	+ * will end up being literals.
	685	+ */
	686	+
	687	+ if ( parentNode->count < 2 ) { /* Known rules */
	688	+ parentNode = breakSyntax( parentNode, nodeString, &nodeStringLen );
	689	+ }
	690	+
	691	+ addLiteral( text, i, matchingCount );
	692	+ i += matchingCount;
	693	+ continue;
	694	+ }
	695	+ assert( ( parentNode->parent && ( parentNode->parent->type == brace_node ) ) );
	696	+
	697	+ assert( parentNode->type == title_node \|\| parentNode->type == part_node );
	698	+ closeNode( parentNode->type );
	699	+
	700	+ addNodeWithText( closebrace_node, text, i, matchingCount ); // should be on next line?
	701	+ // Advance input pointer
	702	+ i += matchingCount;
	703	+
	704	+ parentNode->count -= matchingCount;
	705	+
	706	+ if ( matchingCount == 2 ) {
	707	+ parentNode->type = template_node;
	708	+ } else if ( matchingCount == 3 ) {
	709	+ parentNode->type = tplarg_node;
	710	+ } else {
	711	+ assert( 0 );
	712	+ }
	713	+ parentNode->contentLength = matchingCount;
	714	+
	715	+ // Re-add the old stack element if it still has unmatched opening characters remaining
	716	+ if ( parentNode->count > 0 ) {
	717	+ int oldindex = parentNode->index;
	718	+
	719	+ // do we still qualify for any callback with remaining count?
	720	+ if ( parentNode->count >= 2 ) { /* Known rules */
	721	+ /* Prepend a { and a title node */
	722	+ int oldcount = parentNode->count;
	723	+ int oldflags = parentNode->flags;
	724	+
	725	+ parentNode->flags = 0; /* We don't begin a line since there is markup before us */
	726	+
	727	+ closeNode( parentNode->type );
	728	+ storedLength -= oldcount;
	729	+
	730	+ addNodeWithTags( brace_node, oldcount );
	731	+ addNodeWithTags( title_node, 0 );
	732	+
	733	+ /* But they must be placed before the tag we just closed: */
	734	+
	735	+ /* Move all our childs two positions right */
	736	+ memmove( nodeString + oldindex + NODE_LEN * 2, nodeString + oldindex, nodeStringLen - oldindex - 2 * NODE_LEN );
	737	+
	738	+ /* And the new tags into the positions left */
	739	+ parentNode->index = oldindex + NODE_LEN;
	740	+ parentNode->parent->index = oldindex;
	741	+ parentNode->parent->flags = oldflags;
	742	+ } else {
	743	+ /* Prepend a literal node with the skipped braces */
	744	+ int skippedBraces = 1 /* = parentNode->count */;
	745	+ closeNode( parentNode->type );
	746	+
	747	+ struct node tmpnode;
	748	+ tmpnode.type = literal_node;
	749	+ tmpnode.flags = 0;
	750	+ tmpnode.nextSibling = 0;
	751	+ tmpnode.contentLength = skippedBraces;
	752	+
	753	+ ALLOC_NODESTRING();
	754	+ memmove( nodeString + oldindex + NODE_LEN, nodeString + oldindex, nodeStringLen - oldindex );
	755	+ nodeStringLen += NODE_LEN;
	756	+
	757	+ serializeNode(nodeString + oldindex, &tmpnode);
	758	+ }
	759	+ } else {
	760	+ closeNode( parentNode->type );
	761	+ }
	762	+
	763	+ getFlags();
	764	+ } else if ( found == pipe ) {
	765	+ findEquals = true; // shortcut for getFlags()
	766	+ if ( parentNode->type == title_node ) {
	767	+ closeNode( title_node );
	768	+ } else if ( parentNode->type == name_node ) {
	769	+ assert( ( parentNode->parent && ( parentNode->parent->type == part_node ) ) );
	770	+ assert( ( parentNode->parent->parent && ( parentNode->parent->parent->type == brace_node ) ) );
	771	+
	772	+ /* This was a value node, the name is empty */
	773	+ parentNode->type = value_node;
	774	+ int len = parentNode->contentLength;
	775	+ parentNode->contentLength = 0;
	776	+ int oldindex = parentNode->index;
	777	+
	778	+ /* Relocate the children one position right */
	779	+ ALLOC_NODESTRING();
	780	+ memmove( nodeString + oldindex + NODE_LEN * 2, nodeString + oldindex + NODE_LEN, nodeStringLen - oldindex - NODE_LEN ); /* (nodeStringLen - oldindex) will often be 0 */
	781	+ nodeStringLen += NODE_LEN;
	782	+ /* And the father, too */
	783	+ parentNode->index += NODE_LEN;
	784	+ closeNode( value_node );
	785	+
	786	+ /* Place the name */
	787	+ struct node tmpnode;
	788	+ tmpnode.type = name_node;
	789	+ tmpnode.flags = parentNode->flags = ++parentNode->parent->argIndex;
	790	+ tmpnode.nextSibling = 0;
	791	+ tmpnode.contentLength = len;
	792	+ assert( len == 0 );
	793	+
	794	+ serializeNode(nodeString + oldindex, &tmpnode);
	795	+ if ( !fakePipeFound ) closeNode( part_node );
	796	+ } else {
	797	+ closeNode( value_node );
	798	+ closeNode( part_node );
	799	+ }
	800	+ if ( fakePipeFound ) {
	801	+ fakePipeFound = false;
	802	+ continue;
	803	+ }
	804	+ addNodeWithTags( part_node, 1 );
	805	+ addNodeWithTags( name_node, 0 );
	806	+ ++i;
	807	+ } else if ( found == equals ) {
	808	+ findEquals = false; // shortcut for getFlags()
	809	+ assert( parentNode->type == name_node ); /* If we are searching for an equal we are inside parts\name */
	810	+ closeNode( name_node );
	811	+ addLiteral( text, i, 1 );
	812	+ addNodeWithTags( value_node, 0 ); /* We could piggyback some literals on value_nodes */
	813	+
	814	+ //parentNode->eqpos = i; // we could remove eqpost member
	815	+ ++i;
	816	+ } else {
	817	+ assert( 2 + 2 == 5 );
	818	+ }
	819	+ }
	820	+ while ( parentNode ) {
	821	+ if ( parentNode->type == brace_node \|\| parentNode->type == bracket_node ) {
	822	+ parentNode = breakSyntax( parentNode, nodeString, &nodeStringLen );
	823	+ } else {
	824	+ closeNode( parentNode->type );
	825	+ }
	826	+ }
	827	+
	828	+ nodeString[nodeStringLen] = '\0';
	829	+ *preprocessed_len = nodeStringLen;
	830	+ return nodeString;
	831	+}
Property changes on: trunk/extensions/NativePreprocessor/preprocesstoobj.c
___________________________________________________________________
Added: svn:eol-style
1	832	+ native
Index: trunk/extensions/NativePreprocessor/in_array.c
—	—	@@ -0,0 +1,41 @@
	2	+
	3	+#include <stdbool.h>
	4	+#include "php.h"
	5	+#define const
	6	+#include "in_array.h"
	7	+
	8	+/**
	9	+ * This defines an interface for internally performing in_array()
	10	+ * You will notice that the similarity with php_search_array() is not casual.
	11	+ */
	12	+bool zval_in_array(const zval* value, const HashTable* array, bool strict) {
	13	+ zval *entry; / pointer to array entry */
	14	+ zval res; /* comparison result */
	15	+ HashPosition pos; /* hash iterator */
	16	+ int (is_equal_func)(zval , zval , zval TSRMLS_DC);
	17	+
	18	+ TSRMLS_FETCH(); /* Useless for simple arrays, since it's only needed when comparing array values */
	19	+
	20	+ is_equal_func = strict ? is_identical_function : is_equal_function;
	21	+
	22	+ zend_hash_internal_pointer_reset_ex(array, &pos);
	23	+ while (zend_hash_get_current_data_ex(array, (void **)&entry, &pos) == SUCCESS) {
	24	+ is_equal_func(&res, value, *entry TSRMLS_CC);
	25	+ if (Z_LVAL(res)) { /* if ( (long)res ), ie. if ( res == true ) */
	26	+ return true;
	27	+ }
	28	+ zend_hash_move_forward_ex(array, &pos);
	29	+ }
	30	+ return false;
	31	+}
	32	+
	33	+bool str_in_array(const char* string, int string_len, const HashTable* array, bool strict) {
	34	+ zval zstring;
	35	+ INIT_ZVAL(zstring);
	36	+ zstring.type = IS_STRING;
	37	+ zstring.value.str.val = string;
	38	+ zstring.value.str.len = string_len;
	39	+
	40	+ return zval_in_array(&zstring, array, strict);
	41	+}
	42	+
Property changes on: trunk/extensions/NativePreprocessor/in_array.c
___________________________________________________________________
Added: svn:eol-style
1	43	+ native
Index: trunk/extensions/NativePreprocessor/php_mediawiki_preprocessor.h
—	—	@@ -0,0 +1,28 @@
	2	+#ifndef PHP_MEDIAWIKI_PREPROCESSOR_H
	3	+#define PHP_MEDIAWIKI_PREPROCESSOR_H 1
	4	+
	5	+#ifdef ZTS
	6	+#include "TSRM.h"
	7	+#endif
	8	+
	9	+ZEND_BEGIN_MODULE_GLOBALS(mediawiki_preprocessor)
	10	+
	11	+ZEND_END_MODULE_GLOBALS(mediawiki_preprocessor)
	12	+
	13	+#ifdef ZTS
	14	+#define MWPP_G(v) TSRMG(mediawiki_preprocessor_globals_id, zend_notas_globals *, v)
	15	+#else
	16	+#define MWPP_G(v) (mediawiki_preprocessor_globals.v)
	17	+#endif
	18	+
	19	+#define PHP_MEDIAWIKI_PREPROCESSOR_VERSION "0.1"
	20	+#define PHP_MEDIAWIKI_PREPROCESSOR_EXTNAME "MediaWiki Preprocessor"
	21	+
	22	+PHP_MINIT_FUNCTION(mediawiki_preprocessor);
	23	+PHP_MSHUTDOWN_FUNCTION(mediawiki_preprocessor);
	24	+PHP_RINIT_FUNCTION(mediawiki_preprocessor);
	25	+
	26	+extern zend_module_entry mediawiki_preprocessor_module_entry;
	27	+#define phpext_mediawiki_preprocessor_ptr &mediawiki_preprocessor_module_entry
	28	+
	29	+#endif
Property changes on: trunk/extensions/NativePreprocessor/php_mediawiki_preprocessor.h
___________________________________________________________________
Added: svn:keywords
1	30	+ Author Date Id Rev URL
Added: svn:eol-style
2	31	+ native
Index: trunk/extensions/NativePreprocessor/in_array.h
—	—	@@ -0,0 +1,5 @@
	2	+#include <stdbool.h>
	3	+
	4	+bool zval_in_array(const zval* value, const HashTable* array, bool strict);
	5	+bool str_in_array(const char* string, int string_len, const HashTable* array, bool strict);
	6	+
Property changes on: trunk/extensions/NativePreprocessor/in_array.h
___________________________________________________________________
Added: svn:keywords
1	7	+ Author Date Id Rev URL
Added: svn:eol-style
2	8	+ native
Index: trunk/extensions/NativePreprocessor/mediawiki_preprocessor.c
—	—	@@ -0,0 +1,141 @@
	2	+#ifdef HAVE_CONFIG_H
	3	+#include "config.h"
	4	+#endif
	5	+
	6	+
	7	+#include "php.h"
	8	+#include "php_ini.h"
	9	+#include "php_mediawiki_preprocessor.h"
	10	+
	11	+
	12	+#if ZEND_DEBUG \|\| 1
	13	+#define DEBUG(x,...) php_printf("[MWPP] "x"\n", __VA_ARGS__)
	14	+#else
	15	+#define DEBUG(x,...)
	16	+#endif
	17	+
	18	+typedef struct _mediawiki_preprocessor {
	19	+ zend_object std; /* Inherit from a standard php object */
	20	+
	21	+} mwppobj;
	22	+
	23	+ZEND_DECLARE_MODULE_GLOBALS(mediawiki_preprocessor)
	24	+
	25	+zend_module_entry mediawiki_preprocessor_module_entry = {
	26	+#if ZEND_MODULE_API_NO >= 20010901
	27	+ STANDARD_MODULE_HEADER,
	28	+#endif
	29	+ PHP_MEDIAWIKI_PREPROCESSOR_EXTNAME,
	30	+ NULL, /* No procedures */
	31	+ PHP_MINIT(mediawiki_preprocessor), /* module_startup_func */
	32	+ PHP_MSHUTDOWN(mediawiki_preprocessor), /* module_shutdown_func */
	33	+ PHP_RINIT(mediawiki_preprocessor), /* request_startup_func */
	34	+ NULL, /* request_shutdown_func */
	35	+ NULL, /* info_func */
	36	+#if ZEND_MODULE_API_NO >= 20010901
	37	+ PHP_MEDIAWIKI_PREPROCESSOR_VERSION,
	38	+#endif
	39	+ STANDARD_MODULE_PROPERTIES
	40	+};
	41	+
	42	+#ifdef COMPILE_DL_MEDIAWIKI_PREPROCESSOR
	43	+ZEND_GET_MODULE(mediawiki_preprocessor)
	44	+#endif
	45	+
	46	+PHP_RINIT_FUNCTION(mediawiki_preprocessor)
	47	+{
	48	+ /* Request init */
	49	+
	50	+ return SUCCESS;
	51	+}
	52	+
	53	+PHP_MSHUTDOWN_FUNCTION(mediawiki_preprocessor)
	54	+{
	55	+ /* Module shutdown */
	56	+
	57	+ return SUCCESS;
	58	+}
	59	+
	60	+PHP_METHOD(WikiTextPreprocessor,preprocessToObj);
	61	+/* {{{ arginfo__construct */
	62	+ZEND_BEGIN_ARG_INFO_EX(/name/ arginfopreprocessToObj, /pass_rest_by_reference/ 0, /return_reference/ 0, /required_num_args/ 3)
	63	+ ZEND_ARG_INFO(/pass_by_ref/ 0, /name/ "WikiText")
	64	+ZEND_END_ARG_INFO()
	65	+/* }}} */
	66	+
	67	+static const zend_function_entry mwpp_methods[] = {
	68	+ PHP_ME(WikiTextPreprocessor, preprocessToObj, arginfopreprocessToObj, ZEND_ACC_PUBLIC)
	69	+ {NULL, NULL, NULL}
	70	+};
	71	+
	72	+static void free_mwppobj(void *object TSRMLS_DC);
	73	+static zend_object_value create_mwppobj (zend_class_entry *class_type TSRMLS_DC);
	74	+
	75	+static void php_mwpp_init_globals(zend_mediawiki_preprocessor_globals *mwpp_globals)
	76	+{
	77	+ /* No globals to init */
	78	+}
	79	+
	80	+PHP_MINIT_FUNCTION(mediawiki_preprocessor)
	81	+{
	82	+ /* Module init */
	83	+ zend_class_entry ce;
	84	+ zend_class_entry* registered_class;
	85	+
	86	+ ZEND_INIT_MODULE_GLOBALS(mediawiki_preprocessor, php_mwpp_init_globals, NULL);
	87	+
	88	+ INIT_CLASS_ENTRY(ce, "MediaWikiPreprocessor", mwpp_methods); /* Define class MediaWikiPreprocessor */
	89	+
	90	+ ce.create_object = create_mwppobj;
	91	+ registered_class = zend_register_internal_class(&ce TSRMLS_CC); /* Bring it to existence */
	92	+
	93	+ return SUCCESS;
	94	+}
	95	+
	96	+static zend_object_value create_mwppobj (zend_class_entry *class_type TSRMLS_DC)
	97	+{
	98	+ zend_object_value retval;
	99	+ mwppobj *intern;
	100	+ zval *tmp;
	101	+
	102	+ intern = emalloc(sizeof(mwppobj));
	103	+
	104	+ DEBUG("Creating MediaWikiPreprocessor %p", intern);
	105	+
	106	+ zend_object_std_init(&intern->std, class_type TSRMLS_CC);
	107	+ zend_hash_copy(intern->std.properties, &class_type->default_properties, (copy_ctor_func_t) zval_add_ref, (void ) &tmp, sizeof(zval ));
	108	+
	109	+ retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)NULL, (zend_objects_free_object_storage_t) free_mwppobj, NULL TSRMLS_CC);
	110	+ retval.handlers = zend_get_std_object_handlers(); /* Default handlers */
	111	+
	112	+ return retval;
	113	+}
	114	+
	115	+static void free_mwppobj(void *object TSRMLS_DC)
	116	+{
	117	+ mwppobj intern = (mwppobj )object;
	118	+
	119	+ zend_object_std_dtor(&intern->std TSRMLS_CC);
	120	+ efree(object);
	121	+
	122	+ DEBUG("MediaWikiPreprocessor %p destroyed", object);
	123	+}
	124	+
	125	+char* preprocessToObj( const char* text, int text_len, int flags, HashTable* parserStripList, int* preprocessed_len );
	126	+PHP_METHOD(WikiTextPreprocessor, preprocessToObj)
	127	+{
	128	+ zend_class_entry *class_entry;
	129	+ char wikitext, preprocessed;
	130	+ int wikitext_len, flags;
	131	+ int preprocessed_len;
	132	+ zval array, result;
	133	+
	134	+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sda", &wikitext, &wikitext_len, &flags, &array) == FAILURE) {
	135	+ return;
	136	+ }
	137	+ wikitext_len = strlen(wikitext);
	138	+ DEBUG("Constructed with text «%s» of length %d, flags %d", wikitext, wikitext_len, flags );
	139	+ preprocessed = preprocessToObj( wikitext, wikitext_len, flags, Z_ARRVAL_P(array), &preprocessed_len );
	140	+
	141	+ RETURN_STRINGL( preprocessed, preprocessed_len, 0 );
	142	+}
Property changes on: trunk/extensions/NativePreprocessor/mediawiki_preprocessor.c
___________________________________________________________________
Added: svn:eol-style
1	143	+ native
Index: trunk/extensions/NativePreprocessor/config.h
—	—	@@ -0,0 +1,59 @@
	2	+/* config.h. Generated from config.h.in by configure. */
	3	+/* config.h.in. Generated from configure.in by autoheader. */
	4	+
	5	+/* Whether to build mediawiki_preprocessor as dynamic module */
	6	+#define COMPILE_DL_MEDIAWIKI_PREPROCESSOR 1
	7	+
	8	+/* Define to 1 if you have the <dlfcn.h> header file. */
	9	+#define HAVE_DLFCN_H 1
	10	+
	11	+/* Define to 1 if you have the <inttypes.h> header file. */
	12	+#define HAVE_INTTYPES_H 1
	13	+
	14	+/* Define to 1 if you have the <memory.h> header file. */
	15	+#define HAVE_MEMORY_H 1
	16	+
	17	+/* Define to 1 if you have the <stdint.h> header file. */
	18	+#define HAVE_STDINT_H 1
	19	+
	20	+/* Define to 1 if you have the <stdlib.h> header file. */
	21	+#define HAVE_STDLIB_H 1
	22	+
	23	+/* Define to 1 if you have the <strings.h> header file. */
	24	+#define HAVE_STRINGS_H 1
	25	+
	26	+/* Define to 1 if you have the <string.h> header file. */
	27	+#define HAVE_STRING_H 1
	28	+
	29	+/* Define to 1 if you have the <sys/stat.h> header file. */
	30	+#define HAVE_SYS_STAT_H 1
	31	+
	32	+/* Define to 1 if you have the <sys/types.h> header file. */
	33	+#define HAVE_SYS_TYPES_H 1
	34	+
	35	+/* Define to 1 if you have the <unistd.h> header file. */
	36	+#define HAVE_UNISTD_H 1
	37	+
	38	+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
	39	+/* #undef NO_MINUS_C_MINUS_O */
	40	+
	41	+/* Define to the address where bug reports for this package should be sent. */
	42	+#define PACKAGE_BUGREPORT ""
	43	+
	44	+/* Define to the full name of this package. */
	45	+#define PACKAGE_NAME ""
	46	+
	47	+/* Define to the full name and version of this package. */
	48	+#define PACKAGE_STRING ""
	49	+
	50	+/* Define to the one symbol short name of this package. */
	51	+#define PACKAGE_TARNAME ""
	52	+
	53	+/* Define to the home page for this package. */
	54	+#define PACKAGE_URL ""
	55	+
	56	+/* Define to the version of this package. */
	57	+#define PACKAGE_VERSION ""
	58	+
	59	+/* Define to 1 if you have the ANSI C header files. */
	60	+#define STDC_HEADERS 1
Property changes on: trunk/extensions/NativePreprocessor/config.h
___________________________________________________________________
Added: svn:keywords
1	61	+ Author Date Id Rev URL
Added: svn:eol-style
2	62	+ native
Index: trunk/extensions/NativePreprocessor/nodes.h
—	—	@@ -0,0 +1,287 @@
	2	+
	3	+enum nodeTypes {
	4	+ root_node = '/',
	5	+ literal_node = 'L',
	6	+ ignore_node = 'I',
	7	+ comment_node = '-',
	8	+
	9	+ ext_node = '<', /* Encloses an extension tag */
	10	+ name_node = 'N', /* Tag name or part name */
	11	+ attr_node = 'a', /* Tag attributes */
	12	+ inner_node = '.', /* Tag contents, optional */
	13	+ end_name_node = 'e', /* > or /> closing a name node. Missing in Preprocessor_DOM */
	14	+ close_node = '>', /* Closing tag, optional */
	15	+
	16	+ heading_node = 'h', /* Used when working with a heading candidate */
	17	+ h1_node = 'i',
	18	+ h2_node = 'j',
	19	+ h3_node = 'k',
	20	+ h4_node = 'l',
	21	+ h5_node = 'm',
	22	+ h6_node = 'n',
	23	+
	24	+ brace_node = '{', /* Used when we still don't know its identity (template/tplarg) */
	25	+ bracket_node = '[',
	26	+ template_node = 't',
	27	+ tplarg_node = 'p',
	28	+ title_node = 'T',
	29	+ part_node = '\|',
	30	+ value_node = 'v',
	31	+ closebrace_node = '}',
	32	+};
	33	+
	34	+struct node {
	35	+ enum nodeTypes type;
	36	+ char flags;
	37	+ int nextSibling;
	38	+ int contentLength;
	39	+
	40	+ /* Relevant only for nodes with childs */
	41	+ int index; /* index inside nodeString */
	42	+ struct node* parent;
	43	+
	44	+ /* Used for headings */
	45	+ int commentEnd;
	46	+ int visualEnd; /* Point where the last text ends (ie. without spaces, comments...) */
	47	+
	48	+ /* Used for brace and bracket nodes */
	49	+ int count;
	50	+
	51	+ /* Used for template parts */
	52	+ int eqpos; /* Name nodes */
	53	+ int argIndex; /* Brace nodes */
	54	+ /* Compact me: Move the last three blocks into an union */
	55	+};
	56	+
	57	+struct literalNode {
	58	+ int from;
	59	+ int len;
	60	+};
	61	+
	62	+#define UNKNOWN_NODE_LEN -1
	63	+
	64	+#define DEFINE_NODE_STRING() char* nodeString = NULL; \
	65	+ int nodeStringLen = 0; /* Length used of nodeString. Initialised to 1 for a \0 terminator */ \
	66	+ struct literalNode currentLiteral = { 0, 0 }; \
	67	+ int storedLength = 0; /* Length of text already stored in the nodes */ \
	68	+ struct node* parentNode = NULL; \
	69	+ addNodeWithTags(root_node, 0);
	70	+
	71	+#define NODE_LEN 16 /* Length of a serialized node */
	72	+
	73	+/**
	74	+ * Adds a node of the specified type to the nodeString
	75	+ * @param nodeType enum nodeTypes: Type of the node to add.
	76	+ * @param txt char*: Text pointer. Must be 'text'
	77	+ * @param offset int: Offset from txt to copy from
	78	+ * @param length int: Length to copy from 'from'. -1 to copy until the end of the string.
	79	+ */
	80	+#define addNodeWithText(nodeType,txt,offset,length) \
	81	+ do { \
	82	+ int mylen = length; \
	83	+ assert( txt == text ); \
	84	+ if ( currentLiteral.len && nodeType != literal_node ) { \
	85	+ storeNodeWithText(literal_node, currentLiteral.from, currentLiteral.len); \
	86	+ currentLiteral.len = 0; \
	87	+ } \
	88	+ if ( -1 == mylen ) { \
	89	+ mylen = text_len - offset; \
	90	+ } \
	91	+ \
	92	+ storeNodeWithText(nodeType,offset,mylen); \
	93	+ } while (0)
	94	+
	95	+/**
	96	+ * Like addNodeWithText() but doesn't commit the literals
	97	+ */
	98	+#define storeNodeWithText(nodeType,offset,length) \
	99	+ do { \
	100	+ assert(storedLength == offset); \
	101	+ struct node tmpnode; \
	102	+ tmpnode.type = nodeType; \
	103	+ tmpnode.flags = 0; \
	104	+ tmpnode.nextSibling = 0; \
	105	+ tmpnode.contentLength = length; \
	106	+ \
	107	+ ALLOC_NODESTRING(); \
	108	+ serializeNode(nodeString + nodeStringLen, &tmpnode); \
	109	+ nodeStringLen += NODE_LEN; \
	110	+ storedLength += length; printf("storedLength: %d @%d\n", storedLength, __LINE__);\
	111	+ } while (0);
	112	+
	113	+/**
	114	+ * Records the passed literal inside currentLiteral
	115	+ * Adjacent literal nodes are stored inside of a single node.
	116	+ */
	117	+#define addLiteral(literalText,offset,length) \
	118	+ do { printf("Addliteral '%.*s'\n", length, literalText+offset);\
	119	+ int my_len = length; \
	120	+ assert( literalText == text ); \
	121	+ if ( my_len == -1 ) { \
	122	+ my_len = text_len - offset; \
	123	+ } \
	124	+ if ( currentLiteral.len ) { \
	125	+ assert( currentLiteral.from + currentLiteral.len == (offset) ); \
	126	+ } else { \
	127	+ currentLiteral.from = (offset); \
	128	+ } \
	129	+ currentLiteral.len += my_len; \
	130	+ assert( (length) >= 0 ); \
	131	+ } while (0)
	132	+
	133	+/**
	134	+ * Adds a node which contains other tags
	135	+ * @param nodeType enum nodeTypes: Type of the node.
	136	+ * @param charsToSkip int: Number of characters that 'belong' to this node. Used to skip characters
	137	+ */
	138	+#define addNodeWithTags(nodeType, charsToSkip) \
	139	+ do { \
	140	+ struct node* tmpnode; \
	141	+ if ( currentLiteral.len ) { \
	142	+ storeNodeWithText(literal_node, currentLiteral.from, currentLiteral.len); \
	143	+ currentLiteral.len = 0; printf("currentLiteral committed\n"); \
	144	+ } \
	145	+ \
	146	+ tmpnode = alloc_node(); \
	147	+ tmpnode->type = nodeType; \
	148	+ tmpnode->flags = 0; \
	149	+ tmpnode->nextSibling = UNKNOWN_NODE_LEN; \
	150	+ tmpnode->contentLength = charsToSkip; \
	151	+ tmpnode->index = nodeStringLen; \
	152	+ tmpnode->parent = parentNode; \
	153	+ tmpnode->commentEnd = -1; \
	154	+ tmpnode->eqpos = -1; \
	155	+ parentNode = tmpnode; \
	156	+ \
	157	+ ALLOC_NODESTRING(); \
	158	+ serializeNode(nodeString + nodeStringLen, tmpnode); \
	159	+ nodeStringLen += NODE_LEN; \
	160	+ storedLength += charsToSkip; printf("storedLength: %d @%d\n", storedLength, __LINE__);\
	161	+ } while(0)
	162	+
	163	+#define closeNode(nodeType) \
	164	+ do { \
	165	+ struct node* tmpnode = parentNode; \
	166	+ assert( nodeType == tmpnode->type ); \
	167	+ if ( currentLiteral.len ) { \
	168	+ storeNodeWithText(literal_node, currentLiteral.from, currentLiteral.len); \
	169	+ printf("adding literal of %d with parent %c\n", currentLiteral.len, tmpnode->type); \
	170	+ currentLiteral.len = 0; \
	171	+ } printf("closing node %c at %d with len of %d\n", tmpnode->type, tmpnode->index, tmpnode->contentLength); \
	172	+ tmpnode->nextSibling = nodeStringLen - tmpnode->index - NODE_LEN; \
	173	+ serializeNode( nodeString + tmpnode->index, tmpnode ); \
	174	+ parentNode = parentNode->parent; \
	175	+ free_node( tmpnode ); \
	176	+ } while (0)
	177	+
	178	+#define alloc_node() emalloc( sizeof(struct node) )
	179	+#define free_node(x) efree(x)
	180	+
	181	+#define ALLOC_NODESTRING() \
	182	+ do { \
	183	+ nodeString = erealloc(nodeString, nodeStringLen + NODE_LEN + 1); \
	184	+ assert( nodeString ); \
	185	+ } while(0)
	186	+
	187	+/**
	188	+ * Serializes a node into string.
	189	+ * The caller must ensure that there are at least NODE_LEN bytes
	190	+ * available from pointer, and NODE_LEN + 1 writable.
	191	+ */
	192	+static void serializeNode(char* pointer, struct node* node) {
	193	+ int c;
	194	+ pointer[0] = node->type;
	195	+ pointer[1] = '0' + node->flags;
	196	+ assert( node->nextSibling < (1 << 24) );
	197	+ if ( node->nextSibling == UNKNOWN_NODE_LEN ) {
	198	+ pointer[2] = pointer[3] = pointer[4] = pointer[5] = pointer[6] = pointer[7] = '?';
	199	+ } else {
	200	+ sprintf(&pointer[2], "%06x", node->nextSibling);
	201	+ }
	202	+ c = pointer[16];
	203	+ snprintf(&pointer[8], 9, "%08x", node->contentLength);
	204	+ pointer[16] = c;
	205	+}
	206	+
	207	+static inline int hex2dec(char val) {
	208	+ switch (val) {
	209	+ case '0'...'9':
	210	+ return val - '0';
	211	+ case 'a'...'f':
	212	+ return val - 'a' + 10;
	213	+ }
	214	+ assert(0);
	215	+}
	216	+
	217	+/**
	218	+ * Get the nextSibling value from a node serialized at pointer.
	219	+ */
	220	+static inline int getNextSibling(const char* pointer) {
	221	+ assert( pointer[2] != '?' );
	222	+ return ( ( ( ( ( hex2dec(pointer[2]) << 4 ) \| hex2dec(pointer[3]) << 4 ) \| hex2dec(pointer[4]) << 4 ) \| hex2dec(pointer[5]) << 4 ) \| hex2dec(pointer[6]) << 4 ) \| hex2dec(pointer[7]);
	223	+}
	224	+
	225	+/**
	226	+ * Get the output string that would result if the close is not found.
	227	+ *
	228	+ * TODO: Reduce space by collapsing nodes here.
	229	+ */
	230	+static struct node* breakSyntax( struct node* node, char * const nodeString, int *nodeStringLen ) {
	231	+ struct node* parent;
	232	+
	233	+ /* Note we cannot coalesce with a previous literal node since it
	234	+ * may be our nephew, instead of our sister (we could ask our
	235	+ * parent, though)
	236	+ */
	237	+printf("breakSyntax(%.s)\n", nodeStringLen, nodeString);
	238	+ if ( node->type == bracket_node ) {
	239	+ node->type = literal_node;
	240	+ node->nextSibling = 0;
	241	+ serializeNode( nodeString + node->index, node );
	242	+ parent = node->parent;
	243	+ free_node( node );
	244	+ return parent;
	245	+ } else if ( node->type == brace_node ) {
	246	+ /* Literalize this node and its children (title, part, part\name, part\value) */
	247	+ node->type = literal_node;
	248	+ node->nextSibling = 0;
	249	+ serializeNode( nodeString + node->index, node );
	250	+ int writepos = node->index + NODE_LEN;
	251	+ int readpos = node->index + NODE_LEN;
	252	+ int nextSibling = getNextSibling( nodeString + readpos );
	253	+ readpos += NODE_LEN;
	254	+
	255	+ /* Move up the title contents */
	256	+ if ( nextSibling ) {
	257	+ memmove( nodeString + writepos, nodeString + readpos, nextSibling );
	258	+ readpos += nextSibling;
	259	+ writepos += nextSibling;
	260	+ }
	261	+
	262	+ /* Go for part nodes */
	263	+ while ( readpos < *nodeStringLen ) {
	264	+ readpos += NODE_LEN; /* <part> */
	265	+ assert( readpos < nodeStringLen ); / All part nodes contain one name node inside */
	266	+ int nameChildren = getNextSibling( nodeString + readpos ); /* <name> */
	267	+ readpos += NODE_LEN;
	268	+ if ( nameChildren ) {
	269	+ memmove( nodeString + writepos, nodeString + readpos, nameChildren );
	270	+ readpos += nameChildren;
	271	+ writepos += nameChildren;
	272	+ }
	273	+ if (readpos >= nodeStringLen) break; / It may be the case for eg. {{Foo\|Bar */
	274	+ int valueChildren = getNextSibling( nodeString + readpos ); /* <value> */
	275	+ if ( valueChildren ) {
	276	+ memmove( nodeString + writepos, nodeString + readpos, nameChildren );
	277	+ readpos += nameChildren;
	278	+ writepos += nameChildren;
	279	+ }
	280	+ }
	281	+ *nodeStringLen = writepos;
	282	+ parent = node->parent;
	283	+ free_node( node );
	284	+ return parent;
	285	+ } else {
	286	+ assert( 0 );
	287	+ }
	288	+}
Property changes on: trunk/extensions/NativePreprocessor/nodes.h
___________________________________________________________________
Added: svn:keywords
1	289	+ Author Date Id Rev URL
Added: svn:eol-style
2	290	+ native

Follow-up revisions

Revision	Commit summary	Author	Date
r80461	Follow up r80376. Added missing file FORMAT....	platonides	19:54, 17 January 2011
r80470	Those printf should have been removed before committing r80376.	platonides	21:54, 17 January 2011

Status & tagging log

18:02, 6 March 2011 Hashar (talk | contribs) changed the status of r80376 [removed: new added: deferred]