r80376 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80375‎ | r80376 | r80377 >
Date:08:45, 15 January 2011
Author:platonides
Status:deferred
Tags:
Comment:
First half of the Native Preprocessor.

Not a bad birthday present, Wikipedia :)
Modified paths:
  • /trunk/extensions/NativePreprocessor (added) (history)
  • /trunk/extensions/NativePreprocessor/Preprocessor_Native.php (added) (history)
  • /trunk/extensions/NativePreprocessor/config.h (added) (history)
  • /trunk/extensions/NativePreprocessor/config.m4 (added) (history)
  • /trunk/extensions/NativePreprocessor/in_array.c (added) (history)
  • /trunk/extensions/NativePreprocessor/in_array.h (added) (history)
  • /trunk/extensions/NativePreprocessor/mediawiki_preprocessor.c (added) (history)
  • /trunk/extensions/NativePreprocessor/nodes.h (added) (history)
  • /trunk/extensions/NativePreprocessor/php_mediawiki_preprocessor.h (added) (history)
  • /trunk/extensions/NativePreprocessor/preprocesstoobj.c (added) (history)

Diff [purge]

Index: trunk/extensions/NativePreprocessor/Preprocessor_Native.php
@@ -0,0 +1,115 @@
 2+<?php
 3+
 4+if ( class_exists( 'MediaWikiPreprocessor' ) ) {
 5+ global $wgParserConf;
 6+ $wgParserConf['preprocessorClass'] = 'Preprocessor_Native';
 7+}
 8+
 9+class Preprocessor_Native implements Preprocessor {
 10+ var $parser;
 11+
 12+ function __construct( $parser ) {
 13+ $this->parser = $parser;
 14+ }
 15+
 16+
 17+ function preprocessToObj( $text, $flags = 0 ) {
 18+ $ntobj = $this->preprocessToObjInternal( $text, $flags );
 19+
 20+ return array( 'text' => $text, 'nodes' => $ntobj );
 21+ }
 22+
 23+ function preprocessToObjInternal( $text, $flags = 0 ) {
 24+ $nativePP = new MediaWikiPreprocessor();
 25+ $ntobj = $nativePP->preprocessToObjInternal( $text, $flags, $this->parser->getStripList() );
 26+
 27+ return $ntobj;
 28+ }
 29+
 30+ /**
 31+ * Completely inefficient function to transform into the xml serialization.
 32+ */
 33+ function preprocessToXml( $text, $flags = 0 ) {
 34+ $ser = $this->preprocessToObjInternal( $text, $flags );
 35+
 36+ return $this->unserializeNode( substr( $ser, 0, 16 ), substr( $ser, 16 ), $text );
 37+ }
 38+
 39+ const NODE_LEN = 16;
 40+ function unserializeNode( $node, $children, &$text ) {
 41+ $flags = ord( $node[1] ) - 48;
 42+ $childrenLen = hexdec( substr( $node, 2, 6 ) );
 43+ $textLen = hexdec( substr( $node, 8, 8 ) );
 44+ $result = htmlspecialchars( substr( $text, 0, $textLen ) );
 45+ if ( strlen( $text ) < $textLen ) throw new MWException( 'Bad length in node' );
 46+ $text = substr( $text, $textLen );
 47+ if ( strpos( '<et|p', $node[0] ) !== false )
 48+ $result = ''; // Not present in Preprocessor_DOM
 49+
 50+ while ( $childrenLen > 0 ) {
 51+ $result .= $this->unserializeNode( substr( $children, 0, 16 ), substr( $children, 16 ), $text );
 52+ $n = self::NODE_LEN + hexdec( substr( $children, 2, 6 ) );
 53+ $children = substr( $children, $n );
 54+ $childrenLen -= $n;
 55+ }
 56+ switch ( $node[0] ) {
 57+ case '/':
 58+ return "<root>$result</root>";
 59+ case 'L':
 60+ return $result;
 61+ case 'I':
 62+ return "<ignore>$result</ignore>";
 63+ case '-':
 64+ return "<comment>$result</comment>";
 65+ case '<':
 66+ return "<ext>$result</ext>";
 67+ case 'N':
 68+ if ($flags)
 69+ return "<name index=\"$flags\" />";
 70+ else
 71+ return "<name>$result</name>";
 72+ case 'a':
 73+ return "<attr>$result</attr>";
 74+ case 'e':
 75+ return $result;
 76+ case '.':
 77+ return "<inner>$result</inner>";
 78+ case '>':
 79+ return "<close>$result</close>";
 80+ case 'i':
 81+ case 'j':
 82+ case 'k':
 83+ case 'l':
 84+ case 'm':
 85+ case 'n':
 86+ return "<h level=\"" . ( ord( $node[0] ) - ord( 'h' ) ) . "\" i=\"" . ( ord( $node[1] ) - ord( '0' ) ) . "\">$result</h>";
 87+ case 't':
 88+ $lineStart = $flags ? " lineStart=\"1\"" : "";
 89+ return "<template$lineStart>$result</template>";
 90+ case 'p':
 91+ return "<tplarg>$result</tplarg>";
 92+ case 'T':
 93+ return "<title>$result</title>";
 94+ case '|':
 95+ return "<part>$result</part>";
 96+ case 'v':
 97+ return "<value>$result</value>";
 98+ case '}':
 99+ return '';
 100+ default:
 101+ throw new Exception( "Unknown node of type '" . $node[0] . "'");
 102+ }
 103+ }
 104+
 105+ function newFrame() {
 106+ throw new Exception( __METHOD__ . 'unimplemented' );
 107+ }
 108+
 109+ function newCustomFrame( $args ) {
 110+ throw new Exception( __METHOD__ . 'unimplemented' );
 111+ }
 112+
 113+ function newPartNodeArray( $values ) {
 114+ throw new Exception( __METHOD__ . 'unimplemented' );
 115+ }
 116+}
Property changes on: trunk/extensions/NativePreprocessor/Preprocessor_Native.php
___________________________________________________________________
Added: svn:eol-style
1117 + native
Index: trunk/extensions/NativePreprocessor/config.m4
@@ -0,0 +1,10 @@
 2+dnl Change that 'yes' to 'no' to not build it by default
 3+
 4+PHP_ARG_ENABLE(mediawiki-preprocessor, mediawiki preprocessor support,
 5+[ --enable-mediawiki-preprocessor Include MediaWiki preprocessor extension], no, yes)
 6+
 7+if test "$PHP_MEDIAWIKIPREPROCESSOR" != "no"; then
 8+ dnl Enable the extension
 9+ PHP_NEW_EXTENSION(mediawiki_preprocessor, mediawiki_preprocessor.c in_array.c preprocesstoobj.c, $ext_shared)
 10+ PHP_SUBST(MEDIAWIKI_PREPROCESSOR_SHARED_LIBADD)
 11+fi
Index: trunk/extensions/NativePreprocessor/preprocesstoobj.c
@@ -0,0 +1,830 @@
 2+#include <string.h>
 3+#include <stdbool.h>
 4+
 5+#include "php.h"
 6+#include "ext/standard/php_string.h"
 7+
 8+#undef NDEBUG
 9+#include <assert.h>
 10+
 11+#include "in_array.h"
 12+#include "nodes.h"
 13+
 14+#define PTD_FOR_INCLUSION 1 /* Matches Parser::PTD_FOR_INCLUSION */
 15+
 16+// FIXME: Do not rely on the terminating \0
 17+#define STRSTR(haystack, needle) strpos(haystack, needle, 0)
 18+int strpos(const char* haystack, const char* needle, int offset) {
 19+ char* s = strstr(haystack+offset, needle);
 20+ if (!s) return -1;
 21+ return s - haystack;
 22+}
 23+
 24+#define strsize(x) (sizeof(x)-1)
 25+#define min(x,y) (((x) < (y)) ? (x) : (y))
 26+
 27+enum internalTags {
 28+ None,
 29+ includeonly,
 30+ onlyinclude,
 31+ noinclude
 32+};
 33+const char* internalTagNames[] = { NULL, "includeonly", "onlyinclude", "noinclude" };
 34+
 35+enum internalTags getInternalTag(const char* name, int name_len) {
 36+ #define CHECK_INTERNAL_TAG(x) if ((sizeof(#x)-1 == name_len) && !strncasecmp(name, #x, sizeof(#x)-1)) return x;
 37+ if (name[0] == '/') {
 38+ name++;
 39+ name_len--;
 40+ }
 41+ CHECK_INTERNAL_TAG(includeonly);
 42+ CHECK_INTERNAL_TAG(onlyinclude);
 43+ CHECK_INTERNAL_TAG(noinclude);
 44+ return None;
 45+}
 46+
 47+#define pipe foundPipe /* Avoid conflicts with pipe(2) */
 48+
 49+enum foundTypes {
 50+ lineStart,
 51+ lineEnd,
 52+ pipe = '|',
 53+ equals = '=',
 54+ angle = '<',
 55+ closeBrace = '}',
 56+ closeBracket = ']',
 57+ openBrace = '{',
 58+ openBracket = '[',
 59+};
 60+
 61+#define searchReset() strcpy(search, "[{<\n") // $search = $searchBase;
 62+#define addSearch(x) addToSearch(search, sizeof(search), x) // $search .= 'x';
 63+#define MAX_SEARCH_CHARS "[{<\n|=}]"
 64+void addToSearch(char* search, int search_len, char x) {
 65+ int e;
 66+ assert(strchr(MAX_SEARCH_CHARS, x));
 67+ e = strlen(search);
 68+ assert(e < search_len - 2);
 69+ search[e] = x;
 70+ search[e+1] = '\0';
 71+}
 72+
 73+size_t mwpp_strcspn(const char* text, int text_len, const char* search, int offset) {
 74+ /* Optimize me */
 75+ //printf(" mwpp_strcspn(%s, %d, %s, %d)\n", text, text_len, search, offset);
 76+ return php_strcspn( text + offset, search, text + text_len, search + strlen(search) );
 77+}
 78+
 79+/**
 80+ * Counts the number of times the character c appears since start, up to length.
 81+ */
 82+int chrspn( const char* text, int c, int start, int length ) {
 83+ int i;
 84+ for (i=0; i < length; i++) {
 85+ if ( text[start+i] != c ) {
 86+ break;
 87+ }
 88+ }
 89+ return i;
 90+}
 91+
 92+/**
 93+ * Return the first index in text that either matches a PCRE \s or a '<'
 94+ * Returns -1 if not found. Remember that for PERL compatibility, \s doesn't
 95+ * include the Vertical Tab (0x11)
 96+ */
 97+int findSpaceOrAngle(const char* text, int text_len) {
 98+ int i;
 99+ for (i = 0; i < text_len; i++) {
 100+ switch ( text[i] ) {
 101+ case '\t':
 102+ case '\n':
 103+ case '\f':
 104+ case '\r':
 105+ case ' ':
 106+ case '>':
 107+ return i;
 108+ }
 109+ }
 110+ return -1;
 111+}
 112+
 113+/**
 114+ * Locates an end tag for the given tag name.
 115+ * Matches the regex "/<\/$name\s*>/i"
 116+ * Doesn't (completely) support tag names which contain '<'
 117+ *
 118+ * @param text String: Text in which to find the tag
 119+ * @param text_len int: Length of text
 120+ * @param from int: Offset from which to begin the search
 121+ * @param name String: lowercase name of the tag to close
 122+ * @param name_len int: length of name
 123+ * @param endTagLen int*: length of the found tag (output value)
 124+ * @return int: The position from text where the end tag begins or -1 if not found
 125+ */
 126+static int findEndTag( const char* text, int text_len, int from, const char* name, int name_len, int* endTagLen ) {
 127+ int i, j;
 128+ for (i = from; i < text_len - 2 - name_len; i++) {
 129+ if ( text[i] == '<' && text[i+1] == '/' ) {
 130+ for (j = 0; j < name_len; j++) {
 131+ if ( name[j] != tolower( text[i+2+j] ) ) {
 132+ i += j;
 133+ break;
 134+ }
 135+ }
 136+ if ( j == name_len ) {
 137+ while ( text[i+2+j] == ' ' ) j++;
 138+ if ( text[i+2+j] == '>' ) {
 139+ *endTagLen = j + strsize("</>");
 140+ return i;
 141+ }
 142+ i += j;
 143+ }
 144+ }
 145+ }
 146+ return -1;
 147+}
 148+
 149+/**
 150+ * Returns the number of times the character c appears in text, searching backwards from position start
 151+ */
 152+int chrrspn( const char* text, int c, int start ) {
 153+ int i = 0;
 154+ while ( ( start-i >= 0 ) && text[start-i] == c ) {
 155+ i++;
 156+ }
 157+ return i;
 158+}
 159+
 160+char* preprocessToObj( const char* text, int text_len, int flags, HashTable* parserStripList, int* preprocessed_len ) {
 161+ DEFINE_NODE_STRING()
 162+
 163+ /* The php preprocessors have an array of rules to use,
 164+ * Those are hardcoded here. Places relying on it are
 165+ * marked with a 'Known rules' comment.
 166+ */
 167+ #define BraceRuleMin 2
 168+ #define BraceRuleMax 3
 169+ #define BracketRuleMin 2
 170+ #define BracketRuleMax 2
 171+
 172+ bool forInclusion = flags & PTD_FOR_INCLUSION;
 173+
 174+ bool enableOnlyinclude = false;
 175+ enum internalTags ignoredElement; /* Act as this tag isn't there */
 176+
 177+ HashTable* xmlishElements = parserStripList;
 178+ /* Instead of $xmlishRegex, we use directly the stripList.
 179+ * As it is shared with Parser, includeonly/onlyinclude/noinclude are handled separatedly.
 180+ * Per Parser::set{FunctionTag,}Hook(), the items are all strings and lowercase.
 181+ */
 182+
 183+ if ( forInclusion ) {
 184+ /* $ignoredTags = array( 'includeonly', '/includeonly' ); */
 185+ ignoredElement = noinclude;
 186+ if ( STRSTR( text, "<onlyinclude>" ) && STRSTR( text, "</onlyinclude>" ) ) {
 187+ enableOnlyinclude = true;
 188+ }
 189+ } else {
 190+ /* $ignoredTags = array( 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ); */
 191+ ignoredElement = includeonly;
 192+ }
 193+ #define isIgnoredTag(internalTag) (forInclusion ? ((internalTag) == includeonly) : ((internalTag) > includeonly) )
 194+
 195+ int i = 0;
 196+ bool findEquals = false; // True to find equals signs in arguments
 197+ bool findPipe = false; // True to take notice of pipe characters
 198+ int headingIndex = 1;
 199+ bool inHeading = false; // True if $i is inside a possible heading
 200+ bool noMoreGT = false; // True if there are no more greater-than (>) signs right of $i
 201+ bool findOnlyinclude = enableOnlyinclude; // True to ignore all input up to the next <onlyinclude>
 202+ bool fakeLineStart = true; // Do a line-start run without outputting an LF character
 203+ bool fakePipeFound = false;
 204+ char currentClosing = '\0';
 205+ int lineStartPos = -1;
 206+ char search[sizeof(MAX_SEARCH_CHARS)];
 207+
 208+ #define getFlags() \
 209+ inHeading = (parentNode->type == heading_node); \
 210+ findPipe = (parentNode->type != heading_node) && (parentNode->type != bracket_node); \
 211+ findEquals = findPipe && ( parentNode->nextSibling > 0 ) && ( parentNode->type != value_node );
 212+
 213+ while ( true ) {
 214+
 215+ if ( findOnlyinclude ) {
 216+ // Ignore all input up to the next <onlyinclude>
 217+ int startPos = strpos( text, "<onlyinclude>", i );
 218+ if ( startPos == -1 ) {
 219+ // Ignored section runs to the end
 220+ addNodeWithText(ignore_node, text, i, -1);
 221+ break;
 222+ }
 223+ int tagEndPos = startPos + strsize( "<onlyinclude>" ); // past-the-end
 224+ addNodeWithText(ignore_node, text, i, tagEndPos - i);
 225+ i = tagEndPos;
 226+ findOnlyinclude = false;
 227+ }
 228+
 229+ enum foundTypes found;
 230+ if ( fakeLineStart ) {
 231+ found = lineStart;
 232+ } else if ( fakePipeFound ) {
 233+ found = pipe;
 234+ } else {
 235+ // Find next opening brace, closing brace or pipe
 236+ searchReset();
 237+ if ( parentNode->type == root_node ) {
 238+ currentClosing = 0;
 239+ } else {
 240+ /* This is too ugly */
 241+ if ( parentNode->type == heading_node ) {
 242+ currentClosing = '\n';
 243+ } else if ( parentNode->type == '[' ) {
 244+ currentClosing = ']'; /* Known rules */
 245+ } else if ( parentNode->parent && ( parentNode->parent->type == '{'
 246+ || ( parentNode->parent->parent && parentNode->parent->parent->type == '{' ) ) ) {
 247+ currentClosing = '}'; /* Known rules */
 248+ } else {
 249+ currentClosing = 0;
 250+ }
 251+ addSearch( currentClosing );
 252+ }
 253+ if ( findPipe ) {
 254+ addSearch( '|' );
 255+ }
 256+ if ( findEquals ) {
 257+ // First equals will be for the template
 258+ addSearch( '=' );
 259+ }
 260+
 261+ // Output literal section, advance input counter
 262+ size_t literalLength = mwpp_strcspn( text, text_len, search, i );
 263+ if ( literalLength > 0 ) {
 264+ addLiteral( text, i, literalLength );
 265+ i += literalLength;
 266+ }
 267+ if ( i >= text_len ) {
 268+ if ( currentClosing == '\n' ) {
 269+ // Do a past-the-end run to finish off the heading
 270+ found = lineEnd;
 271+ } else if ( parentNode->type == name_node && parentNode->parent && parentNode->parent->type == part_node && findEquals ) {
 272+ // Convert this part\name into a value and add the name
 273+ fakePipeFound = true;
 274+ found = pipe;
 275+ } else {
 276+ // All done
 277+ break;
 278+ }
 279+ } else {
 280+ switch ( text[i] ) {
 281+ case '|':
 282+ case '=':
 283+ case '<':
 284+ found = text[i];
 285+ break;
 286+ case '\n':
 287+ if ( inHeading ) {
 288+ found = lineEnd;
 289+ } else {
 290+ found = lineStart;
 291+ }
 292+ break;
 293+ case '}': /* Known rules */
 294+ case ']':
 295+ if ( text[i] == currentClosing ) {
 296+ found = currentClosing;
 297+ }
 298+ break;
 299+ case '{': /* Known rules */
 300+ case '[':
 301+ found = text[i];
 302+ break;
 303+
 304+ default:
 305+ // Some versions of PHP have a strcspn which stops on null characters {{refneeded}}
 306+ // Ignore and continue
 307+ ++i;
 308+ continue;
 309+ }
 310+ }
 311+ }
 312+
 313+ if ( found == angle ) {
 314+ // Determine which tag is this
 315+ if ( enableOnlyinclude && strncasecmp( text + i, "</onlyinclude>", strsize( "</onlyinclude>" ) ) ) {
 316+ findOnlyinclude = true;
 317+ continue;
 318+ }
 319+
 320+ // Handle comments
 321+ if ( !strncmp( text + i, "<!--", 4 ) ) {
 322+ // To avoid leaving blank lines, when a comment is both preceded
 323+ // and followed by a newline (ignoring spaces), trim leading and
 324+ // trailing spaces and one of the newlines.
 325+
 326+ // Find the end
 327+ int endPos = strpos( text, "-->", i + 4 );
 328+ if ( endPos == -1 ) {
 329+ // Unclosed comment in input, runs to end
 330+ addNodeWithText(comment_node, text, i, -1);
 331+ i = text_len;
 332+ } else {
 333+ // Search backwards for leading whitespace
 334+
 335+ int wsStart;
 336+ for (wsStart = i - 1; wsStart > 0; wsStart--) {
 337+ if ( text[wsStart] != ' ') { /* It can't go over wikitext_len because the php string has a \0 terminator, too */
 338+ wsStart++;
 339+ break;
 340+ }
 341+ }
 342+
 343+ // Search forwards for trailing whitespace
 344+ // wsEnd will be the position of the last space (or the > if there's none)
 345+ int startPos, wsEnd = endPos + 3;
 346+ while (text[wsEnd] == ' ') { wsEnd++; }
 347+ wsEnd--; // A bit silly since we will be using wsEnd+1 everywhere, but we want to keep this the same as $wsEnd
 348+
 349+ // Eat the line if possible
 350+ // This could theoretically be done if $wsStart == 0, i.e. for comments at
 351+ // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
 352+ // it's a possible beneficial b/c break.
 353+ if ( wsStart > 0 && text[wsStart - 1] == '\n' && text[wsEnd + 1] == '\n' )
 354+ {
 355+ startPos = wsStart;
 356+ endPos = wsEnd + 1;
 357+ // Remove leading whitespace from the end of the accumulator
 358+ // Sanity check first though
 359+ int wsLength = i - wsStart;
 360+ if ( wsLength > 0 && currentLiteral.len >= wsLength ) {
 361+ if ( strspn( text + currentLiteral.from + currentLiteral.len - wsLength, " " ) != wsLength ) {
 362+ // Can this ever be false?
 363+ assert(0);
 364+ }
 365+ currentLiteral.len -= wsLength;
 366+ }
 367+ // Do a line-start run next time to look for headings after the comment
 368+ fakeLineStart = true;
 369+ } else {
 370+ // No line to eat, just take the comment itself
 371+ startPos = i;
 372+ endPos += 2;
 373+ }
 374+
 375+ if ( parentNode ) {
 376+ if ( parentNode->commentEnd != -1 && parentNode->commentEnd == wsStart - 1 ) {
 377+ // Comments abutting, no change in visual end
 378+ parentNode->commentEnd = wsEnd;
 379+ } else {
 380+ parentNode->visualEnd = wsStart - 1;
 381+ parentNode->commentEnd = endPos;
 382+ }
 383+ }
 384+ i = endPos + 1;
 385+ addNodeWithText(comment_node, text, startPos, endPos - startPos + 1);
 386+ }
 387+ continue;
 388+ }
 389+
 390+ if ( noMoreGT ) {
 391+ addLiteral( text, i, 1 );
 392+ ++i;
 393+ continue;
 394+ }
 395+
 396+ /**
 397+ * We differ here from the $xmlishRegex approach
 398+ * The regex ends the tag name with a \s character, /> or >
 399+ * so we start seeking for them, then look which name is it.
 400+ */
 401+ assert(text[i] == '<');
 402+ const char* name = text + i + 1;
 403+ int name_len;
 404+ /* TODO: optimize this search by not going further than
 405+ * max( strlen( getParserStripList() + internalTags() ) )
 406+ * while not setting noMoreGT in such case.
 407+ */
 408+ name_len = findSpaceOrAngle(name, text_len - i - 1);
 409+ if ( name_len > 0 && name[name_len] == '>' && name[name_len - 1] == '/' ) {
 410+ name_len--;
 411+ }
 412+ int attrStart = i + name_len + 1;
 413+
 414+ int tagEndPos = -1;
 415+ if ( name_len != -1 ) {
 416+ // Find end of tag
 417+ char* end = memchr(name + name_len, '>', text_len - i - 1);
 418+
 419+ tagEndPos = end ? end - text : -1;
 420+ }
 421+ if ( tagEndPos == -1 ) {
 422+ // Infinite backtrack
 423+ // Disable tag search to prevent worst-case O(N^2) performance
 424+ noMoreGT = true;
 425+ addLiteral( text, i, 1 );
 426+ ++i;
 427+ continue;
 428+ }
 429+ assert(text[tagEndPos] == '>');
 430+
 431+ enum internalTags internalTag;
 432+ internalTag = getInternalTag(name, name_len);
 433+
 434+ // Handle ignored tags
 435+ if ( isIgnoredTag( internalTag ) ) {
 436+ addNodeWithText( ignore_node, text, i, tagEndPos - i + 1 );
 437+ i = tagEndPos + 1;
 438+ continue;
 439+ }
 440+
 441+ char * lowername;
 442+ if ( internalTag == None ) {
 443+ int j;
 444+ // Verify that it's not just tag-looking text
 445+ lowername = alloca( name_len + 1 ); /* FIXME */
 446+ for (j = 0; j < name_len; j++) {
 447+ lowername[j] = tolower(name[j]);
 448+ }
 449+ lowername[j] = '\0';
 450+ if ( !str_in_array(lowername, name_len, xmlishElements, true) ) {
 451+ addLiteral( text, i, 1 );
 452+ ++i;
 453+ continue;
 454+ }
 455+ } else {
 456+ lowername = (char*)internalTagNames[internalTag];
 457+ }
 458+
 459+ int tagStartPos, attrEnd, endTagBegin, endTagLen;
 460+ int innerTextBegin, innerTextLen;
 461+ tagStartPos = i; endTagLen = 0;
 462+ innerTextBegin = -1; innerTextLen = -1;
 463+
 464+ if ( text[tagEndPos-1] == '/' ) {
 465+ attrEnd = tagEndPos - 1;
 466+ i = tagEndPos + 1;
 467+ } else {
 468+ attrEnd = tagEndPos;
 469+ // Find closing tag
 470+
 471+ endTagBegin = findEndTag( text, text_len, tagEndPos + 1, lowername, name_len, &endTagLen );
 472+
 473+ if ( endTagBegin != -1 )
 474+ {
 475+ innerTextBegin = tagEndPos + 1;
 476+ innerTextLen = endTagBegin - tagEndPos - 1;
 477+ i = endTagBegin + endTagLen;
 478+ } else {
 479+ // No end tag -- let it run out to the end of the text.
 480+ innerTextBegin = tagEndPos + 1;
 481+ i = text_len;
 482+ }
 483+ }
 484+
 485+ if ( isIgnoredTag( internalTag ) ) {
 486+ addNodeWithText(ignore_node, text, tagStartPos, i - tagStartPos );
 487+ continue;
 488+ }
 489+
 490+ addNodeWithTags( ext_node, 1 ); /* The '<' is implicit in Preprocessor_DOM */
 491+ addNodeWithText( name_node, text, tagStartPos + 1, name_len );
 492+
 493+ // Note that the attr element contains the whitespace between name and attribute,
 494+ // this is necessary for precise reconstruction during pre-save transform.
 495+ assert(attrEnd >= attrStart);
 496+ addNodeWithText( attr_node, text, attrStart, attrEnd - attrStart );
 497+ addNodeWithText( end_name_node, text, attrEnd, tagEndPos - attrEnd + 1 );
 498+
 499+ if ( innerTextBegin != -1 ) {
 500+ addNodeWithText( inner_node, text, innerTextBegin, innerTextLen );
 501+ }
 502+ if ( endTagLen ) {
 503+ addNodeWithText( close_node, text, endTagBegin, endTagLen );
 504+ }
 505+ closeNode( ext_node );
 506+ }
 507+ else if ( found == lineStart ) {
 508+ // Is this the start of a heading?
 509+ // Line break belongs before the heading element in any case
 510+ if ( fakeLineStart ) {
 511+ fakeLineStart = false;
 512+ } else {
 513+ addLiteral( text, i, 1 );
 514+ i++;
 515+ }
 516+
 517+ int count = chrspn( text, '=', i, 6 );
 518+ if ( count == 1 && findEquals ) {
 519+ // DWIM: This looks kind of like a name/value separator
 520+ // Let's let the equals handler have it and break the potential heading
 521+ // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
 522+ } else if ( count > 0 ) {
 523+ /*
 524+ piece = array(
 525+ 'open' => "\n",
 526+ 'close' => "\n",
 527+ 'parts' => array( new PPDPart( str_repeat( '=', $count ) ) ),
 528+ 'startPos' => $i,
 529+ 'count' => $count );
 530+ */
 531+ lineStartPos = i; /* This lived in the stack in php, but there can't be two open header pieces */
 532+ addNodeWithTags(heading_node, count);
 533+ currentClosing = '\n';
 534+ /* extract( $stack->getFlags(); ) */
 535+ getFlags()
 536+ i += count;
 537+ }
 538+ } else if ( found == lineEnd ) {
 539+
 540+ // A heading must be open, otherwise \n wouldn't have been in the search list
 541+ assert( parentNode->type == heading_node );
 542+ assert( lineStartPos != -1 );
 543+
 544+ // Search back through the input to see if it has a proper close
 545+ // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
 546+ int searchStart;
 547+ for (searchStart = i - 1; searchStart > 0; --searchStart) {
 548+ if ( ( text[searchStart] != ' ' ) && ( text[searchStart] != '\t' ) ) {
 549+ break;
 550+ }
 551+ }
 552+
 553+ if ( parentNode->commentEnd != -1 && searchStart == parentNode->commentEnd ) {
 554+ // Comment found at line end
 555+ // Search for equals signs before the comment
 556+ for (searchStart = parentNode->visualEnd; searchStart > 0; --searchStart) {
 557+ if (text[i] != ' ' && text[i] != '\t')
 558+ break;
 559+ }
 560+ }
 561+ searchStart++;
 562+
 563+ int count = parentNode->contentLength;
 564+ int equalsLength = chrrspn( text, '=', searchStart - 1 );
 565+
 566+ if ( equalsLength > 0 ) {
 567+ if ( searchStart - equalsLength == lineStartPos ) {
 568+ // This is just a single string of equals signs on its own line
 569+ // Replicate the doHeadings behaviour /={count}(.+)={count}/
 570+ // First find out how many equals signs there really are (don't stop at 6)
 571+ count = equalsLength;
 572+ if ( count < 3 ) {
 573+ count = 0;
 574+ } else {
 575+ count = min( 6, ( count - 1 ) / 2 );
 576+ }
 577+ } else {
 578+ count = min( equalsLength, count );
 579+ }
 580+ if ( count > 0 ) {
 581+ // Normal match, output <h>
 582+ assert( count < 7 );
 583+ parentNode->type = heading_node + count;
 584+ parentNode->flags = headingIndex;
 585+ headingIndex++;
 586+ } else {
 587+ // Single equals sign on its own line, count=0
 588+ parentNode->type = literal_node;
 589+ }
 590+ } else {
 591+ // No match, no <h>, just pass down the inner text
 592+ parentNode->type = literal_node;
 593+ }
 594+ // Unwind the stack
 595+ closeNode( parentNode->type );
 596+ /* extract( getFlags() ); */
 597+ getFlags();
 598+
 599+ // Note that we do NOT increment the input pointer.
 600+ // This is because the closing linebreak could be the opening linebreak of
 601+ // another heading. Infinite loops are avoided because the next iteration MUST
 602+ // hit the heading open case above, which unconditionally increments the
 603+ // input pointer.
 604+ assert( inHeading == false );
 605+ } else if ( found == openBrace || found == openBracket ) {
 606+ // count opening brace characters
 607+ int count = chrspn( text, text[i], i, text_len - i );
 608+
 609+ // we need to add to stack only if opening brace count is enough for one of the rules
 610+ int rulemin = 2; /* Known rules */
 611+
 612+ if ( count >= rulemin ) {
 613+ // Add it to the stack
 614+ addNodeWithTags( found, count );
 615+ parentNode->flags = (i > 0 && text[i-1] == '\n') /* lineStart boolean */;
 616+ /* close char does not need to be stored per Known rules */
 617+ parentNode->count = count;
 618+ parentNode->argIndex = 0;
 619+ if ( found == openBrace ) {
 620+ addNodeWithTags( title_node, 0 );
 621+ }
 622+ getFlags();
 623+ } else {
 624+ // Add literal brace(s)
 625+ addLiteral( text, i, count );
 626+ }
 627+ i += count;
 628+ } else if ( found == closeBrace || found == closeBracket ) {
 629+ // lets check if there are enough characters for closing brace
 630+
 631+ if ( parentNode->type == name_node ) {
 632+ /* Go to close it */
 633+ fakePipeFound = true;
 634+ continue;
 635+ }
 636+ if ( parentNode->type == value_node ) {
 637+ closeNode( parentNode->type );
 638+ assert( parentNode->type == part_node );
 639+ }
 640+ assert( ( parentNode->type == found - 2 ) || ( parentNode->parent && ( parentNode->parent->type == found - 2 ) ) );
 641+
 642+ int maxCount = found == closeBracket ? parentNode->count : parentNode->parent->count;
 643+ int count = chrspn( text, found, i, maxCount );
 644+
 645+ // check for maximum matching characters (if there are 5 closing
 646+ // characters, we will probably need only 3 - depending on the rules)
 647+ int ruleMax = ( found == closeBrace ) ? 3 : 2; /* Known rules */
 648+ int matchingCount = 0;
 649+ if ( count > ruleMax ) {
 650+ // The specified maximum exists in the callback array, unless the caller
 651+ // has made an error
 652+ matchingCount = ruleMax;
 653+ } else {
 654+ // Count is less than the maximum
 655+ // Skip any gaps in the callback array to find the true largest match
 656+ matchingCount = count;
 657+ /* Known rules: If we have three opening braces but only two closing ones, we want the two.
 658+ * With less than the minimum, matchingCount = 0.
 659+ */
 660+ if ( count >= 2 /* min */ ) { /* Known rules */
 661+ matchingCount = count;
 662+ }
 663+ }
 664+
 665+ if ( matchingCount <= 0 ) {
 666+ // No matching element found in callback array
 667+ // Output a literal closing brace and continue
 668+ assert( count == 1 );
 669+ addLiteral( text, i, count );
 670+ i += count;
 671+ continue;
 672+ }
 673+
 674+
 675+ if ( found == closeBracket ) { /* Known rules */
 676+ // No element, just literal text
 677+ parentNode->count -= matchingCount;
 678+
 679+ /* The preprocessor DOM adds a new literal here, then goes
 680+ * backwards and readds another node before if there are
 681+ * brackets left.
 682+ * We leave the same bracket node open (with decreasing counts)
 683+ * until closing time, since we know that all brackets
 684+ * will end up being literals.
 685+ */
 686+
 687+ if ( parentNode->count < 2 ) { /* Known rules */
 688+ parentNode = breakSyntax( parentNode, nodeString, &nodeStringLen );
 689+ }
 690+
 691+ addLiteral( text, i, matchingCount );
 692+ i += matchingCount;
 693+ continue;
 694+ }
 695+ assert( ( parentNode->parent && ( parentNode->parent->type == brace_node ) ) );
 696+
 697+ assert( parentNode->type == title_node || parentNode->type == part_node );
 698+ closeNode( parentNode->type );
 699+
 700+ addNodeWithText( closebrace_node, text, i, matchingCount ); // should be on next line?
 701+ // Advance input pointer
 702+ i += matchingCount;
 703+
 704+ parentNode->count -= matchingCount;
 705+
 706+ if ( matchingCount == 2 ) {
 707+ parentNode->type = template_node;
 708+ } else if ( matchingCount == 3 ) {
 709+ parentNode->type = tplarg_node;
 710+ } else {
 711+ assert( 0 );
 712+ }
 713+ parentNode->contentLength = matchingCount;
 714+
 715+ // Re-add the old stack element if it still has unmatched opening characters remaining
 716+ if ( parentNode->count > 0 ) {
 717+ int oldindex = parentNode->index;
 718+
 719+ // do we still qualify for any callback with remaining count?
 720+ if ( parentNode->count >= 2 ) { /* Known rules */
 721+ /* Prepend a { and a title node */
 722+ int oldcount = parentNode->count;
 723+ int oldflags = parentNode->flags;
 724+
 725+ parentNode->flags = 0; /* We don't begin a line since there is markup before us */
 726+
 727+ closeNode( parentNode->type );
 728+ storedLength -= oldcount;
 729+
 730+ addNodeWithTags( brace_node, oldcount );
 731+ addNodeWithTags( title_node, 0 );
 732+
 733+ /* But they must be placed *before* the tag we just closed: */
 734+
 735+ /* Move all our childs two positions right */
 736+ memmove( nodeString + oldindex + NODE_LEN * 2, nodeString + oldindex, nodeStringLen - oldindex - 2 * NODE_LEN );
 737+
 738+ /* And the new tags into the positions left */
 739+ parentNode->index = oldindex + NODE_LEN;
 740+ parentNode->parent->index = oldindex;
 741+ parentNode->parent->flags = oldflags;
 742+ } else {
 743+ /* Prepend a literal node with the skipped braces */
 744+ int skippedBraces = 1 /* = parentNode->count */;
 745+ closeNode( parentNode->type );
 746+
 747+ struct node tmpnode;
 748+ tmpnode.type = literal_node;
 749+ tmpnode.flags = 0;
 750+ tmpnode.nextSibling = 0;
 751+ tmpnode.contentLength = skippedBraces;
 752+
 753+ ALLOC_NODESTRING();
 754+ memmove( nodeString + oldindex + NODE_LEN, nodeString + oldindex, nodeStringLen - oldindex );
 755+ nodeStringLen += NODE_LEN;
 756+
 757+ serializeNode(nodeString + oldindex, &tmpnode);
 758+ }
 759+ } else {
 760+ closeNode( parentNode->type );
 761+ }
 762+
 763+ getFlags();
 764+ } else if ( found == pipe ) {
 765+ findEquals = true; // shortcut for getFlags()
 766+ if ( parentNode->type == title_node ) {
 767+ closeNode( title_node );
 768+ } else if ( parentNode->type == name_node ) {
 769+ assert( ( parentNode->parent && ( parentNode->parent->type == part_node ) ) );
 770+ assert( ( parentNode->parent->parent && ( parentNode->parent->parent->type == brace_node ) ) );
 771+
 772+ /* This was a value node, the name is empty */
 773+ parentNode->type = value_node;
 774+ int len = parentNode->contentLength;
 775+ parentNode->contentLength = 0;
 776+ int oldindex = parentNode->index;
 777+
 778+ /* Relocate the children one position right */
 779+ ALLOC_NODESTRING();
 780+ memmove( nodeString + oldindex + NODE_LEN * 2, nodeString + oldindex + NODE_LEN, nodeStringLen - oldindex - NODE_LEN ); /* (nodeStringLen - oldindex) will often be 0 */
 781+ nodeStringLen += NODE_LEN;
 782+ /* And the father, too */
 783+ parentNode->index += NODE_LEN;
 784+ closeNode( value_node );
 785+
 786+ /* Place the name */
 787+ struct node tmpnode;
 788+ tmpnode.type = name_node;
 789+ tmpnode.flags = parentNode->flags = ++parentNode->parent->argIndex;
 790+ tmpnode.nextSibling = 0;
 791+ tmpnode.contentLength = len;
 792+ assert( len == 0 );
 793+
 794+ serializeNode(nodeString + oldindex, &tmpnode);
 795+ if ( !fakePipeFound ) closeNode( part_node );
 796+ } else {
 797+ closeNode( value_node );
 798+ closeNode( part_node );
 799+ }
 800+ if ( fakePipeFound ) {
 801+ fakePipeFound = false;
 802+ continue;
 803+ }
 804+ addNodeWithTags( part_node, 1 );
 805+ addNodeWithTags( name_node, 0 );
 806+ ++i;
 807+ } else if ( found == equals ) {
 808+ findEquals = false; // shortcut for getFlags()
 809+ assert( parentNode->type == name_node ); /* If we are searching for an equal we are inside parts\name */
 810+ closeNode( name_node );
 811+ addLiteral( text, i, 1 );
 812+ addNodeWithTags( value_node, 0 ); /* We could piggyback some literals on value_nodes */
 813+
 814+ //parentNode->eqpos = i; // we could remove eqpost member
 815+ ++i;
 816+ } else {
 817+ assert( 2 + 2 == 5 );
 818+ }
 819+ }
 820+ while ( parentNode ) {
 821+ if ( parentNode->type == brace_node || parentNode->type == bracket_node ) {
 822+ parentNode = breakSyntax( parentNode, nodeString, &nodeStringLen );
 823+ } else {
 824+ closeNode( parentNode->type );
 825+ }
 826+ }
 827+
 828+ nodeString[nodeStringLen] = '\0';
 829+ *preprocessed_len = nodeStringLen;
 830+ return nodeString;
 831+}
Property changes on: trunk/extensions/NativePreprocessor/preprocesstoobj.c
___________________________________________________________________
Added: svn:eol-style
1832 + native
Index: trunk/extensions/NativePreprocessor/in_array.c
@@ -0,0 +1,41 @@
 2+
 3+#include <stdbool.h>
 4+#include "php.h"
 5+#define const
 6+#include "in_array.h"
 7+
 8+/**
 9+ * This defines an interface for internally performing in_array()
 10+ * You will notice that the similarity with php_search_array() is not casual.
 11+ */
 12+bool zval_in_array(const zval* value, const HashTable* array, bool strict) {
 13+ zval **entry; /* pointer to array entry */
 14+ zval res; /* comparison result */
 15+ HashPosition pos; /* hash iterator */
 16+ int (*is_equal_func)(zval *, zval *, zval * TSRMLS_DC);
 17+
 18+ TSRMLS_FETCH(); /* Useless for simple arrays, since it's only needed when comparing array values */
 19+
 20+ is_equal_func = strict ? is_identical_function : is_equal_function;
 21+
 22+ zend_hash_internal_pointer_reset_ex(array, &pos);
 23+ while (zend_hash_get_current_data_ex(array, (void **)&entry, &pos) == SUCCESS) {
 24+ is_equal_func(&res, value, *entry TSRMLS_CC);
 25+ if (Z_LVAL(res)) { /* if ( (long)res ), ie. if ( res == true ) */
 26+ return true;
 27+ }
 28+ zend_hash_move_forward_ex(array, &pos);
 29+ }
 30+ return false;
 31+}
 32+
 33+bool str_in_array(const char* string, int string_len, const HashTable* array, bool strict) {
 34+ zval zstring;
 35+ INIT_ZVAL(zstring);
 36+ zstring.type = IS_STRING;
 37+ zstring.value.str.val = string;
 38+ zstring.value.str.len = string_len;
 39+
 40+ return zval_in_array(&zstring, array, strict);
 41+}
 42+
Property changes on: trunk/extensions/NativePreprocessor/in_array.c
___________________________________________________________________
Added: svn:eol-style
143 + native
Index: trunk/extensions/NativePreprocessor/php_mediawiki_preprocessor.h
@@ -0,0 +1,28 @@
 2+#ifndef PHP_MEDIAWIKI_PREPROCESSOR_H
 3+#define PHP_MEDIAWIKI_PREPROCESSOR_H 1
 4+
 5+#ifdef ZTS
 6+#include "TSRM.h"
 7+#endif
 8+
 9+ZEND_BEGIN_MODULE_GLOBALS(mediawiki_preprocessor)
 10+
 11+ZEND_END_MODULE_GLOBALS(mediawiki_preprocessor)
 12+
 13+#ifdef ZTS
 14+#define MWPP_G(v) TSRMG(mediawiki_preprocessor_globals_id, zend_notas_globals *, v)
 15+#else
 16+#define MWPP_G(v) (mediawiki_preprocessor_globals.v)
 17+#endif
 18+
 19+#define PHP_MEDIAWIKI_PREPROCESSOR_VERSION "0.1"
 20+#define PHP_MEDIAWIKI_PREPROCESSOR_EXTNAME "MediaWiki Preprocessor"
 21+
 22+PHP_MINIT_FUNCTION(mediawiki_preprocessor);
 23+PHP_MSHUTDOWN_FUNCTION(mediawiki_preprocessor);
 24+PHP_RINIT_FUNCTION(mediawiki_preprocessor);
 25+
 26+extern zend_module_entry mediawiki_preprocessor_module_entry;
 27+#define phpext_mediawiki_preprocessor_ptr &mediawiki_preprocessor_module_entry
 28+
 29+#endif
Property changes on: trunk/extensions/NativePreprocessor/php_mediawiki_preprocessor.h
___________________________________________________________________
Added: svn:keywords
130 + Author Date Id Rev URL
Added: svn:eol-style
231 + native
Index: trunk/extensions/NativePreprocessor/in_array.h
@@ -0,0 +1,5 @@
 2+#include <stdbool.h>
 3+
 4+bool zval_in_array(const zval* value, const HashTable* array, bool strict);
 5+bool str_in_array(const char* string, int string_len, const HashTable* array, bool strict);
 6+
Property changes on: trunk/extensions/NativePreprocessor/in_array.h
___________________________________________________________________
Added: svn:keywords
17 + Author Date Id Rev URL
Added: svn:eol-style
28 + native
Index: trunk/extensions/NativePreprocessor/mediawiki_preprocessor.c
@@ -0,0 +1,141 @@
 2+#ifdef HAVE_CONFIG_H
 3+#include "config.h"
 4+#endif
 5+
 6+
 7+#include "php.h"
 8+#include "php_ini.h"
 9+#include "php_mediawiki_preprocessor.h"
 10+
 11+
 12+#if ZEND_DEBUG || 1
 13+#define DEBUG(x,...) php_printf("[MWPP] "x"\n", __VA_ARGS__)
 14+#else
 15+#define DEBUG(x,...)
 16+#endif
 17+
 18+typedef struct _mediawiki_preprocessor {
 19+ zend_object std; /* Inherit from a standard php object */
 20+
 21+} mwppobj;
 22+
 23+ZEND_DECLARE_MODULE_GLOBALS(mediawiki_preprocessor)
 24+
 25+zend_module_entry mediawiki_preprocessor_module_entry = {
 26+#if ZEND_MODULE_API_NO >= 20010901
 27+ STANDARD_MODULE_HEADER,
 28+#endif
 29+ PHP_MEDIAWIKI_PREPROCESSOR_EXTNAME,
 30+ NULL, /* No procedures */
 31+ PHP_MINIT(mediawiki_preprocessor), /* module_startup_func */
 32+ PHP_MSHUTDOWN(mediawiki_preprocessor), /* module_shutdown_func */
 33+ PHP_RINIT(mediawiki_preprocessor), /* request_startup_func */
 34+ NULL, /* request_shutdown_func */
 35+ NULL, /* info_func */
 36+#if ZEND_MODULE_API_NO >= 20010901
 37+ PHP_MEDIAWIKI_PREPROCESSOR_VERSION,
 38+#endif
 39+ STANDARD_MODULE_PROPERTIES
 40+};
 41+
 42+#ifdef COMPILE_DL_MEDIAWIKI_PREPROCESSOR
 43+ZEND_GET_MODULE(mediawiki_preprocessor)
 44+#endif
 45+
 46+PHP_RINIT_FUNCTION(mediawiki_preprocessor)
 47+{
 48+ /* Request init */
 49+
 50+ return SUCCESS;
 51+}
 52+
 53+PHP_MSHUTDOWN_FUNCTION(mediawiki_preprocessor)
 54+{
 55+ /* Module shutdown */
 56+
 57+ return SUCCESS;
 58+}
 59+
 60+PHP_METHOD(WikiTextPreprocessor,preprocessToObj);
 61+/* {{{ arginfo__construct */
 62+ZEND_BEGIN_ARG_INFO_EX(/*name*/ arginfopreprocessToObj, /*pass_rest_by_reference*/ 0, /*return_reference*/ 0, /*required_num_args*/ 3)
 63+ ZEND_ARG_INFO(/*pass_by_ref*/ 0, /*name*/ "WikiText")
 64+ZEND_END_ARG_INFO()
 65+/* }}} */
 66+
 67+static const zend_function_entry mwpp_methods[] = {
 68+ PHP_ME(WikiTextPreprocessor, preprocessToObj, arginfopreprocessToObj, ZEND_ACC_PUBLIC)
 69+ {NULL, NULL, NULL}
 70+};
 71+
 72+static void free_mwppobj(void *object TSRMLS_DC);
 73+static zend_object_value create_mwppobj (zend_class_entry *class_type TSRMLS_DC);
 74+
 75+static void php_mwpp_init_globals(zend_mediawiki_preprocessor_globals *mwpp_globals)
 76+{
 77+ /* No globals to init */
 78+}
 79+
 80+PHP_MINIT_FUNCTION(mediawiki_preprocessor)
 81+{
 82+ /* Module init */
 83+ zend_class_entry ce;
 84+ zend_class_entry* registered_class;
 85+
 86+ ZEND_INIT_MODULE_GLOBALS(mediawiki_preprocessor, php_mwpp_init_globals, NULL);
 87+
 88+ INIT_CLASS_ENTRY(ce, "MediaWikiPreprocessor", mwpp_methods); /* Define class MediaWikiPreprocessor */
 89+
 90+ ce.create_object = create_mwppobj;
 91+ registered_class = zend_register_internal_class(&ce TSRMLS_CC); /* Bring it to existence */
 92+
 93+ return SUCCESS;
 94+}
 95+
 96+static zend_object_value create_mwppobj (zend_class_entry *class_type TSRMLS_DC)
 97+{
 98+ zend_object_value retval;
 99+ mwppobj *intern;
 100+ zval *tmp;
 101+
 102+ intern = emalloc(sizeof(mwppobj));
 103+
 104+ DEBUG("Creating MediaWikiPreprocessor %p", intern);
 105+
 106+ zend_object_std_init(&intern->std, class_type TSRMLS_CC);
 107+ zend_hash_copy(intern->std.properties, &class_type->default_properties, (copy_ctor_func_t) zval_add_ref, (void *) &tmp, sizeof(zval *));
 108+
 109+ retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)NULL, (zend_objects_free_object_storage_t) free_mwppobj, NULL TSRMLS_CC);
 110+ retval.handlers = zend_get_std_object_handlers(); /* Default handlers */
 111+
 112+ return retval;
 113+}
 114+
 115+static void free_mwppobj(void *object TSRMLS_DC)
 116+{
 117+ mwppobj *intern = (mwppobj *)object;
 118+
 119+ zend_object_std_dtor(&intern->std TSRMLS_CC);
 120+ efree(object);
 121+
 122+ DEBUG("MediaWikiPreprocessor %p destroyed", object);
 123+}
 124+
 125+char* preprocessToObj( const char* text, int text_len, int flags, HashTable* parserStripList, int* preprocessed_len );
 126+PHP_METHOD(WikiTextPreprocessor, preprocessToObj)
 127+{
 128+ zend_class_entry *class_entry;
 129+ char *wikitext, *preprocessed;
 130+ int wikitext_len, flags;
 131+ int preprocessed_len;
 132+ zval *array, *result;
 133+
 134+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sda", &wikitext, &wikitext_len, &flags, &array) == FAILURE) {
 135+ return;
 136+ }
 137+ wikitext_len = strlen(wikitext);
 138+ DEBUG("Constructed with text «%s» of length %d, flags %d", wikitext, wikitext_len, flags );
 139+ preprocessed = preprocessToObj( wikitext, wikitext_len, flags, Z_ARRVAL_P(array), &preprocessed_len );
 140+
 141+ RETURN_STRINGL( preprocessed, preprocessed_len, 0 );
 142+}
Property changes on: trunk/extensions/NativePreprocessor/mediawiki_preprocessor.c
___________________________________________________________________
Added: svn:eol-style
1143 + native
Index: trunk/extensions/NativePreprocessor/config.h
@@ -0,0 +1,59 @@
 2+/* config.h. Generated from config.h.in by configure. */
 3+/* config.h.in. Generated from configure.in by autoheader. */
 4+
 5+/* Whether to build mediawiki_preprocessor as dynamic module */
 6+#define COMPILE_DL_MEDIAWIKI_PREPROCESSOR 1
 7+
 8+/* Define to 1 if you have the <dlfcn.h> header file. */
 9+#define HAVE_DLFCN_H 1
 10+
 11+/* Define to 1 if you have the <inttypes.h> header file. */
 12+#define HAVE_INTTYPES_H 1
 13+
 14+/* Define to 1 if you have the <memory.h> header file. */
 15+#define HAVE_MEMORY_H 1
 16+
 17+/* Define to 1 if you have the <stdint.h> header file. */
 18+#define HAVE_STDINT_H 1
 19+
 20+/* Define to 1 if you have the <stdlib.h> header file. */
 21+#define HAVE_STDLIB_H 1
 22+
 23+/* Define to 1 if you have the <strings.h> header file. */
 24+#define HAVE_STRINGS_H 1
 25+
 26+/* Define to 1 if you have the <string.h> header file. */
 27+#define HAVE_STRING_H 1
 28+
 29+/* Define to 1 if you have the <sys/stat.h> header file. */
 30+#define HAVE_SYS_STAT_H 1
 31+
 32+/* Define to 1 if you have the <sys/types.h> header file. */
 33+#define HAVE_SYS_TYPES_H 1
 34+
 35+/* Define to 1 if you have the <unistd.h> header file. */
 36+#define HAVE_UNISTD_H 1
 37+
 38+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
 39+/* #undef NO_MINUS_C_MINUS_O */
 40+
 41+/* Define to the address where bug reports for this package should be sent. */
 42+#define PACKAGE_BUGREPORT ""
 43+
 44+/* Define to the full name of this package. */
 45+#define PACKAGE_NAME ""
 46+
 47+/* Define to the full name and version of this package. */
 48+#define PACKAGE_STRING ""
 49+
 50+/* Define to the one symbol short name of this package. */
 51+#define PACKAGE_TARNAME ""
 52+
 53+/* Define to the home page for this package. */
 54+#define PACKAGE_URL ""
 55+
 56+/* Define to the version of this package. */
 57+#define PACKAGE_VERSION ""
 58+
 59+/* Define to 1 if you have the ANSI C header files. */
 60+#define STDC_HEADERS 1
Property changes on: trunk/extensions/NativePreprocessor/config.h
___________________________________________________________________
Added: svn:keywords
161 + Author Date Id Rev URL
Added: svn:eol-style
262 + native
Index: trunk/extensions/NativePreprocessor/nodes.h
@@ -0,0 +1,287 @@
 2+
 3+enum nodeTypes {
 4+ root_node = '/',
 5+ literal_node = 'L',
 6+ ignore_node = 'I',
 7+ comment_node = '-',
 8+
 9+ ext_node = '<', /* Encloses an extension tag */
 10+ name_node = 'N', /* Tag name or part name */
 11+ attr_node = 'a', /* Tag attributes */
 12+ inner_node = '.', /* Tag contents, optional */
 13+ end_name_node = 'e', /* > or /> closing a name node. Missing in Preprocessor_DOM */
 14+ close_node = '>', /* Closing tag, optional */
 15+
 16+ heading_node = 'h', /* Used when working with a heading candidate */
 17+ h1_node = 'i',
 18+ h2_node = 'j',
 19+ h3_node = 'k',
 20+ h4_node = 'l',
 21+ h5_node = 'm',
 22+ h6_node = 'n',
 23+
 24+ brace_node = '{', /* Used when we still don't know its identity (template/tplarg) */
 25+ bracket_node = '[',
 26+ template_node = 't',
 27+ tplarg_node = 'p',
 28+ title_node = 'T',
 29+ part_node = '|',
 30+ value_node = 'v',
 31+ closebrace_node = '}',
 32+};
 33+
 34+struct node {
 35+ enum nodeTypes type;
 36+ char flags;
 37+ int nextSibling;
 38+ int contentLength;
 39+
 40+ /* Relevant only for nodes with childs */
 41+ int index; /* index inside nodeString */
 42+ struct node* parent;
 43+
 44+ /* Used for headings */
 45+ int commentEnd;
 46+ int visualEnd; /* Point where the last text ends (ie. without spaces, comments...) */
 47+
 48+ /* Used for brace and bracket nodes */
 49+ int count;
 50+
 51+ /* Used for template parts */
 52+ int eqpos; /* Name nodes */
 53+ int argIndex; /* Brace nodes */
 54+ /* Compact me: Move the last three blocks into an union */
 55+};
 56+
 57+struct literalNode {
 58+ int from;
 59+ int len;
 60+};
 61+
 62+#define UNKNOWN_NODE_LEN -1
 63+
 64+#define DEFINE_NODE_STRING() char* nodeString = NULL; \
 65+ int nodeStringLen = 0; /* Length used of nodeString. Initialised to 1 for a \0 terminator */ \
 66+ struct literalNode currentLiteral = { 0, 0 }; \
 67+ int storedLength = 0; /* Length of text already stored in the nodes */ \
 68+ struct node* parentNode = NULL; \
 69+ addNodeWithTags(root_node, 0);
 70+
 71+#define NODE_LEN 16 /* Length of a serialized node */
 72+
 73+/**
 74+ * Adds a node of the specified type to the nodeString
 75+ * @param nodeType enum nodeTypes: Type of the node to add.
 76+ * @param txt char*: Text pointer. Must be 'text'
 77+ * @param offset int: Offset from txt to copy from
 78+ * @param length int: Length to copy from 'from'. -1 to copy until the end of the string.
 79+ */
 80+#define addNodeWithText(nodeType,txt,offset,length) \
 81+ do { \
 82+ int mylen = length; \
 83+ assert( txt == text ); \
 84+ if ( currentLiteral.len && nodeType != literal_node ) { \
 85+ storeNodeWithText(literal_node, currentLiteral.from, currentLiteral.len); \
 86+ currentLiteral.len = 0; \
 87+ } \
 88+ if ( -1 == mylen ) { \
 89+ mylen = text_len - offset; \
 90+ } \
 91+ \
 92+ storeNodeWithText(nodeType,offset,mylen); \
 93+ } while (0)
 94+
 95+/**
 96+ * Like addNodeWithText() but doesn't commit the literals
 97+ */
 98+#define storeNodeWithText(nodeType,offset,length) \
 99+ do { \
 100+ assert(storedLength == offset); \
 101+ struct node tmpnode; \
 102+ tmpnode.type = nodeType; \
 103+ tmpnode.flags = 0; \
 104+ tmpnode.nextSibling = 0; \
 105+ tmpnode.contentLength = length; \
 106+ \
 107+ ALLOC_NODESTRING(); \
 108+ serializeNode(nodeString + nodeStringLen, &tmpnode); \
 109+ nodeStringLen += NODE_LEN; \
 110+ storedLength += length; printf("storedLength: %d @%d\n", storedLength, __LINE__);\
 111+ } while (0);
 112+
 113+/**
 114+ * Records the passed literal inside currentLiteral
 115+ * Adjacent literal nodes are stored inside of a single node.
 116+ */
 117+#define addLiteral(literalText,offset,length) \
 118+ do { printf("Addliteral '%.*s'\n", length, literalText+offset);\
 119+ int my_len = length; \
 120+ assert( literalText == text ); \
 121+ if ( my_len == -1 ) { \
 122+ my_len = text_len - offset; \
 123+ } \
 124+ if ( currentLiteral.len ) { \
 125+ assert( currentLiteral.from + currentLiteral.len == (offset) ); \
 126+ } else { \
 127+ currentLiteral.from = (offset); \
 128+ } \
 129+ currentLiteral.len += my_len; \
 130+ assert( (length) >= 0 ); \
 131+ } while (0)
 132+
 133+/**
 134+ * Adds a node which contains other tags
 135+ * @param nodeType enum nodeTypes: Type of the node.
 136+ * @param charsToSkip int: Number of characters that 'belong' to this node. Used to skip characters
 137+ */
 138+#define addNodeWithTags(nodeType, charsToSkip) \
 139+ do { \
 140+ struct node* tmpnode; \
 141+ if ( currentLiteral.len ) { \
 142+ storeNodeWithText(literal_node, currentLiteral.from, currentLiteral.len); \
 143+ currentLiteral.len = 0; printf("currentLiteral committed\n"); \
 144+ } \
 145+ \
 146+ tmpnode = alloc_node(); \
 147+ tmpnode->type = nodeType; \
 148+ tmpnode->flags = 0; \
 149+ tmpnode->nextSibling = UNKNOWN_NODE_LEN; \
 150+ tmpnode->contentLength = charsToSkip; \
 151+ tmpnode->index = nodeStringLen; \
 152+ tmpnode->parent = parentNode; \
 153+ tmpnode->commentEnd = -1; \
 154+ tmpnode->eqpos = -1; \
 155+ parentNode = tmpnode; \
 156+ \
 157+ ALLOC_NODESTRING(); \
 158+ serializeNode(nodeString + nodeStringLen, tmpnode); \
 159+ nodeStringLen += NODE_LEN; \
 160+ storedLength += charsToSkip; printf("storedLength: %d @%d\n", storedLength, __LINE__);\
 161+ } while(0)
 162+
 163+#define closeNode(nodeType) \
 164+ do { \
 165+ struct node* tmpnode = parentNode; \
 166+ assert( nodeType == tmpnode->type ); \
 167+ if ( currentLiteral.len ) { \
 168+ storeNodeWithText(literal_node, currentLiteral.from, currentLiteral.len); \
 169+ printf("adding literal of %d with parent %c\n", currentLiteral.len, tmpnode->type); \
 170+ currentLiteral.len = 0; \
 171+ } printf("closing node %c at %d with len of %d\n", tmpnode->type, tmpnode->index, tmpnode->contentLength); \
 172+ tmpnode->nextSibling = nodeStringLen - tmpnode->index - NODE_LEN; \
 173+ serializeNode( nodeString + tmpnode->index, tmpnode ); \
 174+ parentNode = parentNode->parent; \
 175+ free_node( tmpnode ); \
 176+ } while (0)
 177+
 178+#define alloc_node() emalloc( sizeof(struct node) )
 179+#define free_node(x) efree(x)
 180+
 181+#define ALLOC_NODESTRING() \
 182+ do { \
 183+ nodeString = erealloc(nodeString, nodeStringLen + NODE_LEN + 1); \
 184+ assert( nodeString ); \
 185+ } while(0)
 186+
 187+/**
 188+ * Serializes a node into string.
 189+ * The caller must ensure that there are at least NODE_LEN bytes
 190+ * available from pointer, and NODE_LEN + 1 writable.
 191+ */
 192+static void serializeNode(char* pointer, struct node* node) {
 193+ int c;
 194+ pointer[0] = node->type;
 195+ pointer[1] = '0' + node->flags;
 196+ assert( node->nextSibling < (1 << 24) );
 197+ if ( node->nextSibling == UNKNOWN_NODE_LEN ) {
 198+ pointer[2] = pointer[3] = pointer[4] = pointer[5] = pointer[6] = pointer[7] = '?';
 199+ } else {
 200+ sprintf(&pointer[2], "%06x", node->nextSibling);
 201+ }
 202+ c = pointer[16];
 203+ snprintf(&pointer[8], 9, "%08x", node->contentLength);
 204+ pointer[16] = c;
 205+}
 206+
 207+static inline int hex2dec(char val) {
 208+ switch (val) {
 209+ case '0'...'9':
 210+ return val - '0';
 211+ case 'a'...'f':
 212+ return val - 'a' + 10;
 213+ }
 214+ assert(0);
 215+}
 216+
 217+/**
 218+ * Get the nextSibling value from a node serialized at pointer.
 219+ */
 220+static inline int getNextSibling(const char* pointer) {
 221+ assert( pointer[2] != '?' );
 222+ return ( ( ( ( ( hex2dec(pointer[2]) << 4 ) | hex2dec(pointer[3]) << 4 ) | hex2dec(pointer[4]) << 4 ) | hex2dec(pointer[5]) << 4 ) | hex2dec(pointer[6]) << 4 ) | hex2dec(pointer[7]);
 223+}
 224+
 225+/**
 226+ * Get the output string that would result if the close is not found.
 227+ *
 228+ * TODO: Reduce space by collapsing nodes here.
 229+ */
 230+static struct node* breakSyntax( struct node* node, char * const nodeString, int *nodeStringLen ) {
 231+ struct node* parent;
 232+
 233+ /* Note we cannot coalesce with a previous literal node since it
 234+ * may be our nephew, instead of our sister (we could ask our
 235+ * parent, though)
 236+ */
 237+printf("breakSyntax(%.*s)\n", *nodeStringLen, nodeString);
 238+ if ( node->type == bracket_node ) {
 239+ node->type = literal_node;
 240+ node->nextSibling = 0;
 241+ serializeNode( nodeString + node->index, node );
 242+ parent = node->parent;
 243+ free_node( node );
 244+ return parent;
 245+ } else if ( node->type == brace_node ) {
 246+ /* Literalize this node and its children (title, part, part\name, part\value) */
 247+ node->type = literal_node;
 248+ node->nextSibling = 0;
 249+ serializeNode( nodeString + node->index, node );
 250+ int writepos = node->index + NODE_LEN;
 251+ int readpos = node->index + NODE_LEN;
 252+ int nextSibling = getNextSibling( nodeString + readpos );
 253+ readpos += NODE_LEN;
 254+
 255+ /* Move up the title contents */
 256+ if ( nextSibling ) {
 257+ memmove( nodeString + writepos, nodeString + readpos, nextSibling );
 258+ readpos += nextSibling;
 259+ writepos += nextSibling;
 260+ }
 261+
 262+ /* Go for part nodes */
 263+ while ( readpos < *nodeStringLen ) {
 264+ readpos += NODE_LEN; /* <part> */
 265+ assert( readpos < *nodeStringLen ); /* All part nodes contain one name node inside */
 266+ int nameChildren = getNextSibling( nodeString + readpos ); /* <name> */
 267+ readpos += NODE_LEN;
 268+ if ( nameChildren ) {
 269+ memmove( nodeString + writepos, nodeString + readpos, nameChildren );
 270+ readpos += nameChildren;
 271+ writepos += nameChildren;
 272+ }
 273+ if (readpos >= *nodeStringLen) break; /* It may be the case for eg. {{Foo|Bar */
 274+ int valueChildren = getNextSibling( nodeString + readpos ); /* <value> */
 275+ if ( valueChildren ) {
 276+ memmove( nodeString + writepos, nodeString + readpos, nameChildren );
 277+ readpos += nameChildren;
 278+ writepos += nameChildren;
 279+ }
 280+ }
 281+ *nodeStringLen = writepos;
 282+ parent = node->parent;
 283+ free_node( node );
 284+ return parent;
 285+ } else {
 286+ assert( 0 );
 287+ }
 288+}
Property changes on: trunk/extensions/NativePreprocessor/nodes.h
___________________________________________________________________
Added: svn:keywords
1289 + Author Date Id Rev URL
Added: svn:eol-style
2290 + native

Follow-up revisions

RevisionCommit summaryAuthorDate
r80461Follow up r80376. Added missing file FORMAT....platonides19:54, 17 January 2011
r80470Those printf should have been removed before committing r80376.platonides21:54, 17 January 2011

Status & tagging log