r64369 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r64368‎ | r64369 | r64370 >
Date:00:02, 30 March 2010
Author:than4213
Status:deferred
Tags:
Comment:
This change is slightly broken. Created unparse functionality for the parser. This change had a dominoe affect of other things I needed to change.
Modified paths:
  • /branches/parser-work/phase3/includes/AutoLoader.php (modified) (history)
  • /branches/parser-work/phase3/includes/parser/ParseEngine.php (modified) (history)
  • /branches/parser-work/phase3/includes/parser/Parser.php (modified) (history)
  • /branches/parser-work/phase3/includes/parser/Preprocessor.php (modified) (history)
  • /branches/parser-work/phase3/includes/parser/WikiTextGrammar.xml (added) (history)

Diff [purge]

Index: branches/parser-work/phase3/includes/parser/Parser.php
@@ -76,7 +76,7 @@
7777 const COLON_STATE_COMMENTDASHDASH = 7;
7878
7979 // Flags for preprocessToDom
80 - const PTD_FOR_INCLUSION = 1;
 80+ const PTD_FOR_INCLUSION = 1024;
8181
8282 // Allowed values for $this->mOutputType
8383 // Parameter to startExternalParse().
@@ -95,7 +95,7 @@
9696 # Persistent:
9797 var $mTagHooks, $mTransparentTagHooks, $mFunctionHooks, $mFunctionSynonyms, $mVariables,
9898 $mSubsts, $mImageParams, $mImageParamsMagicArray, $mStripList, $mMarkerIndex,
99 - $mPreprocessor, $mExtLinkBracketedRegex, $mUrlProtocols, $mDefaultStripList,
 99+ $mParseEngine, $mPreprocessor, $mExtLinkBracketedRegex, $mUrlProtocols, $mDefaultStripList,
100100 $mVarCache, $mConf, $mFunctionTagHooks;
101101
102102
@@ -137,19 +137,10 @@
138138 $this->mExtLinkBracketedRegex = '/\[(\b(' . wfUrlProtocols() . ')'.
139139 '[^][<>"\\x00-\\x20\\x7F]+) *([^\]\\x0a\\x0d]*?)\]/S';
140140 $this->mVarCache = array();
141 - if ( isset( $conf['preprocessorClass'] ) ) {
142 - $this->mPreprocessorClass = $conf['preprocessorClass'];
143 - } elseif ( extension_loaded( 'domxml' ) ) {
144 - // PECL extension that conflicts with the core DOM extension (bug 13770)
145 - wfDebug( "Warning: you have the obsolete domxml extension for PHP. Please remove it!\n" );
146 - $this->mPreprocessorClass = 'Preprocessor_Hash';
147 - } elseif ( extension_loaded( 'dom' ) ) {
148 - $this->mPreprocessorClass = 'Preprocessor_DOM';
149 - } else {
150 - $this->mPreprocessorClass = 'Preprocessor_Hash';
151 - }
152141 $this->mMarkerIndex = 0;
153142 $this->mFirstCall = true;
 143+
 144+ $this->mParseEngine = new ParseEngine("includes/parser/WikiTextGrammar.xml");
154145 }
155146
156147 /**
@@ -512,7 +503,8 @@
513504
514505 list( $text, $title ) = $this->getTemplateDom( $title );
515506 $flags = PPFrame::NO_ARGS | PPFrame::NO_TEMPLATES;
516 - return $this->getPreprocessor()->newFrame()->expand( $text, $flags );
 507+ $frame = new PPFrame($this);
 508+ return $frame->expand( $text, $flags );
517509 }
518510
519511 /**
@@ -547,8 +539,7 @@
548540 */
549541 function getPreprocessor() {
550542 if ( !isset( $this->mPreprocessor ) ) {
551 - $class = $this->mPreprocessorClass;
552 - $this->mPreprocessor = new $class( $this );
 543+ $this->mPreprocessor = new Preprocessor( $this->mParseEngine );
553544 }
554545 return $this->mPreprocessor;
555546 }
@@ -922,7 +913,7 @@
923914 else
924915 $flag = Parser::PTD_FOR_INCLUSION;
925916 $dom = $this->preprocessToDom( $text, $flag );
926 - $text = $frame->expand( $dom );
 917+ $text = $frame->expand( $dom, $flag );
927918 }
928919 // if $frame is not provided, then use old-style replaceVariables
929920 else {
@@ -2792,7 +2783,7 @@
27932784 wfProfileIn( __METHOD__ );
27942785
27952786 if ( $frame === false ) {
2796 - $frame = $this->getPreprocessor()->newFrame();
 2787+ $frame = new PPFrame($this);
27972788 } elseif ( !( $frame instanceof PPFrame ) ) {
27982789 wfDebug( __METHOD__." called using plain parameters instead of a PPFrame instance. Creating custom frame.\n" );
27992790 $frame = $this->getPreprocessor()->newCustomFrame($frame);
@@ -2859,7 +2850,7 @@
28602851 * @return string the text of the template
28612852 * @private
28622853 */
2863 - function braceSubstitution( $piece, $frame ) {
 2854+ function braceSubstitution( $template, $frame ) {
28642855 global $wgContLang, $wgNonincludableNamespaces;
28652856 wfProfileIn( __METHOD__ );
28662857 wfProfileIn( __METHOD__.'-setup' );
@@ -2873,11 +2864,12 @@
28742865 $isLocalObj = false; # $text is a DOM node needing expansion in the current frame
28752866
28762867 # Title object, where $text came from
 2868+ $xpath = new DOMXPath($template->ownerDocument);
28772869 $title = null;
28782870
28792871 # $part1 is the bit before the first |, and must contain only title characters.
28802872 # Various prefixes will be stripped from it later.
2881 - $titleWithSpaces = $frame->expand( $piece['title'] );
 2873+ $titleWithSpaces = $xpath->query("title", $template)->item(0)->textContent;
28822874 $part1 = trim( $titleWithSpaces );
28832875 $titleText = false;
28842876
@@ -2885,7 +2877,10 @@
28862878 $originalTitle = $part1;
28872879
28882880 # $args is a list of argument nodes, starting from index 0, not including $part1
2889 - $args = (null == $piece['parts']) ? array() : $piece['parts'];
 2881+ $args = array();
 2882+ foreach ($xpath->query("part", $template) as $part) {
 2883+ $args[] = $part;
 2884+ }
28902885 wfProfileOut( __METHOD__.'-setup' );
28912886
28922887 # SUBST
@@ -2900,14 +2895,15 @@
29012896 # safesubst || (subst && PST) || (false && !PST) => transclude (skip the if)
29022897 # (false && PST) || (subst && !PST) => return input (handled by if)
29032898 if ( $substMatch != 'safesubst' && ($substMatch == 'subst' xor $this->ot['wiki']) ) {
2904 - $text = $frame->virtualBracketedImplode( '{{', '|', '}}', $titleWithSpaces, $args );
 2899+ $outText = ParseEngine::unparse($template);
 2900+ $template->parentNode->replaceChild($template->ownerDocument->createTextNode($outText), $template);
29052901 $isLocalObj = true;
29062902 $found = true;
29072903 }
29082904 }
29092905
29102906 # Variables
2911 - if ( !$found && $args->getLength() == 0 ) {
 2907+ if ( !$found && $args->length == 0 ) {
29122908 $id = $this->mVariables->matchStartToEnd( $part1 );
29132909 if ( $id !== false ) {
29142910 $text = $this->getVariableValue( $id, $frame );
@@ -2964,14 +2960,12 @@
29652961 # Add a frame parameter, and pass the arguments as an array
29662962 $allArgs = $initialArgs;
29672963 $allArgs[] = $frame;
2968 - for ( $i = 0; $i < $args->getLength(); $i++ ) {
2969 - $funcArgs[] = $args->item( $i );
2970 - }
 2964+ $funcArgs = array_merge( $funcArgs, $args );
29712965 $allArgs[] = $funcArgs;
29722966 } else {
29732967 # Convert arguments to plain text
2974 - for ( $i = 0; $i < $args->getLength(); $i++ ) {
2975 - $funcArgs[] = trim( $frame->expand( $args->item( $i ) ) );
 2968+ foreach ($args as $arg) {
 2969+ $funcArgs[] = substr(ParseEngine::unparse($arg), 1);
29762970 }
29772971 $allArgs = array_merge( $initialArgs, $funcArgs );
29782972 }
@@ -3088,9 +3082,9 @@
30893083 # If we haven't found text to substitute by now, we're done
30903084 # Recover the source wikitext and return it
30913085 if ( !$found ) {
3092 - $text = $frame->virtualBracketedImplode( '{{', '|', '}}', $titleWithSpaces, $args );
 3086+ $outText = ParseEngine::unparse($template);
 3087+ $template->parentNode->replaceChild($template->ownerDocument->createTextNode($outText), $template);
30933088 wfProfileOut( __METHOD__ );
3094 - return array( 'object' => $text );
30953089 }
30963090
30973091 # Expand DOM-style return values in a child frame
@@ -3105,7 +3099,7 @@
31063100 if ( isset( $this->mTplExpandCache[$titleText] ) ) {
31073101 $text = $this->mTplExpandCache[$titleText];
31083102 } else {
3109 - $text = $newFrame->expand( $text );
 3103+ $text = $newFrame->expand( $text, self::PTD_FOR_INCLUSION );
31103104 $this->mTplExpandCache[$titleText] = $text;
31113105 }
31123106 } else {
@@ -3141,15 +3135,11 @@
31423136 $this->insertStripItem( '<!-- WARNING: template omitted, post-expand include size too large -->' );
31433137 $this->limitationWarn( 'post-expand-template-inclusion' );
31443138 }
3145 -
3146 - if ( $isLocalObj ) {
3147 - $ret = array( 'object' => $text );
3148 - } else {
3149 - $ret = array( 'text' => $text );
 3139+ if ($template->parentNode != NULL) {
 3140+ $template->parentNode->replaceChild($template->ownerDocument->createTextNode($text), $template);
31503141 }
31513142
31523143 wfProfileOut( __METHOD__ );
3153 - return $ret;
31543144 }
31553145
31563146 /**
@@ -3314,16 +3304,16 @@
33153305 * Triple brace replacement -- used for template arguments
33163306 * @private
33173307 */
3318 - function argSubstitution( $piece, $frame ) {
 3308+ function argSubstitution( $tplArg, $frame ) {
33193309 wfProfileIn( __METHOD__ );
33203310
3321 - $error = false;
3322 - $parts = $piece['parts'];
3323 - $nameWithSpaces = $frame->expand( $piece['title'] );
 3311+ $xpath = new DOMXPath($tplArg->ownerDocument);
 3312+ $parts = $xpath->query("part", $tplArg);
 3313+ $nameWithSpaces = $xpath->query("title", $tplArg)->item(0)->textContent;
33243314 $argName = trim( $nameWithSpaces );
33253315 $object = false;
33263316 $text = $frame->getArgument( $argName );
3327 - if ( $text === false && $parts->getLength() > 0
 3317+ if ( $text === false && $parts->length > 0
33283318 && (
33293319 $this->ot['html']
33303320 || $this->ot['pre']
@@ -3331,28 +3321,18 @@
33323322 )
33333323 ) {
33343324 # No match in frame, use the supplied default
3335 - $object = $parts->item( 0 )->getChildren();
 3325+ $text = $parts->item( 0 )->firstChild->textContent;
33363326 }
33373327 if ( !$this->incrementIncludeSize( 'arg', strlen( $text ) ) ) {
3338 - $error = '<!-- WARNING: argument omitted, expansion size too large -->';
 3328+ $text .= '<!-- WARNING: argument omitted, expansion size too large -->';
33393329 $this->limitationWarn( 'post-expand-template-argument' );
33403330 }
3341 -
3342 - if ( $text === false && $object === false ) {
3343 - # No match anywhere
3344 - $object = $frame->virtualBracketedImplode( '{{{', '|', '}}}', $nameWithSpaces, $parts );
 3331+ if ($text == NULL) {
 3332+ $text = ParseEngine::unparse($tplArg);
33453333 }
3346 - if ( $error !== false ) {
3347 - $text .= $error;
3348 - }
3349 - if ( $object !== false ) {
3350 - $ret = array( 'object' => $object );
3351 - } else {
3352 - $ret = array( 'text' => $text );
3353 - }
 3334+ $tplArg->parentNode->replaceChild($tplArg->ownerDocument->createTextNode($text), $tplArg);
33543335
33553336 wfProfileOut( __METHOD__ );
3356 - return $ret;
33573337 }
33583338
33593339 /**
@@ -3367,12 +3347,14 @@
33683348 * noClose Original text did not have a close tag
33693349 * @param PPFrame $frame
33703350 */
3371 - function extensionSubstitution( $params, $frame ) {
 3351+ function extensionSubstitution( $xmltag, $frame ) {
33723352 global $wgRawHtml, $wgContLang;
33733353
3374 - $name = $frame->expand( $params['name'] );
3375 - $attrText = !isset( $params['attr'] ) ? null : $frame->expand( $params['attr'] );
3376 - $content = !isset( $params['inner'] ) ? null : $frame->expand( $params['inner'] );
 3354+ $xpath = new DOMXPath($xmltag->ownerDocument);
 3355+ $name = substr($xmltag->getAttribute("startTag"), 1);
 3356+ $attrText = $xpath->query("attr", $xmltag)->item(0)->textContent;
 3357+ $inner = $xpath->query("inner", $xmltag);
 3358+ $content = $inner->length == 0 ? NULL : $inner->item(0)->textContent;
33773359 $marker = "{$this->mUniqPrefix}-$name-" . sprintf('%08X', $this->mMarkerIndex++) . self::MARKER_SUFFIX;
33783360
33793361 $isFunctionTag = isset( $this->mFunctionTagHooks[strtolower($name)] ) &&
@@ -3416,21 +3398,7 @@
34173399 extract( $flags );
34183400 }
34193401 } else {
3420 - if ( is_null( $attrText ) ) {
3421 - $attrText = '';
3422 - }
3423 - if ( isset( $params['attributes'] ) ) {
3424 - foreach ( $params['attributes'] as $attrName => $attrValue ) {
3425 - $attrText .= ' ' . htmlspecialchars( $attrName ) . '="' .
3426 - htmlspecialchars( $attrValue ) . '"';
3427 - }
3428 - }
3429 - if ( $content === null ) {
3430 - $output = "<$name$attrText/>";
3431 - } else {
3432 - $close = is_null( $params['close'] ) ? '' : $frame->expand( $params['close'] );
3433 - $output = "<$name$attrText>$content$close";
3434 - }
 3402+ $output = ParseEngine::unparse($xmltag);
34353403 }
34363404
34373405 if( $markerType === 'none' ) {
@@ -3631,9 +3599,9 @@
36323600 $baseTitleText = $this->mTitle->getPrefixedDBkey();
36333601 $oldType = $this->mOutputType;
36343602 $this->setOutputType( self::OT_WIKI );
3635 - $frame = $this->getPreprocessor()->newFrame();
 3603+ $frame = new PPFrame($this);
36363604 $root = $this->preprocessToDom( $origText );
3637 - $node = $root->getFirstChild();
 3605+ $node = $root->firstChild;
36383606 $byteOffset = 0;
36393607 $tocraw = array();
36403608
@@ -3810,14 +3778,14 @@
38113779 # Add the section to the section tree
38123780 # Find the DOM node for this header
38133781 while ( $node && !$isTemplate ) {
3814 - if ( $node->getName() === 'h' ) {
 3782+ if ( $node->nodeName === 'h' ) {
38153783 $bits = $node->splitHeading();
38163784 if ( $bits['i'] == $sectionIndex )
38173785 break;
38183786 }
38193787 $byteOffset += mb_strlen( $this->mStripState->unstripBoth(
38203788 $frame->expand( $node, PPFrame::RECOVER_ORIG ) ) );
3821 - $node = $node->getNextSibling();
 3789+ $node = $node->nextSibling;
38223790 }
38233791 $tocraw[] = array(
38243792 'toclevel' => $toclevel,
@@ -4196,7 +4164,7 @@
41974165 $text = preg_replace( $substRegex, $substText, $text );
41984166 $text = $this->cleanSigInSig( $text );
41994167 $dom = $this->preprocessToDom( $text );
4200 - $frame = $this->getPreprocessor()->newFrame();
 4168+ $frame = new PPFrame($this);
42014169 $text = $frame->expand( $dom );
42024170
42034171 if ( !$parsing ) {
@@ -4834,7 +4802,7 @@
48354803 $this->mOptions = new ParserOptions;
48364804 $this->setOutputType( self::OT_WIKI );
48374805 $outText = '';
4838 - $frame = $this->getPreprocessor()->newFrame();
 4806+ $frame = new PPFrame($this);
48394807
48404808 // Process section extraction flags
48414809 $flags = 0;
@@ -4847,10 +4815,11 @@
48484816 }
48494817 // Preprocess the text
48504818 $root = $this->preprocessToDom( $text, $flags );
 4819+ PPFrame::updateIncTags($root, $flags);
48514820
48524821 // <h> nodes indicate section breaks
48534822 // They can only occur at the top level, so we can find them by iterating the root's children
4854 - $node = $root->getFirstChild();
 4823+ $node = $root->firstChild->firstChild;
48554824
48564825 // Find the target section
48574826 $ind = 1;
@@ -4859,18 +4828,17 @@
48604829 $targetLevel = 1000;
48614830 } else {
48624831 while ( $node ) {
4863 - if ( $node->getName() === 'h' ) {
 4832+ if ( $node->nodeName === 'h' ) {
48644833 if ( $ind == $sectionIndex ) {
4865 - $bits = $node->splitHeading();
4866 - $targetLevel = $bits['level'];
 4834+ $targetLevel = strlen($node->getAttribute("endTag"));
48674835 break;
48684836 }
48694837 $ind ++;
48704838 }
48714839 if ( $mode === 'replace' ) {
4872 - $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG );
 4840+ $outText .= ParseEngine::unparse($node);
48734841 }
4874 - $node = $node->getNextSibling();
 4842+ $node = $node->nextSibling;
48754843 }
48764844 }
48774845
@@ -4885,18 +4853,17 @@
48864854
48874855 // Find the end of the section, including nested sections
48884856 do {
4889 - if ( $node->getName() === 'h' ) {
4890 - $bits = $node->splitHeading();
4891 - $curLevel = $bits['level'];
 4857+ if ( $node->nodeName === 'h' ) {
 4858+ $curLevel = strlen($node->getAttribute("endTag"));
48924859 if ( $ind != $sectionIndex && $curLevel <= $targetLevel ) {
48934860 break;
48944861 }
48954862 $ind ++;
48964863 }
48974864 if ( $mode === 'get' ) {
4898 - $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG );
 4865+ $outText .= ParseEngine::unparse($node);
48994866 }
4900 - $node = $node->getNextSibling();
 4867+ $node = $node->nextSibling;
49014868 } while ( $node );
49024869
49034870 // Write out the remainder (in replace mode only)
@@ -4911,7 +4878,7 @@
49124879
49134880 while ( $node ) {
49144881 $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG );
4915 - $node = $node->getNextSibling();
 4882+ $node = $node->nextSibling;
49164883 }
49174884 }
49184885
@@ -4919,6 +4886,9 @@
49204887 // Re-insert stripped tags
49214888 $outText = rtrim( $this->mStripState->unstripBoth( $outText ) );
49224889 }
 4890+ if ($outText[0] == "\n") {
 4891+ $outText = substr($outText, 1);
 4892+ }
49234893
49244894 return $outText;
49254895 }
Index: branches/parser-work/phase3/includes/parser/ParseEngine.php
@@ -1,216 +1,254 @@
22 <?php
33 /**
44 * Acts as the primary interface between the world and the parser.
 5+ * mStartRule - the first rule to use while parsing
56 * mRules - The list of rules to use while parsing
6 - * mStartRule - the first rule to use while parsing
77 * mDom - Used to create Dom objects and get's returned at the end of parsing
 8+ * mIter - Keeps track of how many times the parser recurses to stop endless loops
89 */
910 class ParseEngine {
10 - const maxIter = 8192;
11 - private $mRules, $mStartRule, $mDom, $mIter;
 11+ const maxIter = 2048;
 12+ private $mGrammar, $mTextPats;
1213
13 - function __construct($rules, $startRule) {
14 - $this->mRules = $rules;
15 - $this->mStartRule = $startRule;
 14+ function __construct($grammarFile) {
 15+ global $IP;
 16+ $this->mGrammar = new DOMDocument();
 17+ if (! $this->mGrammar->load("$IP/$grammarFile", LIBXML_NOBLANKS)) {
 18+ throw new MWException("Failed to load $grammarFile.");
 19+ }
 20+ foreach ($this->mGrammar->documentElement->childNodes as $crrnt) {
 21+ $this->pushTags($crrnt, NULL);
 22+ }
1623 }
1724
18 - function parse(&$text) {
 25+ function parse($text) {
1926 global $wgDebugParserLog;
2027 if ($wgDebugParserLog != '') {
2128 wfErrorLog("==========Start Parsing==========\n", $wgDebugParserLog);
2229 }
23 - $this->mIter = 0;
24 - $this->mDom = new DOMDocument();
25 - if (! $this->callParser($this->mStartRule, $text, $children, NULL)) {
26 - throw new MWException("Parser rejected text.");
 30+ $doc = new DOMDocument();
 31+ $rule = $this->mGrammar->documentElement;
 32+ $rootTag = $doc->createElement($rule->getAttribute("rootTag"));
 33+ $doc->appendChild($rootTag);
 34+ $xpath = new DOMXPath($this->mGrammar);
 35+ $startRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("startRule")}']")->item(0);
 36+ $iter = 0;
 37+ if (! $this->parseRec($startRule, "", $saveTags, $iter, $text, $rootTag)) {
 38+ throw new MWException("Failed to parse the given text.");
2739 }
28 - $this->mDom->appendChild($children[0]);
29 - $this->mDom->normalizeDocument();
 40+ $doc->normalizeDocument();
3041 if ($wgDebugParserLog != '') {
31 - wfErrorLog("XML - {$this->mDom->saveXML()}\n", $wgDebugParserLog);
 42+ wfErrorLog("XML - {$doc->saveXML()}\n", $wgDebugParserLog);
3243 }
33 - return $this->mDom;
 44+ return $doc;
3445 }
3546
36 - function callParser($child, &$text, &$children, $replaceStr) {
37 - $childName = get_class($child);
38 - if (is_string($child)) {
39 - $childName = $child;
40 - $child = $this->mRules[$childName];
 47+ static function unparse($node) {
 48+ $retStr = "";
 49+ if ($node instanceof DOMElement) {
 50+ $retStr .= $node->getAttribute("startTag");
 51+ foreach ($node->childNodes as $child) {
 52+ $retStr .= ParseEngine::unparse($child);
 53+ }
 54+ $retStr .= $node->getAttribute("endTag");
 55+ } else {
 56+ $retStr .= $node->textContent;
4157 }
 58+ return $retStr;
 59+ }
 60+
 61+ private function parseRec($rule, $replaceStr, $saveTags, &$iter, &$text, &$outNode) {
4262 global $wgDebugParserLog;
4363 if ($wgDebugParserLog != '') {
44 - wfErrorLog("Entering $childName\n", $wgDebugParserLog);
 64+ wfErrorLog("Entering {$rule->nodeName}, {$rule->getAttribute("name")}\n", $wgDebugParserLog);
4565 }
46 - $this->mIter ++;
47 - if ($this->mIter > ParseEngine::maxIter) {
 66+ $iter ++;
 67+ if ($iter > ParseEngine::maxIter) {
4868 throw new MWException("Parser iterated too many times. Probable loop in grammar.");
4969 }
50 - $retCode = $child->parse($text, $this, $this->mDom, $children, $replaceStr);
 70+ if ($rule->nodeName == "Assignment" || $rule->nodeName == "Reference" || $rule->nodeName == "Text") {
 71+ $saveTags = str_replace("~r", preg_quote($replaceStr, "/"), $saveTags);
 72+ $newTags = $rule->getAttribute("saveTags");
 73+ if ($saveTags == "") {
 74+ $saveTags = $newTags;
 75+ } elseif ($newTags != "") {
 76+ $saveTags .= "|" . $newTags;
 77+ }
 78+ }
 79+ $dom = $outNode->ownerDocument;
 80+ $retCode = FALSE;
 81+ if ($rule->nodeName == "Assignment") {
 82+ $startPat = $rule->getAttribute("tag");
 83+ $startTag = NULL;
 84+ if ($rule->getAttribute("regex") != NULL) {
 85+ if (preg_match("/^$startPat/s", $text, $matches)) {
 86+ $startTag = $matches[0];
 87+ if (isset($matches[1])) {
 88+ $replaceStr = $matches[1];
 89+ }
 90+ }
 91+ } elseif ($startPat != NULL && strncmp($startPat, $text, strlen($startPat)) == 0) {
 92+ $startTag = $startPat;
 93+ }
 94+ if ($startTag != NULL || $startPat == NULL) {
 95+ $newText = $text;
 96+ $newElement = $dom->createElement($rule->getAttribute("name"));
 97+ if ($startTag != NULL) {
 98+ $newText = substr($newText, strlen($startTag));
 99+ $newElement->setAttribute("startTag", $startTag);
 100+ }
 101+ $retCode = $rule->firstChild == NULL || $this->parseRec($rule->firstChild, $replaceStr, $saveTags, $iter, $newText, $newElement);
 102+ if ($retCode) {
 103+ $outNode->appendChild($newElement);
 104+ $text = $newText;
 105+ }
 106+ }
 107+ } elseif ($rule->nodeName == "EndTag") {
 108+ $tag = str_replace("~r", $replaceStr, $rule->getAttribute("tag"));
 109+ $tagLength = strlen($tag);
 110+ if (strncmp($tag, $text, $tagLength) == 0) {
 111+ $text = substr($text, $tagLength);
 112+ $outNode->setAttribute("endTag", $tag);
 113+ $retCode = TRUE;
 114+ }
 115+ } elseif ($rule->nodeName == "Sequence") {
 116+ $saveText = $text;
 117+ $saveNode = $outNode->cloneNode(TRUE);
 118+ $pushInd = $rule->getAttribute("pushInd");
 119+ foreach ($rule->childNodes as $i => $crrnt) {
 120+ $pushTags = $i >= $pushInd ? $saveTags : "";
 121+ $retCode = $this->parseRec($crrnt, $replaceStr, $pushTags, $iter, $text, $outNode);
 122+ if (! $retCode) {
 123+ $text = $saveText;
 124+ $outNode = $saveNode;
 125+ break;
 126+ }
 127+ }
 128+ } elseif ($rule->nodeName == "Choice") {
 129+ foreach ($rule->childNodes as $crrnt) {
 130+ $retCode = $this->parseRec($crrnt, $replaceStr, $saveTags, $iter, $text, $outNode);
 131+ if ($retCode) {
 132+ break;
 133+ }
 134+ }
 135+ $retCode |= $rule->getAttribute("failSafe") != NULL;
 136+ } elseif ($rule->nodeName == "Reference") {
 137+ $varAttr = $rule->getAttribute("var");
 138+ $newVar = $varAttr == NULL ? $replaceStr : str_replace("~r", $replaceStr, $varAttr);
 139+ $xpath = new DOMXPath($this->mGrammar);
 140+ $refRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("name")}']")->item(0);
 141+ $retCode = $this->parseRec($refRule, $newVar, $saveTags, $iter, $text, $outNode);
 142+ } elseif ($rule->nodeName == "Text") {
 143+ $tagSearch = $rule->getAttribute("childTags");
 144+ if ($saveTags != "") {
 145+ $tagSearch .= "|" . $saveTags;
 146+ }
 147+ while ($text != "" && ($saveTags == "" || ! preg_match("/^($saveTags)/s", $text))) {
 148+ $offset = 1;
 149+ foreach ($rule->childNodes as $crrnt) {
 150+ if ($this->parseRec($crrnt, $replaceStr, "", $iter, $text, $outNode)) {
 151+ $offset = 0;
 152+ break;
 153+ }
 154+ }
 155+ if (preg_match("/$tagSearch/s", $text, $matches, PREG_OFFSET_CAPTURE, $offset)) {
 156+ if ($matches[0][1] > 0) {
 157+ $outNode->appendChild($dom->createTextNode(substr($text, 0, $matches[0][1])));
 158+ $text = substr($text, $matches[0][1]);
 159+ }
 160+ } else {
 161+ $outNode->appendChild($dom->createTextNode($text));
 162+ $text = "";
 163+ }
 164+ }
 165+ $retCode = true;
 166+ }
51167 if ($wgDebugParserLog != '') {
52 - wfErrorLog("Exiting $childName, Return Code - $retCode\n", $wgDebugParserLog);
 168+ wfErrorLog("Exiting {$rule->nodeName}, Return Code - $retCode\n", $wgDebugParserLog);
53169 wfErrorLog("Text - $text\n", $wgDebugParserLog);
54170 }
55171 return $retCode;
56172 }
57 -}
58173
59 -
60 -// Interface for Parse objects each with a specialized task while parsing
61 -interface ParseObject {
62 - // Does the parse task specific to each parse object
63 - function parse(&$text, &$engine, &$dom, &$children, $replaceStr);
64 -}
65 -
66 -/**
67 - * Deals with pattern matching and saving strings from the text.
68 - * mMatchPat - the regular expression used to determine if this is the rule that should be used
69 - */
70 -class ParsePattern implements ParseObject {
71 - private $mMatchPat;
72 -
73 - function __construct($matchPat) {
74 - $this->mMatchPat = $matchPat;
75 - }
76 -
77 - function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
78 - $regEx = $this->mMatchPat;
79 - if ($replaceStr != NULL) {
80 - $regEx = str_replace('~r', $replaceStr, $regEx);
81 - }
82 - if (! preg_match($regEx, $text, $matches)) {
83 - return FALSE;
84 - }
85 - $text = substr($text, strlen($matches[0]));
86 - $children = array();
87 - if (isset($matches[1])) {
88 - $children[] = $dom->createTextNode($matches[1]);
89 - }
90 - return TRUE;
91 - }
92 -}
93 -
94 -/**
95 - * Deals with cases where a rule can be matched multiple or 0 times.
96 - * mChildRule - What Parse rule to quantify
97 - * mMinChildren - Minimum amount of children for this rule
98 - * mMaxChildren - Maximum amount of children for this rule, 0 means unlimited
99 - */
100 -class ParseQuant implements ParseObject {
101 - private $mChildRule, $mMinChildren, $mMaxChildren;
102 -
103 - function __construct($childRule, $minChildren = 0, $maxChildren = 0) {
104 - $this->mChildRule = $childRule;
105 - $this->mMinChildren = $minChildren;
106 - $this->mMaxChildren = $maxChildren;
107 - }
108 -
109 - function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
110 - $children = array();
111 - for ($i = 0; $this->mMaxChildren <= 0 || $i < $this->mMaxChildren; $i ++) {
112 - if (! $engine->callParser($this->mChildRule, $text, $retChildren, $replaceStr)) {
113 - if ($i < $this->mMinChildren) {
114 - return FALSE;
 174+ private function pushTags($rule, $tagStr) {
 175+ $iter = 0;
 176+ if ($rule->nodeName == "Sequence") {
 177+ $pushInd = $rule->childNodes->length - 1;
 178+ $shouldPush = true;
 179+ for ($child = $rule->lastChild; $child != NULL; $child = $child->previousSibling) {
 180+ $this->pushTags($child, $tagStr);
 181+ if ($child->previousSibling != NULL) {
 182+ if ($this->pullTags($child, $iter, $childTag)) {
 183+ if ($shouldPush) {
 184+ $pushInd --;
 185+ }
 186+ if ($tagStr == "") {
 187+ $tagStr = $childTag;
 188+ } elseif ($childTag != "") {
 189+ $tagStr .= "|" . $childTag;
 190+ }
 191+ } else {
 192+ $shouldPush = false;
 193+ $tagStr = $childTag;
 194+ }
115195 }
116 - break;
117196 }
118 - $children = array_merge($children, $retChildren);
119 - }
120 - return TRUE;
121 - }
122 -}
123 -
124 -/**
125 - * Cycles throug array of rules until it finds one that succeeds
126 - * mList - The list of rules
127 - * mMatchChar - This is a shortcut. If the starting char of the text is different then parse will return FALSE.
128 - */
129 -class ParseChoice implements ParseObject {
130 - private $mList;
131 -
132 - function __construct() {
133 - $this->mList = $args = func_get_args();
134 - }
135 -
136 - function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
137 - foreach ($this->mList as $crrnt) {
138 - $newText = $text;
139 - if ($engine->callParser($crrnt, $newText, $children, $replaceStr)) {
140 - $text = $newText;
141 - return TRUE;
 197+ $rule->setAttribute("pushInd", $pushInd);
 198+ } else {
 199+ if ($rule->nodeName != "Choice" && $rule->nodeName != "EndTag") {
 200+ $rule->setAttribute("saveTags", $tagStr);
 201+ $tagStr = NULL;
 202+ if ($rule->nodeName == "Text") {
 203+ $childTags = "";
 204+ foreach ($rule->childNodes as $crrnt) {
 205+ if ($childTags != "") {
 206+ $childTags .= "|";
 207+ }
 208+ $this->pullTags($crrnt, $iter, $childTag);
 209+ $childTags .= $childTag;
 210+ }
 211+ $rule->setAttribute("childTags", $childTags);
 212+ }
142213 }
 214+ foreach ($rule->childNodes as $crrnt) {
 215+ $this->pushTags($crrnt, $tagStr);
 216+ }
143217 }
144 - return FALSE;
145218 }
146 -}
147219
148 -/**
149 - * Contains a sequence of rules all of which must pass
150 - * mList - The sequence of rules
151 - * mReplaceStr - A string used to determine the close tag of bracketed markup
152 - * mSaveStr - Boolean specifying wheter to pull mReplaceStr from text
153 - */
154 -class ParseSeq implements ParseObject {
155 - private $mList, $mReplaceStr, $mSaveStr;
156 -
157 - function __construct($list, $replaceStr = NULL, $saveStr = FALSE) {
158 - $this->mList = $list;
159 - $this->mReplaceStr = $replaceStr;
160 - $this->mSaveStr = $saveStr;
161 - }
162 -
163 - function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
164 - if ($this->mReplaceStr != NULL) {
165 - if ($replaceStr != NULL) {
166 - $replaceStr = str_replace('~r', $replaceStr, $this->mReplaceStr);
167 - } else {
168 - $replaceStr = $this->mReplaceStr;
169 - }
 220+ private function pullTags($rule, &$iter, &$childTags) {
 221+ $iter ++;
 222+ if ($iter > ParseEngine::maxIter) {
 223+ throw new MWException("Collecter iterated too many times. Probable loop in grammar.");
170224 }
171 - $children = array();
172 - foreach ($this->mList as $i => $crrnt) {
173 - if (! $engine->callParser($crrnt, $text, $retChildren, $replaceStr)) {
174 - return FALSE;
 225+ $childTags = "";
 226+ $failSafe = TRUE;
 227+ if ($rule->nodeName == "EndTag" || $rule->nodeName == "Assignment") {
 228+ $childTags = $rule->getAttribute("tag");
 229+ if ($rule->nodeName != "Assignment" || $rule->getAttribute("regex") == NULL) {
 230+ $childTags = preg_quote($childTags, "/");
175231 }
176 - if ($i == 0 && $this->mSaveStr && isset($retChildren[0]) && $retChildren[0] instanceof DOMText) {
177 - $replaceStr = $retChildren[0]->wholeText;
178 - } else {
179 - $children = array_merge($children, $retChildren);
 232+ $failSafe = FALSE;
 233+ } elseif ($rule->nodeName == "Choice" || $rule->nodeName == "Sequence") {
 234+ $failSafe = $rule->nodeName == "Sequence";
 235+ foreach ($rule->childNodes as $child) {
 236+ $failSafe = $this->pullTags($child, $iter, $newTags);
 237+ if ($childTags == "") {
 238+ $childTags = $newTags;
 239+ } elseif ($newTags != "") {
 240+ $childTags .= "|" . $newTags;
 241+ }
 242+ if (($failSafe && $rule->nodeName == "Choice") || (! $failSafe && $rule->nodeName == "Sequence")) {
 243+ break;
 244+ }
180245 }
 246+ $failSafe |= $rule->nodeName == "Choice" && $rule->getAttribute("failSafe") != NULL;
 247+ } elseif ($rule->nodeName == "Reference") {
 248+ $xpath = new DOMXPath($this->mGrammar);
 249+ $refRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("name")}']")->item(0);
 250+ $failSafe = $this->pullTags($refRule, $iter, $childTags);
181251 }
182 - return TRUE;
 252+ return $failSafe;
183253 }
184254 }
185255
186 -/**
187 - * Creates a Dom element
188 - * mName - The name to give the resultant ParseTree object
189 - * mAttrName - name of an attribute to add to the element
190 - * mAttrValue - value of the attribute
191 - */
192 -class ParseAssign implements ParseObject {
193 - private $mName, $mChildRule, $mAttrName, $mAttrValue;
194 -
195 - function __construct($name, $childRule, $attrName = NULL, $attrValue = NULL) {
196 - $this->mName = $name;
197 - $this->mChildRule = $childRule;
198 - $this->mAttrName = $attrName;
199 - $this->mAttrValue = $attrValue;
200 - }
201 -
202 - function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
203 - if (! $engine->callParser($this->mChildRule, $text, $retChildren, $replaceStr)) {
204 - return FALSE;
205 - }
206 - $retNode = $dom->createElement($this->mName);
207 - foreach ($retChildren as $child) {
208 - $retNode->appendChild($child);
209 - }
210 - if ($this->mAttrName != NULL && $this->mAttrValue != NULL) {
211 - $retNode->setAttribute($this->mAttrName, $this->mAttrValue);
212 - }
213 - $children = array($retNode);
214 - return TRUE;
215 - }
216 -}
217 -
Index: branches/parser-work/phase3/includes/parser/Preprocessor.php
@@ -3,24 +3,119 @@
44 /**
55 * @ingroup Parser
66 */
7 -interface Preprocessor {
8 - /** Create a new preprocessor object based on an initialised Parser object */
9 - function __construct( $parser );
 7+class Preprocessor {
 8+ private $mParser, $memoryLimit;
109
11 - /** Create a new top-level frame for expansion of a page */
12 - function newFrame();
 10+ const CACHE_VERSION = 1;
1311
14 - /** Create a new custom frame for programmatic use of parameter replacement as used in some extensions */
15 - function newCustomFrame( $args );
 12+ function __construct( $parser ) {
 13+ $this->mParser = $parser;
 14+ $mem = ini_get( 'memory_limit' );
 15+ $this->memoryLimit = false;
 16+ if ( strval( $mem ) !== '' && $mem != -1 ) {
 17+ if ( preg_match( '/^\d+$/', $mem ) ) {
 18+ $this->memoryLimit = $mem;
 19+ } elseif ( preg_match( '/^(\d+)M$/i', $mem, $m ) ) {
 20+ $this->memoryLimit = $m[1] * 1048576;
 21+ }
 22+ }
 23+ }
1624
17 - /** Preprocess text to a PPNode */
18 - function preprocessToObj( $text, $flags = 0 );
 25+ function memCheck() {
 26+ if ( $this->memoryLimit === false ) {
 27+ return;
 28+ }
 29+ $usage = memory_get_usage();
 30+ if ( $usage > $this->memoryLimit * 0.9 ) {
 31+ $limit = intval( $this->memoryLimit * 0.9 / 1048576 + 0.5 );
 32+ throw new MWException( "Preprocessor hit 90% memory limit ($limit MB)" );
 33+ }
 34+ return $usage <= $this->memoryLimit * 0.8;
 35+ }
 36+
 37+ /**
 38+ * Preprocess some wikitext and return the document tree.
 39+ * This is the ghost of Parser::replace_variables().
 40+ *
 41+ * @param string $text The text to parse
 42+ * @param integer flags Bitwise combination of:
 43+ * Parser::PTD_FOR_INCLUSION Handle <noinclude>/<includeonly> as if the text is being
 44+ * included. Default is to assume a direct page view.
 45+ *
 46+ * The generated DOM tree must depend only on the input text and the flags.
 47+ * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899.
 48+ *
 49+ * Any flag added to the $flags parameter here, or any other parameter liable to cause a
 50+ * change in the DOM tree for a given text, must be passed through the section identifier
 51+ * in the section edit link and thus back to extractSections().
 52+ *
 53+ * The output of this function is currently only cached in process memory, but a persistent
 54+ * cache may be implemented at a later date which takes further advantage of these strict
 55+ * dependency requirements.
 56+ *
 57+ * @private
 58+ */
 59+ function preprocessToObj( $text, $flags = 0 ) {
 60+ wfProfileIn( __METHOD__ );
 61+ global $wgMemc, $wgPreprocessorCacheThreshold;
 62+
 63+ $xml = false;
 64+ $cacheable = strlen( $text ) > $wgPreprocessorCacheThreshold;
 65+ if ( $cacheable ) {
 66+ wfProfileIn( __METHOD__.'-cacheable' );
 67+
 68+ $cacheKey = wfMemcKey( 'preprocess-xml', md5($text), $flags );
 69+ $cacheValue = $wgMemc->get( $cacheKey );
 70+ if ( $cacheValue ) {
 71+ $version = substr( $cacheValue, 0, 8 );
 72+ if ( intval( $version ) == self::CACHE_VERSION ) {
 73+ $xml = substr( $cacheValue, 8 );
 74+ // From the cache
 75+ wfDebugLog( "Preprocessor", "Loaded preprocessor XML from memcached (key $cacheKey)" );
 76+ }
 77+ }
 78+ }
 79+ $dom = false;
 80+ if ( $xml === false ) {
 81+ if ( $cacheable ) {
 82+ wfProfileIn( __METHOD__.'-cache-miss' );
 83+ }
 84+ $dom = $this->mParser->parse($text);
 85+ if ( $cacheable ) {
 86+ $cacheValue = sprintf( "%08d", self::CACHE_VERSION ) . $dom->saveXML();
 87+ $wgMemc->set( $cacheKey, $cacheValue, 86400 );
 88+ wfProfileOut( __METHOD__.'-cache-miss' );
 89+ wfDebugLog( "Preprocessor", "Saved preprocessor XML to memcached (key $cacheKey)" );
 90+ }
 91+ } else {
 92+ wfProfileIn( __METHOD__.'-loadXML' );
 93+ $dom = new DOMDocument;
 94+ wfSuppressWarnings();
 95+ $result = $dom->loadXML( $xml );
 96+ wfRestoreWarnings();
 97+ if ( !$result ) {
 98+ // Try running the XML through UtfNormal to get rid of invalid characters
 99+ $xml = UtfNormal::cleanUp( $xml );
 100+ $result = $dom->loadXML( $xml );
 101+ if ( !$result ) {
 102+ throw new MWException( __METHOD__.' generated invalid XML' );
 103+ }
 104+ }
 105+ wfProfileOut( __METHOD__.'-loadXML' );
 106+ }
 107+ if ( $cacheable ) {
 108+ wfProfileOut( __METHOD__.'-cacheable' );
 109+ }
 110+ wfProfileOut( __METHOD__ );
 111+ return $dom;
 112+ }
19113 }
20114
21115 /**
 116+ * An expansion frame, used as a context to expand the result of preprocessToObj()
22117 * @ingroup Parser
23118 */
24 -interface PPFrame {
 119+class PPFrame {
25120 const NO_ARGS = 1;
26121 const NO_TEMPLATES = 2;
27122 const STRIP_COMMENTS = 4;
@@ -29,150 +124,470 @@
30125
31126 const RECOVER_ORIG = 27; // = 1|2|8|16 no constant expression support in PHP yet
32127
33 - /**
34 - * Create a child frame
35 - */
36 - function newChild( $args = false, $title = false );
 128+ protected $parser, $title, $titleCache;
37129
38130 /**
39 - * Expand a document tree node
 131+ * Hashtable listing templates which are disallowed for expansion in this frame,
 132+ * having been encountered previously in parent frames.
40133 */
41 - function expand( $root, $flags = 0 );
 134+ protected $loopCheckHash;
42135
43136 /**
44 - * Implode with flags for expand()
 137+ * Recursion depth of this frame, top = 0
 138+ * Note that this is NOT the same as expansion depth in expand()
45139 */
46 - function implodeWithFlags( $sep, $flags /*, ... */ );
 140+ protected $depth;
47141
48 - /**
49 - * Implode with no flags specified
50 - */
51 - function implode( $sep /*, ... */ );
52142
53143 /**
54 - * Makes an object that, when expand()ed, will be the same as one obtained
55 - * with implode()
 144+ * Construct a new preprocessor frame.
 145+ * @param Preprocessor $parser The parent parser
56146 */
57 - function virtualImplode( $sep /*, ... */ );
 147+ function __construct( $parser ) {
 148+ $this->parser = $parser;
 149+ $this->title = $this->parser->mTitle;
 150+ $this->titleCache = array( $this->title ? $this->title->getPrefixedDBkey() : false );
 151+ $this->loopCheckHash = array();
 152+ $this->depth = 0;
 153+ }
58154
 155+ function __get($var) {
 156+ $retVal = NULL;
 157+ if ($var = "depth") {
 158+ return $depth;
 159+ }
 160+ return $retVal;
 161+ }
59162 /**
60 - * Virtual implode with brackets
 163+ * Create a new child frame
 164+ * $args is optionally a multi-root PPNode or array containing the template arguments
61165 */
62 - function virtualBracketedImplode( $start, $sep, $end /*, ... */ );
 166+ function newChild( $args = false, $title = false ) {
 167+ $namedArgs = array();
 168+ $numberedArgs = array();
 169+ if ( $title === false ) {
 170+ $title = $this->title;
 171+ }
 172+ if ($args !== false) {
 173+ $xpath = false;
 174+ $index = 1;
 175+ foreach ( $args as $arg ) {
 176+ if ( !$xpath ) {
 177+ $xpath = new DOMXPath( $arg->ownerDocument );
 178+ }
 179+ $first = $xpath->query( 'first', $arg )->item(0)->textContent;
 180+ $value = $xpath->query( 'value', $arg );
 181+ if ($value->length <= 0) {
 182+ // Numbered parameter
 183+ $numberedArgs[$index] = $first;
 184+ $index ++;
 185+ } else {
 186+ // Named parameter
 187+ $namedArgs[trim($first)] = $value->item( 0 )->textContent;
 188+ }
 189+ }
 190+ }
 191+ return new PPTemplateFrame( $this, $numberedArgs, $namedArgs, $title );
 192+ }
63193
64 - /**
65 - * Returns true if there are no arguments in this frame
66 - */
67 - function isEmpty();
 194+ function expand( $root, $flags = 0 ) {
 195+ static $expansionDepth = 0;
 196+ if ( is_string( $root ) ) {
 197+ return $root;
 198+ }
68199
69 - /**
70 - * Returns all arguments of this frame
71 - */
72 - function getArguments();
 200+ if ( ++$this->parser->mPPNodeCount > $this->parser->mOptions->mMaxPPNodeCount )
 201+ {
 202+ return '<span class="error">Node-count limit exceeded</span>';
 203+ }
73204
74 - /**
75 - * Returns all numbered arguments of this frame
76 - */
77 - function getNumberedArguments();
 205+ if ( $expansionDepth > $this->parser->mOptions->mMaxPPExpandDepth ) {
 206+ return '<span class="error">Expansion depth limit exceeded</span>';
 207+ }
 208+ wfProfileIn( __METHOD__ );
 209+ ++$expansionDepth;
78210
79 - /**
80 - * Returns all named arguments of this frame
81 - */
82 - function getNamedArguments();
 211+ if ( $root instanceof DOMDocument ) {
 212+ $root = $root->documentElement;
 213+ }
 214+ if (! $root instanceof DOMElement ) {
 215+ throw new MWException( __METHOD__.': Invalid parameter type' );
 216+ }
 217+//print("UpdIn - {$root->ownerDocument->saveXML()}\n");
 218+ PPFrame::updateIncTags($root, $flags);
83219
 220+print("ParseIn - {$root->ownerDocument->saveXML()}\n");
 221+ $headingIndex = 1;
 222+ $this->expandRec($root->childNodes, $flags, $headingIndex);
 223+ $output = $root->textContent;
 224+print("ParseOut - {$output}\n");
 225+
 226+ --$expansionDepth;
 227+ wfProfileOut( __METHOD__ );
 228+ return $output;
 229+ }
 230+
 231+ private function expandRec($contextNode, $flags, &$headingIndex) {
 232+ if ($contextNode instanceof DOMNodeList) {
 233+ for ($i = 0; $i < $contextNode->length; $i ++) {
 234+ $child = $contextNode->item($i);
 235+ if ($child instanceof DOMElement) {
 236+ $this->expandRec($child, $flags, $headingIndex);
 237+ $i --;
 238+ }
 239+ }
 240+ } else {
 241+print("ParseRecIn - {$contextNode->nodeName}\n");
 242+ if (($contextNode->nodeName == 'template' || $contextNode->nodeName == 'tplarg') && ! ($flags & self::NO_ARGS)) {
 243+ foreach ($contextNode->childNodes as $child) {
 244+ if ($child->nodeName == "part") {
 245+ foreach ($child->childNodes as $partChild) {
 246+ $this->expandRec($partChild->childNodes, $flags, $headingIndex);
 247+ }
 248+ } else {
 249+ $this->expandRec($child->childNodes, $flags, $headingIndex);
 250+ }
 251+ }
 252+ if ( $contextNode->nodeName == 'template' ) {
 253+ $this->parser->braceSubstitution($contextNode, $this);
 254+ } else {
 255+ $this->parser->argSubstitution($contextNode, $this);
 256+ }
 257+ } elseif ( $contextNode->nodeName == 'comment' ) {
 258+ $comment = $contextNode->getAttribute("startTag");
 259+ # HTML-style comment
 260+ # Remove it in HTML, pre+remove and STRIP_COMMENTS modes
 261+ if ( $this->parser->ot['html']
 262+ || ( $this->parser->ot['pre'] && $this->parser->mOptions->getRemoveComments() )
 263+ || ( $flags & self::STRIP_COMMENTS ) )
 264+ {
 265+ if ($comment[0] == "\n" || $comment[strlen($comment) - 1] == "\n") {
 266+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode("\n"), $contextNode);
 267+ } else {
 268+ $contextNode->parentNode->removeChild($contextNode);
 269+ }
 270+ }
 271+ # Add a strip marker in PST mode so that pstPass2() can run some old-fashioned regexes on the result
 272+ # Not in RECOVER_COMMENTS mode (extractSections) though
 273+ elseif ( $this->parser->ot['wiki'] && ! ( $flags & self::RECOVER_COMMENTS ) ) {
 274+ $outText = $this->parser->insertStripItem($contextNode->getAttribute("startTag"));
 275+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode($outText), $contextNode);
 276+ }
 277+ # Recover the literal comment in RECOVER_COMMENTS and pre+no-remove
 278+ else {
 279+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode($comment), $contextNode);
 280+ }
 281+ } elseif ($contextNode->nodeName == 'ignore') {
 282+ # Output suppression used by <includeonly> etc.
 283+ # OT_WIKI will only respect <ignore> in substed templates.
 284+ # The other output types respect it unless NO_IGNORE is set.
 285+ # extractSections() sets NO_IGNORE and so never respects it.
 286+ if (($this instanceof PPTemplateFrame || ! $this->parser->ot['wiki']) && ! ($flags & self::NO_IGNORE)) {
 287+ $contextNode->parentNode->removeChild($contextNode);
 288+ } else {
 289+ $outText = ParseEngine::unparse($contextNode);
 290+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode($outText), $contextNode);
 291+ }
 292+ } elseif ( $contextNode->nodeName == 'xmltag' ) {
 293+ foreach ($contextNode->childNodes as $child) {
 294+ $this->expandRec($child->childNodes, $flags, $headingIndex);
 295+ }
 296+ $tagName = substr($contextNode->getAttribute("startTag"), 1);
 297+ $isStripTag = false;
 298+ foreach ($this->parser->getStripList() as $stripTag) {
 299+ $isStripTag = $tagName == $stripTag;
 300+ if ($isStripTag) {
 301+ break;
 302+ }
 303+ }
 304+ if ($isStripTag) {
 305+ $outText = $this->parser->extensionSubstitution($contextNode, $this);
 306+ } else {
 307+ $outText = ParseEngine::unparse($contextNode);
 308+ }
 309+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode($outText), $contextNode);
 310+ } elseif ($contextNode->nodeName == 'h' && $contextNode->parentNode->nodeName == 'root' && $this->parser->ot['html']) {
 311+ # Insert a heading marker only for <h> children of <root>
 312+ # This is to stop extractSections from going over multiple tree levels
 313+ # Insert heading index marker
 314+ $this->expandRec($contextNode->childNodes, $flags, $headingIndex);
 315+ $titleText = $this->title->getPrefixedDBkey();
 316+ $this->parser->mHeadings[] = array( $titleText, $headingIndex );
 317+ $serial = count( $this->parser->mHeadings ) - 1;
 318+ $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX;
 319+ $this->parser->mStripState->general->setPair( $marker, '' );
 320+ $outText = $contextNode->getAttribute("startTag") . $marker . $contextNode->firstChild->wholeText .
 321+ $contextNode->getAttribute("endTag");
 322+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode($outText), $contextNode);
 323+ $headingIndex ++;
 324+ } else {
 325+ $this->expandRec($contextNode->childNodes, $flags, $headingIndex);
 326+ $outText = ParseEngine::unparse($contextNode);
 327+ $contextNode->parentNode->replaceChild($contextNode->ownerDocument->createTextNode($outText), $contextNode);
 328+ }
 329+print("ParseRecOut - {$contextNode->ownerDocument->saveXML()}\n");
 330+ }
 331+ }
 332+
 333+ static function updateIncTags($root, $flags = 0) {
 334+ if ( $root instanceof DOMDocument ) {
 335+ $root = $root->documentElement;
 336+ }
 337+ $parent = $root;
 338+ if ($parent instanceof DOMNodeList) {
 339+ $parent = $parent->item(0)->parentNode;
 340+ }
 341+ $xpath = new DOMXPath( $parent->ownerDocument );
 342+ $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
 343+ $ignoreRest = $forInclusion && $xpath->query("xmltag[@startTag='<onlyinclude']", $parent)->length > 0;
 344+ $children = array();
 345+ $ind = -1;
 346+ while ($parent->hasChildNodes()) {
 347+ $child = $parent->firstChild;
 348+ $parent->removeChild($child);
 349+ $tagName = $child instanceof DOMElement ? substr($child->getAttribute("startTag"), 1) : "";
 350+ if ($tagName != "onlyinclude" && $ignoreRest) {
 351+ if ($ind < 0 || $children[$ind]->nodeName != "ignore") {
 352+ $children[] = $parent->ownerDocument->createElement("ignore");
 353+ $ind ++;
 354+ }
 355+ $children[$ind]->appendChild($child);
 356+ } elseif ($tagName == "includeonly" || $tagName == "noinclude" || $tagName == "onlyinclude") {
 357+ $leftTag = $parent->ownerDocument->createTextNode("<$tagName>");
 358+ $rightTag = $parent->ownerDocument->createTextNode("</$tagName>");
 359+ $inner = $child->lastChild;
 360+ if (($tagName == "includeonly" && ! $forInclusion) || ($tagName == "noinclude" && $forInclusion)) {
 361+ $children[] = $parent->ownerDocument->createElement("ignore");
 362+ $ind ++;
 363+ $children[$ind]->appendChild($leftTag);
 364+ while ($inner->hasChildNodes()) {
 365+ $gChild = $inner->firstChild;
 366+ $inner->removeChild($gChild);
 367+ $children[$ind]->appendChild($gChild);
 368+ }
 369+ $children[$ind]->appendChild($rightTag);
 370+ } else {
 371+ $children[] = $parent->ownerDocument->createElement("ignore");
 372+ $ind ++;
 373+ $children[$ind]->appendChild($leftTag);
 374+ while ($inner->hasChildNodes()) {
 375+ $children[] = $inner->firstChild;
 376+ $ind ++;
 377+ $inner->removeChild($inner->firstChild);
 378+ }
 379+ $children[] = $parent->ownerDocument->createElement("ignore");
 380+ $ind ++;
 381+ $children[$ind]->appendChild($rightTag);
 382+ }
 383+ } else {
 384+ $children[] = $child;
 385+ $ind ++;
 386+ }
 387+ }
 388+ foreach ($children as $child) {
 389+ $parent->appendChild($child);
 390+ }
 391+ }
 392+
 393+ function __toString() {
 394+ return 'frame{}';
 395+ }
 396+
 397+ function getPDBK( $level = false ) {
 398+ if ( $level === false ) {
 399+ return $this->title->getPrefixedDBkey();
 400+ } else {
 401+ return isset( $this->titleCache[$level] ) ? $this->titleCache[$level] : false;
 402+ }
 403+ }
 404+
 405+ function getArguments() {
 406+ return array();
 407+ }
 408+
 409+ function getNumberedArguments() {
 410+ return array();
 411+ }
 412+
 413+ function getNamedArguments() {
 414+ return array();
 415+ }
 416+
84417 /**
85 - * Get an argument to this frame by name
 418+ * Returns true if there are no arguments in this frame
86419 */
87 - function getArgument( $name );
 420+ function isEmpty() {
 421+ return true;
 422+ }
88423
 424+ function getArgument( $name ) {
 425+ return false;
 426+ }
 427+
89428 /**
90429 * Returns true if the infinite loop check is OK, false if a loop is detected
91430 */
92 - function loopCheck( $title );
 431+ function loopCheck( $title ) {
 432+ return !isset( $this->loopCheckHash[$title->getPrefixedDBkey()] );
 433+ }
93434
94435 /**
95436 * Return true if the frame is a template frame
96437 */
97 - function isTemplate();
 438+ function isTemplate() {
 439+ return false;
 440+ }
98441 }
99442
100443 /**
101 - * There are three types of nodes:
102 - * * Tree nodes, which have a name and contain other nodes as children
103 - * * Array nodes, which also contain other nodes but aren't considered part of a tree
104 - * * Leaf nodes, which contain the actual data
105 - *
106 - * This interface provides access to the tree structure and to the contents of array nodes,
107 - * but it does not provide access to the internal structure of leaf nodes. Access to leaf
108 - * data is provided via two means:
109 - * * PPFrame::expand(), which provides expanded text
110 - * * The PPNode::split*() functions, which provide metadata about certain types of tree node
 444+ * Expansion frame with template arguments
111445 * @ingroup Parser
112446 */
113 -interface PPNode {
114 - /**
115 - * Get an array-type node containing the children of this node.
116 - * Returns false if this is not a tree node.
117 - */
118 - function getChildren();
 447+class PPTemplateFrame extends PPFrame {
 448+ private $numberedArgs, $namedArgs, $parent, $numberedExpansionCache, $namedExpansionCache;
119449
120 - /**
121 - * Get the first child of a tree node. False if there isn't one.
122 - */
123 - function getFirstChild();
 450+ function __construct( $parent = false, $numberedArgs = array(), $namedArgs = array(), $title = false ) {
 451+ PPFrame::__construct( $parent->parser );
 452+ $this->parent = $parent;
 453+ $this->numberedArgs = $numberedArgs;
 454+ $this->namedArgs = $namedArgs;
 455+ $this->title = $title;
 456+ $pdbk = $title ? $title->getPrefixedDBkey() : false;
 457+ $this->titleCache = $parent->titleCache;
 458+ $this->titleCache[] = $pdbk;
 459+ $this->loopCheckHash = /*clone*/ $parent->loopCheckHash;
 460+ if ( $pdbk !== false ) {
 461+ $this->loopCheckHash[$pdbk] = true;
 462+ }
 463+ $this->depth = $parent->depth + 1;
 464+ $this->numberedExpansionCache = $this->namedExpansionCache = array();
 465+ }
124466
 467+ function __toString() {
 468+ $s = 'tplframe{';
 469+ $first = true;
 470+ $args = $this->numberedArgs + $this->namedArgs;
 471+ foreach ( $args as $name => $value ) {
 472+ if ( $first ) {
 473+ $first = false;
 474+ } else {
 475+ $s .= ', ';
 476+ }
 477+ $s .= "\"$name\":\"" .
 478+ str_replace( '"', '\\"', $value->ownerDocument->saveXML( $value ) ) . '"';
 479+ }
 480+ $s .= '}';
 481+ return $s;
 482+ }
125483 /**
126 - * Get the next sibling of any node. False if there isn't one
 484+ * Returns true if there are no arguments in this frame
127485 */
128 - function getNextSibling();
 486+ function isEmpty() {
 487+ return !count( $this->numberedArgs ) && !count( $this->namedArgs );
 488+ }
129489
130 - /**
131 - * Get all children of this tree node which have a given name.
132 - * Returns an array-type node, or false if this is not a tree node.
133 - */
134 - function getChildrenOfType( $type );
 490+ function getArguments() {
 491+ $arguments = array();
 492+ foreach ( array_merge(
 493+ array_keys($this->numberedArgs),
 494+ array_keys($this->namedArgs)) as $key ) {
 495+ $arguments[$key] = $this->getArgument($key);
 496+ }
 497+ return $arguments;
 498+ }
 499+
 500+ function getNumberedArguments() {
 501+ $arguments = array();
 502+ foreach ( array_keys($this->numberedArgs) as $key ) {
 503+ $arguments[$key] = $this->getArgument($key);
 504+ }
 505+ return $arguments;
 506+ }
 507+
 508+ function getNamedArguments() {
 509+ $arguments = array();
 510+ foreach ( array_keys($this->namedArgs) as $key ) {
 511+ $arguments[$key] = $this->getArgument($key);
 512+ }
 513+ return $arguments;
 514+ }
135515
 516+ function getNumberedArgument( $index ) {
 517+ if ( !isset( $this->numberedArgs[$index] ) ) {
 518+ return false;
 519+ }
 520+ if ( !isset( $this->numberedExpansionCache[$index] ) ) {
 521+ # No trimming for unnamed arguments
 522+ $this->numberedExpansionCache[$index] = $this->parent->expand( $this->numberedArgs[$index], self::STRIP_COMMENTS );
 523+ }
 524+ return $this->numberedExpansionCache[$index];
 525+ }
136526
137 - /**
138 - * Returns the length of the array, or false if this is not an array-type node
139 - */
140 - function getLength();
 527+ function getNamedArgument( $name ) {
 528+ if ( !isset( $this->namedArgs[$name] ) ) {
 529+ return false;
 530+ }
 531+ if ( !isset( $this->namedExpansionCache[$name] ) ) {
 532+ # Trim named arguments post-expand, for backwards compatibility
 533+ $this->namedExpansionCache[$name] = trim(
 534+ $this->parent->expand( $this->namedArgs[$name], self::STRIP_COMMENTS ) );
 535+ }
 536+ return $this->namedExpansionCache[$name];
 537+ }
141538
142 - /**
143 - * Returns an item of an array-type node
144 - */
145 - function item( $i );
 539+ function getArgument( $name ) {
 540+ $text = $this->getNumberedArgument( $name );
 541+ if ( $text === false ) {
 542+ $text = $this->getNamedArgument( $name );
 543+ }
 544+ return $text;
 545+ }
146546
147547 /**
148 - * Get the name of this node. The following names are defined here:
149 - *
150 - * h A heading node.
151 - * template A double-brace node.
152 - * tplarg A triple-brace node.
153 - * title The first argument to a template or tplarg node.
154 - * part Subsequent arguments to a template or tplarg node.
155 - * #nodelist An array-type node
156 - *
157 - * The subclass may define various other names for tree and leaf nodes.
 548+ * Return true if the frame is a template frame
158549 */
159 - function getName();
 550+ function isTemplate() {
 551+ return true;
 552+ }
 553+}
160554
161 - /**
162 - * Split a <part> node into an associative array containing:
163 - * name PPNode name
164 - * index String index
165 - * value PPNode value
166 - */
167 - function splitArg();
 555+/**
 556+ * Expansion frame with custom arguments
 557+ * @ingroup Parser
 558+ */
 559+class PPCustomFrame extends PPFrame {
 560+ private $args;
168561
169 - /**
170 - * Split an <ext> node into an associative array containing name, attr, inner and close
171 - * All values in the resulting array are PPNodes. Inner and close are optional.
172 - */
173 - function splitExt();
 562+ function __construct( $args ) {
 563+ PPFrame::__construct( );
 564+ $this->args = $args;
 565+ }
174566
175 - /**
176 - * Split an <h> node
177 - */
178 - function splitHeading();
 567+ function __toString() {
 568+ $s = 'cstmframe{';
 569+ $first = true;
 570+ foreach ( $this->args as $name => $value ) {
 571+ if ( $first ) {
 572+ $first = false;
 573+ } else {
 574+ $s .= ', ';
 575+ }
 576+ $s .= "\"$name\":\"" .
 577+ str_replace( '"', '\\"', $value->__toString() ) . '"';
 578+ }
 579+ $s .= '}';
 580+ return $s;
 581+ }
 582+
 583+ function isEmpty() {
 584+ return !count( $this->args );
 585+ }
 586+
 587+ function getArgument( $index ) {
 588+ if ( !isset( $this->args[$index] ) ) {
 589+ return false;
 590+ }
 591+ return $this->args[$index];
 592+ }
179593 }
 594+
Index: branches/parser-work/phase3/includes/parser/WikiTextGrammar.xml
@@ -0,0 +1,67 @@
 2+<?xml version="1.0"?>
 3+<Grammar rootTag="root" startRule="start">
 4+ <Sequence name="start" >
 5+ <Choice failSafe="true">
 6+ <Assignment name="h" tag="(={1,6})" regex="true">
 7+ <Reference name="endText" />
 8+ </Assignment>
 9+ </Choice>
 10+ <Reference name="main" />
 11+ </Sequence>
 12+ <Text name="main">
 13+ <Assignment name="link" tag="[[">
 14+ <Reference name="endText" var="]]" />
 15+ </Assignment>
 16+ <Assignment name="h" tag="\n(={1,6})" regex="true">
 17+ <Reference name="endText" />
 18+ </Assignment>
 19+ <Assignment name="tplarg" tag="{{{(?!{)" regex="true">
 20+ <Reference name="tplSeq" var="}}}" />
 21+ </Assignment>
 22+ <Assignment name="template" tag="{{">
 23+ <Reference name="tplSeq" var="}}" />
 24+ </Assignment>
 25+ <Assignment name="comment" tag="\n?(?:&lt;!--.*?(?:-->\n?|$))+" regex="true" />
 26+ <Assignment name="xmltag" tag="&lt;(\w+)(?= |>)" regex="true">
 27+ <Sequence>
 28+ <Assignment name="attr">
 29+ <Reference name="main" />
 30+ </Assignment>
 31+ <Choice>
 32+ <EndTag tag="/>" />
 33+ <Assignment name="inner" tag=">">
 34+ <Reference name="endText" var="&lt;/~r>" />
 35+ </Assignment>
 36+ </Choice>
 37+ </Sequence>
 38+ </Assignment>
 39+ </Text>
 40+ <Sequence name="endText">
 41+ <Reference name="main" />
 42+ <EndTag tag="~r" />
 43+ </Sequence>
 44+ <Sequence name="tplSeq">
 45+ <Assignment name="title">
 46+ <Reference name="main" />
 47+ </Assignment>
 48+ <Reference name="partList" />
 49+ <EndTag tag="~r" />
 50+ </Sequence>
 51+ <Choice name="partList" failSafe="true">
 52+ <Sequence>
 53+ <Assignment name="part" tag="|">
 54+ <Sequence>
 55+ <Assignment name="first">
 56+ <Reference name="main" />
 57+ </Assignment>
 58+ <Choice failSafe="true">
 59+ <Assignment name="value" tag="=">
 60+ <Reference name="main" />
 61+ </Assignment>
 62+ </Choice>
 63+ </Sequence>
 64+ </Assignment>
 65+ <Reference name="partList" />
 66+ </Sequence>
 67+ </Choice>
 68+</Grammar>
Index: branches/parser-work/phase3/includes/AutoLoader.php
@@ -443,30 +443,10 @@
444444 'LinkHolderArray' => 'includes/parser/LinkHolderArray.php',
445445 'LinkMarkerReplacer' => 'includes/parser/Parser_LinkHooks.php',
446446 'OnlyIncludeReplacer' => 'includes/parser/Parser.php',
447 - 'PPCustomFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
448 - 'PPCustomFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
449 - 'PPDAccum_Hash' => 'includes/parser/Preprocessor_Hash.php',
450 - 'PPDPart_Hash' => 'includes/parser/Preprocessor_Hash.php',
451 - 'PPDStackElement_Hash' => 'includes/parser/Preprocessor_Hash.php',
452 - 'PPDStack_Hash' => 'includes/parser/Preprocessor_Hash.php',
 447+ 'PPCustomFrame' => 'includes/parser/Preprocessor.php',
453448 'PPFrame' => 'includes/parser/Preprocessor.php',
454 - 'PPFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
455 - 'PPFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
456 - 'PPNode' => 'includes/parser/Preprocessor.php',
457 - 'PPNode_DOM' => 'includes/parser/Preprocessor_DOM.php',
458 - 'PPNode_Hash_Array' => 'includes/parser/Preprocessor_Hash.php',
459 - 'PPNode_Hash_Attr' => 'includes/parser/Preprocessor_Hash.php',
460 - 'PPNode_Hash_Text' => 'includes/parser/Preprocessor_Hash.php',
461 - 'PPNode_Hash_Tree' => 'includes/parser/Preprocessor_Hash.php',
462 - 'PPTemplateFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
463 - 'PPTemplateFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
464 - 'ParseAssign' => 'includes/parser/ParseEngine.php',
465 - 'ParseChoice' => 'includes/parser/ParseEngine.php',
 449+ 'PPTemplateFrame' => 'includes/parser/Preprocessor.php',
466450 'ParseEngine' => 'includes/parser/ParseEngine.php',
467 - 'ParseObject' => 'includes/parser/ParseEngine.php',
468 - 'ParsePattern' => 'includes/parser/ParseEngine.php',
469 - 'ParseQuant' => 'includes/parser/ParseEngine.php',
470 - 'ParseSeq' => 'includes/parser/ParseEngine.php',
471451 'Parser' => 'includes/parser/Parser.php',
472452 'ParserCache' => 'includes/parser/ParserCache.php',
473453 'ParserOptions' => 'includes/parser/ParserOptions.php',
@@ -474,8 +454,6 @@
475455 'Parser_DiffTest' => 'includes/parser/Parser_DiffTest.php',
476456 'Parser_LinkHooks' => 'includes/parser/Parser_LinkHooks.php',
477457 'Preprocessor' => 'includes/parser/Preprocessor.php',
478 - 'Preprocessor_DOM' => 'includes/parser/Preprocessor_DOM.php',
479 - 'Preprocessor_Hash' => 'includes/parser/Preprocessor_Hash.php',
480458 'StripState' => 'includes/parser/Parser.php',
481459 'MWTidy' => 'includes/parser/Tidy.php',
482460

Status & tagging log