r62084 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r62083‎ | r62084 | r62085 >
Date:14:22, 7 February 2010
Author:happy-melon
Status:deferred (Comments)
Tags:
Comment:
Apply initial patch by Nathanael Thompson (than4213)
Modified paths:
  • /branches/parser-work/phase3/includes/AutoLoader.php (modified) (history)
  • /branches/parser-work/phase3/includes/parser/ParseTree.php (added) (history)
  • /branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php (modified) (history)

Diff [purge]

Index: branches/parser-work/phase3/includes/parser/ParseTree.php
@@ -0,0 +1,228 @@
 2+<?php
 3+
 4+/**
 5+ * A rule specifying how to parse the text.
 6+ * If the text matches mBeginTag then a ParseTree object is created with the appropriate info.
 7+ * mName - The name to give the resultant ParseTree object
 8+ * mBeginTag - the regular expression used to determine if this is the rule that should be used
 9+ * mEndTag - If ParseTrees of this type are to have children, mEndTag specifies when all of the children are collected
 10+ * mStopChars - extra characters that indicate markup
 11+ * mChildRule - an extra rule to consider when collecting children, it is only used for situations covered by the HHP21 parser test
 12+ * @ingroup Parser
 13+ */
 14+class ParseRule {
 15+ private $mName, $mBeginTag, $mEndTag, $mStopChars, $mChildRule;
 16+
 17+ function __construct($name, $beginTag, $endTag = NULL, $stopChars = '', $childRule = NULL) {
 18+ $this->mName = $name;
 19+ $this->mBeginTag = $beginTag;
 20+ $this->mEndTag = $endTag;
 21+ $this->mStopChars = $stopChars;
 22+ $this->mChildRule = $childRule;
 23+ }
 24+
 25+ function parse(&$text, $parseList) {
 26+ $retTree = NULL;
 27+
 28+ if (preg_match($this->mBeginTag, $text, $matches)) {
 29+ $text = substr($text, strlen($matches[0]));
 30+ $children = array();
 31+ if ($this->mEndTag != NULL) {
 32+ $endTag = $this->mEndTag;
 33+ foreach ($matches as $i => $crrnt) {
 34+ $endTag = str_replace('~' . $i, $crrnt, $endTag);
 35+ }
 36+ while ($text != "" && ($endTag == NULL || ! preg_match($endTag, $text, $endMatches))) {
 37+ if ($this->mChildRule != NULL) {
 38+ $child = $this->mChildRule->parse($text, $parseList);
 39+ if ($child != NULL) {
 40+ $children[] = $child;
 41+ }
 42+ }
 43+ $moreChildren = $parseList->parse($text, $this->mStopChars);
 44+ $children = array_merge($children, $moreChildren);
 45+ }
 46+ if ($text != "") {
 47+ $text = substr($text, strlen($endMatches[0]));
 48+ $matches = array_merge($matches, $endMatches);
 49+ }
 50+ }
 51+ $retTree = new ParseTree($this->mName, $matches, $children);
 52+ }
 53+
 54+ return $retTree;
 55+ }
 56+}
 57+
 58+/**
 59+ * Contains a list of rules to cycle through when creating a parse tree
 60+ * mList - The list of rules
 61+ * mStopChars - the characters used to find markup
 62+ * @ingroup Parser
 63+ */
 64+class ParseList {
 65+ private $mList, $mStopChars;
 66+
 67+ function __construct($list, $stopChars) {
 68+ $this->mList = $list;
 69+ $this->mStopChars = $stopChars;
 70+ }
 71+
 72+ function parse(&$text, $stopChars) {
 73+ $children = array();
 74+
 75+ foreach ($this->mList as $crrnt) {
 76+ $child = $crrnt->parse($text, $this);
 77+ if ($child != NULL) {
 78+ $children[] = $child;
 79+ break;
 80+ }
 81+ }
 82+ if ($child == NULL) {
 83+ $children[] = $text[0];
 84+ $text = substr($text, 1);
 85+ }
 86+ if (preg_match('/^[^' . $this->mStopChars . $stopChars . ']+/s', $text, $matches)) {
 87+ $children[] = $matches[0];
 88+ $text = substr($text, strlen($matches[0]));
 89+ }
 90+
 91+ return $children;
 92+ }
 93+}
 94+
 95+/**
 96+ * The parse tree of the data.
 97+ * printTree translates the parse tree to xml, eventually this should be seperated into a data and engine layer.
 98+ * mName - Indicates what ParseRule was used to create this node
 99+ * mMatches - The text groups that were collected by the regular expressions used when creating this rule
 100+ * mChildren - The child ParseTree nodes in this tree
 101+ * @ingroup Parser
 102+ */
 103+class ParseTree {
 104+ private $mName, $mMatches, $mChildren;
 105+
 106+ function __construct($name, $matches, $children) {
 107+ $this->mName = $name;
 108+ $this->mMatches = $matches;
 109+ $this->mChildren = $children;
 110+ }
 111+
 112+ static function createParseTree($text, $parseList) {
 113+ wfProfileIn( __METHOD__ );
 114+
 115+ $text = "~BOF" . $text;
 116+ $root = new ParseRule("Root", '/^/', '/^\Z/');
 117+ $retTree = $root->parse($text, $parseList);
 118+
 119+ wfProfileOut( __METHOD__ );
 120+ return $retTree;
 121+ }
 122+
 123+ //this function will definitely need to be seperated into data and engine layers
 124+ function printTree(&$headingInd = 1) {
 125+ $retString = "";
 126+
 127+ if ($this->mName == "Literal" || $this->mName == "BugHHP21") {
 128+ $retString = htmlspecialchars($this->mMatches[0]);
 129+ } elseif ($this->mName == "Comment") {
 130+ $retString = "<comment>" . htmlspecialchars($this->mMatches[0]) . "</comment>";
 131+ } elseif ($this->mName == "CommentLine") {
 132+ $retString = htmlspecialchars($this->mMatches[1]) . "<comment>" . htmlspecialchars($this->mMatches[2]) . "</comment>";
 133+ } elseif ($this->mName == "IncludeOnly" || $this->mName == "NoInclude" || $this->mName == "OnlyInclude") {
 134+ $retString = "<ignore>" . htmlspecialchars($this->mMatches[0]) . "</ignore>";
 135+ } elseif ($this->mName == "XmlClosed") {
 136+ $retString = "<ext><name>" . htmlspecialchars($this->mMatches[1]) .
 137+ "</name><attr>" . htmlspecialchars($this->mMatches[2]) . "</attr></ext>";
 138+ } elseif ($this->mName == "XmlOpened") {
 139+ $closeTag = "";
 140+ if ($this->mMatches[4] != "") {
 141+ $closeTag = "<close>" . htmlspecialchars($this->mMatches[4]) . "</close>";
 142+ }
 143+ $retString = "<ext><name>" . htmlspecialchars($this->mMatches[1]) . "</name><attr>" . htmlspecialchars($this->mMatches[2]) .
 144+ "</attr><inner>" . htmlspecialchars($this->mMatches[3]) . "</inner>" . $closeTag . "</ext>";
 145+ } elseif ($this->mName == "BeginFile") {
 146+ if (isset($this->mMatches[1])) {
 147+ $retString = "<ignore>" . htmlspecialchars($this->mMatches[1]) . "</ignore>";
 148+ }
 149+ } elseif (($this->mName == "Template" && isset($this->mMatches[2])) || ($this->mName == "TplArg" && isset($this->mMatches[1]))) {
 150+ $inTitle = true;
 151+ $foundEquals = false;
 152+ $currentItem = "";
 153+ $partInd = 1;
 154+ $this->mChildren[] = '|';
 155+ foreach ($this->mChildren as $crrnt) {
 156+ if ($crrnt instanceof ParseTree) {
 157+ $currentItem .= $crrnt->printTree($headingInd);
 158+ } elseif ($crrnt == '|') {
 159+ if ($inTitle) {
 160+ $retString .= "<title>" . $currentItem . "</title>";
 161+ $inTitle = false;
 162+ } else {
 163+ if (! $foundEquals) {
 164+ $retString .= "<part><name index=\"" . $partInd . "\" />";
 165+ $partInd ++;
 166+ }
 167+ $retString .= "<value>" . $currentItem . "</value></part>";
 168+ $foundEquals = false;
 169+ }
 170+ $currentItem = "";
 171+ } elseif ($crrnt == '=' && ! $inTitle && ! $foundEquals) {
 172+ $retString .= "<part><name>" . $currentItem . "</name>=";
 173+ $foundEquals = true;
 174+ $currentItem = "";
 175+ } else {
 176+ $currentItem .= htmlspecialchars($crrnt);
 177+ }
 178+ }
 179+ if ($this->mName == "Template") {
 180+ $templateAttr = "";
 181+ if ($this->mMatches[1] != "") {
 182+ $templateAttr = " lineStart=\"1\"";
 183+ }
 184+ $retString = "<template" . $templateAttr . ">" . $retString . "</template>";
 185+ if ($this->mMatches[1] == "\n") {
 186+ $retString = $this->mMatches[1] . $retString;
 187+ }
 188+ } else {
 189+ $retString = "<tplarg>" . $retString . "</tplarg>";
 190+ }
 191+ } else {
 192+ foreach ($this->mChildren as $crrnt) {
 193+ if ($crrnt instanceof ParseTree) {
 194+ $retString .= $crrnt->printTree($headingInd);
 195+ } else {
 196+ $retString .= htmlspecialchars($crrnt);
 197+ }
 198+ }
 199+ if ($this->mName == "Root") {
 200+ $retString = "<root>" . $retString . "</root>";
 201+ } elseif ($this->mName == "TplArg") {
 202+ $retString = htmlspecialchars($this->mMatches[0]) . $retString;
 203+ } elseif ($this->mName == "Template") {
 204+ $retString = "{{" . $retString;
 205+ if ($this->mMatches[1] == "\n") {
 206+ $retString = $this->mMatches[1] . $retString;
 207+ }
 208+ } elseif ($this->mName == "Link") {
 209+ $retString = htmlspecialchars($this->mMatches[0]) . $retString;
 210+ if (isset($this->mMatches[1])) {
 211+ $retString .= htmlspecialchars($this->mMatches[1]);
 212+ }
 213+ } elseif ($this->mName == "Heading") {
 214+ $retString = htmlspecialchars($this->mMatches[2]) . $retString;
 215+ if (isset($this->mMatches[3])) {
 216+ $retString = "<h level=\"" . strlen($this->mMatches[2]) . "\" i=\"" . $headingInd . "\">" .
 217+ $retString . htmlspecialchars($this->mMatches[3]) . "</h>";
 218+ }
 219+ if ($this->mMatches[1] == "\n") {
 220+ $retString = "\n" . $retString;
 221+ }
 222+ $headingInd ++;
 223+ }
 224+ }
 225+
 226+ return $retString;
 227+ }
 228+}
 229+
Property changes on: branches/parser-work/phase3/includes/parser/ParseTree.php
___________________________________________________________________
Name: svn:eol-style
1230 + native
Index: branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php
@@ -118,688 +118,46 @@
119119 return $obj;
120120 }
121121
 122+ /**
 123+ * Preprocessor that reads in wiki text and returns xml.
 124+ * This is the data layer of the new wikitext parser.
 125+ */
122126 function preprocessToXml( $text, $flags = 0 ) {
123127 wfProfileIn( __METHOD__ );
 128+
 129+ $xmlishRegex = implode('|', $this->parser->getStripList());
 130+ $bugHHP21 = new ParseRule("BugHHP21", '/^\n(?==[^=])/s');
124131 $rules = array(
125 - '{' => array(
126 - 'end' => '}',
127 - 'names' => array(
128 - 2 => 'template',
129 - 3 => 'tplarg',
130 - ),
131 - 'min' => 2,
132 - 'max' => 3,
133 - ),
134 - '[' => array(
135 - 'end' => ']',
136 - 'names' => array( 2 => null ),
137 - 'min' => 2,
138 - 'max' => 2,
139 - )
140 - );
 132+ new ParseRule("Template", '/^((?:\n|~BOF)?){{(?!{[^{])/s', '/^}}/s', '}|=', $bugHHP21),
 133+ new ParseRule("TplArg", '/^{{{/s', '/^}}}/s', '}|=', $bugHHP21),
 134+ new ParseRule("Link", '/^\[\[/s', '/^]]/s', '\]'),
 135+ new ParseRule("Heading", '/^(\n|~BOF)(={1,6})/s', '/^~2(?: *<!--.*?(?:-->|\Z))*(?=\n|\Z)/s', '='),
 136+ new ParseRule("CommentLine", '/^(\n *)((?:<!--.*?(?:-->|\Z)(?: *\n)?)+)/s'),
 137+ new ParseRule("Comment", '/^<!--.*?(?:-->|\Z)/s'),
 138+ new ParseRule("OnlyInclude", '/^<\/?onlyinclude>/s'),
 139+ new ParseRule("NoInclude", '/^<\/?noinclude>/s'),
 140+ new ParseRule("IncludeOnly", '/^<includeonly>.*?(?:<\/includeonly>|\Z)/s'),
 141+ new ParseRule("XmlClosed", '/^<(' . $xmlishRegex . ')([^>]*)\/>/si'),
 142+ new ParseRule("XmlOpened", '/^<(' . $xmlishRegex . ')(.*?)>(.*?)(<\/\1>|\Z)/si'),
 143+ new ParseRule("BeginFile", '/^~BOF/s'));
141144
142 - $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
143 -
144 - $xmlishElements = $this->parser->getStripList();
145 - $enableOnlyinclude = false;
146 - if ( $forInclusion ) {
147 - $ignoredTags = array( 'includeonly', '/includeonly' );
148 - $ignoredElements = array( 'noinclude' );
149 - $xmlishElements[] = 'noinclude';
150 - if ( strpos( $text, '<onlyinclude>' ) !== false && strpos( $text, '</onlyinclude>' ) !== false ) {
151 - $enableOnlyinclude = true;
152 - }
153 - } else {
154 - $ignoredTags = array( 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' );
155 - $ignoredElements = array( 'includeonly' );
156 - $xmlishElements[] = 'includeonly';
 145+ if ($flags & Parser::PTD_FOR_INCLUSION) {
 146+ $rules[6] = new ParseRule("OnlyInclude", '/^<\/onlyinclude>.*?(?:<onlyinclude>|\Z)/s');
 147+ $rules[7] = new ParseRule("NoInclude", '/^<noinclude>.*?(?:<\/noinclude>|\Z)/s');
 148+ $rules[8] = new ParseRule("IncludeOnly", '/^<\/?includeonly>/s');
 149+ $rules[11] = new ParseRule("BeginFile", '/^~BOF(.*?<onlyinclude>)?/s');
157150 }
158 - $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
159151
160 - // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
161 - $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
 152+ $parseList = new ParseList($rules, '{\[<\n');
 153+ $parseTree = ParseTree::createParseTree($text, $parseList);
 154+ $xml = $parseTree->printTree();
162155
163 - $stack = new PPDStack;
164 -
165 - $searchBase = "[{<\n"; #}
166 - $revText = strrev( $text ); // For fast reverse searches
167 -
168 - $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start
169 - $accum =& $stack->getAccum(); # Current accumulator
170 - $accum = '<root>';
171 - $findEquals = false; # True to find equals signs in arguments
172 - $findPipe = false; # True to take notice of pipe characters
173 - $headingIndex = 1;
174 - $inHeading = false; # True if $i is inside a possible heading
175 - $noMoreGT = false; # True if there are no more greater-than (>) signs right of $i
176 - $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next <onlyinclude>
177 - $fakeLineStart = true; # Do a line-start run without outputting an LF character
178 -
179 - while ( true ) {
180 - //$this->memCheck();
181 -
182 - if ( $findOnlyinclude ) {
183 - // Ignore all input up to the next <onlyinclude>
184 - $startPos = strpos( $text, '<onlyinclude>', $i );
185 - if ( $startPos === false ) {
186 - // Ignored section runs to the end
187 - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i ) ) . '</ignore>';
188 - break;
189 - }
190 - $tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end
191 - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i ) ) . '</ignore>';
192 - $i = $tagEndPos;
193 - $findOnlyinclude = false;
194 - }
195 -
196 - if ( $fakeLineStart ) {
197 - $found = 'line-start';
198 - $curChar = '';
199 - } else {
200 - # Find next opening brace, closing brace or pipe
201 - $search = $searchBase;
202 - if ( $stack->top === false ) {
203 - $currentClosing = '';
204 - } else {
205 - $currentClosing = $stack->top->close;
206 - $search .= $currentClosing;
207 - }
208 - if ( $findPipe ) {
209 - $search .= '|';
210 - }
211 - if ( $findEquals ) {
212 - // First equals will be for the template
213 - $search .= '=';
214 - }
215 - $rule = null;
216 - # Output literal section, advance input counter
217 - $literalLength = strcspn( $text, $search, $i );
218 - if ( $literalLength > 0 ) {
219 - $accum .= htmlspecialchars( substr( $text, $i, $literalLength ) );
220 - $i += $literalLength;
221 - }
222 - if ( $i >= strlen( $text ) ) {
223 - if ( $currentClosing == "\n" ) {
224 - // Do a past-the-end run to finish off the heading
225 - $curChar = '';
226 - $found = 'line-end';
227 - } else {
228 - # All done
229 - break;
230 - }
231 - } else {
232 - $curChar = $text[$i];
233 - if ( $curChar == '|' ) {
234 - $found = 'pipe';
235 - } elseif ( $curChar == '=' ) {
236 - $found = 'equals';
237 - } elseif ( $curChar == '<' ) {
238 - $found = 'angle';
239 - } elseif ( $curChar == "\n" ) {
240 - if ( $inHeading ) {
241 - $found = 'line-end';
242 - } else {
243 - $found = 'line-start';
244 - }
245 - } elseif ( $curChar == $currentClosing ) {
246 - $found = 'close';
247 - } elseif ( isset( $rules[$curChar] ) ) {
248 - $found = 'open';
249 - $rule = $rules[$curChar];
250 - } else {
251 - # Some versions of PHP have a strcspn which stops on null characters
252 - # Ignore and continue
253 - ++$i;
254 - continue;
255 - }
256 - }
257 - }
258 -
259 - if ( $found == 'angle' ) {
260 - $matches = false;
261 - // Handle </onlyinclude>
262 - if ( $enableOnlyinclude && substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>' ) {
263 - $findOnlyinclude = true;
264 - continue;
265 - }
266 -
267 - // Determine element name
268 - if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) {
269 - // Element name missing or not listed
270 - $accum .= '&lt;';
271 - ++$i;
272 - continue;
273 - }
274 - // Handle comments
275 - if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
276 - // To avoid leaving blank lines, when a comment is both preceded
277 - // and followed by a newline (ignoring spaces), trim leading and
278 - // trailing spaces and one of the newlines.
279 -
280 - // Find the end
281 - $endPos = strpos( $text, '-->', $i + 4 );
282 - if ( $endPos === false ) {
283 - // Unclosed comment in input, runs to end
284 - $inner = substr( $text, $i );
285 - $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
286 - $i = strlen( $text );
287 - } else {
288 - // Search backwards for leading whitespace
289 - $wsStart = $i ? ( $i - strspn( $revText, ' ', strlen( $text ) - $i ) ) : 0;
290 - // Search forwards for trailing whitespace
291 - // $wsEnd will be the position of the last space
292 - $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
293 - // Eat the line if possible
294 - // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
295 - // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
296 - // it's a possible beneficial b/c break.
297 - if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
298 - && substr( $text, $wsEnd + 1, 1 ) == "\n" )
299 - {
300 - $startPos = $wsStart;
301 - $endPos = $wsEnd + 1;
302 - // Remove leading whitespace from the end of the accumulator
303 - // Sanity check first though
304 - $wsLength = $i - $wsStart;
305 - if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
306 - $accum = substr( $accum, 0, -$wsLength );
307 - }
308 - // Do a line-start run next time to look for headings after the comment
309 - $fakeLineStart = true;
310 - } else {
311 - // No line to eat, just take the comment itself
312 - $startPos = $i;
313 - $endPos += 2;
314 - }
315 -
316 - if ( $stack->top ) {
317 - $part = $stack->top->getCurrentPart();
318 - if ( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) {
319 - // Comments abutting, no change in visual end
320 - $part->commentEnd = $wsEnd;
321 - } else {
322 - $part->visualEnd = $wsStart;
323 - $part->commentEnd = $endPos;
324 - }
325 - }
326 - $i = $endPos + 1;
327 - $inner = substr( $text, $startPos, $endPos - $startPos + 1 );
328 - $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
329 - }
330 - continue;
331 - }
332 - $name = $matches[1];
333 - $lowerName = strtolower( $name );
334 - $attrStart = $i + strlen( $name ) + 1;
335 -
336 - // Find end of tag
337 - $tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart );
338 - if ( $tagEndPos === false ) {
339 - // Infinite backtrack
340 - // Disable tag search to prevent worst-case O(N^2) performance
341 - $noMoreGT = true;
342 - $accum .= '&lt;';
343 - ++$i;
344 - continue;
345 - }
346 -
347 - // Handle ignored tags
348 - if ( in_array( $lowerName, $ignoredTags ) ) {
349 - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i + 1 ) ) . '</ignore>';
350 - $i = $tagEndPos + 1;
351 - continue;
352 - }
353 -
354 - $tagStartPos = $i;
355 - if ( $text[$tagEndPos-1] == '/' ) {
356 - $attrEnd = $tagEndPos - 1;
357 - $inner = null;
358 - $i = $tagEndPos + 1;
359 - $close = '';
360 - } else {
361 - $attrEnd = $tagEndPos;
362 - // Find closing tag
363 - if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
364 - $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) )
365 - {
366 - $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
367 - $i = $matches[0][1] + strlen( $matches[0][0] );
368 - $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>';
369 - } else {
370 - // No end tag -- let it run out to the end of the text.
371 - $inner = substr( $text, $tagEndPos + 1 );
372 - $i = strlen( $text );
373 - $close = '';
374 - }
375 - }
376 - // <includeonly> and <noinclude> just become <ignore> tags
377 - if ( in_array( $lowerName, $ignoredElements ) ) {
378 - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $tagStartPos, $i - $tagStartPos ) )
379 - . '</ignore>';
380 - continue;
381 - }
382 -
383 - $accum .= '<ext>';
384 - if ( $attrEnd <= $attrStart ) {
385 - $attr = '';
386 - } else {
387 - $attr = substr( $text, $attrStart, $attrEnd - $attrStart );
388 - }
389 - $accum .= '<name>' . htmlspecialchars( $name ) . '</name>' .
390 - // Note that the attr element contains the whitespace between name and attribute,
391 - // this is necessary for precise reconstruction during pre-save transform.
392 - '<attr>' . htmlspecialchars( $attr ) . '</attr>';
393 - if ( $inner !== null ) {
394 - $accum .= '<inner>' . htmlspecialchars( $inner ) . '</inner>';
395 - }
396 - $accum .= $close . '</ext>';
397 - }
398 -
399 - elseif ( $found == 'line-start' ) {
400 - // Is this the start of a heading?
401 - // Line break belongs before the heading element in any case
402 - if ( $fakeLineStart ) {
403 - $fakeLineStart = false;
404 - } else {
405 - $accum .= $curChar;
406 - $i++;
407 - }
408 -
409 - $count = strspn( $text, '=', $i, 6 );
410 - if ( $count == 1 && $findEquals ) {
411 - // DWIM: This looks kind of like a name/value separator
412 - // Let's let the equals handler have it and break the potential heading
413 - // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
414 - } elseif ( $count > 0 ) {
415 - $piece = array(
416 - 'open' => "\n",
417 - 'close' => "\n",
418 - 'parts' => array( new PPDPart( str_repeat( '=', $count ) ) ),
419 - 'startPos' => $i,
420 - 'count' => $count );
421 - $stack->push( $piece );
422 - $accum =& $stack->getAccum();
423 - $flags = $stack->getFlags();
424 - extract( $flags );
425 - $i += $count;
426 - }
427 - }
428 -
429 - elseif ( $found == 'line-end' ) {
430 - $piece = $stack->top;
431 - // A heading must be open, otherwise \n wouldn't have been in the search list
432 - assert( $piece->open == "\n" );
433 - $part = $piece->getCurrentPart();
434 - // Search back through the input to see if it has a proper close
435 - // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
436 - $wsLength = strspn( $revText, " \t", strlen( $text ) - $i );
437 - $searchStart = $i - $wsLength;
438 - if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {
439 - // Comment found at line end
440 - // Search for equals signs before the comment
441 - $searchStart = $part->visualEnd;
442 - $searchStart -= strspn( $revText, " \t", strlen( $text ) - $searchStart );
443 - }
444 - $count = $piece->count;
445 - $equalsLength = strspn( $revText, '=', strlen( $text ) - $searchStart );
446 - if ( $equalsLength > 0 ) {
447 - if ( $i - $equalsLength == $piece->startPos ) {
448 - // This is just a single string of equals signs on its own line
449 - // Replicate the doHeadings behaviour /={count}(.+)={count}/
450 - // First find out how many equals signs there really are (don't stop at 6)
451 - $count = $equalsLength;
452 - if ( $count < 3 ) {
453 - $count = 0;
454 - } else {
455 - $count = min( 6, intval( ( $count - 1 ) / 2 ) );
456 - }
457 - } else {
458 - $count = min( $equalsLength, $count );
459 - }
460 - if ( $count > 0 ) {
461 - // Normal match, output <h>
462 - $element = "<h level=\"$count\" i=\"$headingIndex\">$accum</h>";
463 - $headingIndex++;
464 - } else {
465 - // Single equals sign on its own line, count=0
466 - $element = $accum;
467 - }
468 - } else {
469 - // No match, no <h>, just pass down the inner text
470 - $element = $accum;
471 - }
472 - // Unwind the stack
473 - $stack->pop();
474 - $accum =& $stack->getAccum();
475 - $flags = $stack->getFlags();
476 - extract( $flags );
477 -
478 - // Append the result to the enclosing accumulator
479 - $accum .= $element;
480 - // Note that we do NOT increment the input pointer.
481 - // This is because the closing linebreak could be the opening linebreak of
482 - // another heading. Infinite loops are avoided because the next iteration MUST
483 - // hit the heading open case above, which unconditionally increments the
484 - // input pointer.
485 - }
486 -
487 - elseif ( $found == 'open' ) {
488 - # count opening brace characters
489 - $count = strspn( $text, $curChar, $i );
490 -
491 - # we need to add to stack only if opening brace count is enough for one of the rules
492 - if ( $count >= $rule['min'] ) {
493 - # Add it to the stack
494 - $piece = array(
495 - 'open' => $curChar,
496 - 'close' => $rule['end'],
497 - 'count' => $count,
498 - 'lineStart' => ($i > 0 && $text[$i-1] == "\n"),
499 - );
500 -
501 - $stack->push( $piece );
502 - $accum =& $stack->getAccum();
503 - $flags = $stack->getFlags();
504 - extract( $flags );
505 - } else {
506 - # Add literal brace(s)
507 - $accum .= htmlspecialchars( str_repeat( $curChar, $count ) );
508 - }
509 - $i += $count;
510 - }
511 -
512 - elseif ( $found == 'close' ) {
513 - $piece = $stack->top;
514 - # lets check if there are enough characters for closing brace
515 - $maxCount = $piece->count;
516 - $count = strspn( $text, $curChar, $i, $maxCount );
517 -
518 - # check for maximum matching characters (if there are 5 closing
519 - # characters, we will probably need only 3 - depending on the rules)
520 - $matchingCount = 0;
521 - $rule = $rules[$piece->open];
522 - if ( $count > $rule['max'] ) {
523 - # The specified maximum exists in the callback array, unless the caller
524 - # has made an error
525 - $matchingCount = $rule['max'];
526 - } else {
527 - # Count is less than the maximum
528 - # Skip any gaps in the callback array to find the true largest match
529 - # Need to use array_key_exists not isset because the callback can be null
530 - $matchingCount = $count;
531 - while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) {
532 - --$matchingCount;
533 - }
534 - }
535 -
536 - if ($matchingCount <= 0) {
537 - # No matching element found in callback array
538 - # Output a literal closing brace and continue
539 - $accum .= htmlspecialchars( str_repeat( $curChar, $count ) );
540 - $i += $count;
541 - continue;
542 - }
543 - $name = $rule['names'][$matchingCount];
544 - if ( $name === null ) {
545 - // No element, just literal text
546 - $element = $piece->breakSyntax( $matchingCount ) . str_repeat( $rule['end'], $matchingCount );
547 - } else {
548 - # Create XML element
549 - # Note: $parts is already XML, does not need to be encoded further
550 - $parts = $piece->parts;
551 - $title = $parts[0]->out;
552 - unset( $parts[0] );
553 -
554 - # The invocation is at the start of the line if lineStart is set in
555 - # the stack, and all opening brackets are used up.
556 - if ( $maxCount == $matchingCount && !empty( $piece->lineStart ) ) {
557 - $attr = ' lineStart="1"';
558 - } else {
559 - $attr = '';
560 - }
561 -
562 - $element = "<$name$attr>";
563 - $element .= "<title>$title</title>";
564 - $argIndex = 1;
565 - foreach ( $parts as $partIndex => $part ) {
566 - if ( isset( $part->eqpos ) ) {
567 - $argName = substr( $part->out, 0, $part->eqpos );
568 - $argValue = substr( $part->out, $part->eqpos + 1 );
569 - $element .= "<part><name>$argName</name>=<value>$argValue</value></part>";
570 - } else {
571 - $element .= "<part><name index=\"$argIndex\" /><value>{$part->out}</value></part>";
572 - $argIndex++;
573 - }
574 - }
575 - $element .= "</$name>";
576 - }
577 -
578 - # Advance input pointer
579 - $i += $matchingCount;
580 -
581 - # Unwind the stack
582 - $stack->pop();
583 - $accum =& $stack->getAccum();
584 -
585 - # Re-add the old stack element if it still has unmatched opening characters remaining
586 - if ($matchingCount < $piece->count) {
587 - $piece->parts = array( new PPDPart );
588 - $piece->count -= $matchingCount;
589 - # do we still qualify for any callback with remaining count?
590 - $names = $rules[$piece->open]['names'];
591 - $skippedBraces = 0;
592 - $enclosingAccum =& $accum;
593 - while ( $piece->count ) {
594 - if ( array_key_exists( $piece->count, $names ) ) {
595 - $stack->push( $piece );
596 - $accum =& $stack->getAccum();
597 - break;
598 - }
599 - --$piece->count;
600 - $skippedBraces ++;
601 - }
602 - $enclosingAccum .= str_repeat( $piece->open, $skippedBraces );
603 - }
604 - $flags = $stack->getFlags();
605 - extract( $flags );
606 -
607 - # Add XML element to the enclosing accumulator
608 - $accum .= $element;
609 - }
610 -
611 - elseif ( $found == 'pipe' ) {
612 - $findEquals = true; // shortcut for getFlags()
613 - $stack->addPart();
614 - $accum =& $stack->getAccum();
615 - ++$i;
616 - }
617 -
618 - elseif ( $found == 'equals' ) {
619 - $findEquals = false; // shortcut for getFlags()
620 - $stack->getCurrentPart()->eqpos = strlen( $accum );
621 - $accum .= '=';
622 - ++$i;
623 - }
624 - }
625 -
626 - # Output any remaining unclosed brackets
627 - foreach ( $stack->stack as $piece ) {
628 - $stack->rootAccum .= $piece->breakSyntax();
629 - }
630 - $stack->rootAccum .= '</root>';
631 - $xml = $stack->rootAccum;
632 -
633156 wfProfileOut( __METHOD__ );
634 -
635157 return $xml;
636158 }
637159 }
638160
639161 /**
640 - * Stack class to help Preprocessor::preprocessToObj()
641 - * @ingroup Parser
642 - */
643 -class PPDStack {
644 - var $stack, $rootAccum, $top;
645 - var $out;
646 - var $elementClass = 'PPDStackElement';
647 -
648 - static $false = false;
649 -
650 - function __construct() {
651 - $this->stack = array();
652 - $this->top = false;
653 - $this->rootAccum = '';
654 - $this->accum =& $this->rootAccum;
655 - }
656 -
657 - function count() {
658 - return count( $this->stack );
659 - }
660 -
661 - function &getAccum() {
662 - return $this->accum;
663 - }
664 -
665 - function getCurrentPart() {
666 - if ( $this->top === false ) {
667 - return false;
668 - } else {
669 - return $this->top->getCurrentPart();
670 - }
671 - }
672 -
673 - function push( $data ) {
674 - if ( $data instanceof $this->elementClass ) {
675 - $this->stack[] = $data;
676 - } else {
677 - $class = $this->elementClass;
678 - $this->stack[] = new $class( $data );
679 - }
680 - $this->top = $this->stack[ count( $this->stack ) - 1 ];
681 - $this->accum =& $this->top->getAccum();
682 - }
683 -
684 - function pop() {
685 - if ( !count( $this->stack ) ) {
686 - throw new MWException( __METHOD__.': no elements remaining' );
687 - }
688 - $temp = array_pop( $this->stack );
689 -
690 - if ( count( $this->stack ) ) {
691 - $this->top = $this->stack[ count( $this->stack ) - 1 ];
692 - $this->accum =& $this->top->getAccum();
693 - } else {
694 - $this->top = self::$false;
695 - $this->accum =& $this->rootAccum;
696 - }
697 - return $temp;
698 - }
699 -
700 - function addPart( $s = '' ) {
701 - $this->top->addPart( $s );
702 - $this->accum =& $this->top->getAccum();
703 - }
704 -
705 - function getFlags() {
706 - if ( !count( $this->stack ) ) {
707 - return array(
708 - 'findEquals' => false,
709 - 'findPipe' => false,
710 - 'inHeading' => false,
711 - );
712 - } else {
713 - return $this->top->getFlags();
714 - }
715 - }
716 -}
717 -
718 -/**
719 - * @ingroup Parser
720 - */
721 -class PPDStackElement {
722 - var $open, // Opening character (\n for heading)
723 - $close, // Matching closing character
724 - $count, // Number of opening characters found (number of "=" for heading)
725 - $parts, // Array of PPDPart objects describing pipe-separated parts.
726 - $lineStart; // True if the open char appeared at the start of the input line. Not set for headings.
727 -
728 - var $partClass = 'PPDPart';
729 -
730 - function __construct( $data = array() ) {
731 - $class = $this->partClass;
732 - $this->parts = array( new $class );
733 -
734 - foreach ( $data as $name => $value ) {
735 - $this->$name = $value;
736 - }
737 - }
738 -
739 - function &getAccum() {
740 - return $this->parts[count($this->parts) - 1]->out;
741 - }
742 -
743 - function addPart( $s = '' ) {
744 - $class = $this->partClass;
745 - $this->parts[] = new $class( $s );
746 - }
747 -
748 - function getCurrentPart() {
749 - return $this->parts[count($this->parts) - 1];
750 - }
751 -
752 - function getFlags() {
753 - $partCount = count( $this->parts );
754 - $findPipe = $this->open != "\n" && $this->open != '[';
755 - return array(
756 - 'findPipe' => $findPipe,
757 - 'findEquals' => $findPipe && $partCount > 1 && !isset( $this->parts[$partCount - 1]->eqpos ),
758 - 'inHeading' => $this->open == "\n",
759 - );
760 - }
761 -
762 - /**
763 - * Get the output string that would result if the close is not found.
764 - */
765 - function breakSyntax( $openingCount = false ) {
766 - if ( $this->open == "\n" ) {
767 - $s = $this->parts[0]->out;
768 - } else {
769 - if ( $openingCount === false ) {
770 - $openingCount = $this->count;
771 - }
772 - $s = str_repeat( $this->open, $openingCount );
773 - $first = true;
774 - foreach ( $this->parts as $part ) {
775 - if ( $first ) {
776 - $first = false;
777 - } else {
778 - $s .= '|';
779 - }
780 - $s .= $part->out;
781 - }
782 - }
783 - return $s;
784 - }
785 -}
786 -
787 -/**
788 - * @ingroup Parser
789 - */
790 -class PPDPart {
791 - var $out; // Output accumulator string
792 -
793 - // Optional member variables:
794 - // eqpos Position of equals sign in output accumulator
795 - // commentEnd Past-the-end input pointer for the last comment encountered
796 - // visualEnd Past-the-end input pointer for the end of the accumulator minus comments
797 -
798 - function __construct( $out = '' ) {
799 - $this->out = $out;
800 - }
801 -}
802 -
803 -/**
804162 * An expansion frame, used as a context to expand the result of preprocessToObj()
805163 * @ingroup Parser
806164 */
Index: branches/parser-work/phase3/includes/AutoLoader.php
@@ -446,10 +446,7 @@
447447 'PPCustomFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
448448 'PPCustomFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
449449 'PPDAccum_Hash' => 'includes/parser/Preprocessor_Hash.php',
450 - 'PPDPart' => 'includes/parser/Preprocessor_DOM.php',
451450 'PPDPart_Hash' => 'includes/parser/Preprocessor_Hash.php',
452 - 'PPDStack' => 'includes/parser/Preprocessor_DOM.php',
453 - 'PPDStackElement' => 'includes/parser/Preprocessor_DOM.php',
454451 'PPDStackElement_Hash' => 'includes/parser/Preprocessor_Hash.php',
455452 'PPDStack_Hash' => 'includes/parser/Preprocessor_Hash.php',
456453 'PPFrame' => 'includes/parser/Preprocessor.php',
@@ -463,6 +460,9 @@
464461 'PPNode_Hash_Tree' => 'includes/parser/Preprocessor_Hash.php',
465462 'PPTemplateFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
466463 'PPTemplateFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
 464+ 'ParseList' => 'includes/parser/ParseTree.php',
 465+ 'ParseRule' => 'includes/parser/ParseTree.php',
 466+ 'ParseTree' => 'includes/parser/ParseTree.php',
467467 'Parser' => 'includes/parser/Parser.php',
468468 'ParserCache' => 'includes/parser/ParserCache.php',
469469 'ParserOptions' => 'includes/parser/ParserOptions.php',

Comments

#Comment by Bryan (talk | contribs)   20:56, 7 February 2010

Needs stylize.php

#Comment by Bryan (talk | contribs)   20:58, 7 February 2010

Needs stylize.php

Status & tagging log