r62084 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r62083‎ \| r62084 \| r62085 >
Date:	14:22, 7 February 2010
Author:	happy-melon
Status:	deferred (Comments)
Tags:
Comment:	Apply initial patch by Nathanael Thompson (than4213)
Modified paths:	/branches/parser-work/phase3/includes/AutoLoader.php (modified) (history) /branches/parser-work/phase3/includes/parser/ParseTree.php (added) (history) /branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php (modified) (history)

Diff [purge]

Index: branches/parser-work/phase3/includes/parser/ParseTree.php
—	—	@@ -0,0 +1,228 @@
	2	+<?php
	3	+
	4	+/**
	5	+ * A rule specifying how to parse the text.
	6	+ * If the text matches mBeginTag then a ParseTree object is created with the appropriate info.
	7	+ * mName - The name to give the resultant ParseTree object
	8	+ * mBeginTag - the regular expression used to determine if this is the rule that should be used
	9	+ * mEndTag - If ParseTrees of this type are to have children, mEndTag specifies when all of the children are collected
	10	+ * mStopChars - extra characters that indicate markup
	11	+ * mChildRule - an extra rule to consider when collecting children, it is only used for situations covered by the HHP21 parser test
	12	+ * @ingroup Parser
	13	+ */
	14	+class ParseRule {
	15	+ private $mName, $mBeginTag, $mEndTag, $mStopChars, $mChildRule;
	16	+
	17	+ function __construct($name, $beginTag, $endTag = NULL, $stopChars = '', $childRule = NULL) {
	18	+ $this->mName = $name;
	19	+ $this->mBeginTag = $beginTag;
	20	+ $this->mEndTag = $endTag;
	21	+ $this->mStopChars = $stopChars;
	22	+ $this->mChildRule = $childRule;
	23	+ }
	24	+
	25	+ function parse(&$text, $parseList) {
	26	+ $retTree = NULL;
	27	+
	28	+ if (preg_match($this->mBeginTag, $text, $matches)) {
	29	+ $text = substr($text, strlen($matches[0]));
	30	+ $children = array();
	31	+ if ($this->mEndTag != NULL) {
	32	+ $endTag = $this->mEndTag;
	33	+ foreach ($matches as $i => $crrnt) {
	34	+ $endTag = str_replace('~' . $i, $crrnt, $endTag);
	35	+ }
	36	+ while ($text != "" && ($endTag == NULL \|\| ! preg_match($endTag, $text, $endMatches))) {
	37	+ if ($this->mChildRule != NULL) {
	38	+ $child = $this->mChildRule->parse($text, $parseList);
	39	+ if ($child != NULL) {
	40	+ $children[] = $child;
	41	+ }
	42	+ }
	43	+ $moreChildren = $parseList->parse($text, $this->mStopChars);
	44	+ $children = array_merge($children, $moreChildren);
	45	+ }
	46	+ if ($text != "") {
	47	+ $text = substr($text, strlen($endMatches[0]));
	48	+ $matches = array_merge($matches, $endMatches);
	49	+ }
	50	+ }
	51	+ $retTree = new ParseTree($this->mName, $matches, $children);
	52	+ }
	53	+
	54	+ return $retTree;
	55	+ }
	56	+}
	57	+
	58	+/**
	59	+ * Contains a list of rules to cycle through when creating a parse tree
	60	+ * mList - The list of rules
	61	+ * mStopChars - the characters used to find markup
	62	+ * @ingroup Parser
	63	+ */
	64	+class ParseList {
	65	+ private $mList, $mStopChars;
	66	+
	67	+ function __construct($list, $stopChars) {
	68	+ $this->mList = $list;
	69	+ $this->mStopChars = $stopChars;
	70	+ }
	71	+
	72	+ function parse(&$text, $stopChars) {
	73	+ $children = array();
	74	+
	75	+ foreach ($this->mList as $crrnt) {
	76	+ $child = $crrnt->parse($text, $this);
	77	+ if ($child != NULL) {
	78	+ $children[] = $child;
	79	+ break;
	80	+ }
	81	+ }
	82	+ if ($child == NULL) {
	83	+ $children[] = $text[0];
	84	+ $text = substr($text, 1);
	85	+ }
	86	+ if (preg_match('/^[^' . $this->mStopChars . $stopChars . ']+/s', $text, $matches)) {
	87	+ $children[] = $matches[0];
	88	+ $text = substr($text, strlen($matches[0]));
	89	+ }
	90	+
	91	+ return $children;
	92	+ }
	93	+}
	94	+
	95	+/**
	96	+ * The parse tree of the data.
	97	+ * printTree translates the parse tree to xml, eventually this should be seperated into a data and engine layer.
	98	+ * mName - Indicates what ParseRule was used to create this node
	99	+ * mMatches - The text groups that were collected by the regular expressions used when creating this rule
	100	+ * mChildren - The child ParseTree nodes in this tree
	101	+ * @ingroup Parser
	102	+ */
	103	+class ParseTree {
	104	+ private $mName, $mMatches, $mChildren;
	105	+
	106	+ function __construct($name, $matches, $children) {
	107	+ $this->mName = $name;
	108	+ $this->mMatches = $matches;
	109	+ $this->mChildren = $children;
	110	+ }
	111	+
	112	+ static function createParseTree($text, $parseList) {
	113	+ wfProfileIn( __METHOD__ );
	114	+
	115	+ $text = "~BOF" . $text;
	116	+ $root = new ParseRule("Root", '/^/', '/^\Z/');
	117	+ $retTree = $root->parse($text, $parseList);
	118	+
	119	+ wfProfileOut( __METHOD__ );
	120	+ return $retTree;
	121	+ }
	122	+
	123	+ //this function will definitely need to be seperated into data and engine layers
	124	+ function printTree(&$headingInd = 1) {
	125	+ $retString = "";
	126	+
	127	+ if ($this->mName == "Literal" \|\| $this->mName == "BugHHP21") {
	128	+ $retString = htmlspecialchars($this->mMatches[0]);
	129	+ } elseif ($this->mName == "Comment") {
	130	+ $retString = "<comment>" . htmlspecialchars($this->mMatches[0]) . "</comment>";
	131	+ } elseif ($this->mName == "CommentLine") {
	132	+ $retString = htmlspecialchars($this->mMatches[1]) . "<comment>" . htmlspecialchars($this->mMatches[2]) . "</comment>";
	133	+ } elseif ($this->mName == "IncludeOnly" \|\| $this->mName == "NoInclude" \|\| $this->mName == "OnlyInclude") {
	134	+ $retString = "<ignore>" . htmlspecialchars($this->mMatches[0]) . "</ignore>";
	135	+ } elseif ($this->mName == "XmlClosed") {
	136	+ $retString = "<ext><name>" . htmlspecialchars($this->mMatches[1]) .
	137	+ "</name><attr>" . htmlspecialchars($this->mMatches[2]) . "</attr></ext>";
	138	+ } elseif ($this->mName == "XmlOpened") {
	139	+ $closeTag = "";
	140	+ if ($this->mMatches[4] != "") {
	141	+ $closeTag = "<close>" . htmlspecialchars($this->mMatches[4]) . "</close>";
	142	+ }
	143	+ $retString = "<ext><name>" . htmlspecialchars($this->mMatches[1]) . "</name><attr>" . htmlspecialchars($this->mMatches[2]) .
	144	+ "</attr><inner>" . htmlspecialchars($this->mMatches[3]) . "</inner>" . $closeTag . "</ext>";
	145	+ } elseif ($this->mName == "BeginFile") {
	146	+ if (isset($this->mMatches[1])) {
	147	+ $retString = "<ignore>" . htmlspecialchars($this->mMatches[1]) . "</ignore>";
	148	+ }
	149	+ } elseif (($this->mName == "Template" && isset($this->mMatches[2])) \|\| ($this->mName == "TplArg" && isset($this->mMatches[1]))) {
	150	+ $inTitle = true;
	151	+ $foundEquals = false;
	152	+ $currentItem = "";
	153	+ $partInd = 1;
	154	+ $this->mChildren[] = '\|';
	155	+ foreach ($this->mChildren as $crrnt) {
	156	+ if ($crrnt instanceof ParseTree) {
	157	+ $currentItem .= $crrnt->printTree($headingInd);
	158	+ } elseif ($crrnt == '\|') {
	159	+ if ($inTitle) {
	160	+ $retString .= "<title>" . $currentItem . "</title>";
	161	+ $inTitle = false;
	162	+ } else {
	163	+ if (! $foundEquals) {
	164	+ $retString .= "<part><name index=\"" . $partInd . "\" />";
	165	+ $partInd ++;
	166	+ }
	167	+ $retString .= "<value>" . $currentItem . "</value></part>";
	168	+ $foundEquals = false;
	169	+ }
	170	+ $currentItem = "";
	171	+ } elseif ($crrnt == '=' && ! $inTitle && ! $foundEquals) {
	172	+ $retString .= "<part><name>" . $currentItem . "</name>=";
	173	+ $foundEquals = true;
	174	+ $currentItem = "";
	175	+ } else {
	176	+ $currentItem .= htmlspecialchars($crrnt);
	177	+ }
	178	+ }
	179	+ if ($this->mName == "Template") {
	180	+ $templateAttr = "";
	181	+ if ($this->mMatches[1] != "") {
	182	+ $templateAttr = " lineStart=\"1\"";
	183	+ }
	184	+ $retString = "<template" . $templateAttr . ">" . $retString . "</template>";
	185	+ if ($this->mMatches[1] == "\n") {
	186	+ $retString = $this->mMatches[1] . $retString;
	187	+ }
	188	+ } else {
	189	+ $retString = "<tplarg>" . $retString . "</tplarg>";
	190	+ }
	191	+ } else {
	192	+ foreach ($this->mChildren as $crrnt) {
	193	+ if ($crrnt instanceof ParseTree) {
	194	+ $retString .= $crrnt->printTree($headingInd);
	195	+ } else {
	196	+ $retString .= htmlspecialchars($crrnt);
	197	+ }
	198	+ }
	199	+ if ($this->mName == "Root") {
	200	+ $retString = "<root>" . $retString . "</root>";
	201	+ } elseif ($this->mName == "TplArg") {
	202	+ $retString = htmlspecialchars($this->mMatches[0]) . $retString;
	203	+ } elseif ($this->mName == "Template") {
	204	+ $retString = "{{" . $retString;
	205	+ if ($this->mMatches[1] == "\n") {
	206	+ $retString = $this->mMatches[1] . $retString;
	207	+ }
	208	+ } elseif ($this->mName == "Link") {
	209	+ $retString = htmlspecialchars($this->mMatches[0]) . $retString;
	210	+ if (isset($this->mMatches[1])) {
	211	+ $retString .= htmlspecialchars($this->mMatches[1]);
	212	+ }
	213	+ } elseif ($this->mName == "Heading") {
	214	+ $retString = htmlspecialchars($this->mMatches[2]) . $retString;
	215	+ if (isset($this->mMatches[3])) {
	216	+ $retString = "<h level=\"" . strlen($this->mMatches[2]) . "\" i=\"" . $headingInd . "\">" .
	217	+ $retString . htmlspecialchars($this->mMatches[3]) . "</h>";
	218	+ }
	219	+ if ($this->mMatches[1] == "\n") {
	220	+ $retString = "\n" . $retString;
	221	+ }
	222	+ $headingInd ++;
	223	+ }
	224	+ }
	225	+
	226	+ return $retString;
	227	+ }
	228	+}
	229	+
Property changes on: branches/parser-work/phase3/includes/parser/ParseTree.php
___________________________________________________________________
Name: svn:eol-style
1	230	+ native
Index: branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php
—	—	@@ -118,688 +118,46 @@
119	119	return $obj;
120	120	}
121	121
	122	+ /**
	123	+ * Preprocessor that reads in wiki text and returns xml.
	124	+ * This is the data layer of the new wikitext parser.
	125	+ */
122	126	function preprocessToXml( $text, $flags = 0 ) {
123	127	wfProfileIn( __METHOD__ );
	128	+
	129	+ $xmlishRegex = implode('\|', $this->parser->getStripList());
	130	+ $bugHHP21 = new ParseRule("BugHHP21", '/^\n(?==[^=])/s');
124	131	$rules = array(
125		~~- '{' => array(~~
126		~~- 'end' => '}',~~
127		~~- 'names' => array(~~
128		~~- 2 => 'template',~~
129		~~- 3 => 'tplarg',~~
130		~~- ),~~
131		~~- 'min' => 2,~~
132		~~- 'max' => 3,~~
133		~~- ),~~
134		~~- '[' => array(~~
135		~~- 'end' => ']',~~
136		~~- 'names' => array( 2 => null ),~~
137		~~- 'min' => 2,~~
138		~~- 'max' => 2,~~
139		~~- )~~
140		~~- );~~
	132	+ new ParseRule("Template", '/^((?:\n\|~BOF)?){{(?!{[^{])/s', '/^}}/s', '}\|=', $bugHHP21),
	133	+ new ParseRule("TplArg", '/^{{{/s', '/^}}}/s', '}\|=', $bugHHP21),
	134	+ new ParseRule("Link", '/^\[\[/s', '/^]]/s', '\]'),
	135	+ new ParseRule("Heading", '/^(\n\|~BOF)(={1,6})/s', '/^~2(?: <!--.?(?:-->\|\Z))*(?=\n\|\Z)/s', '='),
	136	+ new ParseRule("CommentLine", '/^(\n )((?:<!--.?(?:-->\|\Z)(?: *\n)?)+)/s'),
	137	+ new ParseRule("Comment", '/^<!--.*?(?:-->\|\Z)/s'),
	138	+ new ParseRule("OnlyInclude", '/^<\/?onlyinclude>/s'),
	139	+ new ParseRule("NoInclude", '/^<\/?noinclude>/s'),
	140	+ new ParseRule("IncludeOnly", '/^<includeonly>.*?(?:<\/includeonly>\|\Z)/s'),
	141	+ new ParseRule("XmlClosed", '/^<(' . $xmlishRegex . ')([^>]*)\/>/si'),
	142	+ new ParseRule("XmlOpened", '/^<(' . $xmlishRegex . ')(.?)>(.?)(<\/\1>\|\Z)/si'),
	143	+ new ParseRule("BeginFile", '/^~BOF/s'));
141	144
142		~~- $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;~~
143		-
144		~~- $xmlishElements = $this->parser->getStripList();~~
145		~~- $enableOnlyinclude = false;~~
146		~~- if ( $forInclusion ) {~~
147		~~- $ignoredTags = array( 'includeonly', '/includeonly' );~~
148		~~- $ignoredElements = array( 'noinclude' );~~
149		~~- $xmlishElements[] = 'noinclude';~~
150		~~- if ( strpos( $text, '<onlyinclude>' ) !== false && strpos( $text, '</onlyinclude>' ) !== false ) {~~
151		~~- $enableOnlyinclude = true;~~
152		~~- }~~
153		~~- } else {~~
154		~~- $ignoredTags = array( 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' );~~
155		~~- $ignoredElements = array( 'includeonly' );~~
156		~~- $xmlishElements[] = 'includeonly';~~
	145	+ if ($flags & Parser::PTD_FOR_INCLUSION) {
	146	+ $rules[6] = new ParseRule("OnlyInclude", '/^<\/onlyinclude>.*?(?:<onlyinclude>\|\Z)/s');
	147	+ $rules[7] = new ParseRule("NoInclude", '/^<noinclude>.*?(?:<\/noinclude>\|\Z)/s');
	148	+ $rules[8] = new ParseRule("IncludeOnly", '/^<\/?includeonly>/s');
	149	+ $rules[11] = new ParseRule("BeginFile", '/^~BOF(.*?<onlyinclude>)?/s');
157	150	}
158		~~- $xmlishRegex = implode( '\|', array_merge( $xmlishElements, $ignoredTags ) );~~
159	151
160		~~- // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset~~
161		~~- $elementsRegex = "~($xmlishRegex)(?:\s\|\/>\|>)\|(!--)~iA";~~
	152	+ $parseList = new ParseList($rules, '{\[<\n');
	153	+ $parseTree = ParseTree::createParseTree($text, $parseList);
	154	+ $xml = $parseTree->printTree();
162	155
163		~~- $stack = new PPDStack;~~
164		-
165		~~- $searchBase = "[{<\n"; #}~~
166		~~- $revText = strrev( $text ); // For fast reverse searches~~
167		-
168		~~- $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start~~
169		~~- $accum =& $stack->getAccum(); # Current accumulator~~
170		~~- $accum = '<root>';~~
171		~~- $findEquals = false; # True to find equals signs in arguments~~
172		~~- $findPipe = false; # True to take notice of pipe characters~~
173		~~- $headingIndex = 1;~~
174		~~- $inHeading = false; # True if $i is inside a possible heading~~
175		~~- $noMoreGT = false; # True if there are no more greater-than (>) signs right of $i~~
176		~~- $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next <onlyinclude>~~
177		~~- $fakeLineStart = true; # Do a line-start run without outputting an LF character~~
178		-
179		~~- while ( true ) {~~
180		~~- //$this->memCheck();~~
181		-
182		~~- if ( $findOnlyinclude ) {~~
183		~~- // Ignore all input up to the next <onlyinclude>~~
184		~~- $startPos = strpos( $text, '<onlyinclude>', $i );~~
185		~~- if ( $startPos === false ) {~~
186		~~- // Ignored section runs to the end~~
187		~~- $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i ) ) . '</ignore>';~~
188		~~- break;~~
189		~~- }~~
190		~~- $tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end~~
191		~~- $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i ) ) . '</ignore>';~~
192		~~- $i = $tagEndPos;~~
193		~~- $findOnlyinclude = false;~~
194		~~- }~~
195		-
196		~~- if ( $fakeLineStart ) {~~
197		~~- $found = 'line-start';~~
198		~~- $curChar = '';~~
199		~~- } else {~~
200		~~- # Find next opening brace, closing brace or pipe~~
201		~~- $search = $searchBase;~~
202		~~- if ( $stack->top === false ) {~~
203		~~- $currentClosing = '';~~
204		~~- } else {~~
205		~~- $currentClosing = $stack->top->close;~~
206		~~- $search .= $currentClosing;~~
207		~~- }~~
208		~~- if ( $findPipe ) {~~
209		~~- $search .= '\|';~~
210		~~- }~~
211		~~- if ( $findEquals ) {~~
212		~~- // First equals will be for the template~~
213		~~- $search .= '=';~~
214		~~- }~~
215		~~- $rule = null;~~
216		~~- # Output literal section, advance input counter~~
217		~~- $literalLength = strcspn( $text, $search, $i );~~
218		~~- if ( $literalLength > 0 ) {~~
219		~~- $accum .= htmlspecialchars( substr( $text, $i, $literalLength ) );~~
220		~~- $i += $literalLength;~~
221		~~- }~~
222		~~- if ( $i >= strlen( $text ) ) {~~
223		~~- if ( $currentClosing == "\n" ) {~~
224		~~- // Do a past-the-end run to finish off the heading~~
225		~~- $curChar = '';~~
226		~~- $found = 'line-end';~~
227		~~- } else {~~
228		~~- # All done~~
229		~~- break;~~
230		~~- }~~
231		~~- } else {~~
232		~~- $curChar = $text[$i];~~
233		~~- if ( $curChar == '\|' ) {~~
234		~~- $found = 'pipe';~~
235		~~- } elseif ( $curChar == '=' ) {~~
236		~~- $found = 'equals';~~
237		~~- } elseif ( $curChar == '<' ) {~~
238		~~- $found = 'angle';~~
239		~~- } elseif ( $curChar == "\n" ) {~~
240		~~- if ( $inHeading ) {~~
241		~~- $found = 'line-end';~~
242		~~- } else {~~
243		~~- $found = 'line-start';~~
244		~~- }~~
245		~~- } elseif ( $curChar == $currentClosing ) {~~
246		~~- $found = 'close';~~
247		~~- } elseif ( isset( $rules[$curChar] ) ) {~~
248		~~- $found = 'open';~~
249		~~- $rule = $rules[$curChar];~~
250		~~- } else {~~
251		~~- # Some versions of PHP have a strcspn which stops on null characters~~
252		~~- # Ignore and continue~~
253		~~- ++$i;~~
254		~~- continue;~~
255		~~- }~~
256		~~- }~~
257		~~- }~~
258		-
259		~~- if ( $found == 'angle' ) {~~
260		~~- $matches = false;~~
261		~~- // Handle </onlyinclude>~~
262		~~- if ( $enableOnlyinclude && substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>' ) {~~
263		~~- $findOnlyinclude = true;~~
264		~~- continue;~~
265		~~- }~~
266		-
267		~~- // Determine element name~~
268		~~- if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) {~~
269		~~- // Element name missing or not listed~~
270		~~- $accum .= '<';~~
271		~~- ++$i;~~
272		~~- continue;~~
273		~~- }~~
274		~~- // Handle comments~~
275		~~- if ( isset( $matches[2] ) && $matches[2] == '!--' ) {~~
276		~~- // To avoid leaving blank lines, when a comment is both preceded~~
277		~~- // and followed by a newline (ignoring spaces), trim leading and~~
278		~~- // trailing spaces and one of the newlines.~~
279		-
280		~~- // Find the end~~
281		~~- $endPos = strpos( $text, '-->', $i + 4 );~~
282		~~- if ( $endPos === false ) {~~
283		~~- // Unclosed comment in input, runs to end~~
284		~~- $inner = substr( $text, $i );~~
285		~~- $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';~~
286		~~- $i = strlen( $text );~~
287		~~- } else {~~
288		~~- // Search backwards for leading whitespace~~
289		~~- $wsStart = $i ? ( $i - strspn( $revText, ' ', strlen( $text ) - $i ) ) : 0;~~
290		~~- // Search forwards for trailing whitespace~~
291		~~- // $wsEnd will be the position of the last space~~
292		~~- $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );~~
293		~~- // Eat the line if possible~~
294		~~- // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at~~
295		~~- // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but~~
296		~~- // it's a possible beneficial b/c break.~~
297		~~- if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"~~
298		~~- && substr( $text, $wsEnd + 1, 1 ) == "\n" )~~
299		~~- {~~
300		~~- $startPos = $wsStart;~~
301		~~- $endPos = $wsEnd + 1;~~
302		~~- // Remove leading whitespace from the end of the accumulator~~
303		~~- // Sanity check first though~~
304		~~- $wsLength = $i - $wsStart;~~
305		~~- if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {~~
306		~~- $accum = substr( $accum, 0, -$wsLength );~~
307		~~- }~~
308		~~- // Do a line-start run next time to look for headings after the comment~~
309		~~- $fakeLineStart = true;~~
310		~~- } else {~~
311		~~- // No line to eat, just take the comment itself~~
312		~~- $startPos = $i;~~
313		~~- $endPos += 2;~~
314		~~- }~~
315		-
316		~~- if ( $stack->top ) {~~
317		~~- $part = $stack->top->getCurrentPart();~~
318		~~- if ( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) {~~
319		~~- // Comments abutting, no change in visual end~~
320		~~- $part->commentEnd = $wsEnd;~~
321		~~- } else {~~
322		~~- $part->visualEnd = $wsStart;~~
323		~~- $part->commentEnd = $endPos;~~
324		~~- }~~
325		~~- }~~
326		~~- $i = $endPos + 1;~~
327		~~- $inner = substr( $text, $startPos, $endPos - $startPos + 1 );~~
328		~~- $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';~~
329		~~- }~~
330		~~- continue;~~
331		~~- }~~
332		~~- $name = $matches[1];~~
333		~~- $lowerName = strtolower( $name );~~
334		~~- $attrStart = $i + strlen( $name ) + 1;~~
335		-
336		~~- // Find end of tag~~
337		~~- $tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart );~~
338		~~- if ( $tagEndPos === false ) {~~
339		~~- // Infinite backtrack~~
340		~~- // Disable tag search to prevent worst-case O(N^2) performance~~
341		~~- $noMoreGT = true;~~
342		~~- $accum .= '<';~~
343		~~- ++$i;~~
344		~~- continue;~~
345		~~- }~~
346		-
347		~~- // Handle ignored tags~~
348		~~- if ( in_array( $lowerName, $ignoredTags ) ) {~~
349		~~- $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i + 1 ) ) . '</ignore>';~~
350		~~- $i = $tagEndPos + 1;~~
351		~~- continue;~~
352		~~- }~~
353		-
354		~~- $tagStartPos = $i;~~
355		~~- if ( $text[$tagEndPos-1] == '/' ) {~~
356		~~- $attrEnd = $tagEndPos - 1;~~
357		~~- $inner = null;~~
358		~~- $i = $tagEndPos + 1;~~
359		~~- $close = '';~~
360		~~- } else {~~
361		~~- $attrEnd = $tagEndPos;~~
362		~~- // Find closing tag~~
363		~~- if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",~~
364		~~- $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) )~~
365		~~- {~~
366		~~- $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );~~
367		~~- $i = $matches[0][1] + strlen( $matches[0][0] );~~
368		~~- $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>';~~
369		~~- } else {~~
370		~~- // No end tag -- let it run out to the end of the text.~~
371		~~- $inner = substr( $text, $tagEndPos + 1 );~~
372		~~- $i = strlen( $text );~~
373		~~- $close = '';~~
374		~~- }~~
375		~~- }~~
376		~~- // <includeonly> and <noinclude> just become <ignore> tags~~
377		~~- if ( in_array( $lowerName, $ignoredElements ) ) {~~
378		~~- $accum .= '<ignore>' . htmlspecialchars( substr( $text, $tagStartPos, $i - $tagStartPos ) )~~
379		~~- . '</ignore>';~~
380		~~- continue;~~
381		~~- }~~
382		-
383		~~- $accum .= '<ext>';~~
384		~~- if ( $attrEnd <= $attrStart ) {~~
385		~~- $attr = '';~~
386		~~- } else {~~
387		~~- $attr = substr( $text, $attrStart, $attrEnd - $attrStart );~~
388		~~- }~~
389		~~- $accum .= '<name>' . htmlspecialchars( $name ) . '</name>' .~~
390		~~- // Note that the attr element contains the whitespace between name and attribute,~~
391		~~- // this is necessary for precise reconstruction during pre-save transform.~~
392		~~- '<attr>' . htmlspecialchars( $attr ) . '</attr>';~~
393		~~- if ( $inner !== null ) {~~
394		~~- $accum .= '<inner>' . htmlspecialchars( $inner ) . '</inner>';~~
395		~~- }~~
396		~~- $accum .= $close . '</ext>';~~
397		~~- }~~
398		-
399		~~- elseif ( $found == 'line-start' ) {~~
400		~~- // Is this the start of a heading?~~
401		~~- // Line break belongs before the heading element in any case~~
402		~~- if ( $fakeLineStart ) {~~
403		~~- $fakeLineStart = false;~~
404		~~- } else {~~
405		~~- $accum .= $curChar;~~
406		~~- $i++;~~
407		~~- }~~
408		-
409		~~- $count = strspn( $text, '=', $i, 6 );~~
410		~~- if ( $count == 1 && $findEquals ) {~~
411		~~- // DWIM: This looks kind of like a name/value separator~~
412		~~- // Let's let the equals handler have it and break the potential heading~~
413		~~- // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.~~
414		~~- } elseif ( $count > 0 ) {~~
415		~~- $piece = array(~~
416		~~- 'open' => "\n",~~
417		~~- 'close' => "\n",~~
418		~~- 'parts' => array( new PPDPart( str_repeat( '=', $count ) ) ),~~
419		~~- 'startPos' => $i,~~
420		~~- 'count' => $count );~~
421		~~- $stack->push( $piece );~~
422		~~- $accum =& $stack->getAccum();~~
423		~~- $flags = $stack->getFlags();~~
424		~~- extract( $flags );~~
425		~~- $i += $count;~~
426		~~- }~~
427		~~- }~~
428		-
429		~~- elseif ( $found == 'line-end' ) {~~
430		~~- $piece = $stack->top;~~
431		~~- // A heading must be open, otherwise \n wouldn't have been in the search list~~
432		~~- assert( $piece->open == "\n" );~~
433		~~- $part = $piece->getCurrentPart();~~
434		~~- // Search back through the input to see if it has a proper close~~
435		~~- // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient~~
436		~~- $wsLength = strspn( $revText, " \t", strlen( $text ) - $i );~~
437		~~- $searchStart = $i - $wsLength;~~
438		~~- if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {~~
439		~~- // Comment found at line end~~
440		~~- // Search for equals signs before the comment~~
441		~~- $searchStart = $part->visualEnd;~~
442		~~- $searchStart -= strspn( $revText, " \t", strlen( $text ) - $searchStart );~~
443		~~- }~~
444		~~- $count = $piece->count;~~
445		~~- $equalsLength = strspn( $revText, '=', strlen( $text ) - $searchStart );~~
446		~~- if ( $equalsLength > 0 ) {~~
447		~~- if ( $i - $equalsLength == $piece->startPos ) {~~
448		~~- // This is just a single string of equals signs on its own line~~
449		~~- // Replicate the doHeadings behaviour /={count}(.+)={count}/~~
450		~~- // First find out how many equals signs there really are (don't stop at 6)~~
451		~~- $count = $equalsLength;~~
452		~~- if ( $count < 3 ) {~~
453		~~- $count = 0;~~
454		~~- } else {~~
455		~~- $count = min( 6, intval( ( $count - 1 ) / 2 ) );~~
456		~~- }~~
457		~~- } else {~~
458		~~- $count = min( $equalsLength, $count );~~
459		~~- }~~
460		~~- if ( $count > 0 ) {~~
461		~~- // Normal match, output <h>~~
462		~~- $element = "<h level=\"$count\" i=\"$headingIndex\">$accum</h>";~~
463		~~- $headingIndex++;~~
464		~~- } else {~~
465		~~- // Single equals sign on its own line, count=0~~
466		~~- $element = $accum;~~
467		~~- }~~
468		~~- } else {~~
469		~~- // No match, no <h>, just pass down the inner text~~
470		~~- $element = $accum;~~
471		~~- }~~
472		~~- // Unwind the stack~~
473		~~- $stack->pop();~~
474		~~- $accum =& $stack->getAccum();~~
475		~~- $flags = $stack->getFlags();~~
476		~~- extract( $flags );~~
477		-
478		~~- // Append the result to the enclosing accumulator~~
479		~~- $accum .= $element;~~
480		~~- // Note that we do NOT increment the input pointer.~~
481		~~- // This is because the closing linebreak could be the opening linebreak of~~
482		~~- // another heading. Infinite loops are avoided because the next iteration MUST~~
483		~~- // hit the heading open case above, which unconditionally increments the~~
484		~~- // input pointer.~~
485		~~- }~~
486		-
487		~~- elseif ( $found == 'open' ) {~~
488		~~- # count opening brace characters~~
489		~~- $count = strspn( $text, $curChar, $i );~~
490		-
491		~~- # we need to add to stack only if opening brace count is enough for one of the rules~~
492		~~- if ( $count >= $rule['min'] ) {~~
493		~~- # Add it to the stack~~
494		~~- $piece = array(~~
495		~~- 'open' => $curChar,~~
496		~~- 'close' => $rule['end'],~~
497		~~- 'count' => $count,~~
498		~~- 'lineStart' => ($i > 0 && $text[$i-1] == "\n"),~~
499		~~- );~~
500		-
501		~~- $stack->push( $piece );~~
502		~~- $accum =& $stack->getAccum();~~
503		~~- $flags = $stack->getFlags();~~
504		~~- extract( $flags );~~
505		~~- } else {~~
506		~~- # Add literal brace(s)~~
507		~~- $accum .= htmlspecialchars( str_repeat( $curChar, $count ) );~~
508		~~- }~~
509		~~- $i += $count;~~
510		~~- }~~
511		-
512		~~- elseif ( $found == 'close' ) {~~
513		~~- $piece = $stack->top;~~
514		~~- # lets check if there are enough characters for closing brace~~
515		~~- $maxCount = $piece->count;~~
516		~~- $count = strspn( $text, $curChar, $i, $maxCount );~~
517		-
518		~~- # check for maximum matching characters (if there are 5 closing~~
519		~~- # characters, we will probably need only 3 - depending on the rules)~~
520		~~- $matchingCount = 0;~~
521		~~- $rule = $rules[$piece->open];~~
522		~~- if ( $count > $rule['max'] ) {~~
523		~~- # The specified maximum exists in the callback array, unless the caller~~
524		~~- # has made an error~~
525		~~- $matchingCount = $rule['max'];~~
526		~~- } else {~~
527		~~- # Count is less than the maximum~~
528		~~- # Skip any gaps in the callback array to find the true largest match~~
529		~~- # Need to use array_key_exists not isset because the callback can be null~~
530		~~- $matchingCount = $count;~~
531		~~- while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) {~~
532		~~- --$matchingCount;~~
533		~~- }~~
534		~~- }~~
535		-
536		~~- if ($matchingCount <= 0) {~~
537		~~- # No matching element found in callback array~~
538		~~- # Output a literal closing brace and continue~~
539		~~- $accum .= htmlspecialchars( str_repeat( $curChar, $count ) );~~
540		~~- $i += $count;~~
541		~~- continue;~~
542		~~- }~~
543		~~- $name = $rule['names'][$matchingCount];~~
544		~~- if ( $name === null ) {~~
545		~~- // No element, just literal text~~
546		~~- $element = $piece->breakSyntax( $matchingCount ) . str_repeat( $rule['end'], $matchingCount );~~
547		~~- } else {~~
548		~~- # Create XML element~~
549		~~- # Note: $parts is already XML, does not need to be encoded further~~
550		~~- $parts = $piece->parts;~~
551		~~- $title = $parts[0]->out;~~
552		~~- unset( $parts[0] );~~
553		-
554		~~- # The invocation is at the start of the line if lineStart is set in~~
555		~~- # the stack, and all opening brackets are used up.~~
556		~~- if ( $maxCount == $matchingCount && !empty( $piece->lineStart ) ) {~~
557		~~- $attr = ' lineStart="1"';~~
558		~~- } else {~~
559		~~- $attr = '';~~
560		~~- }~~
561		-
562		~~- $element = "<$name$attr>";~~
563		~~- $element .= "<title>$title</title>";~~
564		~~- $argIndex = 1;~~
565		~~- foreach ( $parts as $partIndex => $part ) {~~
566		~~- if ( isset( $part->eqpos ) ) {~~
567		~~- $argName = substr( $part->out, 0, $part->eqpos );~~
568		~~- $argValue = substr( $part->out, $part->eqpos + 1 );~~
569		~~- $element .= "<part><name>$argName</name>=<value>$argValue</value></part>";~~
570		~~- } else {~~
571		~~- $element .= "<part><name index=\"$argIndex\" /><value>{$part->out}</value></part>";~~
572		~~- $argIndex++;~~
573		~~- }~~
574		~~- }~~
575		~~- $element .= "</$name>";~~
576		~~- }~~
577		-
578		~~- # Advance input pointer~~
579		~~- $i += $matchingCount;~~
580		-
581		~~- # Unwind the stack~~
582		~~- $stack->pop();~~
583		~~- $accum =& $stack->getAccum();~~
584		-
585		~~- # Re-add the old stack element if it still has unmatched opening characters remaining~~
586		~~- if ($matchingCount < $piece->count) {~~
587		~~- $piece->parts = array( new PPDPart );~~
588		~~- $piece->count -= $matchingCount;~~
589		~~- # do we still qualify for any callback with remaining count?~~
590		~~- $names = $rules[$piece->open]['names'];~~
591		~~- $skippedBraces = 0;~~
592		~~- $enclosingAccum =& $accum;~~
593		~~- while ( $piece->count ) {~~
594		~~- if ( array_key_exists( $piece->count, $names ) ) {~~
595		~~- $stack->push( $piece );~~
596		~~- $accum =& $stack->getAccum();~~
597		~~- break;~~
598		~~- }~~
599		~~- --$piece->count;~~
600		~~- $skippedBraces ++;~~
601		~~- }~~
602		~~- $enclosingAccum .= str_repeat( $piece->open, $skippedBraces );~~
603		~~- }~~
604		~~- $flags = $stack->getFlags();~~
605		~~- extract( $flags );~~
606		-
607		~~- # Add XML element to the enclosing accumulator~~
608		~~- $accum .= $element;~~
609		~~- }~~
610		-
611		~~- elseif ( $found == 'pipe' ) {~~
612		~~- $findEquals = true; // shortcut for getFlags()~~
613		~~- $stack->addPart();~~
614		~~- $accum =& $stack->getAccum();~~
615		~~- ++$i;~~
616		~~- }~~
617		-
618		~~- elseif ( $found == 'equals' ) {~~
619		~~- $findEquals = false; // shortcut for getFlags()~~
620		~~- $stack->getCurrentPart()->eqpos = strlen( $accum );~~
621		~~- $accum .= '=';~~
622		~~- ++$i;~~
623		~~- }~~
624		~~- }~~
625		-
626		~~- # Output any remaining unclosed brackets~~
627		~~- foreach ( $stack->stack as $piece ) {~~
628		~~- $stack->rootAccum .= $piece->breakSyntax();~~
629		~~- }~~
630		~~- $stack->rootAccum .= '</root>';~~
631		~~- $xml = $stack->rootAccum;~~
632		-
633	156	wfProfileOut( __METHOD__ );
634		-
635	157	return $xml;
636	158	}
637	159	}
638	160
639	161	/**
640		~~- * Stack class to help Preprocessor::preprocessToObj()~~
641		~~- * @ingroup Parser~~
642		~~- */~~
643		~~-class PPDStack {~~
644		~~- var $stack, $rootAccum, $top;~~
645		~~- var $out;~~
646		~~- var $elementClass = 'PPDStackElement';~~
647		-
648		~~- static $false = false;~~
649		-
650		~~- function __construct() {~~
651		~~- $this->stack = array();~~
652		~~- $this->top = false;~~
653		~~- $this->rootAccum = '';~~
654		~~- $this->accum =& $this->rootAccum;~~
655		~~- }~~
656		-
657		~~- function count() {~~
658		~~- return count( $this->stack );~~
659		~~- }~~
660		-
661		~~- function &getAccum() {~~
662		~~- return $this->accum;~~
663		~~- }~~
664		-
665		~~- function getCurrentPart() {~~
666		~~- if ( $this->top === false ) {~~
667		~~- return false;~~
668		~~- } else {~~
669		~~- return $this->top->getCurrentPart();~~
670		~~- }~~
671		~~- }~~
672		-
673		~~- function push( $data ) {~~
674		~~- if ( $data instanceof $this->elementClass ) {~~
675		~~- $this->stack[] = $data;~~
676		~~- } else {~~
677		~~- $class = $this->elementClass;~~
678		~~- $this->stack[] = new $class( $data );~~
679		~~- }~~
680		~~- $this->top = $this->stack[ count( $this->stack ) - 1 ];~~
681		~~- $this->accum =& $this->top->getAccum();~~
682		~~- }~~
683		-
684		~~- function pop() {~~
685		~~- if ( !count( $this->stack ) ) {~~
686		~~- throw new MWException( __METHOD__.': no elements remaining' );~~
687		~~- }~~
688		~~- $temp = array_pop( $this->stack );~~
689		-
690		~~- if ( count( $this->stack ) ) {~~
691		~~- $this->top = $this->stack[ count( $this->stack ) - 1 ];~~
692		~~- $this->accum =& $this->top->getAccum();~~
693		~~- } else {~~
694		~~- $this->top = self::$false;~~
695		~~- $this->accum =& $this->rootAccum;~~
696		~~- }~~
697		~~- return $temp;~~
698		~~- }~~
699		-
700		~~- function addPart( $s = '' ) {~~
701		~~- $this->top->addPart( $s );~~
702		~~- $this->accum =& $this->top->getAccum();~~
703		~~- }~~
704		-
705		~~- function getFlags() {~~
706		~~- if ( !count( $this->stack ) ) {~~
707		~~- return array(~~
708		~~- 'findEquals' => false,~~
709		~~- 'findPipe' => false,~~
710		~~- 'inHeading' => false,~~
711		~~- );~~
712		~~- } else {~~
713		~~- return $this->top->getFlags();~~
714		~~- }~~
715		~~- }~~
716		-}
717		-
718		-/**
719		~~- * @ingroup Parser~~
720		~~- */~~
721		~~-class PPDStackElement {~~
722		~~- var $open, // Opening character (\n for heading)~~
723		~~- $close, // Matching closing character~~
724		~~- $count, // Number of opening characters found (number of "=" for heading)~~
725		~~- $parts, // Array of PPDPart objects describing pipe-separated parts.~~
726		~~- $lineStart; // True if the open char appeared at the start of the input line. Not set for headings.~~
727		-
728		~~- var $partClass = 'PPDPart';~~
729		-
730		~~- function __construct( $data = array() ) {~~
731		~~- $class = $this->partClass;~~
732		~~- $this->parts = array( new $class );~~
733		-
734		~~- foreach ( $data as $name => $value ) {~~
735		~~- $this->$name = $value;~~
736		~~- }~~
737		~~- }~~
738		-
739		~~- function &getAccum() {~~
740		~~- return $this->parts[count($this->parts) - 1]->out;~~
741		~~- }~~
742		-
743		~~- function addPart( $s = '' ) {~~
744		~~- $class = $this->partClass;~~
745		~~- $this->parts[] = new $class( $s );~~
746		~~- }~~
747		-
748		~~- function getCurrentPart() {~~
749		~~- return $this->parts[count($this->parts) - 1];~~
750		~~- }~~
751		-
752		~~- function getFlags() {~~
753		~~- $partCount = count( $this->parts );~~
754		~~- $findPipe = $this->open != "\n" && $this->open != '[';~~
755		~~- return array(~~
756		~~- 'findPipe' => $findPipe,~~
757		~~- 'findEquals' => $findPipe && $partCount > 1 && !isset( $this->parts[$partCount - 1]->eqpos ),~~
758		~~- 'inHeading' => $this->open == "\n",~~
759		~~- );~~
760		~~- }~~
761		-
762		- /**
763		~~- * Get the output string that would result if the close is not found.~~
764		~~- */~~
765		~~- function breakSyntax( $openingCount = false ) {~~
766		~~- if ( $this->open == "\n" ) {~~
767		~~- $s = $this->parts[0]->out;~~
768		~~- } else {~~
769		~~- if ( $openingCount === false ) {~~
770		~~- $openingCount = $this->count;~~
771		~~- }~~
772		~~- $s = str_repeat( $this->open, $openingCount );~~
773		~~- $first = true;~~
774		~~- foreach ( $this->parts as $part ) {~~
775		~~- if ( $first ) {~~
776		~~- $first = false;~~
777		~~- } else {~~
778		~~- $s .= '\|';~~
779		~~- }~~
780		~~- $s .= $part->out;~~
781		~~- }~~
782		~~- }~~
783		~~- return $s;~~
784		~~- }~~
785		-}
786		-
787		-/**
788		~~- * @ingroup Parser~~
789		~~- */~~
790		~~-class PPDPart {~~
791		~~- var $out; // Output accumulator string~~
792		-
793		~~- // Optional member variables:~~
794		~~- // eqpos Position of equals sign in output accumulator~~
795		~~- // commentEnd Past-the-end input pointer for the last comment encountered~~
796		~~- // visualEnd Past-the-end input pointer for the end of the accumulator minus comments~~
797		-
798		~~- function __construct( $out = '' ) {~~
799		~~- $this->out = $out;~~
800		~~- }~~
801		-}
802		-
803		-/**
804	162	* An expansion frame, used as a context to expand the result of preprocessToObj()
805	163	* @ingroup Parser
806	164	*/
Index: branches/parser-work/phase3/includes/AutoLoader.php
—	—	@@ -446,10 +446,7 @@
447	447	'PPCustomFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
448	448	'PPCustomFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
449	449	'PPDAccum_Hash' => 'includes/parser/Preprocessor_Hash.php',
450		~~- 'PPDPart' => 'includes/parser/Preprocessor_DOM.php',~~
451	450	'PPDPart_Hash' => 'includes/parser/Preprocessor_Hash.php',
452		~~- 'PPDStack' => 'includes/parser/Preprocessor_DOM.php',~~
453		~~- 'PPDStackElement' => 'includes/parser/Preprocessor_DOM.php',~~
454	451	'PPDStackElement_Hash' => 'includes/parser/Preprocessor_Hash.php',
455	452	'PPDStack_Hash' => 'includes/parser/Preprocessor_Hash.php',
456	453	'PPFrame' => 'includes/parser/Preprocessor.php',
—	—	@@ -463,6 +460,9 @@
464	461	'PPNode_Hash_Tree' => 'includes/parser/Preprocessor_Hash.php',
465	462	'PPTemplateFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
466	463	'PPTemplateFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
	464	+ 'ParseList' => 'includes/parser/ParseTree.php',
	465	+ 'ParseRule' => 'includes/parser/ParseTree.php',
	466	+ 'ParseTree' => 'includes/parser/ParseTree.php',
467	467	'Parser' => 'includes/parser/Parser.php',
468	468	'ParserCache' => 'includes/parser/ParserCache.php',
469	469	'ParserOptions' => 'includes/parser/ParserOptions.php',

Comments

#Comment by Bryan (talk | contribs) 20:56, 7 February 2010

Needs stylize.php

#Comment by Bryan (talk | contribs) 20:58, 7 February 2010

Needs stylize.php

Status & tagging log

18:57, 16 February 2010 😂 (talk | contribs) changed the status of r62084 [removed: new added: deferred]