Index: branches/parser-work/phase3/includes/parser/ParseTree.php |
— | — | @@ -0,0 +1,228 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * A rule specifying how to parse the text. |
| 6 | + * If the text matches mBeginTag then a ParseTree object is created with the appropriate info. |
| 7 | + * mName - The name to give the resultant ParseTree object |
| 8 | + * mBeginTag - the regular expression used to determine if this is the rule that should be used |
| 9 | + * mEndTag - If ParseTrees of this type are to have children, mEndTag specifies when all of the children are collected |
| 10 | + * mStopChars - extra characters that indicate markup |
| 11 | + * mChildRule - an extra rule to consider when collecting children, it is only used for situations covered by the HHP21 parser test |
| 12 | + * @ingroup Parser |
| 13 | + */ |
| 14 | +class ParseRule { |
| 15 | + private $mName, $mBeginTag, $mEndTag, $mStopChars, $mChildRule; |
| 16 | + |
| 17 | + function __construct($name, $beginTag, $endTag = NULL, $stopChars = '', $childRule = NULL) { |
| 18 | + $this->mName = $name; |
| 19 | + $this->mBeginTag = $beginTag; |
| 20 | + $this->mEndTag = $endTag; |
| 21 | + $this->mStopChars = $stopChars; |
| 22 | + $this->mChildRule = $childRule; |
| 23 | + } |
| 24 | + |
| 25 | + function parse(&$text, $parseList) { |
| 26 | + $retTree = NULL; |
| 27 | + |
| 28 | + if (preg_match($this->mBeginTag, $text, $matches)) { |
| 29 | + $text = substr($text, strlen($matches[0])); |
| 30 | + $children = array(); |
| 31 | + if ($this->mEndTag != NULL) { |
| 32 | + $endTag = $this->mEndTag; |
| 33 | + foreach ($matches as $i => $crrnt) { |
| 34 | + $endTag = str_replace('~' . $i, $crrnt, $endTag); |
| 35 | + } |
| 36 | + while ($text != "" && ($endTag == NULL || ! preg_match($endTag, $text, $endMatches))) { |
| 37 | + if ($this->mChildRule != NULL) { |
| 38 | + $child = $this->mChildRule->parse($text, $parseList); |
| 39 | + if ($child != NULL) { |
| 40 | + $children[] = $child; |
| 41 | + } |
| 42 | + } |
| 43 | + $moreChildren = $parseList->parse($text, $this->mStopChars); |
| 44 | + $children = array_merge($children, $moreChildren); |
| 45 | + } |
| 46 | + if ($text != "") { |
| 47 | + $text = substr($text, strlen($endMatches[0])); |
| 48 | + $matches = array_merge($matches, $endMatches); |
| 49 | + } |
| 50 | + } |
| 51 | + $retTree = new ParseTree($this->mName, $matches, $children); |
| 52 | + } |
| 53 | + |
| 54 | + return $retTree; |
| 55 | + } |
| 56 | +} |
| 57 | + |
| 58 | +/** |
| 59 | + * Contains a list of rules to cycle through when creating a parse tree |
| 60 | + * mList - The list of rules |
| 61 | + * mStopChars - the characters used to find markup |
| 62 | + * @ingroup Parser |
| 63 | + */ |
| 64 | +class ParseList { |
| 65 | + private $mList, $mStopChars; |
| 66 | + |
| 67 | + function __construct($list, $stopChars) { |
| 68 | + $this->mList = $list; |
| 69 | + $this->mStopChars = $stopChars; |
| 70 | + } |
| 71 | + |
| 72 | + function parse(&$text, $stopChars) { |
| 73 | + $children = array(); |
| 74 | + |
| 75 | + foreach ($this->mList as $crrnt) { |
| 76 | + $child = $crrnt->parse($text, $this); |
| 77 | + if ($child != NULL) { |
| 78 | + $children[] = $child; |
| 79 | + break; |
| 80 | + } |
| 81 | + } |
| 82 | + if ($child == NULL) { |
| 83 | + $children[] = $text[0]; |
| 84 | + $text = substr($text, 1); |
| 85 | + } |
| 86 | + if (preg_match('/^[^' . $this->mStopChars . $stopChars . ']+/s', $text, $matches)) { |
| 87 | + $children[] = $matches[0]; |
| 88 | + $text = substr($text, strlen($matches[0])); |
| 89 | + } |
| 90 | + |
| 91 | + return $children; |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +/** |
| 96 | + * The parse tree of the data. |
| 97 | + * printTree translates the parse tree to xml, eventually this should be seperated into a data and engine layer. |
| 98 | + * mName - Indicates what ParseRule was used to create this node |
| 99 | + * mMatches - The text groups that were collected by the regular expressions used when creating this rule |
| 100 | + * mChildren - The child ParseTree nodes in this tree |
| 101 | + * @ingroup Parser |
| 102 | + */ |
| 103 | +class ParseTree { |
| 104 | + private $mName, $mMatches, $mChildren; |
| 105 | + |
| 106 | + function __construct($name, $matches, $children) { |
| 107 | + $this->mName = $name; |
| 108 | + $this->mMatches = $matches; |
| 109 | + $this->mChildren = $children; |
| 110 | + } |
| 111 | + |
| 112 | + static function createParseTree($text, $parseList) { |
| 113 | + wfProfileIn( __METHOD__ ); |
| 114 | + |
| 115 | + $text = "~BOF" . $text; |
| 116 | + $root = new ParseRule("Root", '/^/', '/^\Z/'); |
| 117 | + $retTree = $root->parse($text, $parseList); |
| 118 | + |
| 119 | + wfProfileOut( __METHOD__ ); |
| 120 | + return $retTree; |
| 121 | + } |
| 122 | + |
| 123 | + //this function will definitely need to be seperated into data and engine layers |
| 124 | + function printTree(&$headingInd = 1) { |
| 125 | + $retString = ""; |
| 126 | + |
| 127 | + if ($this->mName == "Literal" || $this->mName == "BugHHP21") { |
| 128 | + $retString = htmlspecialchars($this->mMatches[0]); |
| 129 | + } elseif ($this->mName == "Comment") { |
| 130 | + $retString = "<comment>" . htmlspecialchars($this->mMatches[0]) . "</comment>"; |
| 131 | + } elseif ($this->mName == "CommentLine") { |
| 132 | + $retString = htmlspecialchars($this->mMatches[1]) . "<comment>" . htmlspecialchars($this->mMatches[2]) . "</comment>"; |
| 133 | + } elseif ($this->mName == "IncludeOnly" || $this->mName == "NoInclude" || $this->mName == "OnlyInclude") { |
| 134 | + $retString = "<ignore>" . htmlspecialchars($this->mMatches[0]) . "</ignore>"; |
| 135 | + } elseif ($this->mName == "XmlClosed") { |
| 136 | + $retString = "<ext><name>" . htmlspecialchars($this->mMatches[1]) . |
| 137 | + "</name><attr>" . htmlspecialchars($this->mMatches[2]) . "</attr></ext>"; |
| 138 | + } elseif ($this->mName == "XmlOpened") { |
| 139 | + $closeTag = ""; |
| 140 | + if ($this->mMatches[4] != "") { |
| 141 | + $closeTag = "<close>" . htmlspecialchars($this->mMatches[4]) . "</close>"; |
| 142 | + } |
| 143 | + $retString = "<ext><name>" . htmlspecialchars($this->mMatches[1]) . "</name><attr>" . htmlspecialchars($this->mMatches[2]) . |
| 144 | + "</attr><inner>" . htmlspecialchars($this->mMatches[3]) . "</inner>" . $closeTag . "</ext>"; |
| 145 | + } elseif ($this->mName == "BeginFile") { |
| 146 | + if (isset($this->mMatches[1])) { |
| 147 | + $retString = "<ignore>" . htmlspecialchars($this->mMatches[1]) . "</ignore>"; |
| 148 | + } |
| 149 | + } elseif (($this->mName == "Template" && isset($this->mMatches[2])) || ($this->mName == "TplArg" && isset($this->mMatches[1]))) { |
| 150 | + $inTitle = true; |
| 151 | + $foundEquals = false; |
| 152 | + $currentItem = ""; |
| 153 | + $partInd = 1; |
| 154 | + $this->mChildren[] = '|'; |
| 155 | + foreach ($this->mChildren as $crrnt) { |
| 156 | + if ($crrnt instanceof ParseTree) { |
| 157 | + $currentItem .= $crrnt->printTree($headingInd); |
| 158 | + } elseif ($crrnt == '|') { |
| 159 | + if ($inTitle) { |
| 160 | + $retString .= "<title>" . $currentItem . "</title>"; |
| 161 | + $inTitle = false; |
| 162 | + } else { |
| 163 | + if (! $foundEquals) { |
| 164 | + $retString .= "<part><name index=\"" . $partInd . "\" />"; |
| 165 | + $partInd ++; |
| 166 | + } |
| 167 | + $retString .= "<value>" . $currentItem . "</value></part>"; |
| 168 | + $foundEquals = false; |
| 169 | + } |
| 170 | + $currentItem = ""; |
| 171 | + } elseif ($crrnt == '=' && ! $inTitle && ! $foundEquals) { |
| 172 | + $retString .= "<part><name>" . $currentItem . "</name>="; |
| 173 | + $foundEquals = true; |
| 174 | + $currentItem = ""; |
| 175 | + } else { |
| 176 | + $currentItem .= htmlspecialchars($crrnt); |
| 177 | + } |
| 178 | + } |
| 179 | + if ($this->mName == "Template") { |
| 180 | + $templateAttr = ""; |
| 181 | + if ($this->mMatches[1] != "") { |
| 182 | + $templateAttr = " lineStart=\"1\""; |
| 183 | + } |
| 184 | + $retString = "<template" . $templateAttr . ">" . $retString . "</template>"; |
| 185 | + if ($this->mMatches[1] == "\n") { |
| 186 | + $retString = $this->mMatches[1] . $retString; |
| 187 | + } |
| 188 | + } else { |
| 189 | + $retString = "<tplarg>" . $retString . "</tplarg>"; |
| 190 | + } |
| 191 | + } else { |
| 192 | + foreach ($this->mChildren as $crrnt) { |
| 193 | + if ($crrnt instanceof ParseTree) { |
| 194 | + $retString .= $crrnt->printTree($headingInd); |
| 195 | + } else { |
| 196 | + $retString .= htmlspecialchars($crrnt); |
| 197 | + } |
| 198 | + } |
| 199 | + if ($this->mName == "Root") { |
| 200 | + $retString = "<root>" . $retString . "</root>"; |
| 201 | + } elseif ($this->mName == "TplArg") { |
| 202 | + $retString = htmlspecialchars($this->mMatches[0]) . $retString; |
| 203 | + } elseif ($this->mName == "Template") { |
| 204 | + $retString = "{{" . $retString; |
| 205 | + if ($this->mMatches[1] == "\n") { |
| 206 | + $retString = $this->mMatches[1] . $retString; |
| 207 | + } |
| 208 | + } elseif ($this->mName == "Link") { |
| 209 | + $retString = htmlspecialchars($this->mMatches[0]) . $retString; |
| 210 | + if (isset($this->mMatches[1])) { |
| 211 | + $retString .= htmlspecialchars($this->mMatches[1]); |
| 212 | + } |
| 213 | + } elseif ($this->mName == "Heading") { |
| 214 | + $retString = htmlspecialchars($this->mMatches[2]) . $retString; |
| 215 | + if (isset($this->mMatches[3])) { |
| 216 | + $retString = "<h level=\"" . strlen($this->mMatches[2]) . "\" i=\"" . $headingInd . "\">" . |
| 217 | + $retString . htmlspecialchars($this->mMatches[3]) . "</h>"; |
| 218 | + } |
| 219 | + if ($this->mMatches[1] == "\n") { |
| 220 | + $retString = "\n" . $retString; |
| 221 | + } |
| 222 | + $headingInd ++; |
| 223 | + } |
| 224 | + } |
| 225 | + |
| 226 | + return $retString; |
| 227 | + } |
| 228 | +} |
| 229 | + |
Property changes on: branches/parser-work/phase3/includes/parser/ParseTree.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 230 | + native |
Index: branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php |
— | — | @@ -118,688 +118,46 @@ |
119 | 119 | return $obj; |
120 | 120 | } |
121 | 121 | |
| 122 | + /** |
| 123 | + * Preprocessor that reads in wiki text and returns xml. |
| 124 | + * This is the data layer of the new wikitext parser. |
| 125 | + */ |
122 | 126 | function preprocessToXml( $text, $flags = 0 ) { |
123 | 127 | wfProfileIn( __METHOD__ ); |
| 128 | + |
| 129 | + $xmlishRegex = implode('|', $this->parser->getStripList()); |
| 130 | + $bugHHP21 = new ParseRule("BugHHP21", '/^\n(?==[^=])/s'); |
124 | 131 | $rules = array( |
125 | | - '{' => array( |
126 | | - 'end' => '}', |
127 | | - 'names' => array( |
128 | | - 2 => 'template', |
129 | | - 3 => 'tplarg', |
130 | | - ), |
131 | | - 'min' => 2, |
132 | | - 'max' => 3, |
133 | | - ), |
134 | | - '[' => array( |
135 | | - 'end' => ']', |
136 | | - 'names' => array( 2 => null ), |
137 | | - 'min' => 2, |
138 | | - 'max' => 2, |
139 | | - ) |
140 | | - ); |
| 132 | + new ParseRule("Template", '/^((?:\n|~BOF)?){{(?!{[^{])/s', '/^}}/s', '}|=', $bugHHP21), |
| 133 | + new ParseRule("TplArg", '/^{{{/s', '/^}}}/s', '}|=', $bugHHP21), |
| 134 | + new ParseRule("Link", '/^\[\[/s', '/^]]/s', '\]'), |
| 135 | + new ParseRule("Heading", '/^(\n|~BOF)(={1,6})/s', '/^~2(?: *<!--.*?(?:-->|\Z))*(?=\n|\Z)/s', '='), |
| 136 | + new ParseRule("CommentLine", '/^(\n *)((?:<!--.*?(?:-->|\Z)(?: *\n)?)+)/s'), |
| 137 | + new ParseRule("Comment", '/^<!--.*?(?:-->|\Z)/s'), |
| 138 | + new ParseRule("OnlyInclude", '/^<\/?onlyinclude>/s'), |
| 139 | + new ParseRule("NoInclude", '/^<\/?noinclude>/s'), |
| 140 | + new ParseRule("IncludeOnly", '/^<includeonly>.*?(?:<\/includeonly>|\Z)/s'), |
| 141 | + new ParseRule("XmlClosed", '/^<(' . $xmlishRegex . ')([^>]*)\/>/si'), |
| 142 | + new ParseRule("XmlOpened", '/^<(' . $xmlishRegex . ')(.*?)>(.*?)(<\/\1>|\Z)/si'), |
| 143 | + new ParseRule("BeginFile", '/^~BOF/s')); |
141 | 144 | |
142 | | - $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; |
143 | | - |
144 | | - $xmlishElements = $this->parser->getStripList(); |
145 | | - $enableOnlyinclude = false; |
146 | | - if ( $forInclusion ) { |
147 | | - $ignoredTags = array( 'includeonly', '/includeonly' ); |
148 | | - $ignoredElements = array( 'noinclude' ); |
149 | | - $xmlishElements[] = 'noinclude'; |
150 | | - if ( strpos( $text, '<onlyinclude>' ) !== false && strpos( $text, '</onlyinclude>' ) !== false ) { |
151 | | - $enableOnlyinclude = true; |
152 | | - } |
153 | | - } else { |
154 | | - $ignoredTags = array( 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ); |
155 | | - $ignoredElements = array( 'includeonly' ); |
156 | | - $xmlishElements[] = 'includeonly'; |
| 145 | + if ($flags & Parser::PTD_FOR_INCLUSION) { |
| 146 | + $rules[6] = new ParseRule("OnlyInclude", '/^<\/onlyinclude>.*?(?:<onlyinclude>|\Z)/s'); |
| 147 | + $rules[7] = new ParseRule("NoInclude", '/^<noinclude>.*?(?:<\/noinclude>|\Z)/s'); |
| 148 | + $rules[8] = new ParseRule("IncludeOnly", '/^<\/?includeonly>/s'); |
| 149 | + $rules[11] = new ParseRule("BeginFile", '/^~BOF(.*?<onlyinclude>)?/s'); |
157 | 150 | } |
158 | | - $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) ); |
159 | 151 | |
160 | | - // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset |
161 | | - $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; |
| 152 | + $parseList = new ParseList($rules, '{\[<\n'); |
| 153 | + $parseTree = ParseTree::createParseTree($text, $parseList); |
| 154 | + $xml = $parseTree->printTree(); |
162 | 155 | |
163 | | - $stack = new PPDStack; |
164 | | - |
165 | | - $searchBase = "[{<\n"; #} |
166 | | - $revText = strrev( $text ); // For fast reverse searches |
167 | | - |
168 | | - $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start |
169 | | - $accum =& $stack->getAccum(); # Current accumulator |
170 | | - $accum = '<root>'; |
171 | | - $findEquals = false; # True to find equals signs in arguments |
172 | | - $findPipe = false; # True to take notice of pipe characters |
173 | | - $headingIndex = 1; |
174 | | - $inHeading = false; # True if $i is inside a possible heading |
175 | | - $noMoreGT = false; # True if there are no more greater-than (>) signs right of $i |
176 | | - $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next <onlyinclude> |
177 | | - $fakeLineStart = true; # Do a line-start run without outputting an LF character |
178 | | - |
179 | | - while ( true ) { |
180 | | - //$this->memCheck(); |
181 | | - |
182 | | - if ( $findOnlyinclude ) { |
183 | | - // Ignore all input up to the next <onlyinclude> |
184 | | - $startPos = strpos( $text, '<onlyinclude>', $i ); |
185 | | - if ( $startPos === false ) { |
186 | | - // Ignored section runs to the end |
187 | | - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i ) ) . '</ignore>'; |
188 | | - break; |
189 | | - } |
190 | | - $tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end |
191 | | - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i ) ) . '</ignore>'; |
192 | | - $i = $tagEndPos; |
193 | | - $findOnlyinclude = false; |
194 | | - } |
195 | | - |
196 | | - if ( $fakeLineStart ) { |
197 | | - $found = 'line-start'; |
198 | | - $curChar = ''; |
199 | | - } else { |
200 | | - # Find next opening brace, closing brace or pipe |
201 | | - $search = $searchBase; |
202 | | - if ( $stack->top === false ) { |
203 | | - $currentClosing = ''; |
204 | | - } else { |
205 | | - $currentClosing = $stack->top->close; |
206 | | - $search .= $currentClosing; |
207 | | - } |
208 | | - if ( $findPipe ) { |
209 | | - $search .= '|'; |
210 | | - } |
211 | | - if ( $findEquals ) { |
212 | | - // First equals will be for the template |
213 | | - $search .= '='; |
214 | | - } |
215 | | - $rule = null; |
216 | | - # Output literal section, advance input counter |
217 | | - $literalLength = strcspn( $text, $search, $i ); |
218 | | - if ( $literalLength > 0 ) { |
219 | | - $accum .= htmlspecialchars( substr( $text, $i, $literalLength ) ); |
220 | | - $i += $literalLength; |
221 | | - } |
222 | | - if ( $i >= strlen( $text ) ) { |
223 | | - if ( $currentClosing == "\n" ) { |
224 | | - // Do a past-the-end run to finish off the heading |
225 | | - $curChar = ''; |
226 | | - $found = 'line-end'; |
227 | | - } else { |
228 | | - # All done |
229 | | - break; |
230 | | - } |
231 | | - } else { |
232 | | - $curChar = $text[$i]; |
233 | | - if ( $curChar == '|' ) { |
234 | | - $found = 'pipe'; |
235 | | - } elseif ( $curChar == '=' ) { |
236 | | - $found = 'equals'; |
237 | | - } elseif ( $curChar == '<' ) { |
238 | | - $found = 'angle'; |
239 | | - } elseif ( $curChar == "\n" ) { |
240 | | - if ( $inHeading ) { |
241 | | - $found = 'line-end'; |
242 | | - } else { |
243 | | - $found = 'line-start'; |
244 | | - } |
245 | | - } elseif ( $curChar == $currentClosing ) { |
246 | | - $found = 'close'; |
247 | | - } elseif ( isset( $rules[$curChar] ) ) { |
248 | | - $found = 'open'; |
249 | | - $rule = $rules[$curChar]; |
250 | | - } else { |
251 | | - # Some versions of PHP have a strcspn which stops on null characters |
252 | | - # Ignore and continue |
253 | | - ++$i; |
254 | | - continue; |
255 | | - } |
256 | | - } |
257 | | - } |
258 | | - |
259 | | - if ( $found == 'angle' ) { |
260 | | - $matches = false; |
261 | | - // Handle </onlyinclude> |
262 | | - if ( $enableOnlyinclude && substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>' ) { |
263 | | - $findOnlyinclude = true; |
264 | | - continue; |
265 | | - } |
266 | | - |
267 | | - // Determine element name |
268 | | - if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) { |
269 | | - // Element name missing or not listed |
270 | | - $accum .= '<'; |
271 | | - ++$i; |
272 | | - continue; |
273 | | - } |
274 | | - // Handle comments |
275 | | - if ( isset( $matches[2] ) && $matches[2] == '!--' ) { |
276 | | - // To avoid leaving blank lines, when a comment is both preceded |
277 | | - // and followed by a newline (ignoring spaces), trim leading and |
278 | | - // trailing spaces and one of the newlines. |
279 | | - |
280 | | - // Find the end |
281 | | - $endPos = strpos( $text, '-->', $i + 4 ); |
282 | | - if ( $endPos === false ) { |
283 | | - // Unclosed comment in input, runs to end |
284 | | - $inner = substr( $text, $i ); |
285 | | - $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>'; |
286 | | - $i = strlen( $text ); |
287 | | - } else { |
288 | | - // Search backwards for leading whitespace |
289 | | - $wsStart = $i ? ( $i - strspn( $revText, ' ', strlen( $text ) - $i ) ) : 0; |
290 | | - // Search forwards for trailing whitespace |
291 | | - // $wsEnd will be the position of the last space |
292 | | - $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 ); |
293 | | - // Eat the line if possible |
294 | | - // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at |
295 | | - // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but |
296 | | - // it's a possible beneficial b/c break. |
297 | | - if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" |
298 | | - && substr( $text, $wsEnd + 1, 1 ) == "\n" ) |
299 | | - { |
300 | | - $startPos = $wsStart; |
301 | | - $endPos = $wsEnd + 1; |
302 | | - // Remove leading whitespace from the end of the accumulator |
303 | | - // Sanity check first though |
304 | | - $wsLength = $i - $wsStart; |
305 | | - if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) { |
306 | | - $accum = substr( $accum, 0, -$wsLength ); |
307 | | - } |
308 | | - // Do a line-start run next time to look for headings after the comment |
309 | | - $fakeLineStart = true; |
310 | | - } else { |
311 | | - // No line to eat, just take the comment itself |
312 | | - $startPos = $i; |
313 | | - $endPos += 2; |
314 | | - } |
315 | | - |
316 | | - if ( $stack->top ) { |
317 | | - $part = $stack->top->getCurrentPart(); |
318 | | - if ( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) { |
319 | | - // Comments abutting, no change in visual end |
320 | | - $part->commentEnd = $wsEnd; |
321 | | - } else { |
322 | | - $part->visualEnd = $wsStart; |
323 | | - $part->commentEnd = $endPos; |
324 | | - } |
325 | | - } |
326 | | - $i = $endPos + 1; |
327 | | - $inner = substr( $text, $startPos, $endPos - $startPos + 1 ); |
328 | | - $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>'; |
329 | | - } |
330 | | - continue; |
331 | | - } |
332 | | - $name = $matches[1]; |
333 | | - $lowerName = strtolower( $name ); |
334 | | - $attrStart = $i + strlen( $name ) + 1; |
335 | | - |
336 | | - // Find end of tag |
337 | | - $tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart ); |
338 | | - if ( $tagEndPos === false ) { |
339 | | - // Infinite backtrack |
340 | | - // Disable tag search to prevent worst-case O(N^2) performance |
341 | | - $noMoreGT = true; |
342 | | - $accum .= '<'; |
343 | | - ++$i; |
344 | | - continue; |
345 | | - } |
346 | | - |
347 | | - // Handle ignored tags |
348 | | - if ( in_array( $lowerName, $ignoredTags ) ) { |
349 | | - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i + 1 ) ) . '</ignore>'; |
350 | | - $i = $tagEndPos + 1; |
351 | | - continue; |
352 | | - } |
353 | | - |
354 | | - $tagStartPos = $i; |
355 | | - if ( $text[$tagEndPos-1] == '/' ) { |
356 | | - $attrEnd = $tagEndPos - 1; |
357 | | - $inner = null; |
358 | | - $i = $tagEndPos + 1; |
359 | | - $close = ''; |
360 | | - } else { |
361 | | - $attrEnd = $tagEndPos; |
362 | | - // Find closing tag |
363 | | - if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", |
364 | | - $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) |
365 | | - { |
366 | | - $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); |
367 | | - $i = $matches[0][1] + strlen( $matches[0][0] ); |
368 | | - $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>'; |
369 | | - } else { |
370 | | - // No end tag -- let it run out to the end of the text. |
371 | | - $inner = substr( $text, $tagEndPos + 1 ); |
372 | | - $i = strlen( $text ); |
373 | | - $close = ''; |
374 | | - } |
375 | | - } |
376 | | - // <includeonly> and <noinclude> just become <ignore> tags |
377 | | - if ( in_array( $lowerName, $ignoredElements ) ) { |
378 | | - $accum .= '<ignore>' . htmlspecialchars( substr( $text, $tagStartPos, $i - $tagStartPos ) ) |
379 | | - . '</ignore>'; |
380 | | - continue; |
381 | | - } |
382 | | - |
383 | | - $accum .= '<ext>'; |
384 | | - if ( $attrEnd <= $attrStart ) { |
385 | | - $attr = ''; |
386 | | - } else { |
387 | | - $attr = substr( $text, $attrStart, $attrEnd - $attrStart ); |
388 | | - } |
389 | | - $accum .= '<name>' . htmlspecialchars( $name ) . '</name>' . |
390 | | - // Note that the attr element contains the whitespace between name and attribute, |
391 | | - // this is necessary for precise reconstruction during pre-save transform. |
392 | | - '<attr>' . htmlspecialchars( $attr ) . '</attr>'; |
393 | | - if ( $inner !== null ) { |
394 | | - $accum .= '<inner>' . htmlspecialchars( $inner ) . '</inner>'; |
395 | | - } |
396 | | - $accum .= $close . '</ext>'; |
397 | | - } |
398 | | - |
399 | | - elseif ( $found == 'line-start' ) { |
400 | | - // Is this the start of a heading? |
401 | | - // Line break belongs before the heading element in any case |
402 | | - if ( $fakeLineStart ) { |
403 | | - $fakeLineStart = false; |
404 | | - } else { |
405 | | - $accum .= $curChar; |
406 | | - $i++; |
407 | | - } |
408 | | - |
409 | | - $count = strspn( $text, '=', $i, 6 ); |
410 | | - if ( $count == 1 && $findEquals ) { |
411 | | - // DWIM: This looks kind of like a name/value separator |
412 | | - // Let's let the equals handler have it and break the potential heading |
413 | | - // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex. |
414 | | - } elseif ( $count > 0 ) { |
415 | | - $piece = array( |
416 | | - 'open' => "\n", |
417 | | - 'close' => "\n", |
418 | | - 'parts' => array( new PPDPart( str_repeat( '=', $count ) ) ), |
419 | | - 'startPos' => $i, |
420 | | - 'count' => $count ); |
421 | | - $stack->push( $piece ); |
422 | | - $accum =& $stack->getAccum(); |
423 | | - $flags = $stack->getFlags(); |
424 | | - extract( $flags ); |
425 | | - $i += $count; |
426 | | - } |
427 | | - } |
428 | | - |
429 | | - elseif ( $found == 'line-end' ) { |
430 | | - $piece = $stack->top; |
431 | | - // A heading must be open, otherwise \n wouldn't have been in the search list |
432 | | - assert( $piece->open == "\n" ); |
433 | | - $part = $piece->getCurrentPart(); |
434 | | - // Search back through the input to see if it has a proper close |
435 | | - // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient |
436 | | - $wsLength = strspn( $revText, " \t", strlen( $text ) - $i ); |
437 | | - $searchStart = $i - $wsLength; |
438 | | - if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) { |
439 | | - // Comment found at line end |
440 | | - // Search for equals signs before the comment |
441 | | - $searchStart = $part->visualEnd; |
442 | | - $searchStart -= strspn( $revText, " \t", strlen( $text ) - $searchStart ); |
443 | | - } |
444 | | - $count = $piece->count; |
445 | | - $equalsLength = strspn( $revText, '=', strlen( $text ) - $searchStart ); |
446 | | - if ( $equalsLength > 0 ) { |
447 | | - if ( $i - $equalsLength == $piece->startPos ) { |
448 | | - // This is just a single string of equals signs on its own line |
449 | | - // Replicate the doHeadings behaviour /={count}(.+)={count}/ |
450 | | - // First find out how many equals signs there really are (don't stop at 6) |
451 | | - $count = $equalsLength; |
452 | | - if ( $count < 3 ) { |
453 | | - $count = 0; |
454 | | - } else { |
455 | | - $count = min( 6, intval( ( $count - 1 ) / 2 ) ); |
456 | | - } |
457 | | - } else { |
458 | | - $count = min( $equalsLength, $count ); |
459 | | - } |
460 | | - if ( $count > 0 ) { |
461 | | - // Normal match, output <h> |
462 | | - $element = "<h level=\"$count\" i=\"$headingIndex\">$accum</h>"; |
463 | | - $headingIndex++; |
464 | | - } else { |
465 | | - // Single equals sign on its own line, count=0 |
466 | | - $element = $accum; |
467 | | - } |
468 | | - } else { |
469 | | - // No match, no <h>, just pass down the inner text |
470 | | - $element = $accum; |
471 | | - } |
472 | | - // Unwind the stack |
473 | | - $stack->pop(); |
474 | | - $accum =& $stack->getAccum(); |
475 | | - $flags = $stack->getFlags(); |
476 | | - extract( $flags ); |
477 | | - |
478 | | - // Append the result to the enclosing accumulator |
479 | | - $accum .= $element; |
480 | | - // Note that we do NOT increment the input pointer. |
481 | | - // This is because the closing linebreak could be the opening linebreak of |
482 | | - // another heading. Infinite loops are avoided because the next iteration MUST |
483 | | - // hit the heading open case above, which unconditionally increments the |
484 | | - // input pointer. |
485 | | - } |
486 | | - |
487 | | - elseif ( $found == 'open' ) { |
488 | | - # count opening brace characters |
489 | | - $count = strspn( $text, $curChar, $i ); |
490 | | - |
491 | | - # we need to add to stack only if opening brace count is enough for one of the rules |
492 | | - if ( $count >= $rule['min'] ) { |
493 | | - # Add it to the stack |
494 | | - $piece = array( |
495 | | - 'open' => $curChar, |
496 | | - 'close' => $rule['end'], |
497 | | - 'count' => $count, |
498 | | - 'lineStart' => ($i > 0 && $text[$i-1] == "\n"), |
499 | | - ); |
500 | | - |
501 | | - $stack->push( $piece ); |
502 | | - $accum =& $stack->getAccum(); |
503 | | - $flags = $stack->getFlags(); |
504 | | - extract( $flags ); |
505 | | - } else { |
506 | | - # Add literal brace(s) |
507 | | - $accum .= htmlspecialchars( str_repeat( $curChar, $count ) ); |
508 | | - } |
509 | | - $i += $count; |
510 | | - } |
511 | | - |
512 | | - elseif ( $found == 'close' ) { |
513 | | - $piece = $stack->top; |
514 | | - # lets check if there are enough characters for closing brace |
515 | | - $maxCount = $piece->count; |
516 | | - $count = strspn( $text, $curChar, $i, $maxCount ); |
517 | | - |
518 | | - # check for maximum matching characters (if there are 5 closing |
519 | | - # characters, we will probably need only 3 - depending on the rules) |
520 | | - $matchingCount = 0; |
521 | | - $rule = $rules[$piece->open]; |
522 | | - if ( $count > $rule['max'] ) { |
523 | | - # The specified maximum exists in the callback array, unless the caller |
524 | | - # has made an error |
525 | | - $matchingCount = $rule['max']; |
526 | | - } else { |
527 | | - # Count is less than the maximum |
528 | | - # Skip any gaps in the callback array to find the true largest match |
529 | | - # Need to use array_key_exists not isset because the callback can be null |
530 | | - $matchingCount = $count; |
531 | | - while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) { |
532 | | - --$matchingCount; |
533 | | - } |
534 | | - } |
535 | | - |
536 | | - if ($matchingCount <= 0) { |
537 | | - # No matching element found in callback array |
538 | | - # Output a literal closing brace and continue |
539 | | - $accum .= htmlspecialchars( str_repeat( $curChar, $count ) ); |
540 | | - $i += $count; |
541 | | - continue; |
542 | | - } |
543 | | - $name = $rule['names'][$matchingCount]; |
544 | | - if ( $name === null ) { |
545 | | - // No element, just literal text |
546 | | - $element = $piece->breakSyntax( $matchingCount ) . str_repeat( $rule['end'], $matchingCount ); |
547 | | - } else { |
548 | | - # Create XML element |
549 | | - # Note: $parts is already XML, does not need to be encoded further |
550 | | - $parts = $piece->parts; |
551 | | - $title = $parts[0]->out; |
552 | | - unset( $parts[0] ); |
553 | | - |
554 | | - # The invocation is at the start of the line if lineStart is set in |
555 | | - # the stack, and all opening brackets are used up. |
556 | | - if ( $maxCount == $matchingCount && !empty( $piece->lineStart ) ) { |
557 | | - $attr = ' lineStart="1"'; |
558 | | - } else { |
559 | | - $attr = ''; |
560 | | - } |
561 | | - |
562 | | - $element = "<$name$attr>"; |
563 | | - $element .= "<title>$title</title>"; |
564 | | - $argIndex = 1; |
565 | | - foreach ( $parts as $partIndex => $part ) { |
566 | | - if ( isset( $part->eqpos ) ) { |
567 | | - $argName = substr( $part->out, 0, $part->eqpos ); |
568 | | - $argValue = substr( $part->out, $part->eqpos + 1 ); |
569 | | - $element .= "<part><name>$argName</name>=<value>$argValue</value></part>"; |
570 | | - } else { |
571 | | - $element .= "<part><name index=\"$argIndex\" /><value>{$part->out}</value></part>"; |
572 | | - $argIndex++; |
573 | | - } |
574 | | - } |
575 | | - $element .= "</$name>"; |
576 | | - } |
577 | | - |
578 | | - # Advance input pointer |
579 | | - $i += $matchingCount; |
580 | | - |
581 | | - # Unwind the stack |
582 | | - $stack->pop(); |
583 | | - $accum =& $stack->getAccum(); |
584 | | - |
585 | | - # Re-add the old stack element if it still has unmatched opening characters remaining |
586 | | - if ($matchingCount < $piece->count) { |
587 | | - $piece->parts = array( new PPDPart ); |
588 | | - $piece->count -= $matchingCount; |
589 | | - # do we still qualify for any callback with remaining count? |
590 | | - $names = $rules[$piece->open]['names']; |
591 | | - $skippedBraces = 0; |
592 | | - $enclosingAccum =& $accum; |
593 | | - while ( $piece->count ) { |
594 | | - if ( array_key_exists( $piece->count, $names ) ) { |
595 | | - $stack->push( $piece ); |
596 | | - $accum =& $stack->getAccum(); |
597 | | - break; |
598 | | - } |
599 | | - --$piece->count; |
600 | | - $skippedBraces ++; |
601 | | - } |
602 | | - $enclosingAccum .= str_repeat( $piece->open, $skippedBraces ); |
603 | | - } |
604 | | - $flags = $stack->getFlags(); |
605 | | - extract( $flags ); |
606 | | - |
607 | | - # Add XML element to the enclosing accumulator |
608 | | - $accum .= $element; |
609 | | - } |
610 | | - |
611 | | - elseif ( $found == 'pipe' ) { |
612 | | - $findEquals = true; // shortcut for getFlags() |
613 | | - $stack->addPart(); |
614 | | - $accum =& $stack->getAccum(); |
615 | | - ++$i; |
616 | | - } |
617 | | - |
618 | | - elseif ( $found == 'equals' ) { |
619 | | - $findEquals = false; // shortcut for getFlags() |
620 | | - $stack->getCurrentPart()->eqpos = strlen( $accum ); |
621 | | - $accum .= '='; |
622 | | - ++$i; |
623 | | - } |
624 | | - } |
625 | | - |
626 | | - # Output any remaining unclosed brackets |
627 | | - foreach ( $stack->stack as $piece ) { |
628 | | - $stack->rootAccum .= $piece->breakSyntax(); |
629 | | - } |
630 | | - $stack->rootAccum .= '</root>'; |
631 | | - $xml = $stack->rootAccum; |
632 | | - |
633 | 156 | wfProfileOut( __METHOD__ ); |
634 | | - |
635 | 157 | return $xml; |
636 | 158 | } |
637 | 159 | } |
638 | 160 | |
639 | 161 | /** |
640 | | - * Stack class to help Preprocessor::preprocessToObj() |
641 | | - * @ingroup Parser |
642 | | - */ |
643 | | -class PPDStack { |
644 | | - var $stack, $rootAccum, $top; |
645 | | - var $out; |
646 | | - var $elementClass = 'PPDStackElement'; |
647 | | - |
648 | | - static $false = false; |
649 | | - |
650 | | - function __construct() { |
651 | | - $this->stack = array(); |
652 | | - $this->top = false; |
653 | | - $this->rootAccum = ''; |
654 | | - $this->accum =& $this->rootAccum; |
655 | | - } |
656 | | - |
657 | | - function count() { |
658 | | - return count( $this->stack ); |
659 | | - } |
660 | | - |
661 | | - function &getAccum() { |
662 | | - return $this->accum; |
663 | | - } |
664 | | - |
665 | | - function getCurrentPart() { |
666 | | - if ( $this->top === false ) { |
667 | | - return false; |
668 | | - } else { |
669 | | - return $this->top->getCurrentPart(); |
670 | | - } |
671 | | - } |
672 | | - |
673 | | - function push( $data ) { |
674 | | - if ( $data instanceof $this->elementClass ) { |
675 | | - $this->stack[] = $data; |
676 | | - } else { |
677 | | - $class = $this->elementClass; |
678 | | - $this->stack[] = new $class( $data ); |
679 | | - } |
680 | | - $this->top = $this->stack[ count( $this->stack ) - 1 ]; |
681 | | - $this->accum =& $this->top->getAccum(); |
682 | | - } |
683 | | - |
684 | | - function pop() { |
685 | | - if ( !count( $this->stack ) ) { |
686 | | - throw new MWException( __METHOD__.': no elements remaining' ); |
687 | | - } |
688 | | - $temp = array_pop( $this->stack ); |
689 | | - |
690 | | - if ( count( $this->stack ) ) { |
691 | | - $this->top = $this->stack[ count( $this->stack ) - 1 ]; |
692 | | - $this->accum =& $this->top->getAccum(); |
693 | | - } else { |
694 | | - $this->top = self::$false; |
695 | | - $this->accum =& $this->rootAccum; |
696 | | - } |
697 | | - return $temp; |
698 | | - } |
699 | | - |
700 | | - function addPart( $s = '' ) { |
701 | | - $this->top->addPart( $s ); |
702 | | - $this->accum =& $this->top->getAccum(); |
703 | | - } |
704 | | - |
705 | | - function getFlags() { |
706 | | - if ( !count( $this->stack ) ) { |
707 | | - return array( |
708 | | - 'findEquals' => false, |
709 | | - 'findPipe' => false, |
710 | | - 'inHeading' => false, |
711 | | - ); |
712 | | - } else { |
713 | | - return $this->top->getFlags(); |
714 | | - } |
715 | | - } |
716 | | -} |
717 | | - |
718 | | -/** |
719 | | - * @ingroup Parser |
720 | | - */ |
721 | | -class PPDStackElement { |
722 | | - var $open, // Opening character (\n for heading) |
723 | | - $close, // Matching closing character |
724 | | - $count, // Number of opening characters found (number of "=" for heading) |
725 | | - $parts, // Array of PPDPart objects describing pipe-separated parts. |
726 | | - $lineStart; // True if the open char appeared at the start of the input line. Not set for headings. |
727 | | - |
728 | | - var $partClass = 'PPDPart'; |
729 | | - |
730 | | - function __construct( $data = array() ) { |
731 | | - $class = $this->partClass; |
732 | | - $this->parts = array( new $class ); |
733 | | - |
734 | | - foreach ( $data as $name => $value ) { |
735 | | - $this->$name = $value; |
736 | | - } |
737 | | - } |
738 | | - |
739 | | - function &getAccum() { |
740 | | - return $this->parts[count($this->parts) - 1]->out; |
741 | | - } |
742 | | - |
743 | | - function addPart( $s = '' ) { |
744 | | - $class = $this->partClass; |
745 | | - $this->parts[] = new $class( $s ); |
746 | | - } |
747 | | - |
748 | | - function getCurrentPart() { |
749 | | - return $this->parts[count($this->parts) - 1]; |
750 | | - } |
751 | | - |
752 | | - function getFlags() { |
753 | | - $partCount = count( $this->parts ); |
754 | | - $findPipe = $this->open != "\n" && $this->open != '['; |
755 | | - return array( |
756 | | - 'findPipe' => $findPipe, |
757 | | - 'findEquals' => $findPipe && $partCount > 1 && !isset( $this->parts[$partCount - 1]->eqpos ), |
758 | | - 'inHeading' => $this->open == "\n", |
759 | | - ); |
760 | | - } |
761 | | - |
762 | | - /** |
763 | | - * Get the output string that would result if the close is not found. |
764 | | - */ |
765 | | - function breakSyntax( $openingCount = false ) { |
766 | | - if ( $this->open == "\n" ) { |
767 | | - $s = $this->parts[0]->out; |
768 | | - } else { |
769 | | - if ( $openingCount === false ) { |
770 | | - $openingCount = $this->count; |
771 | | - } |
772 | | - $s = str_repeat( $this->open, $openingCount ); |
773 | | - $first = true; |
774 | | - foreach ( $this->parts as $part ) { |
775 | | - if ( $first ) { |
776 | | - $first = false; |
777 | | - } else { |
778 | | - $s .= '|'; |
779 | | - } |
780 | | - $s .= $part->out; |
781 | | - } |
782 | | - } |
783 | | - return $s; |
784 | | - } |
785 | | -} |
786 | | - |
787 | | -/** |
788 | | - * @ingroup Parser |
789 | | - */ |
790 | | -class PPDPart { |
791 | | - var $out; // Output accumulator string |
792 | | - |
793 | | - // Optional member variables: |
794 | | - // eqpos Position of equals sign in output accumulator |
795 | | - // commentEnd Past-the-end input pointer for the last comment encountered |
796 | | - // visualEnd Past-the-end input pointer for the end of the accumulator minus comments |
797 | | - |
798 | | - function __construct( $out = '' ) { |
799 | | - $this->out = $out; |
800 | | - } |
801 | | -} |
802 | | - |
803 | | -/** |
804 | 162 | * An expansion frame, used as a context to expand the result of preprocessToObj() |
805 | 163 | * @ingroup Parser |
806 | 164 | */ |
Index: branches/parser-work/phase3/includes/AutoLoader.php |
— | — | @@ -446,10 +446,7 @@ |
447 | 447 | 'PPCustomFrame_Hash' => 'includes/parser/Preprocessor_Hash.php', |
448 | 448 | 'PPCustomFrame_DOM' => 'includes/parser/Preprocessor_DOM.php', |
449 | 449 | 'PPDAccum_Hash' => 'includes/parser/Preprocessor_Hash.php', |
450 | | - 'PPDPart' => 'includes/parser/Preprocessor_DOM.php', |
451 | 450 | 'PPDPart_Hash' => 'includes/parser/Preprocessor_Hash.php', |
452 | | - 'PPDStack' => 'includes/parser/Preprocessor_DOM.php', |
453 | | - 'PPDStackElement' => 'includes/parser/Preprocessor_DOM.php', |
454 | 451 | 'PPDStackElement_Hash' => 'includes/parser/Preprocessor_Hash.php', |
455 | 452 | 'PPDStack_Hash' => 'includes/parser/Preprocessor_Hash.php', |
456 | 453 | 'PPFrame' => 'includes/parser/Preprocessor.php', |
— | — | @@ -463,6 +460,9 @@ |
464 | 461 | 'PPNode_Hash_Tree' => 'includes/parser/Preprocessor_Hash.php', |
465 | 462 | 'PPTemplateFrame_DOM' => 'includes/parser/Preprocessor_DOM.php', |
466 | 463 | 'PPTemplateFrame_Hash' => 'includes/parser/Preprocessor_Hash.php', |
| 464 | + 'ParseList' => 'includes/parser/ParseTree.php', |
| 465 | + 'ParseRule' => 'includes/parser/ParseTree.php', |
| 466 | + 'ParseTree' => 'includes/parser/ParseTree.php', |
467 | 467 | 'Parser' => 'includes/parser/Parser.php', |
468 | 468 | 'ParserCache' => 'includes/parser/ParserCache.php', |
469 | 469 | 'ParserOptions' => 'includes/parser/ParserOptions.php', |