r62939 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r62938‎ | r62939 | r62940 >
Date:23:44, 24 February 2010
Author:than4213
Status:deferred
Tags:
Comment:
Finish change to have parser directly create DOM. The parse engine is finally complete.
Modified paths:
  • /branches/parser-work/phase3/includes/AutoLoader.php (modified) (history)
  • /branches/parser-work/phase3/includes/DefaultSettings.php (modified) (history)
  • /branches/parser-work/phase3/includes/parser/ParseEngine.php (added) (history)
  • /branches/parser-work/phase3/includes/parser/ParseTree.php (deleted) (history)
  • /branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php (modified) (history)

Diff [purge]

Index: branches/parser-work/phase3/includes/parser/ParseTree.php
@@ -1,218 +0,0 @@
2 -<?php
3 -/**
4 - * Interface for Parse Object each with a specialized task while parsing
5 - * @ingroup Parser
6 - */
7 -abstract class ParseObject {
8 - protected $mName;
9 -
10 - function __construct($name) {
11 - $this->mName = $name;
12 - }
13 -
14 - // Does the parse task specific to each parse object
15 - abstract function parse(&$text, &$rules, $replaceStr = NULL);
16 -}
17 -
18 -/**
19 - * A rule specifying how to parse the text.
20 - * If the text matches mBeginTag then a ParseTree object is created with the appropriate info.
21 - * mName - The name to give the resultant ParseTree object
22 - * mBeginTag - the regular expression used to determine if this is the rule that should be used
23 - * mReplaceStr - Collected patterns that should be passed to children
24 - * mChildRule - What Parse rule to use to gather children for this element
25 - * @ingroup Parser
26 - */
27 -class ParsePattern extends ParseObject {
28 - private $mBeginTag, $mChildRule, $mReplaceStr;
29 -
30 - function __construct($name, $beginTag, $childRule = NULL, $replaceStr = NULL) {
31 - parent::__construct($name);
32 - $this->mBeginTag = $beginTag;
33 - $this->mChildRule = $childRule;
34 - $this->mReplaceStr = $replaceStr;
35 - }
36 -
37 - function parse(&$text, &$rules, $replaceStr = NULL) {
38 - $beginTag = $this->mBeginTag;
39 - if ($replaceStr != NULL) {
40 - $beginTag = str_replace('~r', $replaceStr, $beginTag);
41 - }
42 - if (! preg_match($beginTag, $text, $matches)) {
43 - return NULL;
44 - }
45 - $text = substr($text, strlen($matches[0]));
46 - $children = NULL;
47 - if ($this->mChildRule != NULL) {
48 - if ($this->mReplaceStr != NULL) {
49 - $replaceStr = $this->mReplaceStr;
50 - foreach ($matches as $i => $crrnt) {
51 - $replaceStr = str_replace('~' . $i, $crrnt, $replaceStr);
52 - }
53 - }
54 - $child = $rules[$this->mChildRule]->parse($text, $rules, $replaceStr);
55 - if ($child == NULL) {
56 - return NULL;
57 - }
58 - $children = array($child);
59 - }
60 - return new ParseTree($this->mName, $matches, $children);
61 - }
62 -}
63 -
64 -/**
65 - * A rule specifying how to parse the text.
66 - * If the text matches mBeginTag then a ParseTree object is created with the appropriate info.
67 - * mName - The name to give the resultant ParseTree object
68 - * mChildRule - What Parse rule to use to gather children for this element
69 - * mEndTag - If ParseTrees of this type are to have children, mEndTag specifies when all of the children are collected
70 - * mMinChildren - Minimum amount of children for this rule
71 - * mMaxChildren - Maximum amount of children for this rule, 0 means unlimited
72 - * @ingroup Parser
73 - */
74 -class ParseQuant extends ParseObject {
75 - private $mChildRule, $mEndTag, $mMinChildren, $mMaxChildren;
76 -
77 - function __construct($name, $childRule, $endTag = NULL, $minChildren = 0, $maxChildren = 0) {
78 - parent::__construct($name);
79 - $this->mChildRule = $childRule;
80 - $this->mEndTag = $endTag;
81 - $this->mMinChildren = $minChildren;
82 - $this->mMaxChildren = $maxChildren;
83 - }
84 -
85 - function parse(&$text, &$rules, $replaceStr = NULL) {
86 - $endTag = $this->mEndTag;
87 - if ($endTag != NULL && $replaceStr != NULL) {
88 - $endTag = str_replace('~r', $replaceStr, $endTag);
89 - }
90 - $children = array();
91 - for ($i = 0; $i < $this->mMinChildren || (($endTag == NULL || ! preg_match($endTag, $text, $matches)) &&
92 - ($this->mMaxChildren <= 0 || $i < $this->mMaxChildren)); $i ++) {
93 - $child = $rules[$this->mChildRule]->parse($text, $rules, $replaceStr);
94 - if ($child == NULL) {
95 - if ($endTag != NULL || $i < $this->mMinChildren) {
96 - return NULL;
97 - }
98 - break;
99 - }
100 - $children[] = $child;
101 - }
102 - if ($endTag != NULL) {
103 - if (! isset($matches[0])) {
104 - return NULL;
105 - }
106 - $text = substr($text, strlen($matches[0]));
107 - }
108 - return new ParseTree($this->mName, NULL, $children);
109 - }
110 -}
111 -
112 -/**
113 - * Contains a list of rules to cycle through when creating a parse tree
114 - * mList - The list of rules
115 - * @ingroup Parser
116 - */
117 -class ParseChoice extends ParseObject {
118 - private $mList, $matchChar;
119 -
120 - function __construct($name, $list, $matchChar = null) {
121 - parent::__construct($name);
122 - $this->mList = $list;
123 - $this->mMatchChar = $matchChar;
124 - }
125 -
126 - function parse(&$text, &$rules, $replaceStr = NULL) {
127 - if ($this->mMatchChar != NULL && $text[0] != $this->mMatchChar) {
128 - return NULL;
129 - }
130 - foreach ($this->mList as $crrnt) {
131 - $newText = $text;
132 - $child = $rules[$crrnt]->parse($newText, $rules, $replaceStr);
133 - if ($child != NULL) {
134 - $text = $newText;
135 - return new ParseTree($this->mName, NULL, array($child));
136 - }
137 - }
138 - return NULL;
139 - }
140 -}
141 -
142 -/**
143 - * Contains a sequence of rules all of which must pass
144 - * mName - The name to give the resultant ParseTree object
145 - * mList - The sequence of rules
146 - * @ingroup Parser
147 - */
148 -class ParseSeq extends ParseObject {
149 - private $mList;
150 -
151 - function __construct($name, $list) {
152 - parent::__construct($name);
153 - $this->mList = $list;
154 - }
155 -
156 - function parse(&$text, &$rules, $replaceStr = NULL) {
157 - $children = array();
158 - foreach ($this->mList as $crrnt) {
159 - $child = $rules[$crrnt]->parse($text, $rules, $replaceStr);
160 - if ($child == NULL) {
161 - return NULL;
162 - }
163 - $children[] = $child;
164 - }
165 - return new ParseTree($this->mName, NULL, $children);
166 - }
167 -}
168 -
169 -/**
170 - * The parse tree of the data.
171 - * printTree translates the parse tree to xml, eventually this should be seperated into a data and engine layer.
172 - * mName - Indicates what ParseObject was used to create this node
173 - * mMatches - The text groups that were collected by the regular expressions used when creating this rule
174 - * mChildren - The child ParseTree nodes in this tree
175 - * @ingroup Parser
176 - */
177 -class ParseTree {
178 - private $mName, $mMatches, $mChildren;
179 -
180 - function __construct($name, $matches, $children) {
181 - $this->mName = $name;
182 - $this->mMatches = $matches;
183 - $this->mChildren = $children;
184 - }
185 -
186 - function getName() {
187 - return $this->mName;
188 - }
189 -
190 - //this function will definitely need to be seperated into data and engine layers
191 - function printTree() {
192 - $retString = "";
193 -
194 - if ($this->mName == "text") {
195 - $retString = htmlspecialchars($this->mMatches[0]);
196 - } elseif ($this->mName == "newline") {
197 - $retString = htmlspecialchars($this->mMatches[0]) . $this->mChildren[0]->printTree();
198 - } elseif ($this->mName == "link") {
199 - $retString = htmlspecialchars($this->mMatches[0]) . $this->mChildren[0]->printTree() . "]]";
200 - } elseif ($this->mName == "h") {
201 - $retString = "<h>" . htmlspecialchars($this->mMatches[0]) . $this->mChildren[0]->printTree() .
202 - htmlspecialchars($this->mMatches[0]) . "</h>";
203 - } elseif ($this->mName != "unUsed") {
204 - if ($this->mChildren != NULL) {
205 - foreach ($this->mChildren as $crrnt) {
206 - $retString .= $crrnt->printTree();
207 - }
208 - } else {
209 - $retString = htmlspecialchars($this->mMatches[0]);
210 - }
211 - if ($this->mName != "unnamed") {
212 - $retString = "<" . $this->mName . ">" . $retString . "</" . $this->mName . ">";
213 - }
214 - }
215 -
216 - return $retString;
217 - }
218 -}
219 -
Index: branches/parser-work/phase3/includes/parser/ParseEngine.php
@@ -0,0 +1,207 @@
 2+<?php
 3+/**
 4+ * Acts as the primary interface between the world and the parser.
 5+ * mRules - The list of rules to use while parsing
 6+ * mStartRule - the first rule to use while parsing
 7+ * mDom - Used to create Dom objects and get's returned at the end of parsing
 8+ */
 9+class ParseEngine {
 10+ private $mRules, $mStartRule, $mDom;
 11+
 12+ function __construct($rules, $startRule) {
 13+ $this->mRules = $rules;
 14+ $this->mStartRule = $startRule;
 15+ }
 16+
 17+ function parse(&$text) {
 18+ global $wgDebugParserLog;
 19+ if ($wgDebugParserLog != '') {
 20+ wfErrorLog("==========Start Parsing==========\n", $wgDebugParserLog);
 21+ }
 22+ $this->mDom = new DOMDocument();
 23+ if (! $this->callParser($this->mStartRule, $text, $children, NULL)) {
 24+ throw new MWException('Parser regected text.');
 25+ }
 26+ $this->mDom->appendChild($children[0]);
 27+ if ($wgDebugParserLog != '') {
 28+ wfErrorLog("XML - " . $this->mDom->saveXML() . "\n", $wgDebugParserLog);
 29+ }
 30+ return $this->mDom;
 31+ }
 32+
 33+ function callParser($childName, &$text, &$children, $replaceStr) {
 34+ global $wgDebugParserLog;
 35+ if ($wgDebugParserLog != '') {
 36+ wfErrorLog("Entering " . $childName . ", Text - " . $text . "\n", $wgDebugParserLog);
 37+ }
 38+ $retCode = $this->mRules[$childName]->parse($text, $this, $this->mDom, $children, $replaceStr);
 39+ if ($wgDebugParserLog != '') {
 40+ wfErrorLog("Exiting " . $childName . ", Text - " . $text . "\n", $wgDebugParserLog);
 41+ }
 42+ return $retCode;
 43+ }
 44+}
 45+
 46+
 47+// Interface for Parse objects each with a specialized task while parsing
 48+interface ParseObject {
 49+ // Does the parse task specific to each parse object
 50+ function parse(&$text, &$engine, &$dom, &$children, $replaceStr);
 51+}
 52+
 53+/**
 54+ * Deals with pattern matching and saving strings from the text.
 55+ * mMatchPat - the regular expression used to determine if this is the rule that should be used
 56+ */
 57+class ParsePattern implements ParseObject {
 58+ private $mMatchPat;
 59+
 60+ function __construct($matchPat) {
 61+ $this->mMatchPat = $matchPat;
 62+ }
 63+
 64+ function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
 65+ $regEx = $this->mMatchPat;
 66+ if ($replaceStr != NULL) {
 67+ $regEx = str_replace('~r', $replaceStr, $regEx);
 68+ }
 69+ if (! preg_match($regEx, $text, $matches)) {
 70+ return FALSE;
 71+ }
 72+ $text = substr($text, strlen($matches[0]));
 73+ $children = array();
 74+ if (isset($matches[1])) {
 75+ $children[] = $dom->createTextNode($matches[1]);
 76+ }
 77+ return TRUE;
 78+ }
 79+}
 80+
 81+/**
 82+ * Deals with cases where a rule can be matched multiple or 0 times.
 83+ * mChildRule - What Parse rule to quantify
 84+ * mMinChildren - Minimum amount of children for this rule
 85+ * mMaxChildren - Maximum amount of children for this rule, 0 means unlimited
 86+ */
 87+class ParseQuant implements ParseObject {
 88+ private $mChildRule, $mMinChildren, $mMaxChildren;
 89+
 90+ function __construct($childRule, $minChildren = 0, $maxChildren = 0) {
 91+ $this->mChildRule = $childRule;
 92+ $this->mMinChildren = $minChildren;
 93+ $this->mMaxChildren = $maxChildren;
 94+ }
 95+
 96+ function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
 97+ $children = array();
 98+ for ($i = 0; $this->mMaxChildren <= 0 || $i < $this->mMaxChildren; $i ++) {
 99+ if (! $engine->callParser($this->mChildRule, $text, $retChildren, $replaceStr)) {
 100+ if ($i < $this->mMinChildren) {
 101+ return FALSE;
 102+ }
 103+ break;
 104+ }
 105+ $children = array_merge($children, $retChildren);
 106+ }
 107+ return TRUE;
 108+ }
 109+}
 110+
 111+/**
 112+ * Cycles throug array of rules until it finds one that succeeds
 113+ * mList - The list of rules
 114+ * mMatchChar - This is a shortcut. If the starting char of the text is different then parse will return FALSE.
 115+ */
 116+class ParseChoice implements ParseObject {
 117+ private $mList, $mMatchChar;
 118+
 119+ function __construct($list, $matchChar = NULL) {
 120+ $this->mList = $list;
 121+ $this->mMatchChar = $matchChar;
 122+ }
 123+
 124+ function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
 125+ if ($this->mMatchChar != NULL && $text[0] != $this->mMatchChar) {
 126+ return FALSE;
 127+ }
 128+ foreach ($this->mList as $crrnt) {
 129+ $newText = $text;
 130+ if ($engine->callParser($crrnt, $newText, $children, $replaceStr)) {
 131+ $text = $newText;
 132+ return TRUE;
 133+ }
 134+ }
 135+ return FALSE;
 136+ }
 137+}
 138+
 139+/**
 140+ * Contains a sequence of rules all of which must pass
 141+ * mList - The sequence of rules
 142+ * mReplaceStr - A string used to determine the close tag of bracketed markup
 143+ * mSaveStr - Boolean specifying wheter to pull mReplaceStr from text
 144+ */
 145+class ParseSeq implements ParseObject {
 146+ private $mList, $mReplaceStr, $mSaveStr;
 147+
 148+ function __construct($list, $replaceStr = NULL, $saveStr = FALSE) {
 149+ $this->mList = $list;
 150+ $this->mReplaceStr = $replaceStr;
 151+ $this->mSaveStr = $saveStr;
 152+ }
 153+
 154+ function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
 155+ if ($this->mReplaceStr != NULL) {
 156+ if ($replaceStr != NULL) {
 157+ $replaceStr = str_replace('~r', $replaceStr, $this->mReplaceStr);
 158+ } else {
 159+ $replaceStr = $this->mReplaceStr;
 160+ }
 161+ }
 162+ $children = array();
 163+ foreach ($this->mList as $i => $crrnt) {
 164+ if (! $engine->callParser($crrnt, $text, $retChildren, $replaceStr)) {
 165+ return FALSE;
 166+ }
 167+ if ($i == 0 && $this->mSaveStr && isset($retChildren[0]) && $retChildren[0] instanceof DOMText) {
 168+ $replaceStr = $retChildren[0]->wholeText;
 169+ } else {
 170+ $children = array_merge($children, $retChildren);
 171+ }
 172+ }
 173+ return TRUE;
 174+ }
 175+}
 176+
 177+/**
 178+ * Creates a Dom element
 179+ * mName - The name to give the resultant ParseTree object
 180+ * mAttrName - name of an attribute to add to the element
 181+ * mAttrValue - value of the attribute
 182+ */
 183+class ParseAssign implements ParseObject {
 184+ private $mName, $mChildRule, $mAttrName, $mAttrValue;
 185+
 186+ function __construct($name, $childRule, $attrName = NULL, $attrValue = NULL) {
 187+ $this->mName = $name;
 188+ $this->mChildRule = $childRule;
 189+ $this->mAttrName = $attrName;
 190+ $this->mAttrValue = $attrValue;
 191+ }
 192+
 193+ function parse(&$text, &$engine, &$dom, &$children, $replaceStr) {
 194+ if (! $engine->callParser($this->mChildRule, $text, $retChildren, $replaceStr)) {
 195+ return FALSE;
 196+ }
 197+ $retNode = $dom->createElement($this->mName);
 198+ foreach ($retChildren as $child) {
 199+ $retNode->appendChild($child);
 200+ }
 201+ if ($this->mAttrName != NULL && $this->mAttrValue != NULL) {
 202+ $retNode->setAttribute($this->mAttrName, $this->mAttrValue);
 203+ }
 204+ $children = array($retNode);
 205+ return TRUE;
 206+ }
 207+}
 208+
Property changes on: branches/parser-work/phase3/includes/parser/ParseEngine.php
___________________________________________________________________
Name: svn:eol-style
1209 + native
Index: branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php
@@ -57,7 +57,6 @@
5858 * change in the DOM tree for a given text, must be passed through the section identifier
5959 * in the section edit link and thus back to extractSections().
6060 *
61 - * Temporarily removed the cache because the parser now parses straight to DOM
6261 * The output of this function is currently only cached in process memory, but a persistent
6362 * cache may be implemented at a later date which takes further advantage of these strict
6463 * dependency requirements.
@@ -66,90 +65,171 @@
6766 */
6867 function preprocessToObj( $text, $flags = 0 ) {
6968 wfProfileIn( __METHOD__ );
 69+ global $wgMemc, $wgPreprocessorCacheThreshold;
7070
71 - // To XML
72 - $xmlishRegex = implode('|', $this->parser->getStripList());
73 - $rules = array(
74 - "Template" => new ParsePattern("template", '/^{{(?!{[^{])/s', "TemplateSeq", '}}'),
75 - "TplArg" => new ParsePattern("tplarg", '/^{{{/s', "TemplateSeq", '}}}'),
76 - "TplPart" => new ParsePattern("part", '/^\|/s', "TplPartList"),
77 - "Link" => new ParsePattern("link", '/^\[\[/s', "MainQuant", ']]'),
78 - "NewLine" => new ParsePattern("newline", '/^\n/s', "NewLineChoice"),
79 - "Heading" => new ParsePattern("h", '/^={1,6}/s', "HeadingQuant", '~0'),
80 - "CommentLine" => new ParsePattern("commentline", '/^(?:<!--.*?-->\n)+/s'),
81 - "XmlExt" => new ParsePattern("ext", '/^<(?=(' . $xmlishRegex . '))/si', "XmlExtSeq", '~1'),
82 - "Comment" => new ParsePattern("comment", '/^<!--.*?(?:-->|$)/s'),
83 - "OnlyInclude" => new ParsePattern("ignore", '/^<\/?onlyinclude>/s'),
84 - "NoInclude" => new ParsePattern("ignore", '/^<\/?noinclude>/s'),
85 - "IncludeOnly" => new ParsePattern("ignore", '/^<includeonly>.*?<\/includeonly>/s'),
86 - "MainText" => new ParsePattern("text", '/^.[^{}\[\]<\n|=]*/s'),
87 - "XmlName" => new ParsePattern("name", '/^.*?(?= |\/>|>)/s'),
88 - "XmlAttr" => new ParsePattern("attr", '/^.*?(?=\/>|>)/s'),
89 - "XmlClosed" => new ParsePattern("unUsed", '/^\/>/si'),
90 - "XmlOpened" => new ParsePattern("unUsed", '/^>/si'),
91 - "XmlInner" => new ParsePattern("inner", '/^.*?(?=<\/~r>|$)/si'),
92 - "XmlCloseTag" => new ParsePattern("close", '/^<\/~r>/si'),
93 - "StartQuant" => new ParseQuant("unnamed", "MainChoice", '/^$/'),
94 - "BOFQuant" => new ParseQuant("unnamed", "NewLineChoice", NULL, 0, 1),
95 - "MainQuant" => new ParseQuant("unnamed", "MainChoice", '/^~r/s'),
96 - "HeadingQuant" => new ParseQuant("unnamed", "MainChoice", '/^~r(?=(?: *<!--.*?-->)*(?:\n|$))/s'),
97 - "TplTitle" => new ParseQuant("title", "MainChoice", '/^(?=~r|\|)/s'),
98 - "TplPartQuant" => new ParseQuant("unnamed", "TplPart", '/^~r/s'),
99 - "TplTest" => new ParseQuant("unnamed", "MainChoice", '/^(?=~r|\||=(?!~r|\|))/s'),
100 - "TplName" => new ParseQuant("name", "TplTest", '/^=/s', 0, 1),
101 - "TplValue" => new ParseQuant("value", "MainChoice", '/^(?=~r|\|)/s'),
102 - "XmlCloseQuant" => new ParseQuant("unnamed", "XmlCloseTag", NULL, 0, 1),
103 - "MainChoice" => new ParseChoice("unnamed", array("CurlyChoice", "XmlChoice", "NewLine", "Link", "MainText")),
104 - "CurlyChoice" => new ParseChoice("unnamed", array("Template", "TplArg"), "{"),
105 - "XmlChoice" => new ParseChoice("unnamed", array("Comment", "OnlyInclude", "NoInclude", "IncludeOnly", "XmlExt"), "<"),
106 - "NewLineChoice" => new ParseChoice("unnamed", array("Heading", "CommentLine")),
107 - "TplPartList" => new ParseChoice("unnamed", array("TplPartSeq", "TplValue")),
108 - "XmlClose" => new ParseChoice("unnamed", array("XmlClosed", "XmlOpenedSeq")),
109 - "StartSeq" => new ParseSeq("root", array("BOFQuant", "StartQuant")),
110 - "TemplateSeq" => new ParseSeq("unnamed", array("TplTitle", "TplPartQuant")),
111 - "TplPartSeq" => new ParseSeq("unnamed", array("TplName", "TplValue")),
112 - "XmlExtSeq" => new ParseSeq("unnamed", array("XmlName", "XmlAttr", "XmlClose")),
113 - "XmlOpenedSeq" => new ParseSeq("unnamed", array("XmlOpened", "XmlInner", "XmlCloseQuant")));
114 - if ($flags & Parser::PTD_FOR_INCLUSION) {
115 - $rules["BOFQuant"] = new ParseQuant("unnamed", "StartChoice", NULL, 0, 1);
116 - $rules["StartChoice"] = new ParseChoice("unnamed", array("OnlyIncludeBOF", "NewLineChoice"));
117 - $rules["OnlyIncludeBOF"] = new ParsePattern("ignore", '/^.*?<onlyinclude>/s');
118 - $rules["OnlyInclude"] = new ParsePattern("ignore", '/^<\/onlyinclude>.*?(?:<onlyinclude>|$)/s');
119 - $rules["NoInclude"] = new ParsePattern("ignore", '/^<noinclude>.*?<\/noinclude>/s');
120 - $rules["IncludeOnly"] = new ParsePattern("ignore", '/^<\/?includeonly>/s');
 71+ $xml = false;
 72+ $cacheable = strlen( $text ) > $wgPreprocessorCacheThreshold;
 73+ if ( $cacheable ) {
 74+ wfProfileIn( __METHOD__.'-cacheable' );
 75+
 76+ $cacheKey = wfMemcKey( 'preprocess-xml', md5($text), $flags );
 77+ $cacheValue = $wgMemc->get( $cacheKey );
 78+ if ( $cacheValue ) {
 79+ $version = substr( $cacheValue, 0, 8 );
 80+ if ( intval( $version ) == self::CACHE_VERSION ) {
 81+ $xml = substr( $cacheValue, 8 );
 82+ // From the cache
 83+ wfDebugLog( "Preprocessor", "Loaded preprocessor XML from memcached (key $cacheKey)" );
 84+ }
 85+ }
12186 }
122 -
123 - $parseTree = $rules["StartSeq"]->parse($text, $rules);
124 - $xml = $parseTree->printTree();
125 -
126 - // To DOM
127 - wfProfileIn( __METHOD__.'-loadXML' );
128 - $dom = new DOMDocument;
129 - wfSuppressWarnings();
130 - $result = $dom->loadXML( $xml );
131 - wfRestoreWarnings();
132 - if ( !$result ) {
133 - // Try running the XML through UtfNormal to get rid of invalid characters
134 - $xml = UtfNormal::cleanUp( $xml );
 87+ $dom = false;
 88+ if ( $xml === false ) {
 89+ if ( $cacheable ) {
 90+ wfProfileIn( __METHOD__.'-cache-miss' );
 91+ }
 92+ $dom = $this->preprocessToDom( $text, $flags );
 93+ if ( $cacheable ) {
 94+ $cacheValue = sprintf( "%08d", self::CACHE_VERSION ) . $dom->saveXML();
 95+ $wgMemc->set( $cacheKey, $cacheValue, 86400 );
 96+ wfProfileOut( __METHOD__.'-cache-miss' );
 97+ wfDebugLog( "Preprocessor", "Saved preprocessor XML to memcached (key $cacheKey)" );
 98+ }
 99+ } else {
 100+ wfProfileIn( __METHOD__.'-loadXML' );
 101+ $dom = new DOMDocument;
 102+ wfSuppressWarnings();
135103 $result = $dom->loadXML( $xml );
 104+ wfRestoreWarnings();
136105 if ( !$result ) {
137 - throw new MWException( __METHOD__.' generated invalid XML' );
 106+ // Try running the XML through UtfNormal to get rid of invalid characters
 107+ $xml = UtfNormal::cleanUp( $xml );
 108+ $result = $dom->loadXML( $xml );
 109+ if ( !$result ) {
 110+ throw new MWException( __METHOD__.' generated invalid XML' );
 111+ }
138112 }
 113+ wfProfileOut( __METHOD__.'-loadXML' );
139114 }
140 - $this->transformDOM($dom);
141 -
142 - // To Obj
143115 $obj = new PPNode_DOM( $dom->documentElement );
144 -
 116+ if ( $cacheable ) {
 117+ wfProfileOut( __METHOD__.'-cacheable' );
 118+ }
145119 wfProfileOut( __METHOD__ );
146120 return $obj;
147121 }
148122
 123+ // Set up parser data for wikitext then feed the given text to the parser
 124+ private function preprocessToDom(&$text, $flags = 0) {
 125+ wfProfileIn( __METHOD__ );
 126+
 127+ $xmlishRegex = implode('|', $this->parser->getStripList());
 128+ $rules = array(
 129+ "Root" => new ParseAssign("root", "StartSeq"),
 130+ "StartSeq" => new ParseSeq(array("BOFQuant", "MainQuant"), '$'),
 131+ "BOFQuant" => new ParseQuant("HeadingChoice", 0, 1),
 132+ "DefaultPat" => new ParsePattern('/^~r/s'),
 133+ "SavedPat" => new ParsePattern('/^(~r)/s'),
 134+ "MainQuant" => new ParseQuant("MainChoice"),
 135+ "MainChoice"=> new ParseChoice(array("CurlyChoice", "XmlChoice", "NewLineSeq", "LinkSeq", "MainText")),
 136+ "CurlyChoice" => new ParseChoice(array("Template", "TplArg"), "{"),
 137+ "Template" => new ParseAssign("template", "TplSeq"),
 138+ "TplSeq" => new ParseSeq(array("TemplatePat", "TemplateSeq"), '}}'),
 139+ "TemplatePat" => new ParsePattern('/^{{(?!{[^{])/s'),
 140+ "TplArg" => new ParseAssign("tplarg", "TplArgSeq"),
 141+ "TplArgSeq" => new ParseSeq(array("TplArgPat", "TemplateSeq"), '}}}'),
 142+ "TplArgPat" => new ParsePattern('/^{{{/s'),
 143+ "XmlChoice" => new ParseChoice(array("Comment", "OnlyInclude", "NoInclude", "IncludeOnly", "XmlExt"), "<"),
 144+ "Comment" => new ParseAssign("comment", "CommentPat"),
 145+ "CommentPat" => new ParsePattern('/^(<!--.*?(?:-->|$))/s'),
 146+ "OnlyInclude" => new ParseAssign("ignore", "OnlyIncludePat"),
 147+ "OnlyIncludePat" => new ParsePattern('/^(<\/?onlyinclude>)/s'),
 148+ "NoInclude" => new ParseAssign("ignore", "NoIncludePat"),
 149+ "NoIncludePat" => new ParsePattern('/^(<\/?noinclude>)/s'),
 150+ "IncludeOnly" => new ParseAssign("ignore", "IncludeOnlyPat"),
 151+ "IncludeOnlyPat" => new ParsePattern('/^(<includeonly>.*?<\/includeonly>)/s'),
 152+ "XmlExt" => new ParseAssign("ext", "XmlExtSeq"),
 153+ "XmlExtSeq" => new ParseSeq(array("XmlExtPat", "XmlName", "XmlAttr", "XmlClose"), NULL, TRUE),
 154+ "XmlExtPat" => new ParsePattern('/^<(?=(' . $xmlishRegex . '))/si'),
 155+ "XmlName" => new ParseAssign("name", "SavedPat"),
 156+ "XmlAttr" => new ParseAssign("attr", "XmlAttrPat"),
 157+ "XmlAttrPat" => new ParsePattern('/^(.*?)(?=\/>|>)/s'),
 158+ "XmlClose" => new ParseChoice(array("XmlClosed", "XmlOpenedSeq")),
 159+ "XmlClosed" => new ParsePattern('/^\/>/s'),
 160+ "XmlOpenedSeq" => new ParseSeq(array("XmlOpened", "XmlInner", "XmlCloseQuant")),
 161+ "XmlOpened" => new ParsePattern('/^>/s'),
 162+ "XmlInner" => new ParseAssign("inner", "XmlInnerPat"),
 163+ "XmlInnerPat" => new ParsePattern('/^(.*?)(?=<\/~r>|$)/si'),
 164+ "XmlCloseQuant" => new ParseQuant("XmlCloseTag", 0, 1),
 165+ "XmlCloseTag" => new ParseAssign("close", "XmlClosePat"),
 166+ "XmlClosePat" => new ParsePattern('/^(<\/~r>)/si'),
 167+ "NewLineSeq" => new ParseSeq(array("NewLine", "NewLineChoice")),
 168+ "NewLine" => new ParsePattern('/^(\n)/s'),
 169+ "NewLineChoice" => new ParseChoice(array("HeadingChoice", "CommentLine")),
 170+ "CommentLine" => new ParseAssign("comment", "CommentLinePat"),
 171+ "CommentLinePat" => new ParsePattern('/^((?:<!--.*?-->\n)+)/s'),
 172+ "LinkSeq" => new ParseSeq(array("Link", "MainQuant", "SavedPat"), ']]'),
 173+ "Link" => new ParsePattern('/^(\[\[)/s'),
 174+ "MainText" => new ParsePattern('/^(?!~r)(.[^{}\[\]<\n|=]*)/s'),
 175+ "HeadingChoice" => new ParseChoice(array("Heading6", "Heading5", "Heading4", "Heading3", "Heading2", "Heading1"), "="),
 176+ "Heading6" => new ParseAssign("h", "Heading6Seq", "level", "6"),
 177+ "Heading6Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '======'),
 178+ "Heading5" => new ParseAssign("h", "Heading5Seq", "level", "5"),
 179+ "Heading5Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '====='),
 180+ "Heading4" => new ParseAssign("h", "Heading4Seq", "level", "4"),
 181+ "Heading4Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '===='),
 182+ "Heading3" => new ParseAssign("h", "Heading3Seq", "level", "3"),
 183+ "Heading3Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '==='),
 184+ "Heading2" => new ParseAssign("h", "Heading2Seq", "level", "2"),
 185+ "Heading2Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '=='),
 186+ "Heading1" => new ParseAssign("h", "Heading1Seq", "level", "1"),
 187+ "Heading1Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '='),
 188+ "HeadingSeq" => new ParseSeq(array("MainQuant", "DefaultPat"), '~r(?=(?: *<!--.*?-->)*(?:\n|$))'),
 189+ "TemplateSeq" => new ParseSeq(array("TplTitle", "TplPartQuant", "DefaultPat"), '~r|\|'),
 190+ "TplTitle" => new ParseAssign("title", "MainQuant"),
 191+ "TplPartQuant" => new ParseQuant("TplPart"),
 192+ "TplPart" => new ParseAssign("part", "TplPartSeq"),
 193+ "TplPartSeq" => new ParseSeq(array("TplPipe", "TplPartList")),
 194+ "TplPipe" => new ParsePattern('/^\|/s'),
 195+ "TplPartList" => new ParseChoice(array("NamedPartSeq", "TplValue")),
 196+ "NamedPartSeq" => new ParseSeq(array("TplName", "TplValue")),
 197+ "TplName" => new ParseAssign("name", "TplNameSeq"),
 198+ "TplNameSeq" => new ParseSeq(array("MainQuant", "TplEquals"), '~r|\||=(?!~r|\|)'),
 199+ "TplEquals" => new ParsePattern('/^=/s'),
 200+ "TplValue" => new ParseAssign("value", "MainQuant"));
 201+ if ($flags & Parser::PTD_FOR_INCLUSION) {
 202+ $rules["BOFQuant"] = new ParseQuant("StartChoice", 0, 1);
 203+ $rules["StartChoice"] = new ParseChoice(array("OnlyIncludeBOF", "HeadingChoice"));
 204+ $rules["OnlyIncludeBOF"] = new ParseAssign("ignore", "OnlyIncludeBOFPat");
 205+ $rules["OnlyIncludeBOFPat"] = new ParsePattern('/^(.*?<onlyinclude>)/s');
 206+ $rules["OnlyIncludePat"] = new ParsePattern('/^(<\/onlyinclude>.*?(?:<onlyinclude>|$))/s');
 207+ $rules["NoIncludePat"] = new ParsePattern('/^(<noinclude>.*?<\/noinclude>)/s');
 208+ $rules["IncludeOnlyPat"] = new ParsePattern('/^(<\/?includeonly>)/s');
 209+ }
 210+ $parser = new ParseEngine($rules, "Root");
 211+
 212+ $dom = $parser->parse($text);
 213+ $this->transformDOM($dom);
 214+
 215+ wfProfileOut( __METHOD__ );
 216+ return $dom;
 217+ }
 218+
149219 // Temporary function to add needed redundant info to the parse tree after parsing.
150220 private function transformDOM(&$node, &$headingInd = 1) {
151221 if ($node->hasChildNodes()) {
152222 if ($node->nodeName == "h") {
153 - $node->setAttribute("level", strspn($node->firstChild->wholeText, "=", 0, 6 ));
 223+ $headerTag = str_repeat("=", $node->getAttribute("level"));
 224+ if ($node->firstChild instanceof DOMText) {
 225+ $node->firstChild->insertData(0, $headerTag);
 226+ } else {
 227+ $node->insertBefore($node->ownerDocument->createTextNode($headerTag), $crrnt->firstChild);
 228+ }
 229+ if ($node->lastChild instanceof DOMText) {
 230+ $node->lastChild->appendData($headerTag);
 231+ } else {
 232+ $node->appendChild($node->ownerDocument->createTextNode($headerTag));
 233+ }
154234 $node->setAttribute("i", $headingInd);
155235 $headingInd ++;
156236 } elseif ($node->nodeName == "template" && $node->previousSibling instanceof DOMText) {
Index: branches/parser-work/phase3/includes/AutoLoader.php
@@ -460,12 +460,13 @@
461461 'PPNode_Hash_Tree' => 'includes/parser/Preprocessor_Hash.php',
462462 'PPTemplateFrame_DOM' => 'includes/parser/Preprocessor_DOM.php',
463463 'PPTemplateFrame_Hash' => 'includes/parser/Preprocessor_Hash.php',
464 - 'ParseChoice' => 'includes/parser/ParseTree.php',
465 - 'ParseObject' => 'includes/parser/ParseTree.php',
466 - 'ParsePattern' => 'includes/parser/ParseTree.php',
467 - 'ParseQuant' => 'includes/parser/ParseTree.php',
468 - 'ParseSeq' => 'includes/parser/ParseTree.php',
469 - 'ParseTree' => 'includes/parser/ParseTree.php',
 464+ 'ParseAssign' => 'includes/parser/ParseEngine.php',
 465+ 'ParseChoice' => 'includes/parser/ParseEngine.php',
 466+ 'ParseEngine' => 'includes/parser/ParseEngine.php',
 467+ 'ParseObject' => 'includes/parser/ParseEngine.php',
 468+ 'ParsePattern' => 'includes/parser/ParseEngine.php',
 469+ 'ParseQuant' => 'includes/parser/ParseEngine.php',
 470+ 'ParseSeq' => 'includes/parser/ParseEngine.php',
470471 'Parser' => 'includes/parser/Parser.php',
471472 'ParserCache' => 'includes/parser/ParserCache.php',
472473 'ParserOptions' => 'includes/parser/ParserOptions.php',
Index: branches/parser-work/phase3/includes/DefaultSettings.php
@@ -1232,6 +1232,11 @@
12331233 $wgDebugPrintHttpHeaders = true;
12341234
12351235 /**
 1236+ * Log file for debugging the parser, if not set log won't be created
 1237+ */
 1238+$wgDebugParserLog = '';
 1239+
 1240+/**
12361241 * Show the contents of $wgHooks in Special:Version
12371242 */
12381243 $wgSpecialVersionShowHooks = false;

Status & tagging log