Index: branches/parser-work/phase3/includes/parser/ParseTree.php |
— | — | @@ -1,218 +0,0 @@ |
2 | | -<?php |
3 | | -/** |
4 | | - * Interface for Parse Object each with a specialized task while parsing |
5 | | - * @ingroup Parser |
6 | | - */ |
7 | | -abstract class ParseObject { |
8 | | - protected $mName; |
9 | | - |
10 | | - function __construct($name) { |
11 | | - $this->mName = $name; |
12 | | - } |
13 | | - |
14 | | - // Does the parse task specific to each parse object |
15 | | - abstract function parse(&$text, &$rules, $replaceStr = NULL); |
16 | | -} |
17 | | - |
18 | | -/** |
19 | | - * A rule specifying how to parse the text. |
20 | | - * If the text matches mBeginTag then a ParseTree object is created with the appropriate info. |
21 | | - * mName - The name to give the resultant ParseTree object |
22 | | - * mBeginTag - the regular expression used to determine if this is the rule that should be used |
23 | | - * mReplaceStr - Collected patterns that should be passed to children |
24 | | - * mChildRule - What Parse rule to use to gather children for this element |
25 | | - * @ingroup Parser |
26 | | - */ |
27 | | -class ParsePattern extends ParseObject { |
28 | | - private $mBeginTag, $mChildRule, $mReplaceStr; |
29 | | - |
30 | | - function __construct($name, $beginTag, $childRule = NULL, $replaceStr = NULL) { |
31 | | - parent::__construct($name); |
32 | | - $this->mBeginTag = $beginTag; |
33 | | - $this->mChildRule = $childRule; |
34 | | - $this->mReplaceStr = $replaceStr; |
35 | | - } |
36 | | - |
37 | | - function parse(&$text, &$rules, $replaceStr = NULL) { |
38 | | - $beginTag = $this->mBeginTag; |
39 | | - if ($replaceStr != NULL) { |
40 | | - $beginTag = str_replace('~r', $replaceStr, $beginTag); |
41 | | - } |
42 | | - if (! preg_match($beginTag, $text, $matches)) { |
43 | | - return NULL; |
44 | | - } |
45 | | - $text = substr($text, strlen($matches[0])); |
46 | | - $children = NULL; |
47 | | - if ($this->mChildRule != NULL) { |
48 | | - if ($this->mReplaceStr != NULL) { |
49 | | - $replaceStr = $this->mReplaceStr; |
50 | | - foreach ($matches as $i => $crrnt) { |
51 | | - $replaceStr = str_replace('~' . $i, $crrnt, $replaceStr); |
52 | | - } |
53 | | - } |
54 | | - $child = $rules[$this->mChildRule]->parse($text, $rules, $replaceStr); |
55 | | - if ($child == NULL) { |
56 | | - return NULL; |
57 | | - } |
58 | | - $children = array($child); |
59 | | - } |
60 | | - return new ParseTree($this->mName, $matches, $children); |
61 | | - } |
62 | | -} |
63 | | - |
64 | | -/** |
65 | | - * A rule specifying how to parse the text. |
66 | | - * If the text matches mBeginTag then a ParseTree object is created with the appropriate info. |
67 | | - * mName - The name to give the resultant ParseTree object |
68 | | - * mChildRule - What Parse rule to use to gather children for this element |
69 | | - * mEndTag - If ParseTrees of this type are to have children, mEndTag specifies when all of the children are collected |
70 | | - * mMinChildren - Minimum amount of children for this rule |
71 | | - * mMaxChildren - Maximum amount of children for this rule, 0 means unlimited |
72 | | - * @ingroup Parser |
73 | | - */ |
74 | | -class ParseQuant extends ParseObject { |
75 | | - private $mChildRule, $mEndTag, $mMinChildren, $mMaxChildren; |
76 | | - |
77 | | - function __construct($name, $childRule, $endTag = NULL, $minChildren = 0, $maxChildren = 0) { |
78 | | - parent::__construct($name); |
79 | | - $this->mChildRule = $childRule; |
80 | | - $this->mEndTag = $endTag; |
81 | | - $this->mMinChildren = $minChildren; |
82 | | - $this->mMaxChildren = $maxChildren; |
83 | | - } |
84 | | - |
85 | | - function parse(&$text, &$rules, $replaceStr = NULL) { |
86 | | - $endTag = $this->mEndTag; |
87 | | - if ($endTag != NULL && $replaceStr != NULL) { |
88 | | - $endTag = str_replace('~r', $replaceStr, $endTag); |
89 | | - } |
90 | | - $children = array(); |
91 | | - for ($i = 0; $i < $this->mMinChildren || (($endTag == NULL || ! preg_match($endTag, $text, $matches)) && |
92 | | - ($this->mMaxChildren <= 0 || $i < $this->mMaxChildren)); $i ++) { |
93 | | - $child = $rules[$this->mChildRule]->parse($text, $rules, $replaceStr); |
94 | | - if ($child == NULL) { |
95 | | - if ($endTag != NULL || $i < $this->mMinChildren) { |
96 | | - return NULL; |
97 | | - } |
98 | | - break; |
99 | | - } |
100 | | - $children[] = $child; |
101 | | - } |
102 | | - if ($endTag != NULL) { |
103 | | - if (! isset($matches[0])) { |
104 | | - return NULL; |
105 | | - } |
106 | | - $text = substr($text, strlen($matches[0])); |
107 | | - } |
108 | | - return new ParseTree($this->mName, NULL, $children); |
109 | | - } |
110 | | -} |
111 | | - |
112 | | -/** |
113 | | - * Contains a list of rules to cycle through when creating a parse tree |
114 | | - * mList - The list of rules |
115 | | - * @ingroup Parser |
116 | | - */ |
117 | | -class ParseChoice extends ParseObject { |
118 | | - private $mList, $matchChar; |
119 | | - |
120 | | - function __construct($name, $list, $matchChar = null) { |
121 | | - parent::__construct($name); |
122 | | - $this->mList = $list; |
123 | | - $this->mMatchChar = $matchChar; |
124 | | - } |
125 | | - |
126 | | - function parse(&$text, &$rules, $replaceStr = NULL) { |
127 | | - if ($this->mMatchChar != NULL && $text[0] != $this->mMatchChar) { |
128 | | - return NULL; |
129 | | - } |
130 | | - foreach ($this->mList as $crrnt) { |
131 | | - $newText = $text; |
132 | | - $child = $rules[$crrnt]->parse($newText, $rules, $replaceStr); |
133 | | - if ($child != NULL) { |
134 | | - $text = $newText; |
135 | | - return new ParseTree($this->mName, NULL, array($child)); |
136 | | - } |
137 | | - } |
138 | | - return NULL; |
139 | | - } |
140 | | -} |
141 | | - |
142 | | -/** |
143 | | - * Contains a sequence of rules all of which must pass |
144 | | - * mName - The name to give the resultant ParseTree object |
145 | | - * mList - The sequence of rules |
146 | | - * @ingroup Parser |
147 | | - */ |
148 | | -class ParseSeq extends ParseObject { |
149 | | - private $mList; |
150 | | - |
151 | | - function __construct($name, $list) { |
152 | | - parent::__construct($name); |
153 | | - $this->mList = $list; |
154 | | - } |
155 | | - |
156 | | - function parse(&$text, &$rules, $replaceStr = NULL) { |
157 | | - $children = array(); |
158 | | - foreach ($this->mList as $crrnt) { |
159 | | - $child = $rules[$crrnt]->parse($text, $rules, $replaceStr); |
160 | | - if ($child == NULL) { |
161 | | - return NULL; |
162 | | - } |
163 | | - $children[] = $child; |
164 | | - } |
165 | | - return new ParseTree($this->mName, NULL, $children); |
166 | | - } |
167 | | -} |
168 | | - |
169 | | -/** |
170 | | - * The parse tree of the data. |
171 | | - * printTree translates the parse tree to xml, eventually this should be seperated into a data and engine layer. |
172 | | - * mName - Indicates what ParseObject was used to create this node |
173 | | - * mMatches - The text groups that were collected by the regular expressions used when creating this rule |
174 | | - * mChildren - The child ParseTree nodes in this tree |
175 | | - * @ingroup Parser |
176 | | - */ |
177 | | -class ParseTree { |
178 | | - private $mName, $mMatches, $mChildren; |
179 | | - |
180 | | - function __construct($name, $matches, $children) { |
181 | | - $this->mName = $name; |
182 | | - $this->mMatches = $matches; |
183 | | - $this->mChildren = $children; |
184 | | - } |
185 | | - |
186 | | - function getName() { |
187 | | - return $this->mName; |
188 | | - } |
189 | | - |
190 | | - //this function will definitely need to be seperated into data and engine layers |
191 | | - function printTree() { |
192 | | - $retString = ""; |
193 | | - |
194 | | - if ($this->mName == "text") { |
195 | | - $retString = htmlspecialchars($this->mMatches[0]); |
196 | | - } elseif ($this->mName == "newline") { |
197 | | - $retString = htmlspecialchars($this->mMatches[0]) . $this->mChildren[0]->printTree(); |
198 | | - } elseif ($this->mName == "link") { |
199 | | - $retString = htmlspecialchars($this->mMatches[0]) . $this->mChildren[0]->printTree() . "]]"; |
200 | | - } elseif ($this->mName == "h") { |
201 | | - $retString = "<h>" . htmlspecialchars($this->mMatches[0]) . $this->mChildren[0]->printTree() . |
202 | | - htmlspecialchars($this->mMatches[0]) . "</h>"; |
203 | | - } elseif ($this->mName != "unUsed") { |
204 | | - if ($this->mChildren != NULL) { |
205 | | - foreach ($this->mChildren as $crrnt) { |
206 | | - $retString .= $crrnt->printTree(); |
207 | | - } |
208 | | - } else { |
209 | | - $retString = htmlspecialchars($this->mMatches[0]); |
210 | | - } |
211 | | - if ($this->mName != "unnamed") { |
212 | | - $retString = "<" . $this->mName . ">" . $retString . "</" . $this->mName . ">"; |
213 | | - } |
214 | | - } |
215 | | - |
216 | | - return $retString; |
217 | | - } |
218 | | -} |
219 | | - |
Index: branches/parser-work/phase3/includes/parser/ParseEngine.php |
— | — | @@ -0,0 +1,207 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * Acts as the primary interface between the world and the parser. |
| 5 | + * mRules - The list of rules to use while parsing |
| 6 | + * mStartRule - the first rule to use while parsing |
| 7 | + * mDom - Used to create Dom objects and get's returned at the end of parsing |
| 8 | + */ |
| 9 | +class ParseEngine { |
| 10 | + private $mRules, $mStartRule, $mDom; |
| 11 | + |
| 12 | + function __construct($rules, $startRule) { |
| 13 | + $this->mRules = $rules; |
| 14 | + $this->mStartRule = $startRule; |
| 15 | + } |
| 16 | + |
| 17 | + function parse(&$text) { |
| 18 | + global $wgDebugParserLog; |
| 19 | + if ($wgDebugParserLog != '') { |
| 20 | + wfErrorLog("==========Start Parsing==========\n", $wgDebugParserLog); |
| 21 | + } |
| 22 | + $this->mDom = new DOMDocument(); |
| 23 | + if (! $this->callParser($this->mStartRule, $text, $children, NULL)) { |
| 24 | + throw new MWException('Parser regected text.'); |
| 25 | + } |
| 26 | + $this->mDom->appendChild($children[0]); |
| 27 | + if ($wgDebugParserLog != '') { |
| 28 | + wfErrorLog("XML - " . $this->mDom->saveXML() . "\n", $wgDebugParserLog); |
| 29 | + } |
| 30 | + return $this->mDom; |
| 31 | + } |
| 32 | + |
| 33 | + function callParser($childName, &$text, &$children, $replaceStr) { |
| 34 | + global $wgDebugParserLog; |
| 35 | + if ($wgDebugParserLog != '') { |
| 36 | + wfErrorLog("Entering " . $childName . ", Text - " . $text . "\n", $wgDebugParserLog); |
| 37 | + } |
| 38 | + $retCode = $this->mRules[$childName]->parse($text, $this, $this->mDom, $children, $replaceStr); |
| 39 | + if ($wgDebugParserLog != '') { |
| 40 | + wfErrorLog("Exiting " . $childName . ", Text - " . $text . "\n", $wgDebugParserLog); |
| 41 | + } |
| 42 | + return $retCode; |
| 43 | + } |
| 44 | +} |
| 45 | + |
| 46 | + |
| 47 | +// Interface for Parse objects each with a specialized task while parsing |
| 48 | +interface ParseObject { |
| 49 | + // Does the parse task specific to each parse object |
| 50 | + function parse(&$text, &$engine, &$dom, &$children, $replaceStr); |
| 51 | +} |
| 52 | + |
| 53 | +/** |
| 54 | + * Deals with pattern matching and saving strings from the text. |
| 55 | + * mMatchPat - the regular expression used to determine if this is the rule that should be used |
| 56 | + */ |
| 57 | +class ParsePattern implements ParseObject { |
| 58 | + private $mMatchPat; |
| 59 | + |
| 60 | + function __construct($matchPat) { |
| 61 | + $this->mMatchPat = $matchPat; |
| 62 | + } |
| 63 | + |
| 64 | + function parse(&$text, &$engine, &$dom, &$children, $replaceStr) { |
| 65 | + $regEx = $this->mMatchPat; |
| 66 | + if ($replaceStr != NULL) { |
| 67 | + $regEx = str_replace('~r', $replaceStr, $regEx); |
| 68 | + } |
| 69 | + if (! preg_match($regEx, $text, $matches)) { |
| 70 | + return FALSE; |
| 71 | + } |
| 72 | + $text = substr($text, strlen($matches[0])); |
| 73 | + $children = array(); |
| 74 | + if (isset($matches[1])) { |
| 75 | + $children[] = $dom->createTextNode($matches[1]); |
| 76 | + } |
| 77 | + return TRUE; |
| 78 | + } |
| 79 | +} |
| 80 | + |
| 81 | +/** |
| 82 | + * Deals with cases where a rule can be matched multiple or 0 times. |
| 83 | + * mChildRule - What Parse rule to quantify |
| 84 | + * mMinChildren - Minimum amount of children for this rule |
| 85 | + * mMaxChildren - Maximum amount of children for this rule, 0 means unlimited |
| 86 | + */ |
| 87 | +class ParseQuant implements ParseObject { |
| 88 | + private $mChildRule, $mMinChildren, $mMaxChildren; |
| 89 | + |
| 90 | + function __construct($childRule, $minChildren = 0, $maxChildren = 0) { |
| 91 | + $this->mChildRule = $childRule; |
| 92 | + $this->mMinChildren = $minChildren; |
| 93 | + $this->mMaxChildren = $maxChildren; |
| 94 | + } |
| 95 | + |
| 96 | + function parse(&$text, &$engine, &$dom, &$children, $replaceStr) { |
| 97 | + $children = array(); |
| 98 | + for ($i = 0; $this->mMaxChildren <= 0 || $i < $this->mMaxChildren; $i ++) { |
| 99 | + if (! $engine->callParser($this->mChildRule, $text, $retChildren, $replaceStr)) { |
| 100 | + if ($i < $this->mMinChildren) { |
| 101 | + return FALSE; |
| 102 | + } |
| 103 | + break; |
| 104 | + } |
| 105 | + $children = array_merge($children, $retChildren); |
| 106 | + } |
| 107 | + return TRUE; |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | +/** |
| 112 | + * Cycles throug array of rules until it finds one that succeeds |
| 113 | + * mList - The list of rules |
| 114 | + * mMatchChar - This is a shortcut. If the starting char of the text is different then parse will return FALSE. |
| 115 | + */ |
| 116 | +class ParseChoice implements ParseObject { |
| 117 | + private $mList, $mMatchChar; |
| 118 | + |
| 119 | + function __construct($list, $matchChar = NULL) { |
| 120 | + $this->mList = $list; |
| 121 | + $this->mMatchChar = $matchChar; |
| 122 | + } |
| 123 | + |
| 124 | + function parse(&$text, &$engine, &$dom, &$children, $replaceStr) { |
| 125 | + if ($this->mMatchChar != NULL && $text[0] != $this->mMatchChar) { |
| 126 | + return FALSE; |
| 127 | + } |
| 128 | + foreach ($this->mList as $crrnt) { |
| 129 | + $newText = $text; |
| 130 | + if ($engine->callParser($crrnt, $newText, $children, $replaceStr)) { |
| 131 | + $text = $newText; |
| 132 | + return TRUE; |
| 133 | + } |
| 134 | + } |
| 135 | + return FALSE; |
| 136 | + } |
| 137 | +} |
| 138 | + |
| 139 | +/** |
| 140 | + * Contains a sequence of rules all of which must pass |
| 141 | + * mList - The sequence of rules |
| 142 | + * mReplaceStr - A string used to determine the close tag of bracketed markup |
| 143 | + * mSaveStr - Boolean specifying wheter to pull mReplaceStr from text |
| 144 | + */ |
| 145 | +class ParseSeq implements ParseObject { |
| 146 | + private $mList, $mReplaceStr, $mSaveStr; |
| 147 | + |
| 148 | + function __construct($list, $replaceStr = NULL, $saveStr = FALSE) { |
| 149 | + $this->mList = $list; |
| 150 | + $this->mReplaceStr = $replaceStr; |
| 151 | + $this->mSaveStr = $saveStr; |
| 152 | + } |
| 153 | + |
| 154 | + function parse(&$text, &$engine, &$dom, &$children, $replaceStr) { |
| 155 | + if ($this->mReplaceStr != NULL) { |
| 156 | + if ($replaceStr != NULL) { |
| 157 | + $replaceStr = str_replace('~r', $replaceStr, $this->mReplaceStr); |
| 158 | + } else { |
| 159 | + $replaceStr = $this->mReplaceStr; |
| 160 | + } |
| 161 | + } |
| 162 | + $children = array(); |
| 163 | + foreach ($this->mList as $i => $crrnt) { |
| 164 | + if (! $engine->callParser($crrnt, $text, $retChildren, $replaceStr)) { |
| 165 | + return FALSE; |
| 166 | + } |
| 167 | + if ($i == 0 && $this->mSaveStr && isset($retChildren[0]) && $retChildren[0] instanceof DOMText) { |
| 168 | + $replaceStr = $retChildren[0]->wholeText; |
| 169 | + } else { |
| 170 | + $children = array_merge($children, $retChildren); |
| 171 | + } |
| 172 | + } |
| 173 | + return TRUE; |
| 174 | + } |
| 175 | +} |
| 176 | + |
| 177 | +/** |
| 178 | + * Creates a Dom element |
| 179 | + * mName - The name to give the resultant ParseTree object |
| 180 | + * mAttrName - name of an attribute to add to the element |
| 181 | + * mAttrValue - value of the attribute |
| 182 | + */ |
| 183 | +class ParseAssign implements ParseObject { |
| 184 | + private $mName, $mChildRule, $mAttrName, $mAttrValue; |
| 185 | + |
| 186 | + function __construct($name, $childRule, $attrName = NULL, $attrValue = NULL) { |
| 187 | + $this->mName = $name; |
| 188 | + $this->mChildRule = $childRule; |
| 189 | + $this->mAttrName = $attrName; |
| 190 | + $this->mAttrValue = $attrValue; |
| 191 | + } |
| 192 | + |
| 193 | + function parse(&$text, &$engine, &$dom, &$children, $replaceStr) { |
| 194 | + if (! $engine->callParser($this->mChildRule, $text, $retChildren, $replaceStr)) { |
| 195 | + return FALSE; |
| 196 | + } |
| 197 | + $retNode = $dom->createElement($this->mName); |
| 198 | + foreach ($retChildren as $child) { |
| 199 | + $retNode->appendChild($child); |
| 200 | + } |
| 201 | + if ($this->mAttrName != NULL && $this->mAttrValue != NULL) { |
| 202 | + $retNode->setAttribute($this->mAttrName, $this->mAttrValue); |
| 203 | + } |
| 204 | + $children = array($retNode); |
| 205 | + return TRUE; |
| 206 | + } |
| 207 | +} |
| 208 | + |
Property changes on: branches/parser-work/phase3/includes/parser/ParseEngine.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 209 | + native |
Index: branches/parser-work/phase3/includes/parser/Preprocessor_DOM.php |
— | — | @@ -57,7 +57,6 @@ |
58 | 58 | * change in the DOM tree for a given text, must be passed through the section identifier |
59 | 59 | * in the section edit link and thus back to extractSections(). |
60 | 60 | * |
61 | | - * Temporarily removed the cache because the parser now parses straight to DOM |
62 | 61 | * The output of this function is currently only cached in process memory, but a persistent |
63 | 62 | * cache may be implemented at a later date which takes further advantage of these strict |
64 | 63 | * dependency requirements. |
— | — | @@ -66,90 +65,171 @@ |
67 | 66 | */ |
68 | 67 | function preprocessToObj( $text, $flags = 0 ) { |
69 | 68 | wfProfileIn( __METHOD__ ); |
| 69 | + global $wgMemc, $wgPreprocessorCacheThreshold; |
70 | 70 | |
71 | | - // To XML |
72 | | - $xmlishRegex = implode('|', $this->parser->getStripList()); |
73 | | - $rules = array( |
74 | | - "Template" => new ParsePattern("template", '/^{{(?!{[^{])/s', "TemplateSeq", '}}'), |
75 | | - "TplArg" => new ParsePattern("tplarg", '/^{{{/s', "TemplateSeq", '}}}'), |
76 | | - "TplPart" => new ParsePattern("part", '/^\|/s', "TplPartList"), |
77 | | - "Link" => new ParsePattern("link", '/^\[\[/s', "MainQuant", ']]'), |
78 | | - "NewLine" => new ParsePattern("newline", '/^\n/s', "NewLineChoice"), |
79 | | - "Heading" => new ParsePattern("h", '/^={1,6}/s', "HeadingQuant", '~0'), |
80 | | - "CommentLine" => new ParsePattern("commentline", '/^(?:<!--.*?-->\n)+/s'), |
81 | | - "XmlExt" => new ParsePattern("ext", '/^<(?=(' . $xmlishRegex . '))/si', "XmlExtSeq", '~1'), |
82 | | - "Comment" => new ParsePattern("comment", '/^<!--.*?(?:-->|$)/s'), |
83 | | - "OnlyInclude" => new ParsePattern("ignore", '/^<\/?onlyinclude>/s'), |
84 | | - "NoInclude" => new ParsePattern("ignore", '/^<\/?noinclude>/s'), |
85 | | - "IncludeOnly" => new ParsePattern("ignore", '/^<includeonly>.*?<\/includeonly>/s'), |
86 | | - "MainText" => new ParsePattern("text", '/^.[^{}\[\]<\n|=]*/s'), |
87 | | - "XmlName" => new ParsePattern("name", '/^.*?(?= |\/>|>)/s'), |
88 | | - "XmlAttr" => new ParsePattern("attr", '/^.*?(?=\/>|>)/s'), |
89 | | - "XmlClosed" => new ParsePattern("unUsed", '/^\/>/si'), |
90 | | - "XmlOpened" => new ParsePattern("unUsed", '/^>/si'), |
91 | | - "XmlInner" => new ParsePattern("inner", '/^.*?(?=<\/~r>|$)/si'), |
92 | | - "XmlCloseTag" => new ParsePattern("close", '/^<\/~r>/si'), |
93 | | - "StartQuant" => new ParseQuant("unnamed", "MainChoice", '/^$/'), |
94 | | - "BOFQuant" => new ParseQuant("unnamed", "NewLineChoice", NULL, 0, 1), |
95 | | - "MainQuant" => new ParseQuant("unnamed", "MainChoice", '/^~r/s'), |
96 | | - "HeadingQuant" => new ParseQuant("unnamed", "MainChoice", '/^~r(?=(?: *<!--.*?-->)*(?:\n|$))/s'), |
97 | | - "TplTitle" => new ParseQuant("title", "MainChoice", '/^(?=~r|\|)/s'), |
98 | | - "TplPartQuant" => new ParseQuant("unnamed", "TplPart", '/^~r/s'), |
99 | | - "TplTest" => new ParseQuant("unnamed", "MainChoice", '/^(?=~r|\||=(?!~r|\|))/s'), |
100 | | - "TplName" => new ParseQuant("name", "TplTest", '/^=/s', 0, 1), |
101 | | - "TplValue" => new ParseQuant("value", "MainChoice", '/^(?=~r|\|)/s'), |
102 | | - "XmlCloseQuant" => new ParseQuant("unnamed", "XmlCloseTag", NULL, 0, 1), |
103 | | - "MainChoice" => new ParseChoice("unnamed", array("CurlyChoice", "XmlChoice", "NewLine", "Link", "MainText")), |
104 | | - "CurlyChoice" => new ParseChoice("unnamed", array("Template", "TplArg"), "{"), |
105 | | - "XmlChoice" => new ParseChoice("unnamed", array("Comment", "OnlyInclude", "NoInclude", "IncludeOnly", "XmlExt"), "<"), |
106 | | - "NewLineChoice" => new ParseChoice("unnamed", array("Heading", "CommentLine")), |
107 | | - "TplPartList" => new ParseChoice("unnamed", array("TplPartSeq", "TplValue")), |
108 | | - "XmlClose" => new ParseChoice("unnamed", array("XmlClosed", "XmlOpenedSeq")), |
109 | | - "StartSeq" => new ParseSeq("root", array("BOFQuant", "StartQuant")), |
110 | | - "TemplateSeq" => new ParseSeq("unnamed", array("TplTitle", "TplPartQuant")), |
111 | | - "TplPartSeq" => new ParseSeq("unnamed", array("TplName", "TplValue")), |
112 | | - "XmlExtSeq" => new ParseSeq("unnamed", array("XmlName", "XmlAttr", "XmlClose")), |
113 | | - "XmlOpenedSeq" => new ParseSeq("unnamed", array("XmlOpened", "XmlInner", "XmlCloseQuant"))); |
114 | | - if ($flags & Parser::PTD_FOR_INCLUSION) { |
115 | | - $rules["BOFQuant"] = new ParseQuant("unnamed", "StartChoice", NULL, 0, 1); |
116 | | - $rules["StartChoice"] = new ParseChoice("unnamed", array("OnlyIncludeBOF", "NewLineChoice")); |
117 | | - $rules["OnlyIncludeBOF"] = new ParsePattern("ignore", '/^.*?<onlyinclude>/s'); |
118 | | - $rules["OnlyInclude"] = new ParsePattern("ignore", '/^<\/onlyinclude>.*?(?:<onlyinclude>|$)/s'); |
119 | | - $rules["NoInclude"] = new ParsePattern("ignore", '/^<noinclude>.*?<\/noinclude>/s'); |
120 | | - $rules["IncludeOnly"] = new ParsePattern("ignore", '/^<\/?includeonly>/s'); |
| 71 | + $xml = false; |
| 72 | + $cacheable = strlen( $text ) > $wgPreprocessorCacheThreshold; |
| 73 | + if ( $cacheable ) { |
| 74 | + wfProfileIn( __METHOD__.'-cacheable' ); |
| 75 | + |
| 76 | + $cacheKey = wfMemcKey( 'preprocess-xml', md5($text), $flags ); |
| 77 | + $cacheValue = $wgMemc->get( $cacheKey ); |
| 78 | + if ( $cacheValue ) { |
| 79 | + $version = substr( $cacheValue, 0, 8 ); |
| 80 | + if ( intval( $version ) == self::CACHE_VERSION ) { |
| 81 | + $xml = substr( $cacheValue, 8 ); |
| 82 | + // From the cache |
| 83 | + wfDebugLog( "Preprocessor", "Loaded preprocessor XML from memcached (key $cacheKey)" ); |
| 84 | + } |
| 85 | + } |
121 | 86 | } |
122 | | - |
123 | | - $parseTree = $rules["StartSeq"]->parse($text, $rules); |
124 | | - $xml = $parseTree->printTree(); |
125 | | - |
126 | | - // To DOM |
127 | | - wfProfileIn( __METHOD__.'-loadXML' ); |
128 | | - $dom = new DOMDocument; |
129 | | - wfSuppressWarnings(); |
130 | | - $result = $dom->loadXML( $xml ); |
131 | | - wfRestoreWarnings(); |
132 | | - if ( !$result ) { |
133 | | - // Try running the XML through UtfNormal to get rid of invalid characters |
134 | | - $xml = UtfNormal::cleanUp( $xml ); |
| 87 | + $dom = false; |
| 88 | + if ( $xml === false ) { |
| 89 | + if ( $cacheable ) { |
| 90 | + wfProfileIn( __METHOD__.'-cache-miss' ); |
| 91 | + } |
| 92 | + $dom = $this->preprocessToDom( $text, $flags ); |
| 93 | + if ( $cacheable ) { |
| 94 | + $cacheValue = sprintf( "%08d", self::CACHE_VERSION ) . $dom->saveXML(); |
| 95 | + $wgMemc->set( $cacheKey, $cacheValue, 86400 ); |
| 96 | + wfProfileOut( __METHOD__.'-cache-miss' ); |
| 97 | + wfDebugLog( "Preprocessor", "Saved preprocessor XML to memcached (key $cacheKey)" ); |
| 98 | + } |
| 99 | + } else { |
| 100 | + wfProfileIn( __METHOD__.'-loadXML' ); |
| 101 | + $dom = new DOMDocument; |
| 102 | + wfSuppressWarnings(); |
135 | 103 | $result = $dom->loadXML( $xml ); |
| 104 | + wfRestoreWarnings(); |
136 | 105 | if ( !$result ) { |
137 | | - throw new MWException( __METHOD__.' generated invalid XML' ); |
| 106 | + // Try running the XML through UtfNormal to get rid of invalid characters |
| 107 | + $xml = UtfNormal::cleanUp( $xml ); |
| 108 | + $result = $dom->loadXML( $xml ); |
| 109 | + if ( !$result ) { |
| 110 | + throw new MWException( __METHOD__.' generated invalid XML' ); |
| 111 | + } |
138 | 112 | } |
| 113 | + wfProfileOut( __METHOD__.'-loadXML' ); |
139 | 114 | } |
140 | | - $this->transformDOM($dom); |
141 | | - |
142 | | - // To Obj |
143 | 115 | $obj = new PPNode_DOM( $dom->documentElement ); |
144 | | - |
| 116 | + if ( $cacheable ) { |
| 117 | + wfProfileOut( __METHOD__.'-cacheable' ); |
| 118 | + } |
145 | 119 | wfProfileOut( __METHOD__ ); |
146 | 120 | return $obj; |
147 | 121 | } |
148 | 122 | |
| 123 | + // Set up parser data for wikitext then feed the given text to the parser |
| 124 | + private function preprocessToDom(&$text, $flags = 0) { |
| 125 | + wfProfileIn( __METHOD__ ); |
| 126 | + |
| 127 | + $xmlishRegex = implode('|', $this->parser->getStripList()); |
| 128 | + $rules = array( |
| 129 | + "Root" => new ParseAssign("root", "StartSeq"), |
| 130 | + "StartSeq" => new ParseSeq(array("BOFQuant", "MainQuant"), '$'), |
| 131 | + "BOFQuant" => new ParseQuant("HeadingChoice", 0, 1), |
| 132 | + "DefaultPat" => new ParsePattern('/^~r/s'), |
| 133 | + "SavedPat" => new ParsePattern('/^(~r)/s'), |
| 134 | + "MainQuant" => new ParseQuant("MainChoice"), |
| 135 | + "MainChoice"=> new ParseChoice(array("CurlyChoice", "XmlChoice", "NewLineSeq", "LinkSeq", "MainText")), |
| 136 | + "CurlyChoice" => new ParseChoice(array("Template", "TplArg"), "{"), |
| 137 | + "Template" => new ParseAssign("template", "TplSeq"), |
| 138 | + "TplSeq" => new ParseSeq(array("TemplatePat", "TemplateSeq"), '}}'), |
| 139 | + "TemplatePat" => new ParsePattern('/^{{(?!{[^{])/s'), |
| 140 | + "TplArg" => new ParseAssign("tplarg", "TplArgSeq"), |
| 141 | + "TplArgSeq" => new ParseSeq(array("TplArgPat", "TemplateSeq"), '}}}'), |
| 142 | + "TplArgPat" => new ParsePattern('/^{{{/s'), |
| 143 | + "XmlChoice" => new ParseChoice(array("Comment", "OnlyInclude", "NoInclude", "IncludeOnly", "XmlExt"), "<"), |
| 144 | + "Comment" => new ParseAssign("comment", "CommentPat"), |
| 145 | + "CommentPat" => new ParsePattern('/^(<!--.*?(?:-->|$))/s'), |
| 146 | + "OnlyInclude" => new ParseAssign("ignore", "OnlyIncludePat"), |
| 147 | + "OnlyIncludePat" => new ParsePattern('/^(<\/?onlyinclude>)/s'), |
| 148 | + "NoInclude" => new ParseAssign("ignore", "NoIncludePat"), |
| 149 | + "NoIncludePat" => new ParsePattern('/^(<\/?noinclude>)/s'), |
| 150 | + "IncludeOnly" => new ParseAssign("ignore", "IncludeOnlyPat"), |
| 151 | + "IncludeOnlyPat" => new ParsePattern('/^(<includeonly>.*?<\/includeonly>)/s'), |
| 152 | + "XmlExt" => new ParseAssign("ext", "XmlExtSeq"), |
| 153 | + "XmlExtSeq" => new ParseSeq(array("XmlExtPat", "XmlName", "XmlAttr", "XmlClose"), NULL, TRUE), |
| 154 | + "XmlExtPat" => new ParsePattern('/^<(?=(' . $xmlishRegex . '))/si'), |
| 155 | + "XmlName" => new ParseAssign("name", "SavedPat"), |
| 156 | + "XmlAttr" => new ParseAssign("attr", "XmlAttrPat"), |
| 157 | + "XmlAttrPat" => new ParsePattern('/^(.*?)(?=\/>|>)/s'), |
| 158 | + "XmlClose" => new ParseChoice(array("XmlClosed", "XmlOpenedSeq")), |
| 159 | + "XmlClosed" => new ParsePattern('/^\/>/s'), |
| 160 | + "XmlOpenedSeq" => new ParseSeq(array("XmlOpened", "XmlInner", "XmlCloseQuant")), |
| 161 | + "XmlOpened" => new ParsePattern('/^>/s'), |
| 162 | + "XmlInner" => new ParseAssign("inner", "XmlInnerPat"), |
| 163 | + "XmlInnerPat" => new ParsePattern('/^(.*?)(?=<\/~r>|$)/si'), |
| 164 | + "XmlCloseQuant" => new ParseQuant("XmlCloseTag", 0, 1), |
| 165 | + "XmlCloseTag" => new ParseAssign("close", "XmlClosePat"), |
| 166 | + "XmlClosePat" => new ParsePattern('/^(<\/~r>)/si'), |
| 167 | + "NewLineSeq" => new ParseSeq(array("NewLine", "NewLineChoice")), |
| 168 | + "NewLine" => new ParsePattern('/^(\n)/s'), |
| 169 | + "NewLineChoice" => new ParseChoice(array("HeadingChoice", "CommentLine")), |
| 170 | + "CommentLine" => new ParseAssign("comment", "CommentLinePat"), |
| 171 | + "CommentLinePat" => new ParsePattern('/^((?:<!--.*?-->\n)+)/s'), |
| 172 | + "LinkSeq" => new ParseSeq(array("Link", "MainQuant", "SavedPat"), ']]'), |
| 173 | + "Link" => new ParsePattern('/^(\[\[)/s'), |
| 174 | + "MainText" => new ParsePattern('/^(?!~r)(.[^{}\[\]<\n|=]*)/s'), |
| 175 | + "HeadingChoice" => new ParseChoice(array("Heading6", "Heading5", "Heading4", "Heading3", "Heading2", "Heading1"), "="), |
| 176 | + "Heading6" => new ParseAssign("h", "Heading6Seq", "level", "6"), |
| 177 | + "Heading6Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '======'), |
| 178 | + "Heading5" => new ParseAssign("h", "Heading5Seq", "level", "5"), |
| 179 | + "Heading5Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '====='), |
| 180 | + "Heading4" => new ParseAssign("h", "Heading4Seq", "level", "4"), |
| 181 | + "Heading4Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '===='), |
| 182 | + "Heading3" => new ParseAssign("h", "Heading3Seq", "level", "3"), |
| 183 | + "Heading3Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '==='), |
| 184 | + "Heading2" => new ParseAssign("h", "Heading2Seq", "level", "2"), |
| 185 | + "Heading2Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '=='), |
| 186 | + "Heading1" => new ParseAssign("h", "Heading1Seq", "level", "1"), |
| 187 | + "Heading1Seq" => new ParseSeq(array("DefaultPat", "HeadingSeq"), '='), |
| 188 | + "HeadingSeq" => new ParseSeq(array("MainQuant", "DefaultPat"), '~r(?=(?: *<!--.*?-->)*(?:\n|$))'), |
| 189 | + "TemplateSeq" => new ParseSeq(array("TplTitle", "TplPartQuant", "DefaultPat"), '~r|\|'), |
| 190 | + "TplTitle" => new ParseAssign("title", "MainQuant"), |
| 191 | + "TplPartQuant" => new ParseQuant("TplPart"), |
| 192 | + "TplPart" => new ParseAssign("part", "TplPartSeq"), |
| 193 | + "TplPartSeq" => new ParseSeq(array("TplPipe", "TplPartList")), |
| 194 | + "TplPipe" => new ParsePattern('/^\|/s'), |
| 195 | + "TplPartList" => new ParseChoice(array("NamedPartSeq", "TplValue")), |
| 196 | + "NamedPartSeq" => new ParseSeq(array("TplName", "TplValue")), |
| 197 | + "TplName" => new ParseAssign("name", "TplNameSeq"), |
| 198 | + "TplNameSeq" => new ParseSeq(array("MainQuant", "TplEquals"), '~r|\||=(?!~r|\|)'), |
| 199 | + "TplEquals" => new ParsePattern('/^=/s'), |
| 200 | + "TplValue" => new ParseAssign("value", "MainQuant")); |
| 201 | + if ($flags & Parser::PTD_FOR_INCLUSION) { |
| 202 | + $rules["BOFQuant"] = new ParseQuant("StartChoice", 0, 1); |
| 203 | + $rules["StartChoice"] = new ParseChoice(array("OnlyIncludeBOF", "HeadingChoice")); |
| 204 | + $rules["OnlyIncludeBOF"] = new ParseAssign("ignore", "OnlyIncludeBOFPat"); |
| 205 | + $rules["OnlyIncludeBOFPat"] = new ParsePattern('/^(.*?<onlyinclude>)/s'); |
| 206 | + $rules["OnlyIncludePat"] = new ParsePattern('/^(<\/onlyinclude>.*?(?:<onlyinclude>|$))/s'); |
| 207 | + $rules["NoIncludePat"] = new ParsePattern('/^(<noinclude>.*?<\/noinclude>)/s'); |
| 208 | + $rules["IncludeOnlyPat"] = new ParsePattern('/^(<\/?includeonly>)/s'); |
| 209 | + } |
| 210 | + $parser = new ParseEngine($rules, "Root"); |
| 211 | + |
| 212 | + $dom = $parser->parse($text); |
| 213 | + $this->transformDOM($dom); |
| 214 | + |
| 215 | + wfProfileOut( __METHOD__ ); |
| 216 | + return $dom; |
| 217 | + } |
| 218 | + |
149 | 219 | // Temporary function to add needed redundant info to the parse tree after parsing. |
150 | 220 | private function transformDOM(&$node, &$headingInd = 1) { |
151 | 221 | if ($node->hasChildNodes()) { |
152 | 222 | if ($node->nodeName == "h") { |
153 | | - $node->setAttribute("level", strspn($node->firstChild->wholeText, "=", 0, 6 )); |
| 223 | + $headerTag = str_repeat("=", $node->getAttribute("level")); |
| 224 | + if ($node->firstChild instanceof DOMText) { |
| 225 | + $node->firstChild->insertData(0, $headerTag); |
| 226 | + } else { |
| 227 | + $node->insertBefore($node->ownerDocument->createTextNode($headerTag), $crrnt->firstChild); |
| 228 | + } |
| 229 | + if ($node->lastChild instanceof DOMText) { |
| 230 | + $node->lastChild->appendData($headerTag); |
| 231 | + } else { |
| 232 | + $node->appendChild($node->ownerDocument->createTextNode($headerTag)); |
| 233 | + } |
154 | 234 | $node->setAttribute("i", $headingInd); |
155 | 235 | $headingInd ++; |
156 | 236 | } elseif ($node->nodeName == "template" && $node->previousSibling instanceof DOMText) { |
Index: branches/parser-work/phase3/includes/AutoLoader.php |
— | — | @@ -460,12 +460,13 @@ |
461 | 461 | 'PPNode_Hash_Tree' => 'includes/parser/Preprocessor_Hash.php', |
462 | 462 | 'PPTemplateFrame_DOM' => 'includes/parser/Preprocessor_DOM.php', |
463 | 463 | 'PPTemplateFrame_Hash' => 'includes/parser/Preprocessor_Hash.php', |
464 | | - 'ParseChoice' => 'includes/parser/ParseTree.php', |
465 | | - 'ParseObject' => 'includes/parser/ParseTree.php', |
466 | | - 'ParsePattern' => 'includes/parser/ParseTree.php', |
467 | | - 'ParseQuant' => 'includes/parser/ParseTree.php', |
468 | | - 'ParseSeq' => 'includes/parser/ParseTree.php', |
469 | | - 'ParseTree' => 'includes/parser/ParseTree.php', |
| 464 | + 'ParseAssign' => 'includes/parser/ParseEngine.php', |
| 465 | + 'ParseChoice' => 'includes/parser/ParseEngine.php', |
| 466 | + 'ParseEngine' => 'includes/parser/ParseEngine.php', |
| 467 | + 'ParseObject' => 'includes/parser/ParseEngine.php', |
| 468 | + 'ParsePattern' => 'includes/parser/ParseEngine.php', |
| 469 | + 'ParseQuant' => 'includes/parser/ParseEngine.php', |
| 470 | + 'ParseSeq' => 'includes/parser/ParseEngine.php', |
470 | 471 | 'Parser' => 'includes/parser/Parser.php', |
471 | 472 | 'ParserCache' => 'includes/parser/ParserCache.php', |
472 | 473 | 'ParserOptions' => 'includes/parser/ParserOptions.php', |
Index: branches/parser-work/phase3/includes/DefaultSettings.php |
— | — | @@ -1232,6 +1232,11 @@ |
1233 | 1233 | $wgDebugPrintHttpHeaders = true; |
1234 | 1234 | |
1235 | 1235 | /** |
| 1236 | + * Log file for debugging the parser, if not set log won't be created |
| 1237 | + */ |
| 1238 | +$wgDebugParserLog = ''; |
| 1239 | + |
| 1240 | +/** |
1236 | 1241 | * Show the contents of $wgHooks in Special:Version |
1237 | 1242 | */ |
1238 | 1243 | $wgSpecialVersionShowHooks = false; |