Index: trunk/extensions/ParseEngine/ParseEngine.body.php |
— | — | @@ -0,0 +1,236 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * Acts as the primary interface between the world and the parser. |
| 5 | + * mStartRule - the first rule to use while parsing |
| 6 | + * mRules - The list of rules to use while parsing |
| 7 | + * mDom - Used to create Dom objects and get's returned at the end of parsing |
| 8 | + * mIter - Keeps track of how many times the parser recurses to stop endless loops |
| 9 | + */ |
| 10 | +class ParseEngine { |
| 11 | + const maxIter = 2048; |
| 12 | + private $mGrammars; |
| 13 | + |
| 14 | + function __construct() { |
| 15 | + $this->mGrammars = array(); |
| 16 | + } |
| 17 | + |
| 18 | + function parse($grammarName, &$text) { |
| 19 | + global $IP; |
| 20 | + wfDebugLog("ParseEngine", "==========Start Parse Engine==========\n"); |
| 21 | + $grammar = isset($this->mGrammars[$grammarName]) ? $this->mGrammars[$grammarName] : NULL; |
| 22 | + if ($grammar == NULL) { |
| 23 | + $grammar = new DOMDocument(); |
| 24 | + if (! $grammar->load("$IP/extensions/ParseEngine/$grammarName.xml", LIBXML_NOBLANKS)) { |
| 25 | + return TRUE; |
| 26 | + } |
| 27 | + $this->pushTags($grammar->documentElement, NULL); |
| 28 | + $this->mGrammars[$grammarName] = $grammar; |
| 29 | + } |
| 30 | + $doc = new DOMDocument(); |
| 31 | + $rootTag = $doc->createElement($grammar->documentElement->getAttribute("rootTag")); |
| 32 | + $startRule = $grammar->documentElement->getAttribute("startRule"); |
| 33 | + $xpath = new DOMXPath($grammar); |
| 34 | + $startRule = $xpath->query("/Grammar/*[@name='$startRule']")->item(0); |
| 35 | + $refText = $text; |
| 36 | + if (! $this->parseRec($startRule, "", "", $iter, $refText, $rootTag)) { |
| 37 | + return TRUE; |
| 38 | + } |
| 39 | + $doc->appendChild($rootTag); |
| 40 | + $text = $doc->saveXML(); |
| 41 | + wfDebugLog("ParseEngine", "Parsed text - $text\n"); |
| 42 | + return TRUE; |
| 43 | + } |
| 44 | + |
| 45 | + static function unparse($inNodes) { |
| 46 | + $retStr = ""; |
| 47 | + foreach ($inNodes as $child) { |
| 48 | + if ($child instanceof DOMText) { |
| 49 | + $retStr .= $child->data; |
| 50 | + } else { |
| 51 | + $retStr .= $child->getAttribute("tag") . self::unparse($child->childNodes); |
| 52 | + } |
| 53 | + } |
| 54 | + return $retStr; |
| 55 | + } |
| 56 | + |
| 57 | + private function parseRec($rule, $replaceStr, $saveTags, &$iter, &$text, &$outNode) { |
| 58 | + wfDebugLog("ParseEngine", "Entering {$rule->nodeName}, {$rule->getAttribute("name")}\n"); |
| 59 | + $iter ++; |
| 60 | + if ($iter > ParseEngine::maxIter) { |
| 61 | + throw new MWException("Parser iterated too many times. Probable loop in grammar."); |
| 62 | + } |
| 63 | + if ($rule->nodeName == "Assignment" || $rule->nodeName == "Reference" || $rule->nodeName == "Text") { |
| 64 | + $saveTags = str_replace("~r", preg_quote($replaceStr, "/"), $saveTags); |
| 65 | + $newTags = $rule->getAttribute("saveTags"); |
| 66 | + if ($saveTags == "") { |
| 67 | + $saveTags = $newTags; |
| 68 | + } elseif ($newTags != "") { |
| 69 | + $saveTags .= "|" . $newTags; |
| 70 | + } |
| 71 | + } |
| 72 | + $dom = $outNode->ownerDocument; |
| 73 | + $retCode = FALSE; |
| 74 | + if ($rule->nodeName == "Assignment") { |
| 75 | + $tag = $rule->getAttribute("tag"); |
| 76 | + $foundTag = $tag == NULL; |
| 77 | + if (! $foundTag) { |
| 78 | + if ($rule->getAttribute("regex") != NULL) { |
| 79 | + $tag = str_replace("~r", preg_quote($replaceStr, "/"), $tag); |
| 80 | + $foundTag = preg_match("/^$tag/s", $text, $matches); |
| 81 | + if ($foundTag) { |
| 82 | + $tag = $matches[0]; |
| 83 | + if (isset($matches[1])) { |
| 84 | + $replaceStr = $matches[1]; |
| 85 | + } |
| 86 | + } |
| 87 | + } else { |
| 88 | + $tag = str_replace("~r", $replaceStr, $tag); |
| 89 | + $foundTag = strncmp($tag, $text, strlen($tag)) == 0; |
| 90 | + } |
| 91 | + } |
| 92 | + if ($foundTag) { |
| 93 | + $newText = $text; |
| 94 | + $newElement = $dom->createElement($rule->getAttribute("tagName")); |
| 95 | + if ($tag != NULL) { |
| 96 | + $newText = substr($newText, strlen($tag)); |
| 97 | + $newElement->setAttribute("tag", $tag); |
| 98 | + } |
| 99 | + $retCode = $rule->firstChild == NULL || $this->parseRec($rule->firstChild, $replaceStr, $saveTags, $iter, $newText, $newElement); |
| 100 | + if ($retCode) { |
| 101 | + $outNode->appendChild($newElement); |
| 102 | + $text = $newText; |
| 103 | + } |
| 104 | + } |
| 105 | + } elseif ($rule->nodeName == "Sequence") { |
| 106 | + $saveText = $text; |
| 107 | + $saveNode = $outNode->cloneNode(TRUE); |
| 108 | + $pushInd = $rule->getAttribute("pushInd"); |
| 109 | + foreach ($rule->childNodes as $i => $crrnt) { |
| 110 | + $pushTags = $i >= $pushInd ? $saveTags : ""; |
| 111 | + $retCode = $this->parseRec($crrnt, $replaceStr, $pushTags, $iter, $text, $outNode); |
| 112 | + if (! $retCode) { |
| 113 | + $text = $saveText; |
| 114 | + $outNode = $saveNode; |
| 115 | + break; |
| 116 | + } |
| 117 | + } |
| 118 | + } elseif ($rule->nodeName == "Choice") { |
| 119 | + foreach ($rule->childNodes as $crrnt) { |
| 120 | + $retCode = $this->parseRec($crrnt, $replaceStr, $saveTags, $iter, $text, $outNode); |
| 121 | + if ($retCode) { |
| 122 | + break; |
| 123 | + } |
| 124 | + } |
| 125 | + $retCode |= $rule->getAttribute("failSafe") != NULL; |
| 126 | + } elseif ($rule->nodeName == "Reference") { |
| 127 | + $newVar = $rule->hasAttribute("var") ? str_replace("~r", $replaceStr, $rule->getAttribute("var")) : $replaceStr; |
| 128 | + $xpath = new DOMXPath($rule->ownerDocument); |
| 129 | + $refRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("name")}']")->item(0); |
| 130 | + $retCode = $this->parseRec($refRule, $newVar, $saveTags, $iter, $text, $outNode); |
| 131 | + } elseif ($rule->nodeName == "Text") { |
| 132 | + $tagSearch = $rule->getAttribute("childTags"); |
| 133 | + if ($tagSearch == "") { |
| 134 | + $tagSearch = $saveTags; |
| 135 | + } elseif ($saveTags != "") { |
| 136 | + $tagSearch .= "|" . $saveTags; |
| 137 | + } |
| 138 | + while ($text != "" && ($saveTags == "" || ! preg_match("/^($saveTags)/s", $text))) { |
| 139 | + $offset = $rule->firstChild != NULL && $this->parseRec($rule->firstChild, $replaceStr, "", $iter, $text, $outNode) ? 0 : 1; |
| 140 | + if (preg_match("/$tagSearch/s", $text, $matches, PREG_OFFSET_CAPTURE, $offset)) { |
| 141 | + if ($matches[0][1] > 0) { |
| 142 | + $outNode->appendChild($dom->createTextNode(substr($text, 0, $matches[0][1]))); |
| 143 | + $text = substr($text, $matches[0][1]); |
| 144 | + } |
| 145 | + } else { |
| 146 | + $outNode->appendChild($dom->createTextNode($text)); |
| 147 | + $text = ""; |
| 148 | + } |
| 149 | + } |
| 150 | + $retCode = true; |
| 151 | + } |
| 152 | + wfDebugLog("ParseEngine", "Exiting {$rule->nodeName}, Return Code - $retCode\n"); |
| 153 | + wfDebugLog("ParseEngine", "Text - $text\n"); |
| 154 | + return $retCode; |
| 155 | + } |
| 156 | + |
| 157 | + private function pushTags($rule, $tagStr) { |
| 158 | + if ($rule->nodeName == "Sequence") { |
| 159 | + $pushInd = $rule->childNodes->length - 1; |
| 160 | + $shouldPush = true; |
| 161 | + for ($child = $rule->lastChild; $child != NULL; $child = $child->previousSibling) { |
| 162 | + $this->pushTags($child, $tagStr); |
| 163 | + if ($child->previousSibling != NULL) { |
| 164 | + if ($this->pullTags($child, $iter, $childTag)) { |
| 165 | + if ($shouldPush) { |
| 166 | + $pushInd --; |
| 167 | + } |
| 168 | + if ($tagStr == "") { |
| 169 | + $tagStr = $childTag; |
| 170 | + } elseif ($childTag != "") { |
| 171 | + $tagStr .= "|" . $childTag; |
| 172 | + } |
| 173 | + } else { |
| 174 | + $shouldPush = false; |
| 175 | + $tagStr = $childTag; |
| 176 | + } |
| 177 | + } |
| 178 | + } |
| 179 | + $rule->setAttribute("pushInd", $pushInd); |
| 180 | + } else { |
| 181 | + if ($rule->nodeName != "Choice") { |
| 182 | + $rule->setAttribute("saveTags", $tagStr); |
| 183 | + $tagStr = NULL; |
| 184 | + if ($rule->nodeName == "Text") { |
| 185 | + $childTags = ""; |
| 186 | + foreach ($rule->childNodes as $crrnt) { |
| 187 | + if ($childTags != "") { |
| 188 | + $childTags .= "|"; |
| 189 | + } |
| 190 | + $this->pullTags($crrnt, $iter, $childTag); |
| 191 | + $childTags .= $childTag; |
| 192 | + } |
| 193 | + $rule->setAttribute("childTags", $childTags); |
| 194 | + } |
| 195 | + } |
| 196 | + foreach ($rule->childNodes as $crrnt) { |
| 197 | + $this->pushTags($crrnt, $tagStr); |
| 198 | + } |
| 199 | + } |
| 200 | + } |
| 201 | + |
| 202 | + private function pullTags($rule, &$iter, &$childTags) { |
| 203 | + $iter ++; |
| 204 | + if ($iter > ParseEngine::maxIter) { |
| 205 | + throw new MWException("Collecter iterated too many times. Probable loop in grammar."); |
| 206 | + } |
| 207 | + $childTags = ""; |
| 208 | + $failSafe = TRUE; |
| 209 | + if ($rule->nodeName == "Assignment") { |
| 210 | + $childTags = $rule->getAttribute("tag"); |
| 211 | + if ($rule->getAttribute("regex") == NULL) { |
| 212 | + $childTags = preg_quote($childTags, "/"); |
| 213 | + } |
| 214 | + $failSafe = FALSE; |
| 215 | + } elseif ($rule->nodeName == "Choice" || $rule->nodeName == "Sequence") { |
| 216 | + $failSafe = $rule->nodeName == "Sequence"; |
| 217 | + foreach ($rule->childNodes as $child) { |
| 218 | + $failSafe = $this->pullTags($child, $iter, $newTags); |
| 219 | + if ($childTags == "") { |
| 220 | + $childTags = $newTags; |
| 221 | + } elseif ($newTags != "") { |
| 222 | + $childTags .= "|" . $newTags; |
| 223 | + } |
| 224 | + if (($failSafe && $rule->nodeName == "Choice") || (! $failSafe && $rule->nodeName == "Sequence")) { |
| 225 | + break; |
| 226 | + } |
| 227 | + } |
| 228 | + $failSafe |= $rule->nodeName == "Choice" && $rule->getAttribute("failSafe") != NULL; |
| 229 | + } elseif ($rule->nodeName == "Reference") { |
| 230 | + $xpath = new DOMXPath($rule->ownerDocument); |
| 231 | + $refRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("name")}']")->item(0); |
| 232 | + $failSafe = $this->pullTags($refRule, $iter, $childTags); |
| 233 | + } |
| 234 | + return $failSafe; |
| 235 | + } |
| 236 | +} |
| 237 | + |
Property changes on: trunk/extensions/ParseEngine/ParseEngine.body.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 238 | + native |
Index: trunk/extensions/ParseEngine/ParseEngine.php |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * Allows people to define a grammar in a wiki format then use that grammar to input information to the wiki |
| 5 | + * @file |
| 6 | + * @ingroup Extensions |
| 7 | + * @author Nathanael Thompson <than4213@gmail.com> |
| 8 | + * @copyright Copyright © 2009 Nathanael Thompson |
| 9 | + * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License |
| 10 | + */ |
| 11 | +if ( !defined( "MEDIAWIKI" ) ) { |
| 12 | + die( "This is not a valid entry point.\n" ); |
| 13 | +} |
| 14 | + |
| 15 | +$wgExtensionCredits["other"][] = array( |
| 16 | + "path" => __FILE__, |
| 17 | + "name" => "ParseEngine", |
| 18 | + "author" => "Nathanael Thompson", |
| 19 | + "url" => "http://www.mediawiki.org/wiki/Extension:ParseEngine", |
| 20 | + "version" => "1.0", |
| 21 | + "descriptionmsg" => "parseengine-desc", |
| 22 | +); |
| 23 | + |
| 24 | +$dir = dirname( __FILE__ ); |
| 25 | +$wgAutoloadClasses["ParseEngine"] = "$dir/ParseEngine.body.php"; |
| 26 | +$wgHooks["BeforePreSaveTransform"][] = array(new ParseEngine(), "parse", $wgParseEngineGrammar); |
| 27 | + |
Property changes on: trunk/extensions/ParseEngine/ParseEngine.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 28 | + native |
Index: trunk/extensions/ParseEngine/WikiTextGrammar.xml |
— | — | @@ -0,0 +1,145 @@ |
| 2 | +<?xml version="1.0"?> |
| 3 | +<Grammar rootTag="root" startRule="start" version="1.0"> |
| 4 | + <Sequence name="start" > |
| 5 | + <Reference name="postNewLine" /> |
| 6 | + <Reference name="main" /> |
| 7 | + </Sequence> |
| 8 | + <Text name="main"> |
| 9 | + <Choice> |
| 10 | + <Sequence> |
| 11 | + <Reference name="newLine" /> |
| 12 | + <Reference name="postNewLine" /> |
| 13 | + </Sequence> |
| 14 | + <Assignment tagName="link" tag="[["> |
| 15 | + <Reference name="endText" var="]]" /> |
| 16 | + </Assignment> |
| 17 | + <Assignment tagName="tplArg" tag="{{{(?!{)" regex="true"> |
| 18 | + <Sequence> |
| 19 | + <Reference name="name" /> |
| 20 | + <Choice failSafe="true"> |
| 21 | + <Assignment tagName="default" tag="|"> |
| 22 | + <Reference name="main" /> |
| 23 | + </Assignment> |
| 24 | + </Choice> |
| 25 | + <Assignment tagName="endTag" tag="}}}" /> |
| 26 | + </Sequence> |
| 27 | + </Assignment> |
| 28 | + <Assignment tagName="template" tag="{{"> |
| 29 | + <Sequence> |
| 30 | + <Reference name="name" /> |
| 31 | + <Choice failSafe="true"> |
| 32 | + <Assignment tagName="name2" tag=":"> |
| 33 | + <Reference name="main" /> |
| 34 | + </Assignment> |
| 35 | + </Choice> |
| 36 | + <Reference name="partList" /> |
| 37 | + <Assignment tagName="endTag" tag="}}" /> |
| 38 | + </Sequence> |
| 39 | + </Assignment> |
| 40 | + <Reference name="comment" /> |
| 41 | + <Assignment tagName="noWiki" tag="<nowiki>"> |
| 42 | + <Sequence> |
| 43 | + <Text /> |
| 44 | + <Assignment tagName="endTag" tag="<\/nowiki>" /> |
| 45 | + </Sequence> |
| 46 | + </Assignment> |
| 47 | + <Assignment tagName="xmlTag" tag="<(?=(\w+)[\s\/>])" regex="true"> |
| 48 | + <Sequence> |
| 49 | + <Assignment tagName="name" tag="~r" /> |
| 50 | + <Reference name="attrList" /> |
| 51 | + <Choice> |
| 52 | + <Assignment tagName="endTag" tag="\s*\/>" regex="true" /> |
| 53 | + <Sequence> |
| 54 | + <Assignment tagName="inner" tag="\s*>" regex="true"> |
| 55 | + <Reference name="main" /> |
| 56 | + </Assignment> |
| 57 | + <Assignment tagName="endTag" tag="</~r>" /> |
| 58 | + </Sequence> |
| 59 | + </Choice> |
| 60 | + </Sequence> |
| 61 | + </Assignment> |
| 62 | + </Choice> |
| 63 | + </Text> |
| 64 | + <Sequence name="endText"> |
| 65 | + <Reference name="main" /> |
| 66 | + <Assignment tagName="endTag" tag="~r" /> |
| 67 | + </Sequence> |
| 68 | + <Assignment name="newLine" tagName="newLine" tag="\r?\n" regex="true" /> |
| 69 | + <Assignment name="eol" tagName="eol" tag="(?=\n|$)" regex="true" /> |
| 70 | + <Choice name="ignoreList" failSafe="true"> |
| 71 | + <Sequence> |
| 72 | + <Choice> |
| 73 | + <Assignment tag="[ \t]+" regex="true" /> |
| 74 | + <Reference name="comment" /> |
| 75 | + </Choice> |
| 76 | + <Reference name="ignoreList" /> |
| 77 | + </Sequence> |
| 78 | + </Choice> |
| 79 | + <Choice name="postNewLine" failSafe="true"> |
| 80 | + <Sequence> |
| 81 | + <Assignment tagName="h" tag="(={1,6})" regex="true"> |
| 82 | + <Reference name="endText" /> |
| 83 | + </Assignment> |
| 84 | + <Reference name="ignoreList" /> |
| 85 | + <Reference name="eol" /> |
| 86 | + </Sequence> |
| 87 | + <Reference name="listChoice" var="" /> |
| 88 | + </Choice> |
| 89 | + <Choice name="listChoice"> |
| 90 | + <Assignment tagName="orderedList" tag="(?=(~r#))" regex="true"> |
| 91 | + <Reference name="itemList" /> |
| 92 | + </Assignment> |
| 93 | + <Assignment tagName="unorderedList" tag="(?=(~r\*))" regex="true"> |
| 94 | + <Reference name="itemList" /> |
| 95 | + </Assignment> |
| 96 | + </Choice> |
| 97 | + <Sequence name="itemList"> |
| 98 | + <Choice> |
| 99 | + <Reference name="listChoice" /> |
| 100 | + <Assignment tagName="listItem" tag="~r"> |
| 101 | + <Sequence> |
| 102 | + <Reference name="main" /> |
| 103 | + <Reference name="eol" /> |
| 104 | + </Sequence> |
| 105 | + </Assignment> |
| 106 | + </Choice> |
| 107 | + <Choice failSafe="true"> |
| 108 | + <Sequence> |
| 109 | + <Reference name="newLine" /> |
| 110 | + <Reference name="itemList" /> |
| 111 | + </Sequence> |
| 112 | + </Choice> |
| 113 | + </Sequence> |
| 114 | + <Assignment name="comment" tagName="comment" tag="<!--.*?(?:-->|$)" regex="true" /> |
| 115 | + <Assignment name="name" tagName="name"> |
| 116 | + <Reference name="main" /> |
| 117 | + </Assignment> |
| 118 | + <Choice name="partList" failSafe="true"> |
| 119 | + <Sequence> |
| 120 | + <Assignment tagName="part" tag="|"> |
| 121 | + <Sequence> |
| 122 | + <Reference name="name" /> |
| 123 | + <Choice failSafe="true"> |
| 124 | + <Assignment tagName="value" tag="="> |
| 125 | + <Reference name="main" /> |
| 126 | + </Assignment> |
| 127 | + </Choice> |
| 128 | + </Sequence> |
| 129 | + </Assignment> |
| 130 | + <Reference name="partList" /> |
| 131 | + </Sequence> |
| 132 | + </Choice> |
| 133 | + <Choice name="attrList" failSafe="true"> |
| 134 | + <Sequence> |
| 135 | + <Assignment tagName="attribute" tag="\s+(?!\/?>)" regex="true"> |
| 136 | + <Sequence> |
| 137 | + <Reference name="name" /> |
| 138 | + <Assignment tagName="value" tag="\s*=\s*("|')" regex="true"> |
| 139 | + <Reference name="endText" /> |
| 140 | + </Assignment> |
| 141 | + </Sequence> |
| 142 | + </Assignment> |
| 143 | + <Reference name="attrList" /> |
| 144 | + </Sequence> |
| 145 | + </Choice> |
| 146 | +</Grammar> |