r39065 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r39064‎ | r39065 | r39066 >
Date:12:53, 10 August 2008
Author:river
Status:old
Tags:
Comment:
improved string literal parser. now \xHH works as expected (only matches two digits), and \uHHHH, \UHHHHHHHH are supported (16/32-bit Unicode characters)
Modified paths:
  • /trunk/extensions/AbuseFilter/parser_native/ast.h (modified) (history)
  • /trunk/extensions/AbuseFilter/parser_native/parser.h (modified) (history)

Diff [purge]

Index: trunk/extensions/AbuseFilter/parser_native/parser.h
@@ -22,6 +22,7 @@
2323 #include <boost/spirit/core.hpp>
2424 #include <boost/spirit/utility/confix.hpp>
2525 #include <boost/spirit/utility/chset.hpp>
 26+#include <boost/spirit/utility/loops.hpp>
2627 #include <boost/spirit/tree/ast.hpp>
2728 #include <boost/spirit/tree/tree_to_xml.hpp>
2829 #include <boost/spirit/symbols.hpp>
@@ -211,13 +212,28 @@
212213 | string
213214 ;
214215
 216+ hexchar = chset<>("a-fA-F0-9")
 217+ ;
 218+
 219+ octchar = chset<>("0-7")
 220+ ;
 221+
 222+ c_string_char =
 223+ "\\x" >> hexchar >> hexchar
 224+ | "\\u" >> repeat_p(4)[hexchar]
 225+ | "\\U" >> repeat_p(8)[hexchar]
 226+ | "\\o" >> octchar >> octchar >> octchar
 227+ | "\\" >> anychar_p - (ch_p('x') | 'u' | 'o')
 228+ | anychar_p - (ch_p('"') | '\\')
 229+ ;
 230+
215231 /*
216232 * config_p can't be used here, because it will rewrite
217233 * *(c_escape_ch_p[x]) into (*c_escape_ch_p)[x]
218234 */
219235 string = inner_node_d[
220236 '"'
221 - >> leaf_node_d[ *(lex_escape_ch_p - '"') ]
 237+ >> leaf_node_d[ *(c_string_char) ]
222238 >> '"'
223239 ]
224240 ;
@@ -341,6 +357,7 @@
342358 return tern_expr;
343359 }
344360
 361+ rule<ScannerT> c_string_char, hexchar, octchar;
345362 rule<ScannerT, parser_context<>, parser_tag<id_value> > value;
346363 rule<ScannerT, parser_context<>, parser_tag<id_variable> > variable;
347364 rule<ScannerT, parser_context<>, parser_tag<id_basic> > basic;
Index: trunk/extensions/AbuseFilter/parser_native/ast.h
@@ -12,6 +12,32 @@
1313 #ifndef AST_H
1414 #define AST_H
1515
 16+namespace {
 17+
 18+template<typename charT>
 19+int
 20+hex2int(charT const *str, int ndigits)
 21+{
 22+ int ret = 0;
 23+
 24+ while (ndigits--) {
 25+ ret *= 0x10;
 26+ if (*str >= 'a' && *str <= 'f')
 27+ ret += 10 + int(*str - 'a');
 28+ else if (*str >= 'A' && *str <= 'F')
 29+ ret += 10 + int(*str - 'A');
 30+ else if (*str >= '0' && *str <= '9')
 31+ ret += int(*str - '0');
 32+
 33+ str++;
 34+ }
 35+
 36+ std::cerr << "hex2int: " << ret << '\n';
 37+ return ret;
 38+}
 39+
 40+}
 41+
1642 namespace afp {
1743
1844 template<typename T> struct parser_grammar;
@@ -234,6 +260,27 @@
235261 case 'v':
236262 ret.push_back('\v');
237263 break;
 264+ case 'x':
 265+ if (i + 3 >= end)
 266+ break;
 267+ ret.push_back(hex2int(s.data() + i + 2, 2));
 268+ i += 2;
 269+ break;
 270+
 271+ case 'u':
 272+ if (i + 5 >= end)
 273+ break;
 274+ ret.push_back(hex2int(s.data() + i + 2, 4));
 275+ i += 4;
 276+ break;
 277+
 278+ case 'U':
 279+ if (i + 9 >= end)
 280+ break;
 281+ ret.push_back(hex2int(s.data() + i + 2, 8));
 282+ i += 8;
 283+ break;
 284+
238285 default:
239286 ret.push_back(s[i + 1]);
240287 break;

Status & tagging log