Index: trunk/extensions/AbuseFilter/parser_native/affunctions.cpp |
— | — | @@ -1,205 +0,0 @@ |
2 | | -/* |
3 | | - * Copyright (c) 2008 Andrew Garrett. |
4 | | - * Copyright (c) 2008 River Tarnell <river@wikimedia.org> |
5 | | - * Derived from public domain code contributed by Victor Vasiliev. |
6 | | - * |
7 | | - * Permission is granted to anyone to use this software for any purpose, |
8 | | - * including commercial applications, and to alter it and redistribute it |
9 | | - * freely. This software is provided 'as-is', without any express or |
10 | | - * implied warranty. |
11 | | - */ |
12 | | - |
13 | | -#include <algorithm> |
14 | | -#include <fstream> |
15 | | -#include <sstream> |
16 | | -#include <ios> |
17 | | -#include <iostream> |
18 | | - |
19 | | -#include <unicode/uchar.h> |
20 | | - |
21 | | -#include <boost/format.hpp> |
22 | | - |
23 | | -#include "utf8.h" |
24 | | -#include "equiv.h" |
25 | | -#include "affunctions.h" |
26 | | - |
27 | | -namespace { |
28 | | - |
29 | | -struct too_many_arguments_exception : afp::exception { |
30 | | - too_many_arguments_exception(char const *what) |
31 | | - : afp::exception(what) {} |
32 | | -}; |
33 | | - |
34 | | -struct too_few_arguments_exception : afp::exception { |
35 | | - too_few_arguments_exception(char const *what) |
36 | | - : afp::exception(what) {} |
37 | | -}; |
38 | | - |
39 | | -void |
40 | | -check_args(std::string const &fname, int args, int min, int max = 0) |
41 | | -{ |
42 | | - if (max == 0) |
43 | | - max = min; |
44 | | - if (args < min) { |
45 | | - std::string s = str(boost::format( |
46 | | - "too few arguments for function %s (got %d, expected %d)") |
47 | | - % fname % args % min); |
48 | | - throw too_few_arguments_exception(s.c_str()); |
49 | | - } else if (args > max) { |
50 | | - std::string s = str(boost::format( |
51 | | - "too many arguments for function %s (got %d, expected %d)") |
52 | | - % fname % args % min); |
53 | | - throw too_many_arguments_exception(s.c_str()); |
54 | | - } |
55 | | -} |
56 | | - |
57 | | -} // anonymous namespace |
58 | | - |
59 | | -namespace afp { |
60 | | - |
61 | | -datum |
62 | | -af_count(std::vector<datum> const &args) { |
63 | | - check_args("count", args.size(), 1, 2); |
64 | | - |
65 | | - std::string needle, haystack; |
66 | | - |
67 | | - if (args.size() < 2) { |
68 | | - needle = ","; |
69 | | - haystack = args[0].toString(); |
70 | | - } else { |
71 | | - needle = args[0].toString(); |
72 | | - haystack = args[1].toString(); |
73 | | - } |
74 | | - |
75 | | - size_t last_pos = 0; |
76 | | - unsigned int count = 0; |
77 | | - |
78 | | - while (last_pos != haystack.npos) { |
79 | | - count++; |
80 | | - last_pos = haystack.find(needle, last_pos); |
81 | | - } |
82 | | - |
83 | | - // One extra was added, but one extra is needed if only one arg was supplied. |
84 | | - if (args.size() >= 2) |
85 | | - count--; |
86 | | - |
87 | | - return datum::from_int((long int)count); |
88 | | -} |
89 | | - |
90 | | -datum |
91 | | -af_norm(std::vector<datum> const &args) { |
92 | | - check_args("norm", args.size(), 1); |
93 | | - |
94 | | - std::string orig = args[0].toString(); |
95 | | - |
96 | | - int lastchr = 0; |
97 | | - equiv_set const &equivs = equiv_set::instance(); |
98 | | - std::string result; |
99 | | - |
100 | | - utf8::utf8_iterator<std::string::const_iterator> |
101 | | - it(orig.begin(), orig.end()), end; |
102 | | - |
103 | | - for (; it != end; ++it) { |
104 | | - int chr = equivs.get(*it); |
105 | | - |
106 | | - if (chr != lastchr && u_isalnum(chr)) |
107 | | - result.append(utf8::codepoint_to_utf8(chr)); |
108 | | - |
109 | | - lastchr = chr; |
110 | | - } |
111 | | - |
112 | | - return datum::from_string(result); |
113 | | -} |
114 | | - |
115 | | -std::string |
116 | | -rmdoubles(std::string const &orig) { |
117 | | - int lastchr = 0; |
118 | | - std::string result; |
119 | | - |
120 | | - utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
121 | | - for (; it != end; ++it) { |
122 | | - if (*it != lastchr) |
123 | | - result.append(utf8::codepoint_to_utf8(*it)); |
124 | | - |
125 | | - lastchr = *it; |
126 | | - } |
127 | | - |
128 | | - return result; |
129 | | -} |
130 | | - |
131 | | -datum |
132 | | -af_specialratio(std::vector<datum> const &args) { |
133 | | - check_args("specialratio", args.size(), 1); |
134 | | - |
135 | | - std::string orig = args[0].toString(); |
136 | | - int len = 0; |
137 | | - int specialcount = 0; |
138 | | - |
139 | | - utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
140 | | - for (; it != end; ++it) { |
141 | | - len++; |
142 | | - if (!u_isalnum(*it)) |
143 | | - specialcount++; |
144 | | - } |
145 | | - |
146 | | - double ratio = (float)specialcount / len; |
147 | | - |
148 | | - return datum::from_double(ratio); |
149 | | -} |
150 | | - |
151 | | -datum |
152 | | -af_rmspecials(std::vector<datum> const &args) { |
153 | | - check_args("rmspecials", args.size(), 1); |
154 | | - return datum::from_string(rmspecials(args[0].toString())); |
155 | | -} |
156 | | - |
157 | | -std::string |
158 | | -rmspecials(std::string const &orig) { |
159 | | - std::string result; |
160 | | - |
161 | | - utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
162 | | - for (; it != end; ++it) { |
163 | | - if (u_isalnum(*it)) |
164 | | - result.append(utf8::codepoint_to_utf8(*it)); |
165 | | - } |
166 | | - |
167 | | - return result; |
168 | | -} |
169 | | - |
170 | | -datum |
171 | | -af_ccnorm(std::vector<datum> const &args) { |
172 | | - check_args("ccnorm", args.size(), 1); |
173 | | - return datum::from_string(confusable_character_normalise( args[0].toString())); |
174 | | -} |
175 | | - |
176 | | -datum |
177 | | -af_rmdoubles(std::vector<datum> const &args) { |
178 | | - check_args("ccnorm", args.size(), 1); |
179 | | - return datum::from_string(rmdoubles(args[0].toString())); |
180 | | -} |
181 | | - |
182 | | -datum |
183 | | -af_length(std::vector<datum> const &args) { |
184 | | - check_args("ccnorm", args.size(), 1); |
185 | | - return datum::from_int((long int)utf8::utf8_strlen(args[0].toString())); |
186 | | -} |
187 | | - |
188 | | -datum |
189 | | -af_lcase(std::vector<datum> const &args) { |
190 | | - check_args("ccnorm", args.size(), 1); |
191 | | - return datum::from_string(utf8::utf8_tolower(args[0].toString())); |
192 | | -} |
193 | | - |
194 | | -std::string |
195 | | -confusable_character_normalise(std::string const &orig) { |
196 | | - equiv_set const &equivs = equiv_set::instance(); |
197 | | - std::string result; |
198 | | - |
199 | | - utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
200 | | - for (; it != end; ++it) |
201 | | - result += utf8::codepoint_to_utf8(equivs.get(*it)); |
202 | | - |
203 | | - return result; |
204 | | -} |
205 | | - |
206 | | -} // namespace afp |
Index: trunk/extensions/AbuseFilter/parser_native/filter_evaluator.h |
— | — | @@ -15,22 +15,66 @@ |
16 | 16 | #include <string> |
17 | 17 | #include <map> |
18 | 18 | |
| 19 | +#include <unicode/uchar.h> |
| 20 | + |
19 | 21 | #include "aftypes.h" |
20 | 22 | #include "parser.h" |
| 23 | +#include "affunctions.h" |
21 | 24 | |
22 | 25 | namespace afp { |
23 | 26 | |
24 | | -struct filter_evaluator { |
25 | | - filter_evaluator(); |
| 27 | +template<typename charT> |
| 28 | +struct basic_filter_evaluator { |
| 29 | + basic_filter_evaluator(); |
26 | 30 | |
27 | | - bool evaluate(std::string const &filter) const; |
| 31 | + bool evaluate(std::basic_string<charT> const &filter) const; |
28 | 32 | |
29 | | - void add_variable(std::string const &key, datum value); |
| 33 | + void add_variable( |
| 34 | + std::basic_string<charT> const &key, |
| 35 | + basic_datum<charT> value); |
30 | 36 | |
31 | 37 | private: |
32 | | - expressor e; |
| 38 | + basic_expressor<charT> e; |
33 | 39 | }; |
34 | 40 | |
| 41 | +typedef basic_filter_evaluator<char> filter_evaluator; |
| 42 | +typedef basic_filter_evaluator<UChar> u32filter_evaluator; |
| 43 | + |
| 44 | +template<typename charT> |
| 45 | +basic_filter_evaluator<charT>::basic_filter_evaluator() |
| 46 | +{ |
| 47 | + e.add_function("length", af_length<charT>); |
| 48 | + e.add_function("lcase", af_lcase<charT>); |
| 49 | + e.add_function("ccnorm", af_ccnorm<charT>); |
| 50 | + e.add_function("rmdoubles", af_rmdoubles<charT>); |
| 51 | + e.add_function("specialratio", af_specialratio<charT>); |
| 52 | + e.add_function("rmspecials", af_rmspecials<charT>); |
| 53 | + e.add_function("norm", af_norm<charT>); |
| 54 | + e.add_function("count", af_count<charT>); |
| 55 | +} |
| 56 | + |
| 57 | +template<typename charT> |
| 58 | +bool |
| 59 | +basic_filter_evaluator<charT>::evaluate( |
| 60 | + std::basic_string<charT> const &filter) const |
| 61 | +{ |
| 62 | + try { |
| 63 | + return e.evaluate(filter).toBool(); |
| 64 | + } catch (std::exception &e) { |
| 65 | + std::cerr << "can't evaluate filter: " << e.what() << '\n'; |
| 66 | + return false; |
| 67 | + } |
| 68 | +} |
| 69 | + |
| 70 | +template<typename charT> |
| 71 | +void |
| 72 | +basic_filter_evaluator<charT>::add_variable( |
| 73 | + std::basic_string<charT> const &key, |
| 74 | + basic_datum<charT> value) |
| 75 | +{ |
| 76 | + e.add_variable(key, value); |
| 77 | +} |
| 78 | + |
35 | 79 | } // namespace afp |
36 | 80 | |
37 | 81 | #endif /* !FILTER_EVALUATOR_H */ |
Index: trunk/extensions/AbuseFilter/parser_native/affunctions.h |
— | — | @@ -12,26 +12,250 @@ |
13 | 13 | #ifndef AFFUNCTIONS_H |
14 | 14 | #define AFFUNCTIONS_H |
15 | 15 | |
16 | | -#include <map> |
17 | | -#include <vector> |
| 16 | +#include <map> |
| 17 | +#include <vector> |
| 18 | +#include <algorithm> |
| 19 | +#include <fstream> |
| 20 | +#include <sstream> |
| 21 | +#include <ios> |
| 22 | +#include <iostream> |
18 | 23 | |
19 | | -#include "aftypes.h" |
| 24 | +#include <unicode/uchar.h> |
20 | 25 | |
| 26 | +#include <boost/format.hpp> |
| 27 | + |
| 28 | +#include "aftypes.h" |
| 29 | +#include "equiv.h" |
| 30 | + |
21 | 31 | namespace afp { |
22 | 32 | |
23 | | -datum af_length (std::vector<datum> const &args); |
24 | | -datum af_lcase (std::vector<datum> const &args); |
25 | | -datum af_ccnorm (std::vector<datum> const &args); |
26 | | -datum af_rmdoubles (std::vector<datum> const &args); |
27 | | -datum af_specialratio (std::vector<datum> const &args); |
28 | | -datum af_rmspecials (std::vector<datum> const &args); |
29 | | -datum af_norm (std::vector<datum> const &args); |
30 | | -datum af_count (std::vector<datum> const &args); |
| 33 | +template<typename charT> |
| 34 | +basic_datum<charT> |
| 35 | +af_length (std::vector<basic_datum<charT> > const &args); |
31 | 36 | |
32 | | -std::string confusable_character_normalise(std::string const &orig); |
33 | | -std::string rmdoubles(std::string const &orig); |
34 | | -std::string rmspecials(std::string const &orig); |
| 37 | +template<typename charT> |
| 38 | +basic_datum<charT> |
| 39 | +af_ccnorm (std::vector<basic_datum<charT> > const &args); |
35 | 40 | |
| 41 | +template<typename charT> |
| 42 | +basic_datum<charT> |
| 43 | +af_rmdoubles (std::vector<basic_datum<charT> > const &args); |
| 44 | + |
| 45 | +template<typename charT> |
| 46 | +basic_datum<charT> |
| 47 | +af_specialratio (std::vector<basic_datum<charT> > const &args); |
| 48 | + |
| 49 | +template<typename charT> |
| 50 | +basic_datum<charT> |
| 51 | +af_rmspecials (std::vector<basic_datum<charT> > const &args); |
| 52 | + |
| 53 | +template<typename charT> |
| 54 | +basic_datum<charT> |
| 55 | +af_norm (std::vector<basic_datum<charT> > const &args); |
| 56 | + |
| 57 | +template<typename charT> |
| 58 | +basic_datum<charT> |
| 59 | +af_count (std::vector<basic_datum<charT> > const &args); |
| 60 | + |
| 61 | +template<typename charT> |
| 62 | +std::basic_string<charT> |
| 63 | +confusable_character_normalise(std::basic_string<charT> const &orig); |
| 64 | + |
| 65 | +template<typename charT> |
| 66 | +std::basic_string<charT> |
| 67 | +rmdoubles(std::basic_string<charT> const &orig); |
| 68 | + |
| 69 | +template<typename charT> |
| 70 | +std::basic_string<charT> |
| 71 | +rmspecials(std::basic_string<charT> const &orig); |
| 72 | + |
| 73 | +struct too_many_arguments_exception : afp::exception { |
| 74 | + too_many_arguments_exception(char const *what) |
| 75 | + : afp::exception(what) {} |
| 76 | +}; |
| 77 | + |
| 78 | +struct too_few_arguments_exception : afp::exception { |
| 79 | + too_few_arguments_exception(char const *what) |
| 80 | + : afp::exception(what) {} |
| 81 | +}; |
| 82 | + |
| 83 | +namespace { |
| 84 | + |
| 85 | +void |
| 86 | +check_args(std::string const &fname, int args, int min, int max = 0) |
| 87 | +{ |
| 88 | + if (max == 0) |
| 89 | + max = min; |
| 90 | + if (args < min) { |
| 91 | + std::string s = str(boost::format( |
| 92 | + "too few arguments for function %s (got %d, expected %d)") |
| 93 | + % fname % args % min); |
| 94 | + throw too_few_arguments_exception(s.c_str()); |
| 95 | + } else if (args > max) { |
| 96 | + std::string s = str(boost::format( |
| 97 | + "too many arguments for function %s (got %d, expected %d)") |
| 98 | + % fname % args % min); |
| 99 | + throw too_many_arguments_exception(s.c_str()); |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +} // anonymous namespace |
| 104 | + |
| 105 | +template<typename charT> |
| 106 | +basic_datum<charT> |
| 107 | +af_count(std::vector<basic_datum<charT> > const &args) { |
| 108 | + check_args("count", args.size(), 1, 2); |
| 109 | + |
| 110 | + std::basic_string<charT> needle, haystack; |
| 111 | + |
| 112 | + if (args.size() < 2) { |
| 113 | + needle = ","; |
| 114 | + haystack = args[0].toString(); |
| 115 | + } else { |
| 116 | + needle = args[0].toString(); |
| 117 | + haystack = args[1].toString(); |
| 118 | + } |
| 119 | + |
| 120 | + size_t last_pos = 0; |
| 121 | + unsigned int count = 0; |
| 122 | + |
| 123 | + while (last_pos != haystack.npos) { |
| 124 | + count++; |
| 125 | + last_pos = haystack.find(needle, last_pos); |
| 126 | + } |
| 127 | + |
| 128 | + // One extra was added, but one extra is needed if only one arg was supplied. |
| 129 | + if (args.size() >= 2) |
| 130 | + count--; |
| 131 | + |
| 132 | + return basic_datum<charT>::from_int((long int)count); |
| 133 | +} |
| 134 | + |
| 135 | +template<typename charT> |
| 136 | +basic_datum<charT> |
| 137 | +af_norm(std::vector<basic_datum<charT> > const &args) { |
| 138 | + check_args("norm", args.size(), 1); |
| 139 | + |
| 140 | + std::basic_string<charT> orig = args[0].toString(); |
| 141 | + |
| 142 | + int lastchr = 0; |
| 143 | + equiv_set const &equivs = equiv_set::instance(); |
| 144 | + std::basic_string<charT> result; |
| 145 | + |
| 146 | + for (std::size_t i = 0; i < orig.size(); ++i) { |
| 147 | + int chr = equivs.get(orig[i]); |
| 148 | + |
| 149 | + if (chr != lastchr && u_isalnum(chr)) |
| 150 | + result += chr; |
| 151 | + |
| 152 | + lastchr = chr; |
| 153 | + } |
| 154 | + |
| 155 | + return basic_datum<charT>::from_string(result); |
| 156 | +} |
| 157 | + |
| 158 | +template<typename charT> |
| 159 | +std::basic_string<charT> |
| 160 | +rmdoubles(std::basic_string<charT> const &orig) { |
| 161 | + int lastchr = 0; |
| 162 | + std::basic_string<charT> result; |
| 163 | + |
| 164 | + for (std::size_t i = 0; i < orig.size(); ++i) { |
| 165 | + if (orig[i] != lastchr) |
| 166 | + result += orig[i]; |
| 167 | + |
| 168 | + lastchr = orig[i]; |
| 169 | + } |
| 170 | + |
| 171 | + return result; |
| 172 | +} |
| 173 | + |
| 174 | +template<typename charT> |
| 175 | +basic_datum<charT> |
| 176 | +af_specialratio(std::vector<basic_datum<charT> > const &args) { |
| 177 | + check_args("specialratio", args.size(), 1); |
| 178 | + |
| 179 | + std::basic_string<charT> orig = args[0].toString(); |
| 180 | + int len = 0; |
| 181 | + int specialcount = 0; |
| 182 | + |
| 183 | + for (std::size_t i = 0; i < orig.size(); ++i) { |
| 184 | + len++; |
| 185 | + if (!u_isalnum(orig[i])) |
| 186 | + specialcount++; |
| 187 | + } |
| 188 | + |
| 189 | + double ratio = (float)specialcount / len; |
| 190 | + |
| 191 | + return basic_datum<charT>::from_double(ratio); |
| 192 | +} |
| 193 | + |
| 194 | +template<typename charT> |
| 195 | +basic_datum<charT> |
| 196 | +af_rmspecials(std::vector<basic_datum<charT> > const &args) { |
| 197 | + check_args("rmspecials", args.size(), 1); |
| 198 | + return basic_datum<charT>::from_string(rmspecials(args[0].toString())); |
| 199 | +} |
| 200 | + |
| 201 | +template<typename charT> |
| 202 | +std::basic_string<charT> |
| 203 | +rmspecials(std::basic_string<charT> const &orig) { |
| 204 | + std::basic_string<charT> result; |
| 205 | + |
| 206 | + for (std::size_t i = 0; i < orig.size(); ++i) { |
| 207 | + if (u_isalnum(orig[i])) |
| 208 | + result += orig[i]; |
| 209 | + } |
| 210 | + |
| 211 | + return result; |
| 212 | +} |
| 213 | + |
| 214 | +template<typename charT> |
| 215 | +basic_datum<charT> |
| 216 | +af_ccnorm(std::vector<basic_datum<charT> > const &args) { |
| 217 | + check_args("ccnorm", args.size(), 1); |
| 218 | + return basic_datum<charT>::from_string(confusable_character_normalise(args[0].toString())); |
| 219 | +} |
| 220 | + |
| 221 | +template<typename charT> |
| 222 | +basic_datum<charT> |
| 223 | +af_rmdoubles(std::vector<basic_datum<charT> > const &args) { |
| 224 | + check_args("ccnorm", args.size(), 1); |
| 225 | + return basic_datum<charT>::from_string(rmdoubles(args[0].toString())); |
| 226 | +} |
| 227 | + |
| 228 | +template<typename charT> |
| 229 | +basic_datum<charT> |
| 230 | +af_length(std::vector<basic_datum<charT> > const &args) { |
| 231 | + check_args("ccnorm", args.size(), 1); |
| 232 | + return basic_datum<charT>::from_int(args[0].toString().size()); |
| 233 | +} |
| 234 | + |
| 235 | +template<typename charT> |
| 236 | +basic_datum<charT> |
| 237 | +af_lcase(std::vector<basic_datum<charT> > const &args) { |
| 238 | + check_args("ccnorm", args.size(), 1); |
| 239 | + std::basic_string<charT> result; |
| 240 | + std::basic_string<charT> const orig = args[0].toString(); |
| 241 | + |
| 242 | + for (std::size_t i = 0; i < orig.size(); ++i) |
| 243 | + result += u_tolower(orig[i]); |
| 244 | + |
| 245 | + return basic_datum<charT>::from_string(result); |
| 246 | +} |
| 247 | + |
| 248 | +template<typename charT> |
| 249 | +std::basic_string<charT> |
| 250 | +confusable_character_normalise(std::basic_string<charT> const &orig) { |
| 251 | + equiv_set const &equivs = equiv_set::instance(); |
| 252 | + std::basic_string<charT> result; |
| 253 | + |
| 254 | + for (std::size_t i = 0; i < orig.size(); ++i) |
| 255 | + result += equivs.get(orig[i]); |
| 256 | + |
| 257 | + return result; |
| 258 | +} |
| 259 | + |
36 | 260 | } // namespace afp |
37 | 261 | |
38 | 262 | #endif /* !AFFUNCTIONS_H */ |
Index: trunk/extensions/AbuseFilter/parser_native/makefile |
— | — | @@ -17,31 +17,23 @@ |
18 | 18 | expr: CPPFLAGS+=-DTEST_PARSER |
19 | 19 | |
20 | 20 | af_expr_objs = \ |
21 | | - af_expr-affunctions.o \ |
22 | | - af_expr-filter_evaluator.o \ |
23 | 21 | af_expr-eval.o \ |
24 | 22 | af_expr-utf8.o \ |
25 | 23 | af_expr-equiv.o \ |
26 | 24 | af_expr-request.o |
27 | 25 | |
28 | 26 | af_parser_objs = \ |
29 | | - af_parser-affunctions.o \ |
30 | 27 | af_parser-main.o \ |
31 | 28 | af_parser-request.o \ |
32 | 29 | af_parser-utf8.o \ |
33 | | - af_parser-equiv.o \ |
34 | | - af_parser-filter_evaluator.o |
| 30 | + af_parser-equiv.o |
35 | 31 | |
36 | 32 | check_objs = \ |
37 | | - check-affunctions.o \ |
38 | 33 | check-check.o \ |
39 | 34 | check-utf8.o \ |
40 | | - check-equiv.o \ |
41 | | - check-filter_evaluator.o |
| 35 | + check-equiv.o |
42 | 36 | |
43 | 37 | syntax_check_objs = \ |
44 | | - syntax_check-affunctions.o \ |
45 | | - syntax_check-filter_evaluator.o \ |
46 | 38 | syntax_check-utf8.o \ |
47 | 39 | syntax_check-equiv.o \ |
48 | 40 | syntax_check-syntax_check.o |