Index: trunk/extensions/AbuseFilter/parser_native/syntax_check |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: trunk/extensions/AbuseFilter/parser_native/af_parser |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: trunk/extensions/AbuseFilter/parser_native/check |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Index: trunk/extensions/AbuseFilter/parser_native/affunctions.cpp |
— | — | @@ -5,8 +5,10 @@ |
6 | 6 | #include <ios> |
7 | 7 | #include <iostream> |
8 | 8 | #include <ctype.h> |
9 | | -#include <glibmm/ustring.h> |
10 | 9 | |
| 10 | +#include <unicode/utf8.h> |
| 11 | +#include <unicode/ustring.h> |
| 12 | + |
11 | 13 | #define EQUIVSET_LOC "equivset.txt" |
12 | 14 | |
13 | 15 | map<string,AFPFunction> af_functions; |
— | — | @@ -179,7 +181,7 @@ |
180 | 182 | } |
181 | 183 | } |
182 | 184 | |
183 | | - double ratio = (float)(specialcount) / (float)(((Glib::ustring)orig).size()); |
| 185 | + double ratio = (float)(specialcount) / (float)(utf8_strlen(orig)); |
184 | 186 | |
185 | 187 | return AFPData(ratio); |
186 | 188 | } |
— | — | @@ -234,7 +236,7 @@ |
235 | 237 | throw AFPException( "Not enough arguments to lcase" ); |
236 | 238 | } |
237 | 239 | |
238 | | - return AFPData( (long int)((Glib::ustring)args[0].toString()).size() ); |
| 240 | + return AFPData( (long int)utf8_strlen(args[0].toString()) ); |
239 | 241 | } |
240 | 242 | |
241 | 243 | AFPData af_lcase( vector<AFPData> args ) { |
— | — | @@ -242,10 +244,7 @@ |
243 | 245 | throw AFPException( "Not enough arguments to lcase" ); |
244 | 246 | } |
245 | 247 | |
246 | | - Glib::ustring s = args[0].toString(); |
247 | | - string s2 = s.lowercase(); |
248 | | - |
249 | | - return AFPData(s); |
| 248 | + return AFPData(utf8_tolower(args[0].toString())); |
250 | 249 | } |
251 | 250 | |
252 | 251 | string confusable_character_normalise( string orig ) { |
— | — | @@ -354,6 +353,66 @@ |
355 | 354 | return c; |
356 | 355 | } |
357 | 356 | |
| 357 | +std::size_t |
| 358 | +utf8_strlen(std::string const &s) |
| 359 | +{ |
| 360 | +std::size_t ret = 0; |
| 361 | + for (std::string::const_iterator it = s.begin(), end = s.end(); |
| 362 | + it < end; ++it) |
| 363 | + { |
| 364 | + int skip = 1; |
| 365 | + |
| 366 | + skip = U8_LENGTH(*it); |
| 367 | +#if 0 |
| 368 | + if (*it >= 0xc0) { |
| 369 | + if (*it < 0xe0) |
| 370 | + skip = 1; |
| 371 | + else if (*it < 0xf0) |
| 372 | + skip = 2; |
| 373 | + else |
| 374 | + skip = 3; |
| 375 | + } else |
| 376 | + skip = 1; |
| 377 | +#endif |
| 378 | + |
| 379 | + if (it + skip >= end) |
| 380 | + return ret; /* end of string */ |
| 381 | + |
| 382 | + it += skip; |
| 383 | + } |
| 384 | + |
| 385 | + return ret; |
| 386 | +} |
| 387 | + |
| 388 | +/* |
| 389 | + * This could almost certainly be done in a nicer way. |
| 390 | + */ |
| 391 | +std::string utf8_tolower(std::string const &s) |
| 392 | +{ |
| 393 | + std::vector<UChar> ustring; |
| 394 | + UErrorCode error = U_ZERO_ERROR; |
| 395 | + |
| 396 | + for (int i = 0; i < s.size(); ) { |
| 397 | + UChar32 c; |
| 398 | + U8_NEXT(s.data(), i, s.size(), c); |
| 399 | + ustring.push_back(c); |
| 400 | + } |
| 401 | + |
| 402 | + std::vector<UChar> dest; |
| 403 | + u_strToLower(&dest[0], dest.size(), &ustring[0], ustring.size(), |
| 404 | + NULL, &error); |
| 405 | + |
| 406 | + if (U_FAILURE(error)) |
| 407 | + return s; |
| 408 | + |
| 409 | + std::vector<unsigned char> u8string; |
| 410 | + int i, j; |
| 411 | + for (i = 0, j = 0; i < dest.size(); j++) { |
| 412 | + U8_APPEND_UNSAFE(&u8string[0], i, dest[j]); |
| 413 | + } |
| 414 | + return std::string(u8string.begin(), u8string.begin() + i); |
| 415 | +} |
| 416 | + |
358 | 417 | // Ported from MediaWiki core function in PHP. |
359 | 418 | string codepointToUtf8( int codepoint ) { |
360 | 419 | string ret; |
Index: trunk/extensions/AbuseFilter/parser_native/affunctions.h |
— | — | @@ -1,3 +1,6 @@ |
| 2 | +#ifndef AFFUNCTIONS_H |
| 3 | +#define AFFUNCTIONS_H |
| 4 | + |
2 | 5 | #include "aftypes.h" |
3 | 6 | #include <map> |
4 | 7 | #include <vector> |
— | — | @@ -18,3 +21,7 @@ |
19 | 22 | AFPData callFunction( string name, AFPData arg ); |
20 | 23 | string rmdoubles( string orig ); |
21 | 24 | string rmspecials( string orig ); |
| 25 | +std::size_t utf8_strlen(std::string const &s); |
| 26 | +std::string utf8_tolower(std::string const &s); |
| 27 | + |
| 28 | +#endif /* !AFFUNCTIONS_H */ |
Index: trunk/extensions/AbuseFilter/parser_native/eval.cpp |
— | — | @@ -1,6 +1,5 @@ |
2 | 2 | #include "afeval.h" |
3 | 3 | #include "affunctions.h" |
4 | | -#include <libxml++/libxml++.h> |
5 | 4 | #include <iostream> |
6 | 5 | #include <string> |
7 | 6 | #include <sstream> |
Index: trunk/extensions/AbuseFilter/parser_native/makefile |
— | — | @@ -1,24 +1,41 @@ |
2 | | -all: check af_parser syntax_check af_expr |
| 2 | +CXX = g++ |
| 3 | +CXXFLAGS = -O3 |
| 4 | +BOOST_INCLUDES = /opt/boost/include/boost-1_35 |
| 5 | +BOOST_LIBS = /opt/boost/lib |
| 6 | +BOOST_TAG = -gcc34-mt |
| 7 | +ICU_INCLUDES = /opt/icu/include |
| 8 | +ICU_LIBS = /opt/icu/lib |
3 | 9 | |
4 | | -af_expr: afeval.o affunctions.o afparser.o aftypes.o afutils.o eval.o |
5 | | - g++ -O3 -o af_expr afeval.o affunctions.o afparser.o aftypes.o afutils.o eval.o -lboost_regex -lxml++-2.6 -lxml2 -lglibmm-2.4 -lgobject-2.0 -lsigc-2.0 -lglib-2.0 |
| 10 | +CPPFLAGS = -I$(BOOST_INCLUDES) -I$(ICU_INCLUDES) |
| 11 | +LDFLAGS = -L$(BOOST_LIBS) -L$(ICU_LIBS) -R$(ICU_LIBS) -R$(BOOST_LIBS) |
6 | 12 | |
7 | | -af_parser: afeval.o affunctions.o afparser.o aftypes.o afutils.o main.o |
8 | | - g++ -O3 -o af_parser afeval.o affunctions.o afparser.o aftypes.o afutils.o main.o -lboost_regex -lglibmm-2.4 |
| 13 | +LIBS = -lboost_regex$(BOOST_TAG) -licuuc -licui18n -licudata -licui18n |
9 | 14 | |
10 | | -check: afeval.o affunctions.o afparser.o aftypes.o afutils.o check.o |
11 | | - g++ -O3 -o check -lboost_regex afeval.o affunctions.o afparser.o aftypes.o afutils.o check.o -lglibmm-2.4 |
12 | | - |
13 | | -syntax_check: afeval.o affunctions.o afparser.o aftypes.o afutils.o syntax_check.o |
14 | | - g++ -O3 -o syntax_check afeval.o affunctions.o afparser.o aftypes.o afutils.o syntax_check.o -lboost_regex -lglibmm-2.4 |
| 15 | +af_expr_objs = afeval.o affunctions.o afparser.o aftypes.o afutils.o eval.o |
| 16 | +af_parser_objs = afeval.o affunctions.o afparser.o aftypes.o afutils.o main.o |
| 17 | +check_objs = afeval.o affunctions.o afparser.o aftypes.o afutils.o check.o |
| 18 | +syntax_check_objs = afeval.o affunctions.o afparser.o aftypes.o afutils.o syntax_check.o |
15 | 19 | |
| 20 | +progs = check af_parser syntax_check af_expr |
| 21 | + |
| 22 | +all: $(progs) |
| 23 | + |
| 24 | +af_expr: $(af_expr_objs) |
| 25 | + $(CXX) $(CXXFLAGS) -o $@ $(af_expr_objs) $(LDFLAGS) $(LIBS) |
| 26 | +af_parser: $(af_parser_objs) |
| 27 | + $(CXX) $(CXXFLAGS) -o $@ $(af_parser_objs) $(LDFLAGS) $(LIBS) |
| 28 | +check: $(check_objs) |
| 29 | + $(CXX) $(CXXFLAGS) -o $@ $(check_objs) $(LDFLAGS) $(LIBS) |
| 30 | +syntax_check: $(syntax_check_objs) |
| 31 | + $(CXX) $(CXXFLAGS) -o $@ $(syntax_check_objs) $(LDFLAGS) $(LIBS) |
| 32 | + |
16 | 33 | .cpp.o: |
17 | | - g++ -O3 -c $< -I/usr/include/libxml++-2.6 -I/usr/lib/libxml++-2.6/include -I/usr/include/libxml2 -I/usr/include/glibmm-2.4 -I/usr/lib/glibmm-2.4/include -I/usr/include/sigc++-2.0 -I/usr/lib/sigc++-2.0/include -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include |
| 34 | + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< |
18 | 35 | |
19 | 36 | clean: |
20 | | - rm -f *.o check af_parser syntax_check |
| 37 | + rm -f *.o $(progs) |
21 | 38 | |
22 | 39 | clean-final: |
23 | | - rm -f check af_parser syntax_check |
| 40 | + rm -f $(progs) |
24 | 41 | |
25 | 42 | .SUFFIXES: .cpp .o |