Index: trunk/extensions/AbuseFilter/parser_native/equiv.h |
— | — | @@ -0,0 +1,23 @@ |
| 2 | +#ifndef EQUIV_H |
| 3 | +#define EQUIV_H |
| 4 | + |
| 5 | +#include <map> |
| 6 | + |
| 7 | +#include <boost/noncopyable.hpp> |
| 8 | + |
| 9 | +namespace afp { |
| 10 | + |
| 11 | +struct equiv_set : boost::noncopyable { |
| 12 | + static equiv_set const &instance(); |
| 13 | + |
| 14 | + int get(int) const; |
| 15 | + |
| 16 | +private: |
| 17 | + equiv_set(); |
| 18 | + |
| 19 | + std::map<int, int> equivs_; |
| 20 | +}; |
| 21 | + |
| 22 | +} // namespace afp |
| 23 | + |
| 24 | +#endif /* !EQUIV_H */ |
Index: trunk/extensions/AbuseFilter/parser_native/affunctions.cpp |
— | — | @@ -45,22 +45,20 @@ |
46 | 46 | |
47 | 47 | datum |
48 | 48 | af_norm(std::vector<datum> const &args) { |
49 | | - if (!args.size()) { |
| 49 | + if (!args.size()) |
50 | 50 | throw exception( "Not enough arguments to norm" ); |
51 | | - } |
52 | 51 | |
53 | 52 | std::string orig = args[0].toString(); |
54 | 53 | |
55 | | - std::string::const_iterator p, charStart, end; |
56 | | - int chr = 0, lastchr = 0; |
| 54 | + int lastchr = 0; |
57 | 55 | equiv_set const &equivs = equiv_set::instance(); |
58 | 56 | std::string result; |
59 | 57 | |
60 | | - p = orig.begin(); |
61 | | - end = orig.end(); |
62 | | - |
63 | | - while (chr = utf8::next_utf8_char( p, charStart, end )) { |
64 | | - chr = equivs.get(chr); |
| 58 | + utf8::utf8_iterator<std::string::const_iterator> |
| 59 | + it(orig.begin(), orig.end()), end; |
| 60 | + |
| 61 | + for (; it != end; ++it) { |
| 62 | + int chr = equivs.get(*it); |
65 | 63 | |
66 | 64 | if (chr != lastchr && isalnum(chr)) |
67 | 65 | result.append(utf8::codepoint_to_utf8(chr)); |
— | — | @@ -73,18 +71,15 @@ |
74 | 72 | |
75 | 73 | std::string |
76 | 74 | rmdoubles(std::string const &orig) { |
77 | | - std::string::const_iterator p, charStart, end; |
78 | | - int chr,lastchr = 0; |
| 75 | + int lastchr = 0; |
79 | 76 | std::string result; |
80 | 77 | |
81 | | - p = orig.begin(); |
82 | | - end = orig.end(); |
83 | | - while (chr = utf8::next_utf8_char( p, charStart, end )) { |
84 | | - if (chr != lastchr) { |
85 | | - result.append(utf8::codepoint_to_utf8(chr)); |
86 | | - } |
| 78 | + utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
| 79 | + for (; it != end; ++it) { |
| 80 | + if (*it != lastchr) |
| 81 | + result.append(utf8::codepoint_to_utf8(*it)); |
87 | 82 | |
88 | | - lastchr = chr; |
| 83 | + lastchr = *it; |
89 | 84 | } |
90 | 85 | |
91 | 86 | return result; |
— | — | @@ -92,24 +87,21 @@ |
93 | 88 | |
94 | 89 | datum |
95 | 90 | af_specialratio(std::vector<datum> const &args) { |
96 | | - if (!args.size()) { |
| 91 | + if (!args.size()) |
97 | 92 | throw exception( "Not enough arguments to specialratio" ); |
98 | | - } |
99 | 93 | |
100 | 94 | std::string orig = args[0].toString(); |
101 | | - std::string::const_iterator p, charStart, end; |
102 | | - int chr; |
| 95 | + int len = 0; |
103 | 96 | int specialcount = 0; |
104 | 97 | |
105 | | - p = orig.begin(); |
106 | | - end = orig.end(); |
107 | | - while (chr = utf8::next_utf8_char( p, charStart, end )) { |
108 | | - if (!isalnum(chr)) { |
| 98 | + utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
| 99 | + for (; it != end; ++it) { |
| 100 | + len++; |
| 101 | + if (!isalnum(*it)) |
109 | 102 | specialcount++; |
110 | | - } |
111 | 103 | } |
112 | 104 | |
113 | | - double ratio = (float)(specialcount) / (float)(utf8::utf8_strlen(orig)); |
| 105 | + double ratio = (float)specialcount / len; |
114 | 106 | |
115 | 107 | return datum(ratio); |
116 | 108 | } |
— | — | @@ -125,16 +117,12 @@ |
126 | 118 | |
127 | 119 | std::string |
128 | 120 | rmspecials(std::string const &orig) { |
129 | | - std::string::const_iterator p, charStart, end; |
130 | | - int chr = 0; |
131 | 121 | std::string result; |
132 | 122 | |
133 | | - p = orig.begin(); |
134 | | - end = orig.end(); |
135 | | - while (chr = utf8::next_utf8_char( p, charStart, end )) { |
136 | | - if (isalnum(chr)) { |
137 | | - result.append(utf8::codepoint_to_utf8(chr)); |
138 | | - } |
| 123 | + utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
| 124 | + for (; it != end; ++it) { |
| 125 | + if (isalnum(*it)) |
| 126 | + result.append(utf8::codepoint_to_utf8(*it)); |
139 | 127 | } |
140 | 128 | |
141 | 129 | return result; |
— | — | @@ -178,16 +166,12 @@ |
179 | 167 | |
180 | 168 | std::string |
181 | 169 | confusable_character_normalise(std::string const &orig) { |
182 | | - std::string::const_iterator p, charStart, end; |
183 | | - int chr; |
184 | 170 | equiv_set const &equivs = equiv_set::instance(); |
185 | 171 | std::string result; |
186 | 172 | |
187 | | - p = orig.begin(); |
188 | | - end = orig.end(); |
189 | | - |
190 | | - while (chr = utf8::next_utf8_char( p, charStart, end )) { |
191 | | - chr = equivs.get(chr); |
| 173 | + utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end; |
| 174 | + for (; it != end; ++it) { |
| 175 | + int chr = equivs.get(*it); |
192 | 176 | result.append(utf8::codepoint_to_utf8(chr)); |
193 | 177 | } |
194 | 178 | |
Index: trunk/extensions/AbuseFilter/parser_native/utf8.cpp |
— | — | @@ -0,0 +1,94 @@ |
| 2 | +#include "utf8.h" |
| 3 | + |
| 4 | +#include <unicode/utf8.h> |
| 5 | +#include <unicode/ustring.h> |
| 6 | + |
| 7 | +#include "aftypes.h" |
| 8 | + |
| 9 | +namespace utf8 { |
| 10 | + |
| 11 | +// Ported from MediaWiki core function in PHP. |
| 12 | +std::string |
| 13 | +codepoint_to_utf8(int codepoint) { |
| 14 | + std::string ret; |
| 15 | + |
| 16 | + if(codepoint < 0x80) { |
| 17 | + ret.append(1, codepoint); |
| 18 | + return ret; |
| 19 | + } |
| 20 | + |
| 21 | + if(codepoint < 0x800) { |
| 22 | + ret.append(1, codepoint >> 6 & 0x3f | 0xc0); |
| 23 | + ret.append(1, codepoint & 0x3f | 0x80); |
| 24 | + return ret; |
| 25 | + } |
| 26 | + |
| 27 | + if(codepoint < 0x10000) { |
| 28 | + ret.append(1, codepoint >> 12 & 0x0f | 0xe0); |
| 29 | + ret.append(1, codepoint >> 6 & 0x3f | 0x80); |
| 30 | + ret.append(1, codepoint & 0x3f | 0x80); |
| 31 | + return ret; |
| 32 | + } |
| 33 | + |
| 34 | + if(codepoint < 0x110000) { |
| 35 | + ret.append(1, codepoint >> 18 & 0x07 | 0xf0); |
| 36 | + ret.append(1, codepoint >> 12 & 0x3f | 0x80); |
| 37 | + ret.append(1, codepoint >> 6 & 0x3f | 0x80); |
| 38 | + ret.append(1, codepoint & 0x3f | 0x80); |
| 39 | + return ret; |
| 40 | + } |
| 41 | + |
| 42 | + throw afp::exception("Asked for code outside of range ($codepoint)\n"); |
| 43 | +} |
| 44 | + |
| 45 | +std::size_t |
| 46 | +utf8_strlen(std::string const &s) |
| 47 | +{ |
| 48 | +std::size_t ret = 0; |
| 49 | + for (std::string::const_iterator it = s.begin(), end = s.end(); |
| 50 | + it < end; ++it) |
| 51 | + { |
| 52 | + int skip = 1; |
| 53 | + |
| 54 | + skip = U8_LENGTH(*it); |
| 55 | + if (it + skip >= end) |
| 56 | + return ret; /* end of string */ |
| 57 | + |
| 58 | + it += skip; |
| 59 | + } |
| 60 | + |
| 61 | + return ret; |
| 62 | +} |
| 63 | + |
| 64 | +/* |
| 65 | + * This could almost certainly be done in a nicer way. |
| 66 | + */ |
| 67 | +std::string |
| 68 | +utf8_tolower(std::string const &s) |
| 69 | +{ |
| 70 | + std::vector<UChar> ustring; |
| 71 | + UErrorCode error = U_ZERO_ERROR; |
| 72 | + |
| 73 | + for (int i = 0, end = s.size(); i < end; ) { |
| 74 | + UChar32 c; |
| 75 | + U8_NEXT(s.data(), i, end, c); |
| 76 | + ustring.push_back(c); |
| 77 | + } |
| 78 | + |
| 79 | + std::vector<UChar> dest; |
| 80 | + u_strToLower(&dest[0], dest.size(), &ustring[0], ustring.size(), |
| 81 | + NULL, &error); |
| 82 | + |
| 83 | + if (U_FAILURE(error)) |
| 84 | + return s; |
| 85 | + |
| 86 | + std::vector<unsigned char> u8string; |
| 87 | + int i, j, end; |
| 88 | + for (i = 0, j = 0, end = dest.size(); i < end; j++) { |
| 89 | + U8_APPEND_UNSAFE(&u8string[0], i, dest[j]); |
| 90 | + } |
| 91 | + return std::string(u8string.begin(), u8string.begin() + i); |
| 92 | +} |
| 93 | + |
| 94 | + |
| 95 | +} // namespace utf8 |
Index: trunk/extensions/AbuseFilter/parser_native/equiv.cpp |
— | — | @@ -0,0 +1,57 @@ |
| 2 | +#include <fstream> |
| 3 | +#include <string> |
| 4 | + |
| 5 | +#include <boost/lexical_cast.hpp> |
| 6 | + |
| 7 | +#include "equiv.h" |
| 8 | +#include "aftypes.h" |
| 9 | + |
| 10 | +#define EQUIVSET_LOC "equivset.txt" |
| 11 | + |
| 12 | +namespace afp { |
| 13 | + |
| 14 | +equiv_set::equiv_set() |
| 15 | +{ |
| 16 | + // Map of codepoint:codepoint |
| 17 | + |
| 18 | + std::ifstream eqsFile(EQUIVSET_LOC); |
| 19 | + |
| 20 | + if (!eqsFile) |
| 21 | + throw exception( "Unable to open equivalence sets!" ); |
| 22 | + |
| 23 | + std::string line; |
| 24 | + |
| 25 | + while (getline(eqsFile, line)) { |
| 26 | + size_t pos = line.find(':'); |
| 27 | + |
| 28 | + if (pos != line.npos) try { |
| 29 | + // We have a codepoint:codepoint thing. |
| 30 | + int actual = boost::lexical_cast<int>(line.substr(0, pos)); |
| 31 | + int canonical = boost::lexical_cast<int>(line.substr(pos + 1)); |
| 32 | + |
| 33 | + if (actual != 0 && canonical != 0) |
| 34 | + equivs_[actual] = canonical; |
| 35 | + } catch (boost::bad_lexical_cast &) {} |
| 36 | + } |
| 37 | +} |
| 38 | + |
| 39 | +int |
| 40 | +equiv_set::get(int c) const |
| 41 | +{ |
| 42 | + std::map<int, int>::const_iterator it; |
| 43 | + |
| 44 | + if ((it = equivs_.find(c)) == equivs_.end()) |
| 45 | + return c; |
| 46 | + |
| 47 | + return it->second; |
| 48 | +} |
| 49 | + |
| 50 | +equiv_set const & |
| 51 | +equiv_set::instance() |
| 52 | +{ |
| 53 | + static equiv_set inst; |
| 54 | + return inst; |
| 55 | +} |
| 56 | + |
| 57 | +} // namespace afp |
| 58 | + |
Index: trunk/extensions/AbuseFilter/parser_native/utf8.h |
— | — | @@ -0,0 +1,109 @@ |
| 2 | +#ifndef UTF8_H |
| 3 | +#define UTF8_H |
| 4 | + |
| 5 | +#include <string> |
| 6 | +#include <cstddef> |
| 7 | + |
| 8 | +namespace utf8 { |
| 9 | + |
| 10 | +int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end); |
| 11 | +std::string codepoint_to_utf8(int codepoint); |
| 12 | +std::size_t utf8_strlen(std::string const &s); |
| 13 | +std::string utf8_tolower(std::string const &s); |
| 14 | + |
| 15 | +// Weak UTF-8 decoder |
| 16 | +// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.) |
| 17 | +// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel) |
| 18 | +template<typename InputIterator> |
| 19 | +struct utf8_iterator { |
| 20 | + utf8_iterator(InputIterator begin, InputIterator end) |
| 21 | + : cur_(begin) |
| 22 | + , end_(end) |
| 23 | + , atend_(false) |
| 24 | + { |
| 25 | + advance(); |
| 26 | + } |
| 27 | + |
| 28 | + utf8_iterator() |
| 29 | + : atend_(true) |
| 30 | + { |
| 31 | + } |
| 32 | + |
| 33 | + int operator* (void) const { |
| 34 | + return curval; |
| 35 | + } |
| 36 | + |
| 37 | + bool operator==(utf8_iterator<InputIterator> const &other) const { |
| 38 | + if (atend_ || other.atend_) |
| 39 | + return atend_ == other.atend_; |
| 40 | + |
| 41 | + return cur_ == other.cur_; |
| 42 | + } |
| 43 | + |
| 44 | + utf8_iterator<InputIterator> &operator++(void) { |
| 45 | + advance(); |
| 46 | + return *this; |
| 47 | + } |
| 48 | + |
| 49 | +private: |
| 50 | + int curval; |
| 51 | + InputIterator cur_, end_; |
| 52 | + bool atend_; |
| 53 | + |
| 54 | + void advance(); |
| 55 | +}; |
| 56 | + |
| 57 | +template<typename InputIterator> |
| 58 | +void |
| 59 | +utf8_iterator<InputIterator>::advance() |
| 60 | +{ |
| 61 | + int c=0; |
| 62 | + unsigned char byte; |
| 63 | + int bytes = 0; |
| 64 | + |
| 65 | + if (cur_ == end_) { |
| 66 | + atend_ = true; |
| 67 | + curval = 0; |
| 68 | + return; |
| 69 | + } |
| 70 | + |
| 71 | + do { |
| 72 | + byte = (unsigned char)*cur_; |
| 73 | + if (byte < 0x80) { |
| 74 | + c = byte; |
| 75 | + bytes = 0; |
| 76 | + } else if (byte >= 0xc0) { |
| 77 | + // Start of UTF-8 character |
| 78 | + // If this is unexpected, due to an overshort sequence, we ignore the invalid |
| 79 | + // sequence and resynchronise here |
| 80 | + if (byte < 0xe0) { |
| 81 | + bytes = 1; |
| 82 | + c = byte & 0x1f; |
| 83 | + } else if (byte < 0xf0) { |
| 84 | + bytes = 2; |
| 85 | + c = byte & 0x0f; |
| 86 | + } else { |
| 87 | + bytes = 3; |
| 88 | + c = byte & 7; |
| 89 | + } |
| 90 | + } else if (bytes) { |
| 91 | + c <<= 6; |
| 92 | + c |= byte & 0x3f; |
| 93 | + --bytes; |
| 94 | + } else { |
| 95 | + // Unexpected continuation, ignore |
| 96 | + } |
| 97 | + ++cur_; |
| 98 | + } while (bytes && cur_ != end_); |
| 99 | + curval = c; |
| 100 | +} |
| 101 | + |
| 102 | +template<typename InputIterator> |
| 103 | +bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b) |
| 104 | +{ |
| 105 | + return !(a == b); |
| 106 | +} |
| 107 | + |
| 108 | +} // namespace utf8 |
| 109 | + |
| 110 | +#endif /* !UTF8_H */ |