r38835 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r38834‎ | r38835 | r38836 >
Date:03:46, 8 August 2008
Author:river
Status:old
Tags:
Comment:
- add missing files from last commit
- convert next_utf8_char into an iterator
Modified paths:
  • /trunk/extensions/AbuseFilter/parser_native/affunctions.cpp (modified) (history)
  • /trunk/extensions/AbuseFilter/parser_native/equiv.cpp (added) (history)
  • /trunk/extensions/AbuseFilter/parser_native/equiv.h (added) (history)
  • /trunk/extensions/AbuseFilter/parser_native/utf8.cpp (added) (history)
  • /trunk/extensions/AbuseFilter/parser_native/utf8.h (added) (history)

Diff [purge]

Index: trunk/extensions/AbuseFilter/parser_native/equiv.h
@@ -0,0 +1,23 @@
 2+#ifndef EQUIV_H
 3+#define EQUIV_H
 4+
 5+#include <map>
 6+
 7+#include <boost/noncopyable.hpp>
 8+
 9+namespace afp {
 10+
 11+struct equiv_set : boost::noncopyable {
 12+ static equiv_set const &instance();
 13+
 14+ int get(int) const;
 15+
 16+private:
 17+ equiv_set();
 18+
 19+ std::map<int, int> equivs_;
 20+};
 21+
 22+} // namespace afp
 23+
 24+#endif /* !EQUIV_H */
Index: trunk/extensions/AbuseFilter/parser_native/affunctions.cpp
@@ -45,22 +45,20 @@
4646
4747 datum
4848 af_norm(std::vector<datum> const &args) {
49 - if (!args.size()) {
 49+ if (!args.size())
5050 throw exception( "Not enough arguments to norm" );
51 - }
5251
5352 std::string orig = args[0].toString();
5453
55 - std::string::const_iterator p, charStart, end;
56 - int chr = 0, lastchr = 0;
 54+ int lastchr = 0;
5755 equiv_set const &equivs = equiv_set::instance();
5856 std::string result;
5957
60 - p = orig.begin();
61 - end = orig.end();
62 -
63 - while (chr = utf8::next_utf8_char( p, charStart, end )) {
64 - chr = equivs.get(chr);
 58+ utf8::utf8_iterator<std::string::const_iterator>
 59+ it(orig.begin(), orig.end()), end;
 60+
 61+ for (; it != end; ++it) {
 62+ int chr = equivs.get(*it);
6563
6664 if (chr != lastchr && isalnum(chr))
6765 result.append(utf8::codepoint_to_utf8(chr));
@@ -73,18 +71,15 @@
7472
7573 std::string
7674 rmdoubles(std::string const &orig) {
77 - std::string::const_iterator p, charStart, end;
78 - int chr,lastchr = 0;
 75+ int lastchr = 0;
7976 std::string result;
8077
81 - p = orig.begin();
82 - end = orig.end();
83 - while (chr = utf8::next_utf8_char( p, charStart, end )) {
84 - if (chr != lastchr) {
85 - result.append(utf8::codepoint_to_utf8(chr));
86 - }
 78+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
 79+ for (; it != end; ++it) {
 80+ if (*it != lastchr)
 81+ result.append(utf8::codepoint_to_utf8(*it));
8782
88 - lastchr = chr;
 83+ lastchr = *it;
8984 }
9085
9186 return result;
@@ -92,24 +87,21 @@
9388
9489 datum
9590 af_specialratio(std::vector<datum> const &args) {
96 - if (!args.size()) {
 91+ if (!args.size())
9792 throw exception( "Not enough arguments to specialratio" );
98 - }
9993
10094 std::string orig = args[0].toString();
101 - std::string::const_iterator p, charStart, end;
102 - int chr;
 95+ int len = 0;
10396 int specialcount = 0;
10497
105 - p = orig.begin();
106 - end = orig.end();
107 - while (chr = utf8::next_utf8_char( p, charStart, end )) {
108 - if (!isalnum(chr)) {
 98+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
 99+ for (; it != end; ++it) {
 100+ len++;
 101+ if (!isalnum(*it))
109102 specialcount++;
110 - }
111103 }
112104
113 - double ratio = (float)(specialcount) / (float)(utf8::utf8_strlen(orig));
 105+ double ratio = (float)specialcount / len;
114106
115107 return datum(ratio);
116108 }
@@ -125,16 +117,12 @@
126118
127119 std::string
128120 rmspecials(std::string const &orig) {
129 - std::string::const_iterator p, charStart, end;
130 - int chr = 0;
131121 std::string result;
132122
133 - p = orig.begin();
134 - end = orig.end();
135 - while (chr = utf8::next_utf8_char( p, charStart, end )) {
136 - if (isalnum(chr)) {
137 - result.append(utf8::codepoint_to_utf8(chr));
138 - }
 123+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
 124+ for (; it != end; ++it) {
 125+ if (isalnum(*it))
 126+ result.append(utf8::codepoint_to_utf8(*it));
139127 }
140128
141129 return result;
@@ -178,16 +166,12 @@
179167
180168 std::string
181169 confusable_character_normalise(std::string const &orig) {
182 - std::string::const_iterator p, charStart, end;
183 - int chr;
184170 equiv_set const &equivs = equiv_set::instance();
185171 std::string result;
186172
187 - p = orig.begin();
188 - end = orig.end();
189 -
190 - while (chr = utf8::next_utf8_char( p, charStart, end )) {
191 - chr = equivs.get(chr);
 173+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
 174+ for (; it != end; ++it) {
 175+ int chr = equivs.get(*it);
192176 result.append(utf8::codepoint_to_utf8(chr));
193177 }
194178
Index: trunk/extensions/AbuseFilter/parser_native/utf8.cpp
@@ -0,0 +1,94 @@
 2+#include "utf8.h"
 3+
 4+#include <unicode/utf8.h>
 5+#include <unicode/ustring.h>
 6+
 7+#include "aftypes.h"
 8+
 9+namespace utf8 {
 10+
 11+// Ported from MediaWiki core function in PHP.
 12+std::string
 13+codepoint_to_utf8(int codepoint) {
 14+ std::string ret;
 15+
 16+ if(codepoint < 0x80) {
 17+ ret.append(1, codepoint);
 18+ return ret;
 19+ }
 20+
 21+ if(codepoint < 0x800) {
 22+ ret.append(1, codepoint >> 6 & 0x3f | 0xc0);
 23+ ret.append(1, codepoint & 0x3f | 0x80);
 24+ return ret;
 25+ }
 26+
 27+ if(codepoint < 0x10000) {
 28+ ret.append(1, codepoint >> 12 & 0x0f | 0xe0);
 29+ ret.append(1, codepoint >> 6 & 0x3f | 0x80);
 30+ ret.append(1, codepoint & 0x3f | 0x80);
 31+ return ret;
 32+ }
 33+
 34+ if(codepoint < 0x110000) {
 35+ ret.append(1, codepoint >> 18 & 0x07 | 0xf0);
 36+ ret.append(1, codepoint >> 12 & 0x3f | 0x80);
 37+ ret.append(1, codepoint >> 6 & 0x3f | 0x80);
 38+ ret.append(1, codepoint & 0x3f | 0x80);
 39+ return ret;
 40+ }
 41+
 42+ throw afp::exception("Asked for code outside of range ($codepoint)\n");
 43+}
 44+
 45+std::size_t
 46+utf8_strlen(std::string const &s)
 47+{
 48+std::size_t ret = 0;
 49+ for (std::string::const_iterator it = s.begin(), end = s.end();
 50+ it < end; ++it)
 51+ {
 52+ int skip = 1;
 53+
 54+ skip = U8_LENGTH(*it);
 55+ if (it + skip >= end)
 56+ return ret; /* end of string */
 57+
 58+ it += skip;
 59+ }
 60+
 61+ return ret;
 62+}
 63+
 64+/*
 65+ * This could almost certainly be done in a nicer way.
 66+ */
 67+std::string
 68+utf8_tolower(std::string const &s)
 69+{
 70+ std::vector<UChar> ustring;
 71+ UErrorCode error = U_ZERO_ERROR;
 72+
 73+ for (int i = 0, end = s.size(); i < end; ) {
 74+ UChar32 c;
 75+ U8_NEXT(s.data(), i, end, c);
 76+ ustring.push_back(c);
 77+ }
 78+
 79+ std::vector<UChar> dest;
 80+ u_strToLower(&dest[0], dest.size(), &ustring[0], ustring.size(),
 81+ NULL, &error);
 82+
 83+ if (U_FAILURE(error))
 84+ return s;
 85+
 86+ std::vector<unsigned char> u8string;
 87+ int i, j, end;
 88+ for (i = 0, j = 0, end = dest.size(); i < end; j++) {
 89+ U8_APPEND_UNSAFE(&u8string[0], i, dest[j]);
 90+ }
 91+ return std::string(u8string.begin(), u8string.begin() + i);
 92+}
 93+
 94+
 95+} // namespace utf8
Index: trunk/extensions/AbuseFilter/parser_native/equiv.cpp
@@ -0,0 +1,57 @@
 2+#include <fstream>
 3+#include <string>
 4+
 5+#include <boost/lexical_cast.hpp>
 6+
 7+#include "equiv.h"
 8+#include "aftypes.h"
 9+
 10+#define EQUIVSET_LOC "equivset.txt"
 11+
 12+namespace afp {
 13+
 14+equiv_set::equiv_set()
 15+{
 16+ // Map of codepoint:codepoint
 17+
 18+ std::ifstream eqsFile(EQUIVSET_LOC);
 19+
 20+ if (!eqsFile)
 21+ throw exception( "Unable to open equivalence sets!" );
 22+
 23+ std::string line;
 24+
 25+ while (getline(eqsFile, line)) {
 26+ size_t pos = line.find(':');
 27+
 28+ if (pos != line.npos) try {
 29+ // We have a codepoint:codepoint thing.
 30+ int actual = boost::lexical_cast<int>(line.substr(0, pos));
 31+ int canonical = boost::lexical_cast<int>(line.substr(pos + 1));
 32+
 33+ if (actual != 0 && canonical != 0)
 34+ equivs_[actual] = canonical;
 35+ } catch (boost::bad_lexical_cast &) {}
 36+ }
 37+}
 38+
 39+int
 40+equiv_set::get(int c) const
 41+{
 42+ std::map<int, int>::const_iterator it;
 43+
 44+ if ((it = equivs_.find(c)) == equivs_.end())
 45+ return c;
 46+
 47+ return it->second;
 48+}
 49+
 50+equiv_set const &
 51+equiv_set::instance()
 52+{
 53+ static equiv_set inst;
 54+ return inst;
 55+}
 56+
 57+} // namespace afp
 58+
Index: trunk/extensions/AbuseFilter/parser_native/utf8.h
@@ -0,0 +1,109 @@
 2+#ifndef UTF8_H
 3+#define UTF8_H
 4+
 5+#include <string>
 6+#include <cstddef>
 7+
 8+namespace utf8 {
 9+
 10+int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
 11+std::string codepoint_to_utf8(int codepoint);
 12+std::size_t utf8_strlen(std::string const &s);
 13+std::string utf8_tolower(std::string const &s);
 14+
 15+// Weak UTF-8 decoder
 16+// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
 17+// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
 18+template<typename InputIterator>
 19+struct utf8_iterator {
 20+ utf8_iterator(InputIterator begin, InputIterator end)
 21+ : cur_(begin)
 22+ , end_(end)
 23+ , atend_(false)
 24+ {
 25+ advance();
 26+ }
 27+
 28+ utf8_iterator()
 29+ : atend_(true)
 30+ {
 31+ }
 32+
 33+ int operator* (void) const {
 34+ return curval;
 35+ }
 36+
 37+ bool operator==(utf8_iterator<InputIterator> const &other) const {
 38+ if (atend_ || other.atend_)
 39+ return atend_ == other.atend_;
 40+
 41+ return cur_ == other.cur_;
 42+ }
 43+
 44+ utf8_iterator<InputIterator> &operator++(void) {
 45+ advance();
 46+ return *this;
 47+ }
 48+
 49+private:
 50+ int curval;
 51+ InputIterator cur_, end_;
 52+ bool atend_;
 53+
 54+ void advance();
 55+};
 56+
 57+template<typename InputIterator>
 58+void
 59+utf8_iterator<InputIterator>::advance()
 60+{
 61+ int c=0;
 62+ unsigned char byte;
 63+ int bytes = 0;
 64+
 65+ if (cur_ == end_) {
 66+ atend_ = true;
 67+ curval = 0;
 68+ return;
 69+ }
 70+
 71+ do {
 72+ byte = (unsigned char)*cur_;
 73+ if (byte < 0x80) {
 74+ c = byte;
 75+ bytes = 0;
 76+ } else if (byte >= 0xc0) {
 77+ // Start of UTF-8 character
 78+ // If this is unexpected, due to an overshort sequence, we ignore the invalid
 79+ // sequence and resynchronise here
 80+ if (byte < 0xe0) {
 81+ bytes = 1;
 82+ c = byte & 0x1f;
 83+ } else if (byte < 0xf0) {
 84+ bytes = 2;
 85+ c = byte & 0x0f;
 86+ } else {
 87+ bytes = 3;
 88+ c = byte & 7;
 89+ }
 90+ } else if (bytes) {
 91+ c <<= 6;
 92+ c |= byte & 0x3f;
 93+ --bytes;
 94+ } else {
 95+ // Unexpected continuation, ignore
 96+ }
 97+ ++cur_;
 98+ } while (bytes && cur_ != end_);
 99+ curval = c;
 100+}
 101+
 102+template<typename InputIterator>
 103+bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
 104+{
 105+ return !(a == b);
 106+}
 107+
 108+} // namespace utf8
 109+
 110+#endif /* !UTF8_H */

Status & tagging log