r38835 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r38834‎ \| r38835 \| r38836 >
Date:	03:46, 8 August 2008
Author:	river
Status:	old
Tags:
Comment:	- add missing files from last commit - convert next_utf8_char into an iterator
Modified paths:	/trunk/extensions/AbuseFilter/parser_native/affunctions.cpp (modified) (history) /trunk/extensions/AbuseFilter/parser_native/equiv.cpp (added) (history) /trunk/extensions/AbuseFilter/parser_native/equiv.h (added) (history) /trunk/extensions/AbuseFilter/parser_native/utf8.cpp (added) (history) /trunk/extensions/AbuseFilter/parser_native/utf8.h (added) (history)

Diff [purge]

Index: trunk/extensions/AbuseFilter/parser_native/equiv.h
—	—	@@ -0,0 +1,23 @@
	2	+#ifndef EQUIV_H
	3	+#define EQUIV_H
	4	+
	5	+#include <map>
	6	+
	7	+#include <boost/noncopyable.hpp>
	8	+
	9	+namespace afp {
	10	+
	11	+struct equiv_set : boost::noncopyable {
	12	+ static equiv_set const &instance();
	13	+
	14	+ int get(int) const;
	15	+
	16	+private:
	17	+ equiv_set();
	18	+
	19	+ std::map<int, int> equivs_;
	20	+};
	21	+
	22	+} // namespace afp
	23	+
	24	+#endif /* !EQUIV_H */
Index: trunk/extensions/AbuseFilter/parser_native/affunctions.cpp
—	—	@@ -45,22 +45,20 @@
46	46
47	47	datum
48	48	af_norm(std::vector<datum> const &args) {
49		~~- if (!args.size()) {~~
	49	+ if (!args.size())
50	50	throw exception( "Not enough arguments to norm" );
51		~~- }~~
52	51
53	52	std::string orig = args[0].toString();
54	53
55		~~- std::string::const_iterator p, charStart, end;~~
56		~~- int chr = 0, lastchr = 0;~~
	54	+ int lastchr = 0;
57	55	equiv_set const &equivs = equiv_set::instance();
58	56	std::string result;
59	57
60		~~- p = orig.begin();~~
61		~~- end = orig.end();~~
62		-
63		~~- while (chr = utf8::next_utf8_char( p, charStart, end )) {~~
64		~~- chr = equivs.get(chr);~~
	58	+ utf8::utf8_iterator<std::string::const_iterator>
	59	+ it(orig.begin(), orig.end()), end;
	60	+
	61	+ for (; it != end; ++it) {
	62	+ int chr = equivs.get(*it);
65	63
66	64	if (chr != lastchr && isalnum(chr))
67	65	result.append(utf8::codepoint_to_utf8(chr));
—	—	@@ -73,18 +71,15 @@
74	72
75	73	std::string
76	74	rmdoubles(std::string const &orig) {
77		~~- std::string::const_iterator p, charStart, end;~~
78		~~- int chr,lastchr = 0;~~
	75	+ int lastchr = 0;
79	76	std::string result;
80	77
81		~~- p = orig.begin();~~
82		~~- end = orig.end();~~
83		~~- while (chr = utf8::next_utf8_char( p, charStart, end )) {~~
84		~~- if (chr != lastchr) {~~
85		~~- result.append(utf8::codepoint_to_utf8(chr));~~
86		~~- }~~
	78	+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
	79	+ for (; it != end; ++it) {
	80	+ if (*it != lastchr)
	81	+ result.append(utf8::codepoint_to_utf8(*it));
87	82
88		~~- lastchr = chr;~~
	83	+ lastchr = *it;
89	84	}
90	85
91	86	return result;
—	—	@@ -92,24 +87,21 @@
93	88
94	89	datum
95	90	af_specialratio(std::vector<datum> const &args) {
96		~~- if (!args.size()) {~~
	91	+ if (!args.size())
97	92	throw exception( "Not enough arguments to specialratio" );
98		~~- }~~
99	93
100	94	std::string orig = args[0].toString();
101		~~- std::string::const_iterator p, charStart, end;~~
102		~~- int chr;~~
	95	+ int len = 0;
103	96	int specialcount = 0;
104	97
105		~~- p = orig.begin();~~
106		~~- end = orig.end();~~
107		~~- while (chr = utf8::next_utf8_char( p, charStart, end )) {~~
108		~~- if (!isalnum(chr)) {~~
	98	+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
	99	+ for (; it != end; ++it) {
	100	+ len++;
	101	+ if (!isalnum(*it))
109	102	specialcount++;
110		~~- }~~
111	103	}
112	104
113		~~- double ratio = (float)(specialcount) / (float)(utf8::utf8_strlen(orig));~~
	105	+ double ratio = (float)specialcount / len;
114	106
115	107	return datum(ratio);
116	108	}
—	—	@@ -125,16 +117,12 @@
126	118
127	119	std::string
128	120	rmspecials(std::string const &orig) {
129		~~- std::string::const_iterator p, charStart, end;~~
130		~~- int chr = 0;~~
131	121	std::string result;
132	122
133		~~- p = orig.begin();~~
134		~~- end = orig.end();~~
135		~~- while (chr = utf8::next_utf8_char( p, charStart, end )) {~~
136		~~- if (isalnum(chr)) {~~
137		~~- result.append(utf8::codepoint_to_utf8(chr));~~
138		~~- }~~
	123	+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
	124	+ for (; it != end; ++it) {
	125	+ if (isalnum(*it))
	126	+ result.append(utf8::codepoint_to_utf8(*it));
139	127	}
140	128
141	129	return result;
—	—	@@ -178,16 +166,12 @@
179	167
180	168	std::string
181	169	confusable_character_normalise(std::string const &orig) {
182		~~- std::string::const_iterator p, charStart, end;~~
183		~~- int chr;~~
184	170	equiv_set const &equivs = equiv_set::instance();
185	171	std::string result;
186	172
187		~~- p = orig.begin();~~
188		~~- end = orig.end();~~
189		-
190		~~- while (chr = utf8::next_utf8_char( p, charStart, end )) {~~
191		~~- chr = equivs.get(chr);~~
	173	+ utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
	174	+ for (; it != end; ++it) {
	175	+ int chr = equivs.get(*it);
192	176	result.append(utf8::codepoint_to_utf8(chr));
193	177	}
194	178
Index: trunk/extensions/AbuseFilter/parser_native/utf8.cpp
—	—	@@ -0,0 +1,94 @@
	2	+#include "utf8.h"
	3	+
	4	+#include <unicode/utf8.h>
	5	+#include <unicode/ustring.h>
	6	+
	7	+#include "aftypes.h"
	8	+
	9	+namespace utf8 {
	10	+
	11	+// Ported from MediaWiki core function in PHP.
	12	+std::string
	13	+codepoint_to_utf8(int codepoint) {
	14	+ std::string ret;
	15	+
	16	+ if(codepoint < 0x80) {
	17	+ ret.append(1, codepoint);
	18	+ return ret;
	19	+ }
	20	+
	21	+ if(codepoint < 0x800) {
	22	+ ret.append(1, codepoint >> 6 & 0x3f \| 0xc0);
	23	+ ret.append(1, codepoint & 0x3f \| 0x80);
	24	+ return ret;
	25	+ }
	26	+
	27	+ if(codepoint < 0x10000) {
	28	+ ret.append(1, codepoint >> 12 & 0x0f \| 0xe0);
	29	+ ret.append(1, codepoint >> 6 & 0x3f \| 0x80);
	30	+ ret.append(1, codepoint & 0x3f \| 0x80);
	31	+ return ret;
	32	+ }
	33	+
	34	+ if(codepoint < 0x110000) {
	35	+ ret.append(1, codepoint >> 18 & 0x07 \| 0xf0);
	36	+ ret.append(1, codepoint >> 12 & 0x3f \| 0x80);
	37	+ ret.append(1, codepoint >> 6 & 0x3f \| 0x80);
	38	+ ret.append(1, codepoint & 0x3f \| 0x80);
	39	+ return ret;
	40	+ }
	41	+
	42	+ throw afp::exception("Asked for code outside of range ($codepoint)\n");
	43	+}
	44	+
	45	+std::size_t
	46	+utf8_strlen(std::string const &s)
	47	+{
	48	+std::size_t ret = 0;
	49	+ for (std::string::const_iterator it = s.begin(), end = s.end();
	50	+ it < end; ++it)
	51	+ {
	52	+ int skip = 1;
	53	+
	54	+ skip = U8_LENGTH(*it);
	55	+ if (it + skip >= end)
	56	+ return ret; /* end of string */
	57	+
	58	+ it += skip;
	59	+ }
	60	+
	61	+ return ret;
	62	+}
	63	+
	64	+/*
	65	+ * This could almost certainly be done in a nicer way.
	66	+ */
	67	+std::string
	68	+utf8_tolower(std::string const &s)
	69	+{
	70	+ std::vector<UChar> ustring;
	71	+ UErrorCode error = U_ZERO_ERROR;
	72	+
	73	+ for (int i = 0, end = s.size(); i < end; ) {
	74	+ UChar32 c;
	75	+ U8_NEXT(s.data(), i, end, c);
	76	+ ustring.push_back(c);
	77	+ }
	78	+
	79	+ std::vector<UChar> dest;
	80	+ u_strToLower(&dest[0], dest.size(), &ustring[0], ustring.size(),
	81	+ NULL, &error);
	82	+
	83	+ if (U_FAILURE(error))
	84	+ return s;
	85	+
	86	+ std::vector<unsigned char> u8string;
	87	+ int i, j, end;
	88	+ for (i = 0, j = 0, end = dest.size(); i < end; j++) {
	89	+ U8_APPEND_UNSAFE(&u8string[0], i, dest[j]);
	90	+ }
	91	+ return std::string(u8string.begin(), u8string.begin() + i);
	92	+}
	93	+
	94	+
	95	+} // namespace utf8
Index: trunk/extensions/AbuseFilter/parser_native/equiv.cpp
—	—	@@ -0,0 +1,57 @@
	2	+#include <fstream>
	3	+#include <string>
	4	+
	5	+#include <boost/lexical_cast.hpp>
	6	+
	7	+#include "equiv.h"
	8	+#include "aftypes.h"
	9	+
	10	+#define EQUIVSET_LOC "equivset.txt"
	11	+
	12	+namespace afp {
	13	+
	14	+equiv_set::equiv_set()
	15	+{
	16	+ // Map of codepoint:codepoint
	17	+
	18	+ std::ifstream eqsFile(EQUIVSET_LOC);
	19	+
	20	+ if (!eqsFile)
	21	+ throw exception( "Unable to open equivalence sets!" );
	22	+
	23	+ std::string line;
	24	+
	25	+ while (getline(eqsFile, line)) {
	26	+ size_t pos = line.find(':');
	27	+
	28	+ if (pos != line.npos) try {
	29	+ // We have a codepoint:codepoint thing.
	30	+ int actual = boost::lexical_cast<int>(line.substr(0, pos));
	31	+ int canonical = boost::lexical_cast<int>(line.substr(pos + 1));
	32	+
	33	+ if (actual != 0 && canonical != 0)
	34	+ equivs_[actual] = canonical;
	35	+ } catch (boost::bad_lexical_cast &) {}
	36	+ }
	37	+}
	38	+
	39	+int
	40	+equiv_set::get(int c) const
	41	+{
	42	+ std::map<int, int>::const_iterator it;
	43	+
	44	+ if ((it = equivs_.find(c)) == equivs_.end())
	45	+ return c;
	46	+
	47	+ return it->second;
	48	+}
	49	+
	50	+equiv_set const &
	51	+equiv_set::instance()
	52	+{
	53	+ static equiv_set inst;
	54	+ return inst;
	55	+}
	56	+
	57	+} // namespace afp
	58	+
Index: trunk/extensions/AbuseFilter/parser_native/utf8.h
—	—	@@ -0,0 +1,109 @@
	2	+#ifndef UTF8_H
	3	+#define UTF8_H
	4	+
	5	+#include <string>
	6	+#include <cstddef>
	7	+
	8	+namespace utf8 {
	9	+
	10	+int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
	11	+std::string codepoint_to_utf8(int codepoint);
	12	+std::size_t utf8_strlen(std::string const &s);
	13	+std::string utf8_tolower(std::string const &s);
	14	+
	15	+// Weak UTF-8 decoder
	16	+// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
	17	+// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
	18	+template<typename InputIterator>
	19	+struct utf8_iterator {
	20	+ utf8_iterator(InputIterator begin, InputIterator end)
	21	+ : cur_(begin)
	22	+ , end_(end)
	23	+ , atend_(false)
	24	+ {
	25	+ advance();
	26	+ }
	27	+
	28	+ utf8_iterator()
	29	+ : atend_(true)
	30	+ {
	31	+ }
	32	+
	33	+ int operator* (void) const {
	34	+ return curval;
	35	+ }
	36	+
	37	+ bool operator==(utf8_iterator<InputIterator> const &other) const {
	38	+ if (atend_ \|\| other.atend_)
	39	+ return atend_ == other.atend_;
	40	+
	41	+ return cur_ == other.cur_;
	42	+ }
	43	+
	44	+ utf8_iterator<InputIterator> &operator++(void) {
	45	+ advance();
	46	+ return *this;
	47	+ }
	48	+
	49	+private:
	50	+ int curval;
	51	+ InputIterator cur_, end_;
	52	+ bool atend_;
	53	+
	54	+ void advance();
	55	+};
	56	+
	57	+template<typename InputIterator>
	58	+void
	59	+utf8_iterator<InputIterator>::advance()
	60	+{
	61	+ int c=0;
	62	+ unsigned char byte;
	63	+ int bytes = 0;
	64	+
	65	+ if (cur_ == end_) {
	66	+ atend_ = true;
	67	+ curval = 0;
	68	+ return;
	69	+ }
	70	+
	71	+ do {
	72	+ byte = (unsigned char)*cur_;
	73	+ if (byte < 0x80) {
	74	+ c = byte;
	75	+ bytes = 0;
	76	+ } else if (byte >= 0xc0) {
	77	+ // Start of UTF-8 character
	78	+ // If this is unexpected, due to an overshort sequence, we ignore the invalid
	79	+ // sequence and resynchronise here
	80	+ if (byte < 0xe0) {
	81	+ bytes = 1;
	82	+ c = byte & 0x1f;
	83	+ } else if (byte < 0xf0) {
	84	+ bytes = 2;
	85	+ c = byte & 0x0f;
	86	+ } else {
	87	+ bytes = 3;
	88	+ c = byte & 7;
	89	+ }
	90	+ } else if (bytes) {
	91	+ c <<= 6;
	92	+ c \|= byte & 0x3f;
	93	+ --bytes;
	94	+ } else {
	95	+ // Unexpected continuation, ignore
	96	+ }
	97	+ ++cur_;
	98	+ } while (bytes && cur_ != end_);
	99	+ curval = c;
	100	+}
	101	+
	102	+template<typename InputIterator>
	103	+bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
	104	+{
	105	+ return !(a == b);
	106	+}
	107	+
	108	+} // namespace utf8
	109	+
	110	+#endif /* !UTF8_H */

Status & tagging log

15:29, 12 September 2011 Meno25 (talk | contribs) changed the status of r38835 [removed: ok added: old]