r64581 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r64580‎ \| r64581 \| r64582 >
Date:	22:57, 3 April 2010
Author:	siebrand
Status:	deferred
Tags:
Comment:	stylize.php, trailing whitespace removed
Modified paths:	/trunk/extensions/AntiSpoof/AntiSpoof.i18n.php (modified) (history) /trunk/extensions/AntiSpoof/AntiSpoof.php (modified) (history) /trunk/extensions/AntiSpoof/AntiSpoof_body.php (modified) (history) /trunk/extensions/AntiSpoof/SpoofUser.php (modified) (history) /trunk/extensions/AntiSpoof/batchAntiSpoof.php (modified) (history) /trunk/extensions/AntiSpoof/equivset.in (modified) (history) /trunk/extensions/AntiSpoof/generateEquivset.php (modified) (history) /trunk/extensions/AntiSpoof/sql/patch-antispoof.mysql.sql (modified) (history) /trunk/extensions/AntiSpoof/sql/patch-antispoof.postgres.sql (modified) (history)

Diff [purge]

Index: trunk/extensions/AntiSpoof/sql/patch-antispoof.mysql.sql
—	—	@@ -3,19 +3,19 @@
4	4	-- Do a join against user_name to confirm that an account hasn't
5	5	-- been renamed or deleted away.
6	6	su_name VARCHAR(255),
7		-
	7	+
8	8	-- Normalized form of name for similarity-spoofing checks
9	9	su_normalized VARCHAR(255),
10		-
	10	+
11	11	-- ok/not-ok according to the looks-like-a-valid-name check
12	12	su_legal BOOL,
13		-
	13	+
14	14	-- error message that came out of the unicode check, if any
15	15	su_error TEXT,
16	16
17	17	-- unique record per username
18	18	PRIMARY KEY (su_name),
19		-
	19	+
20	20	-- for checking matching possible spoofs
21	21	KEY(su_normalized, su_name)
22	22	) /$wgDBTableOptions/;
Index: trunk/extensions/AntiSpoof/sql/patch-antispoof.postgres.sql
—	—	@@ -14,4 +14,4 @@
15	15	su_error text
16	16	);
17	17
18		~~-CREATE INDEX su_normname_idx ON spoofuser (su_normalized,su_name);~~
\ No newline at end of file
	18	+CREATE INDEX su_normname_idx ON spoofuser (su_normalized,su_name);
Index: trunk/extensions/AntiSpoof/AntiSpoof_body.php
—	—	@@ -1,5 +1,4 @@
2	2	<?php
3		-
4	3	# AntiSpoof.php
5	4	# Username spoofing prevention for MediaWiki
6	5	# Version 0.04
—	—	@@ -30,9 +29,7 @@
31	30	# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
32	31	# USA
33	32
34		-
35	33	class AntiSpoof {
36		-
37	34	# Define script tag codes for various Unicode codepoint ranges
38	35	# If it does not have a code here, it does not have a script assignment
39	36	# NB: Braille is not in this list since it is a transliteration system, not a script;
—	—	@@ -43,93 +40,93 @@
44	41	# are commented out: these are either not in modern use, or only used for specialized
45	42	# religious purposes, or only of literary interest
46	43	private static $script_ranges = array(
47		~~- array( 0x0020, 0x002F, "SCRIPT_ASCII_PUNCTUATION" ), # ASCII Punctuation 1, Hyphen, ASCII Punctuation 2~~
48		~~- array( 0x0030, 0x0039, "SCRIPT_ASCII_DIGITS" ), # ASCII Digits~~
49		~~- array( 0x003A, 0x0040, "SCRIPT_ASCII_PUNCTUATION" ), # Colon, ASCII Punctuation 3~~
50		~~- array( 0x0041, 0x005A, "SCRIPT_LATIN" ), # ASCII Uppercase~~
51		~~- array( 0x005B, 0x0060, "SCRIPT_ASCII_PUNCTUATION" ), # ASCII Punctuation 4, Underscore, ASCII Punctuation 5~~
52		~~- array( 0x0061, 0x007A, "SCRIPT_LATIN" ), # ASCII Lowercase~~
53		~~- array( 0x007B, 0x007E, "SCRIPT_ASCII_PUNCTUATION" ), # ASCII Punctuation 5~~
54		~~- array( 0x00B7, 0x00B7, "SCRIPT_LATIN" ), # Middle Dot~~
55		~~- array( 0x00C0, 0x00D6, "SCRIPT_LATIN" ), # Latin-1 Letters 1~~
56		~~- array( 0x00D8, 0x00F6, "SCRIPT_LATIN" ), # Latin-1 Letters 2~~
57		~~- array( 0x00F8, 0x02AF, "SCRIPT_LATIN" ), # Latin-1 Letters 3, Latin Extended-A, Latin Extended-B, IPA Extensions~~
58		~~- array( 0x0300, 0x036F, "SCRIPT_COMBINING_MARKS" ), # Combining Diacritical Marks~~
59		~~- array( 0x0370, 0x03E1, "SCRIPT_GREEK" ), # Greek and Coptic (Greek)~~
60		~~- array( 0x03E2, 0x03EF, "SCRIPT_COPTIC_EXTRAS" ), # Greek and Coptic (Coptic-unique)~~
61		~~- array( 0x03F0, 0x03FF, "SCRIPT_GREEK" ), # Greek and Coptic (Greek)~~
62		~~- array( 0x0400, 0x052F, "SCRIPT_CYRILLIC" ), # Cyrillic, Cyrillic Supplement~~
63		~~- array( 0x0530, 0x058F, "SCRIPT_ARMENIAN" ), # Armenian~~
64		~~- array( 0x0590, 0x05FF, "SCRIPT_HEBREW" ), # Hebrew~~
65		~~- array( 0x0600, 0x06FF, "SCRIPT_ARABIC" ), # Arabic~~
66		~~- array( 0x0700, 0x074F, "SCRIPT_SYRIAC" ), # Syriac~~
67		~~- array( 0x0750, 0x077F, "SCRIPT_ARABIC" ), # Arabic Supplement~~
68		~~- array( 0x0780, 0x07BF, "SCRIPT_THAANA" ), # Thaana~~
69		~~- array( 0x0900, 0x097F, "SCRIPT_DEVANAGARI" ), # Devanagari~~
70		~~- array( 0x0980, 0x09FF, "SCRIPT_BENGALI" ), # Bengali~~
71		~~- array( 0x0A00, 0x0A7F, "SCRIPT_GURMUKHI" ), # Gurmukhi~~
72		~~- array( 0x0A80, 0x0AFF, "SCRIPT_GUJARATI" ), # Gujarati~~
73		~~- array( 0x0B00, 0x0B7F, "SCRIPT_ORIYA" ), # Oriya~~
74		~~- array( 0x0B80, 0x0BFF, "SCRIPT_TAMIL" ), # Tamil~~
75		~~- array( 0x0C00, 0x0C7F, "SCRIPT_TELUGU" ), # Telugu~~
76		~~- array( 0x0C80, 0x0CFF, "SCRIPT_KANNADA" ), # Kannada~~
77		~~- array( 0x0D00, 0x0D7F, "SCRIPT_MALAYALAM" ), # Malayalam~~
78		~~- array( 0x0D80, 0x0DFF, "SCRIPT_SINHALA" ), # Sinhala~~
79		~~- array( 0x0E00, 0x0E7F, "SCRIPT_THAI" ), # Thai~~
80		~~- array( 0x0E80, 0x0EFF, "SCRIPT_LAO" ), # Lao~~
81		~~- array( 0x0F00, 0x0FFF, "SCRIPT_TIBETAN" ), # Tibetan~~
82		~~- array( 0x1000, 0x109F, "SCRIPT_MYANMAR" ), # Myanmar~~
83		~~- array( 0x10A0, 0x10FF, "SCRIPT_GEORGIAN" ), # Georgian~~
	44	+ array( 0x0020, 0x002F, "SCRIPT_ASCII_PUNCTUATION" ), # ASCII Punctuation 1, Hyphen, ASCII Punctuation 2
	45	+ array( 0x0030, 0x0039, "SCRIPT_ASCII_DIGITS" ), # ASCII Digits
	46	+ array( 0x003A, 0x0040, "SCRIPT_ASCII_PUNCTUATION" ), # Colon, ASCII Punctuation 3
	47	+ array( 0x0041, 0x005A, "SCRIPT_LATIN" ), # ASCII Uppercase
	48	+ array( 0x005B, 0x0060, "SCRIPT_ASCII_PUNCTUATION" ), # ASCII Punctuation 4, Underscore, ASCII Punctuation 5
	49	+ array( 0x0061, 0x007A, "SCRIPT_LATIN" ), # ASCII Lowercase
	50	+ array( 0x007B, 0x007E, "SCRIPT_ASCII_PUNCTUATION" ), # ASCII Punctuation 5
	51	+ array( 0x00B7, 0x00B7, "SCRIPT_LATIN" ), # Middle Dot
	52	+ array( 0x00C0, 0x00D6, "SCRIPT_LATIN" ), # Latin-1 Letters 1
	53	+ array( 0x00D8, 0x00F6, "SCRIPT_LATIN" ), # Latin-1 Letters 2
	54	+ array( 0x00F8, 0x02AF, "SCRIPT_LATIN" ), # Latin-1 Letters 3, Latin Extended-A, Latin Extended-B, IPA Extensions
	55	+ array( 0x0300, 0x036F, "SCRIPT_COMBINING_MARKS" ), # Combining Diacritical Marks
	56	+ array( 0x0370, 0x03E1, "SCRIPT_GREEK" ), # Greek and Coptic (Greek)
	57	+ array( 0x03E2, 0x03EF, "SCRIPT_COPTIC_EXTRAS" ), # Greek and Coptic (Coptic-unique)
	58	+ array( 0x03F0, 0x03FF, "SCRIPT_GREEK" ), # Greek and Coptic (Greek)
	59	+ array( 0x0400, 0x052F, "SCRIPT_CYRILLIC" ), # Cyrillic, Cyrillic Supplement
	60	+ array( 0x0530, 0x058F, "SCRIPT_ARMENIAN" ), # Armenian
	61	+ array( 0x0590, 0x05FF, "SCRIPT_HEBREW" ), # Hebrew
	62	+ array( 0x0600, 0x06FF, "SCRIPT_ARABIC" ), # Arabic
	63	+ array( 0x0700, 0x074F, "SCRIPT_SYRIAC" ), # Syriac
	64	+ array( 0x0750, 0x077F, "SCRIPT_ARABIC" ), # Arabic Supplement
	65	+ array( 0x0780, 0x07BF, "SCRIPT_THAANA" ), # Thaana
	66	+ array( 0x0900, 0x097F, "SCRIPT_DEVANAGARI" ), # Devanagari
	67	+ array( 0x0980, 0x09FF, "SCRIPT_BENGALI" ), # Bengali
	68	+ array( 0x0A00, 0x0A7F, "SCRIPT_GURMUKHI" ), # Gurmukhi
	69	+ array( 0x0A80, 0x0AFF, "SCRIPT_GUJARATI" ), # Gujarati
	70	+ array( 0x0B00, 0x0B7F, "SCRIPT_ORIYA" ), # Oriya
	71	+ array( 0x0B80, 0x0BFF, "SCRIPT_TAMIL" ), # Tamil
	72	+ array( 0x0C00, 0x0C7F, "SCRIPT_TELUGU" ), # Telugu
	73	+ array( 0x0C80, 0x0CFF, "SCRIPT_KANNADA" ), # Kannada
	74	+ array( 0x0D00, 0x0D7F, "SCRIPT_MALAYALAM" ), # Malayalam
	75	+ array( 0x0D80, 0x0DFF, "SCRIPT_SINHALA" ), # Sinhala
	76	+ array( 0x0E00, 0x0E7F, "SCRIPT_THAI" ), # Thai
	77	+ array( 0x0E80, 0x0EFF, "SCRIPT_LAO" ), # Lao
	78	+ array( 0x0F00, 0x0FFF, "SCRIPT_TIBETAN" ), # Tibetan
	79	+ array( 0x1000, 0x109F, "SCRIPT_MYANMAR" ), # Myanmar
	80	+ array( 0x10A0, 0x10FF, "SCRIPT_GEORGIAN" ), # Georgian
84	81	array( 0x1100, 0x11FF, "SCRIPT_HANGUL" ), # Hangul Jamo
85		~~- array( 0x1200, 0x139F, "SCRIPT_ETHIOPIC" ), # Ethiopic, Ethiopic Supplement~~
86		~~- array( 0x13A0, 0x13FF, "SCRIPT_CHEROKEE" ), # Cherokee~~
	82	+ array( 0x1200, 0x139F, "SCRIPT_ETHIOPIC" ), # Ethiopic, Ethiopic Supplement
	83	+ array( 0x13A0, 0x13FF, "SCRIPT_CHEROKEE" ), # Cherokee
87	84	array( 0x1400, 0x167F, "SCRIPT_CANADIAN_ABORIGINAL" ), # Unified Canadian Aboriginal Syllabics
88		~~- # array( 0x1680, 0x169F, "SCRIPT_OGHAM" ), # Ogham~~
89		~~- # array( 0x16A0, 0x16FF, "SCRIPT_RUNIC" ), # Runic~~
90		~~- array( 0x1700, 0x171F, "SCRIPT_TAGALOG" ), # Tagalog~~
91		~~- array( 0x1720, 0x173F, "SCRIPT_HANUNOO" ), # Hanunoo~~
92		~~- array( 0x1740, 0x175F, "SCRIPT_BUHID" ), # Buhid~~
93		~~- array( 0x1760, 0x177F, "SCRIPT_TAGBANWA" ), # Tagbanwa~~
94		~~- array( 0x1780, 0x17FF, "SCRIPT_KHMER" ), # Khmer~~
95		~~- array( 0x1800, 0x18AF, "SCRIPT_MONGOLIAN" ), # Mongolian~~
96		~~- array( 0x1900, 0x194F, "SCRIPT_LIMBU" ), # Limbu~~
97		~~- array( 0x1950, 0x197F, "SCRIPT_TAI_LE" ), # Tai Le~~
98		~~- array( 0x1980, 0x19DF, "SCRIPT_NEW_TAI_LUE" ), # New Tai Lue~~
99		~~- array( 0x1A00, 0x1A1F, "SCRIPT_BUGINESE" ), # Buginese~~
100		~~- array( 0x1E00, 0x1EFF, "SCRIPT_LATIN" ), # Latin Extended Additional~~
101		~~- array( 0x1F00, 0x1FFF, "SCRIPT_GREEK" ), # Greek Extended~~
102		~~- # array( 0x2C00, 0x2C5F, "SCRIPT_GLAGOLITIC" ), # Glagolitic~~
103		~~- array( 0x2C80, 0x2CFF, "SCRIPT_COPTIC" ), # Coptic~~
104		~~- array( 0x2D00, 0x2D2F, "SCRIPT_GEORGIAN" ), # Georgian Supplement~~
105		~~- array( 0x2D30, 0x2D7F, "SCRIPT_TIFINAGH" ), # Tifinagh~~
106		~~- array( 0x2D80, 0x2DDF, "SCRIPT_ETHIOPIC" ), # Ethiopic Extended~~
107		~~- array( 0x2E80, 0x2FDF, "SCRIPT_DEPRECATED" ), # CJK Radicals Supplement, Kangxi Radicals~~
108		~~- array( 0x3040, 0x309F, "SCRIPT_HIRAGANA" ), # Hiragana~~
109		~~- array( 0x30A0, 0x30FF, "SCRIPT_KATAKANA" ), # Katakana~~
110		~~- array( 0x3100, 0x312F, "SCRIPT_BOPOMOFO" ), # Bopomofo~~
	85	+ # array( 0x1680, 0x169F, "SCRIPT_OGHAM" ), # Ogham
	86	+ # array( 0x16A0, 0x16FF, "SCRIPT_RUNIC" ), # Runic
	87	+ array( 0x1700, 0x171F, "SCRIPT_TAGALOG" ), # Tagalog
	88	+ array( 0x1720, 0x173F, "SCRIPT_HANUNOO" ), # Hanunoo
	89	+ array( 0x1740, 0x175F, "SCRIPT_BUHID" ), # Buhid
	90	+ array( 0x1760, 0x177F, "SCRIPT_TAGBANWA" ), # Tagbanwa
	91	+ array( 0x1780, 0x17FF, "SCRIPT_KHMER" ), # Khmer
	92	+ array( 0x1800, 0x18AF, "SCRIPT_MONGOLIAN" ), # Mongolian
	93	+ array( 0x1900, 0x194F, "SCRIPT_LIMBU" ), # Limbu
	94	+ array( 0x1950, 0x197F, "SCRIPT_TAI_LE" ), # Tai Le
	95	+ array( 0x1980, 0x19DF, "SCRIPT_NEW_TAI_LUE" ), # New Tai Lue
	96	+ array( 0x1A00, 0x1A1F, "SCRIPT_BUGINESE" ), # Buginese
	97	+ array( 0x1E00, 0x1EFF, "SCRIPT_LATIN" ), # Latin Extended Additional
	98	+ array( 0x1F00, 0x1FFF, "SCRIPT_GREEK" ), # Greek Extended
	99	+ # array( 0x2C00, 0x2C5F, "SCRIPT_GLAGOLITIC" ), # Glagolitic
	100	+ array( 0x2C80, 0x2CFF, "SCRIPT_COPTIC" ), # Coptic
	101	+ array( 0x2D00, 0x2D2F, "SCRIPT_GEORGIAN" ), # Georgian Supplement
	102	+ array( 0x2D30, 0x2D7F, "SCRIPT_TIFINAGH" ), # Tifinagh
	103	+ array( 0x2D80, 0x2DDF, "SCRIPT_ETHIOPIC" ), # Ethiopic Extended
	104	+ array( 0x2E80, 0x2FDF, "SCRIPT_DEPRECATED" ), # CJK Radicals Supplement, Kangxi Radicals
	105	+ array( 0x3040, 0x309F, "SCRIPT_HIRAGANA" ), # Hiragana
	106	+ array( 0x30A0, 0x30FF, "SCRIPT_KATAKANA" ), # Katakana
	107	+ array( 0x3100, 0x312F, "SCRIPT_BOPOMOFO" ), # Bopomofo
111	108	array( 0x3130, 0x318F, "SCRIPT_HANGUL" ), # Hangul Compatibility Jamo
112		~~- array( 0x31A0, 0x31BF, "SCRIPT_BOPOMOFO" ), # Bopomofo Extended~~
113		~~- array( 0x3400, 0x4DBF, "SCRIPT_HAN" ), # CJK Unified Ideographs Extension A~~
114		~~- array( 0x4E00, 0x9FFF, "SCRIPT_HAN" ), # CJK Unified Ideographs~~
115		~~- array( 0xA000, 0xA4CF, "SCRIPT_YI" ), # Yi Syllables, Yi Radicals~~
116		~~- array( 0xA800, 0xA82F, "SCRIPT_SYLOTI_NAGRI" ), # Syloti Nagri~~
117		~~- array( 0xAC00, 0xD7AF, "SCRIPT_HANGUL" ), # Hangul Syllables~~
118		~~- array( 0xF900, 0xFAFF, "SCRIPT_DEPRECATED" ), # CJK Compatibility Ideographs~~
119		~~- # array( 0x10000, 0x100FF, "SCRIPT_LINEAR_B" ), # Linear B Syllabary, Linear B Ideograms~~
120		~~- # array( 0x10140, 0x1018F, "SCRIPT_GREEK" ), # Ancient Greek Numbers~~
121		~~- # array( 0x10300, 0x1032F, "SCRIPT_OLD_ITALIC" ), # Old Italic~~
122		~~- array( 0x10330, 0x1034F, "SCRIPT_GOTHIC" ), # Gothic~~
123		~~- # array( 0x10380, 0x1039F, "SCRIPT_UGARITIC" ), # Ugaritic~~
124		~~- # array( 0x103A0, 0x103DF, "SCRIPT_OLD_PERSIAN" ), # Old Persian~~
125		~~- # array( 0x10400, 0x1044F, "SCRIPT_DESERET" ), # Deseret~~
126		~~- # array( 0x10450, 0x1047F, "SCRIPT_SHAVIAN" ), # Shavian~~
127		~~- # array( 0x10480, 0x104AF, "SCRIPT_OSMANYA" ), # Osmanya~~
128		~~- # array( 0x10800, 0x1083F, "SCRIPT_CYPRIOT" ), # Cypriot Syllabary~~
129		~~- array( 0x10A00, 0x10A5F, "SCRIPT_KHAROSHTHI" ), # Kharoshthi~~
130		~~- array( 0x20000, 0x2A6DF, "SCRIPT_HAN" ), # CJK Unified Ideographs Extension B~~
131		~~- array( 0x2F800, 0x2FA1F, "SCRIPT_DEPRECATED" ) # CJK Compatibility Ideographs Supplement~~
	109	+ array( 0x31A0, 0x31BF, "SCRIPT_BOPOMOFO" ), # Bopomofo Extended
	110	+ array( 0x3400, 0x4DBF, "SCRIPT_HAN" ), # CJK Unified Ideographs Extension A
	111	+ array( 0x4E00, 0x9FFF, "SCRIPT_HAN" ), # CJK Unified Ideographs
	112	+ array( 0xA000, 0xA4CF, "SCRIPT_YI" ), # Yi Syllables, Yi Radicals
	113	+ array( 0xA800, 0xA82F, "SCRIPT_SYLOTI_NAGRI" ), # Syloti Nagri
	114	+ array( 0xAC00, 0xD7AF, "SCRIPT_HANGUL" ), # Hangul Syllables
	115	+ array( 0xF900, 0xFAFF, "SCRIPT_DEPRECATED" ), # CJK Compatibility Ideographs
	116	+ # array( 0x10000, 0x100FF, "SCRIPT_LINEAR_B" ), # Linear B Syllabary, Linear B Ideograms
	117	+ # array( 0x10140, 0x1018F, "SCRIPT_GREEK" ), # Ancient Greek Numbers
	118	+ # array( 0x10300, 0x1032F, "SCRIPT_OLD_ITALIC" ), # Old Italic
	119	+ array( 0x10330, 0x1034F, "SCRIPT_GOTHIC" ), # Gothic
	120	+ # array( 0x10380, 0x1039F, "SCRIPT_UGARITIC" ), # Ugaritic
	121	+ # array( 0x103A0, 0x103DF, "SCRIPT_OLD_PERSIAN" ), # Old Persian
	122	+ # array( 0x10400, 0x1044F, "SCRIPT_DESERET" ), # Deseret
	123	+ # array( 0x10450, 0x1047F, "SCRIPT_SHAVIAN" ), # Shavian
	124	+ # array( 0x10480, 0x104AF, "SCRIPT_OSMANYA" ), # Osmanya
	125	+ # array( 0x10800, 0x1083F, "SCRIPT_CYPRIOT" ), # Cypriot Syllabary
	126	+ array( 0x10A00, 0x10A5F, "SCRIPT_KHAROSHTHI" ), # Kharoshthi
	127	+ array( 0x20000, 0x2A6DF, "SCRIPT_HAN" ), # CJK Unified Ideographs Extension B
	128	+ array( 0x2F800, 0x2FA1F, "SCRIPT_DEPRECATED" ) # CJK Compatibility Ideographs Supplement
132	129	);
133		-
	130	+
134	131	# Specially naughty characters we don't ever want to see...
135	132	private static $character_blacklist = array(
136	133	0x0337,
—	—	@@ -143,21 +140,21 @@
144	141	0x2AFD,
145	142	0xFF0F
146	143	);
147		-
	144	+
148	145	# Equivalence sets
149	146	private static $equivset = null;
150	147
151	148	static function initEquivSet() {
152	149	if ( is_null( self::$equivset ) ) {
153		~~- self::$equivset = unserialize( file_get_contents(~~
	150	+ self::$equivset = unserialize( file_get_contents(
154	151	dirname( __FILE__ ) . '/equivset.ser' ) );
155	152	}
156	153	}
157	154
158	155	private static function getScriptCode( $ch ) {
159	156	# Linear search: binary chop would be faster...
160		~~- foreach( self::$script_ranges as $range ) {~~
161		~~- if( $ch >= $range[0] && $ch <= $range[1] ) {~~
	157	+ foreach ( self::$script_ranges as $range ) {
	158	+ if ( $ch >= $range[0] && $ch <= $range[1] ) {
162	159	return $range[2];
163	160	}
164	161	}
—	—	@@ -170,19 +167,19 @@
171	168	private static function getScriptTag( $name ) {
172	169	$name = "SCRIPT_" . strtoupper( trim( $name ) );
173	170	# Linear search
174		~~- foreach( self::$script_ranges as $range ) {~~
175		~~- if( $name == $range[2] ) {~~
	171	+ foreach ( self::$script_ranges as $range ) {
	172	+ if ( $name == $range[2] ) {
176	173	return $range[2];
177	174	}
178	175	}
179	176	# Otherwise...
180	177	return null;
181	178	}
182		-
	179	+
183	180	private static function isSubsetOf( $aList, $bList ) {
184	181	return count( array_diff( $aList, $bList ) ) == 0;
185	182	}
186		-
	183	+
187	184	# Is this an allowed script mixture?
188	185	private static function isAllowedScriptCombination( $scriptList ) {
189	186	$allowedScriptCombinations = array(
—	—	@@ -192,14 +189,14 @@
193	190	array( "SCRIPT_HAN", "SCRIPT_HANGUL" ), # Korean
194	191	array( "SCRIPT_HAN", "SCRIPT_KATAKANA", "SCRIPT_HIRAGANA" ) # Japanese
195	192	);
196		~~- foreach( $allowedScriptCombinations as $allowedCombo ) {~~
197		~~- if( self::isSubsetOf( $scriptList, $allowedCombo ) ) {~~
	193	+ foreach ( $allowedScriptCombinations as $allowedCombo ) {
	194	+ if ( self::isSubsetOf( $scriptList, $allowedCombo ) ) {
198	195	return true;
199	196	}
200	197	}
201	198	return false;
202	199	}
203		-
	200	+
204	201	/**
205	202	* Convert string into array of Unicode code points as integers
206	203	*/
—	—	@@ -213,24 +210,24 @@
214	211	}
215	212	return $out;
216	213	}
217		-
	214	+
218	215	public static function listToString( $list ) {
219	216	$out = '';
220		~~- foreach( $list as $cp ) {~~
	217	+ foreach ( $list as $cp ) {
221	218	$out .= codepointToUtf8( $cp );
222	219	}
223	220	return $out;
224	221	}
225		-
	222	+
226	223	private static function hardjoin( $a_list ) {
227	224	return implode( '', $a_list );
228	225	}
229		-
	226	+
230	227	public static function equivString( $testName ) {
231	228	$out = array();
232	229	self::initEquivSet();
233		~~- foreach( $testName as $codepoint ) {~~
234		~~- if( isset( self::$equivset[$codepoint] ) ) {~~
	230	+ foreach ( $testName as $codepoint ) {
	231	+ if ( isset( self::$equivset[$codepoint] ) ) {
235	232	$out[] = self::$equivset[$codepoint];
236	233	} else {
237	234	$out[] = $codepoint;
—	—	@@ -238,11 +235,11 @@
239	236	}
240	237	return $out;
241	238	}
242		-
	239	+
243	240	private static function mergePairs( $text, $pair, $result ) {
244	241	$out = array();
245		~~- for( $i = 0; $i < count( $text ); $i++ ) {~~
246		~~- if( $text[$i] == $pair[0] && @$text[$i+1] == $pair[1] ) {~~
	242	+ for ( $i = 0; $i < count( $text ); $i++ ) {
	243	+ if ( $text[$i] == $pair[0] && @$text[$i + 1] == $pair[1] ) {
247	244	$out[] = $result[0];
248	245	$i++;
249	246	} else {
—	—	@@ -251,75 +248,75 @@
252	249	}
253	250	return $out;
254	251	}
255		-
	252	+
256	253	private static function stripScript( $text, $script ) {
257	254	$scripts = array_map( array( 'AntiSpoof', 'getScriptCode' ), $text );
258	255	$out = array();
259		~~- foreach( $text as $index => $char ) {~~
260		~~- if( $scripts[$index] !== $script ) {~~
	256	+ foreach ( $text as $index => $char ) {
	257	+ if ( $scripts[$index] !== $script ) {
261	258	$out[] = $char;
262	259	}
263	260	}
264	261	return $out;
265	262	}
266		-
	263	+
267	264	# TODO: does too much in one routine, refactor...
268	265	public static function checkUnicodeString( $testName ) {
269	266	wfLoadExtensionMessages( 'AntiSpoof' );
270	267	# Start with some sanity checking
271		~~- if( !is_string( $testName ) ) {~~
272		~~- return array( "ERROR", wfMsg('antispoof-badtype') );~~
	268	+ if ( !is_string( $testName ) ) {
	269	+ return array( "ERROR", wfMsg( 'antispoof-badtype' ) );
273	270	}
274		-
275		~~- if( strlen( $testName ) == 0 ) {~~
276		~~- return array("ERROR", wfMsg('antispoof-empty') );~~
	271	+
	272	+ if ( strlen( $testName ) == 0 ) {
	273	+ return array( "ERROR", wfMsg( 'antispoof-empty' ) );
277	274	}
278		-
279		~~- if( array_intersect( self::stringToList( $testName ), self::$character_blacklist ) ) {~~
280		~~- return array( "ERROR", wfMsg('antispoof-blacklisted') );~~
	275	+
	276	+ if ( array_intersect( self::stringToList( $testName ), self::$character_blacklist ) ) {
	277	+ return array( "ERROR", wfMsg( 'antispoof-blacklisted' ) );
281	278	}
282		-
	279	+
283	280	# Perform Unicode _compatibility_ decomposition
284	281	$testName = UtfNormal::toNFKD( $testName );
285	282	$testChars = self::stringToList( $testName );
286		-
	283	+
287	284	# Be paranoid: check again, just in case Unicode normalization code changes...
288		~~- if( array_intersect( $testChars, self::$character_blacklist ) ) {~~
289		~~- return array( "ERROR", wfMsg('antispoof-blacklisted') );~~
	285	+ if ( array_intersect( $testChars, self::$character_blacklist ) ) {
	286	+ return array( "ERROR", wfMsg( 'antispoof-blacklisted' ) );
290	287	}
291		-
	288	+
292	289	# Check for this: should not happen in any valid Unicode string
293		~~- if( self::getScriptCode( $testChars[0] ) == "SCRIPT_COMBINING_MARKS" ) {~~
294		~~- return array( "ERROR", wfMsg('antispoof-combining') );~~
	290	+ if ( self::getScriptCode( $testChars[0] ) == "SCRIPT_COMBINING_MARKS" ) {
	291	+ return array( "ERROR", wfMsg( 'antispoof-combining' ) );
295	292	}
296		-
	293	+
297	294	# Strip all combining characters in order to crudely strip accents
298	295	# Note: NFKD normalization should have decomposed all accented chars earlier
299	296	$testChars = self::stripScript( $testChars, "SCRIPT_COMBINING_MARKS" );
300		-
	297	+
301	298	$testScripts = array_unique( array_map( array( 'AntiSpoof', 'getScriptCode' ), $testChars ) );
302		~~- if( in_array( "SCRIPT_UNASSIGNED", $testScripts ) \|\| in_array( "SCRIPT_DEPRECATED", $testScripts ) ) {~~
303		~~- return array( "ERROR", wfMsg('antispoof-unassigned') );~~
	299	+ if ( in_array( "SCRIPT_UNASSIGNED", $testScripts ) \|\| in_array( "SCRIPT_DEPRECATED", $testScripts ) ) {
	300	+ return array( "ERROR", wfMsg( 'antispoof-unassigned' ) );
304	301	}
305		-
	302	+
306	303	# We don't mind ASCII punctuation or digits
307	304	$testScripts = array_diff( $testScripts,
308	305	array( "SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS" ) );
309		-
310		~~- if( !$testScripts ) {~~
311		~~- return array( "ERROR", wfMsg('antispoof-noletters') );~~
	306	+
	307	+ if ( !$testScripts ) {
	308	+ return array( "ERROR", wfMsg( 'antispoof-noletters' ) );
312	309	}
313		-
314		~~- if( count( $testScripts ) > 1 && !self::isAllowedScriptCombination( $testScripts ) ) {~~
315		~~- return array( "ERROR", wfMsg('antispoof-mixedscripts') );~~
	310	+
	311	+ if ( count( $testScripts ) > 1 && !self::isAllowedScriptCombination( $testScripts ) ) {
	312	+ return array( "ERROR", wfMsg( 'antispoof-mixedscripts' ) );
316	313	}
317		-
	314	+
318	315	# At this point, we should probably check for BiDi violations if they aren't
319	316	# caught above...
320		-
	317	+
321	318	# Replace characters in confusables set with equivalence chars
322	319	$testChars = self::equivString( $testChars );
323		-
	320	+
324	321	# Do very simple sequence processing: "vv" -> "w", "rn" -> "m"...
325	322	# Not exhaustive, but ups the ante...
326	323	# Do this _after_ canonicalization: looks weird, but needed for consistency
—	—	@@ -329,30 +326,29 @@
330	327	$testChars = self::mergePairs( $testChars,
331	328	self::equivString( self::stringToList( "RN" ) ),
332	329	self::equivString( self::stringToList( "M" ) ) );
333		-
	330	+
334	331	# Squeeze out all punctuation chars
335	332	# TODO: almost the same code occurs twice, refactor into own routine
336	333	$testChars = self::stripScript( $testChars, "SCRIPT_ASCII_PUNCTUATION" );
337		-
	334	+
338	335	$testName = self::listToString( $testChars );
339		-
	336	+
340	337	# Remove all remaining spaces, just in case any have snuck through...
341	338	$testName = self::hardjoin( explode( " ", $testName ) );
342		-
	339	+
343	340	# Reduce repeated char sequences to single character
344	341	# BUG: TODO: implement this
345		-
346		~~- if( strlen( $testName ) < 1 ) {~~
347		~~- return array("ERROR", wfMsg('antispoof-tooshort') );~~
	342	+
	343	+ if ( strlen( $testName ) < 1 ) {
	344	+ return array( "ERROR", wfMsg( 'antispoof-tooshort' ) );
348	345	}
349		-
	346	+
350	347	# Don't ASCIIfy: we assume we are UTF-8 capable on output
351		-
	348	+
352	349	# Prepend version string, for futureproofing if this algorithm changes
353	350	$testName = "v2:" . $testName;
354		-
	351	+
355	352	# And return the canonical version of the name
356	353	return array( "OK", $testName );
357	354	}
358		-
359	355	}
Index: trunk/extensions/AntiSpoof/AntiSpoof.i18n.php
—	—	@@ -3,7 +3,7 @@
4	4	* Internationalisation file for extension AntiSpoof.
5	5	*
6	6	* @addtogroup Extensions
7		~~-*/~~
	7	+ */
8	8
9	9	$messages = array();
10	10
—	—	@@ -956,7 +956,7 @@
957	957	'antispoof-desc' => 'Menggak nggawé akun utawa rékening mawa jeneng panganggo aksara campuran, mbingungaké lan sing mèmper',
958	958	'antispoof-conflict-top' => 'Jeneng "$1" mèmper banget karo {{PLURAL:$2\|akun sing wis ana\|$2 akun iki}}:',
959	959	'antispoof-conflict-bottom' => 'Mangga milih jeneng liya',
960		~~- 'antispoof-name-illegal' => 'Jeneng "$1" ora diidinaké supaya wong ora bingung utawa menggak ngapi-api jeneng panganggo sing wis ana: $2.~~
	960	+ 'antispoof-name-illegal' => 'Jeneng "$1" ora diidinaké supaya wong ora bingung utawa menggak ngapi-api jeneng panganggo sing wis ana: $2.
961	961	Mangga pilihen jeneng liya.',
962	962	'antispoof-badtype' => 'Tipe data salah',
963	963	'antispoof-empty' => 'Data kosong',
—	—	@@ -1331,7 +1331,7 @@
1332	1332	'antispoof-desc' => 'Blokkerer for oppretting av konti med liknande eller forvirrande brukarnamn, eller brukarnamn som inneheld forskjellige alfabettypar',
1333	1333	'antispoof-conflict-top' => 'Namnet «$1» er for likt følgjande {{PLURAL:$2\|konto\|kontoar}}:',
1334	1334	'antispoof-conflict-bottom' => 'Vel eit anna namn.',
1335		~~- 'antispoof-name-illegal' => 'Namnet «$1» er ikkje tillate for å hindra samanblanding: $2.~~
	1335	+ 'antispoof-name-illegal' => 'Namnet «$1» er ikkje tillate for å hindra samanblanding: $2.
1336	1336	Ver venleg og vel eit anna namn.',
1337	1337	'antispoof-badtype' => 'Ugyldig datatype',
1338	1338	'antispoof-empty' => 'Tom streng',
—	—	@@ -1502,7 +1502,7 @@
1503	1503	'antispoof-desc' => 'Impede a criação de contas com escrita mista, e nomes de utilizador confusos e semelhantes',
1504	1504	'antispoof-conflict-top' => 'O nome "$1" é demasiado semelhante {{PLURAL:$2\|ao da seguinte conta já existente\|aos das seguintes $2 contas}}',
1505	1505	'antispoof-conflict-bottom' => 'Por favor, escolha outro nome.',
1506		~~- 'antispoof-name-illegal' => 'O nome "$1" não é permitido para prevenir que seja confundido com outro (ou que seja feito algum trocadilho): já existe $2.~~
	1506	+ 'antispoof-name-illegal' => 'O nome "$1" não é permitido para prevenir que seja confundido com outro (ou que seja feito algum trocadilho): já existe $2.
1507	1507	Por favor, escolha outro nome.',
1508	1508	'antispoof-badtype' => 'Formato de dados incorreto',
1509	1509	'antispoof-empty' => 'Linha vazia',
Index: trunk/extensions/AntiSpoof/generateEquivset.php
—	—	@@ -1,7 +1,7 @@
2	2	<?php
3	3
4		~~-require_once ( getenv('MW_INSTALL_PATH') !== false~~
5		~~- ? getenv('MW_INSTALL_PATH')."/maintenance/commandLine.inc"~~
	4	+require_once ( getenv( 'MW_INSTALL_PATH' ) !== false
	5	+ ? getenv( 'MW_INSTALL_PATH' ) . "/maintenance/commandLine.inc"
6	6	: dirname( __FILE__ ) . '/../../maintenance/commandLine.inc' );
7	7
8	8	$dir = dirname( __FILE__ );
Index: trunk/extensions/AntiSpoof/AntiSpoof.php
—	—	@@ -1,4 +1,7 @@
2	2	<?php
	3	+if ( !defined( 'MEDIAWIKI' ) ) {
	4	+ exit( 1 );
	5	+}
3	6
4	7	$wgExtensionCredits['other'][] = array(
5	8	'path' => __FILE__,
—	—	@@ -56,10 +59,10 @@
57	60	global $wgAntiSpoofAccounts, $wgUser, $wgRequest;
58	61	wfLoadExtensionMessages( 'AntiSpoof' );
59	62
60		~~- if( !$wgAntiSpoofAccounts ) {~~
	63	+ if ( !$wgAntiSpoofAccounts ) {
61	64	$mode = 'LOGGING ';
62	65	$active = false;
63		~~- } elseif( $wgRequest->getCheck('wpIgnoreAntiSpoof') &&~~
	66	+ } elseif ( $wgRequest->getCheck( 'wpIgnoreAntiSpoof' ) &&
64	67	$wgUser->isAllowed( 'override-antispoof' ) ) {
65	68	$mode = 'OVERRIDE ';
66	69	$active = false;
—	—	@@ -70,18 +73,18 @@
71	74
72	75	$name = $user->getName();
73	76	$spoof = new SpoofUser( $name );
74		~~- if( $spoof->isLegal() ) {~~
	77	+ if ( $spoof->isLegal() ) {
75	78	$normalized = $spoof->getNormalized();
76	79	$conflicts = $spoof->getConflicts();
77		~~- if( empty($conflicts) ) {~~
	80	+ if ( empty( $conflicts ) ) {
78	81	wfDebugLog( 'antispoof', "{$mode}PASS new account '$name' [$normalized]" );
79	82	} else {
80	83	wfDebugLog( 'antispoof', "{$mode}CONFLICT new account '$name' [$normalized] spoofs " . implode( ',', $conflicts ) );
81		~~- if( $active ) {~~
	84	+ if ( $active ) {
82	85	$numConflicts = count( $conflicts );
83		~~- $message = wfMsgExt( 'antispoof-conflict-top', array('parsemag'), htmlspecialchars( $name ), $numConflicts );~~
	86	+ $message = wfMsgExt( 'antispoof-conflict-top', array( 'parsemag' ), htmlspecialchars( $name ), $numConflicts );
84	87	$message .= '<ul>';
85		~~- foreach( $conflicts as $simUser ) {~~
	88	+ foreach ( $conflicts as $simUser ) {
86	89	$message .= '<li>' . wfMsg( 'antispoof-conflict-item', $simUser ) . '</li>';
87	90	}
88	91	$message .= '</ul>' . wfMsg( 'antispoof-conflict-bottom' );
—	—	@@ -91,7 +94,7 @@
92	95	} else {
93	96	$error = $spoof->getError();
94	97	wfDebugLog( 'antispoof', "{$mode}ILLEGAL new account '$name' $error" );
95		~~- if( $active ) {~~
	98	+ if ( $active ) {
96	99	$message = wfMsg( 'antispoof-name-illegal', $name, $error );
97	100	return false;
98	101	}
—	—	@@ -107,9 +110,9 @@
108	111
109	112	wfLoadExtensionMessages( 'AntiSpoof' );
110	113
111		~~- if( $wgAntiSpoofAccounts && $wgUser->isAllowed( 'override-antispoof' ) )~~
	114	+ if ( $wgAntiSpoofAccounts && $wgUser->isAllowed( 'override-antispoof' ) )
112	115	$template->addInputItem( 'wpIgnoreAntiSpoof',
113		~~- $wgRequest->getCheck('wpIgnoreAntiSpoof'),~~
	116	+ $wgRequest->getCheck( 'wpIgnoreAntiSpoof' ),
114	117	'checkbox', 'antispoof-ignore' );
115	118	return true;
116	119	}
Index: trunk/extensions/AntiSpoof/batchAntiSpoof.php
—	—	@@ -1,5 +1,4 @@
2	2	<?php
3		-
4	3	// Go through all usernames and calculate and record spoof thingies
5	4
6	5	$base = dirname( dirname( dirname( __FILE__ ) ) );
—	—	@@ -13,14 +12,14 @@
14	13
15	14	$result = $dbw->select( 'user', 'user_name', null, 'batchAntiSpoof.php' );
16	15	$n = 0;
17		~~-while( $row = $dbw->fetchObject( $result ) ) {~~
18		~~- if( $n++ % $batchSize == 0 ) {~~
	16	+while ( $row = $dbw->fetchObject( $result ) ) {
	17	+ if ( $n++ % $batchSize == 0 ) {
19	18	echo "$wgDBname $n\n";
20	19	}
21	20
22	21	$items[] = new SpoofUser( $row->user_name );
23	22
24		~~- if( $n % $batchSize == 0 ) {~~
	23	+ if ( $n % $batchSize == 0 ) {
25	24	SpoofUser::batchRecord( $items );
26	25	$items = array();
27	26	}
—	—	@@ -29,4 +28,3 @@
30	29	SpoofUser::batchRecord( $items );
31	30	echo "$wgDBname $n done.\n";
32	31	$dbw->freeResult( $result );
33		-
Index: trunk/extensions/AntiSpoof/equivset.in
—	—	@@ -1,37 +1,36 @@
2		~~-# There is a publically editable copy of this file at~~
	2	+# There is a publically editable copy of this file at
3	3	# http://www.mediawiki.org/wiki/AntiSpoof/Equivalence_sets
4	4
5		-
6	5	# This is the input file for generateEquivset.php
7	6	# The format is:
8	7	#
9	8	# <hexadecimal codepoint> <character> => [<hexadecimal codepoint>] <character>
10	9	#
11		~~-# If the codepoint is given, it must match the character, or else a warning~~
	10	+# If the codepoint is given, it must match the character, or else a warning
12	11	# will be issued and the line will be ignored.
13	12	#
14	13	# The effect of such a line is to conflate the two identified character, i.e.
15		~~-# to put them in the same set. If two sets share a member, then they will be~~
	14	+# to put them in the same set. If two sets share a member, then they will be
16	15	# merged into a single larger set.
17	16	#
18	17	# We have attempted to include the following types of equivalence:
19		~~-# * Case folding. Although letters of different cases are often visually~~
	18	+# * Case folding. Although letters of different cases are often visually
20	19	# distinct, they can easily be confused by people who are familiar with
21		~~-# the alphabet. Two words with a different case may be read as the same~~
	20	+# the alphabet. Two words with a different case may be read as the same
22	21	# word. This is a popular technique for impersonation.
23	22	#
24	23	# * Visually similar characters. Cross-script pairs are included, but these
25		~~-# tend to produce false conflations within scripts, and so should be~~
	24	+# tend to produce false conflations within scripts, and so should be
26	25	# avoided. The software implements a blanket restriction against cross-
27		~~-# script strings, which makes cross-script pairs mostly redundant.~~
	26	+# script strings, which makes cross-script pairs mostly redundant.
28	27	#
29		~~-# * Chinese Simplified/Traditional pairs.~~
	28	+# * Chinese Simplified/Traditional pairs.
30	29	#
31	30	# The list is based on one by Neil Harris, which was derived by unknown methods.
32	31	# That list also contained transliteration pairs, which we considered excessive
33		~~-# and have attempted to remove. For example, the latin E and H were considered~~
34		~~-# equivalent, because the latin transliteration of the cyrillic "Н" (which~~
35		~~-# looks like latin H) is "E".~~
	32	+# and have attempted to remove. For example, the latin E and H were considered
	33	+# equivalent, because the latin transliteration of the cyrillic "Н" (which
	34	+# looks like latin H) is "E".
36	35
37	36	49 I => 31 1
38	37	4C L => 31 1
—	—	@@ -5183,4 +5182,3 @@
5184	5183	FFDA ￚ => 3161 ㅡ
5185	5184	FFDB ￛ => 3162 ㅢ
5186	5185	FFDC ￜ => 3163 ㅣ
5187		-
Index: trunk/extensions/AntiSpoof/SpoofUser.php
—	—	@@ -5,7 +5,7 @@
6	6	$this->mName = strval( $name );
7	7	list( $ok, $normalized ) = AntiSpoof::checkUnicodeString( $this->mName );
8	8	$this->mLegal = ( $ok == 'OK' );
9		~~- if( $this->mLegal ) {~~
	9	+ if ( $this->mLegal ) {
10	10	$this->mNormalized = $normalized;
11	11	$this->mError = null;
12	12	} else {
—	—	@@ -59,7 +59,7 @@
60	60	) );
61	61
62	62	$spoofs = array();
63		~~- while( $row = $dbr->fetchObject( $spoofedUsers ) ) {~~
	63	+ while ( $row = $dbr->fetchObject( $spoofedUsers ) ) {
64	64	array_push( $spoofs, $row->user_name );
65	65	}
66	66	return $spoofs;
—	—	@@ -87,9 +87,9 @@
88	88	* @param $items array of SpoofUser
89	89	*/
90	90	public function batchRecord( $items ) {
91		~~- if( count( $items ) ) {~~
	91	+ if ( count( $items ) ) {
92	92	$fields = array();
93		~~- foreach( $items as $item ) {~~
	93	+ foreach ( $items as $item ) {
94	94	$fields[] = $item->insertFields();
95	95	}
96	96	$dbw = wfGetDB( DB_MASTER );

Status & tagging log

20:04, 23 October 2010 Reedy (talk | contribs) changed the status of r64581 [removed: new added: deferred]