r88408 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r88407‎ \| r88408 \| r88409 >
Date:	15:10, 19 May 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Fixed euclidean distance formula.
Modified paths:	/trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
—	—	@@ -4,7 +4,8 @@
5	5	__version__ = '0.1'
6	6
7	7	from datetime import datetime
8		~~-from math import pow~~
	8	+from math import pow, log, sqrt
	9	+import codecs
9	10	import time
10	11	import sys
11	12	import os
—	—	@@ -32,62 +33,166 @@
33	34
34	35
35	36	def create_sample_b():
36		~~- date = datetime(2010, 6, 30)~~
37		~~- cursor = db.find('reg_date', {'$gte': date})~~
38		~~- return cursor~~
	37	+ editors = []
	38	+ start_date = datetime(2010, 9, 1)
	39	+ end_date = datetime(2010, 11, 1)
	40	+ cursor = db.find('reg_date', {'$gte': start_date, '$lt': end_date})
	41	+ for editor in cursor:
	42	+ editors.append(editor['username'])
	43	+ return editors
39	44
40	45
	46	+def retrieve_variables(obs, username, date):
	47	+ data = db.find_one('username', username)
	48	+ year = str(date.year)
	49	+ month = str(date.month)
	50	+ if data:
	51	+ revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0)
	52	+ character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0)
	53	+ reg_date = data.get('reg_date', datetime(2001, 1, 1))
	54	+ #epoch = time.mktime(reg_date.timetuple())
	55	+ cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
	56	+ cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0)
	57	+ article_count = data['article_count'].get(year, {}).get(month, {}).get('0', 0)
	58	+
	59	+ if character_count + cum_edit_count_main_ns + cum_edit_count_other_ns + article_count > 0:
	60	+ #print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date)
	61	+ obs.setdefault(username, {})
	62	+ obs[username]['revert_count'] = float(revert_count)
	63	+ obs[username]['character_count'] = float(character_count)
	64	+ obs[username]['reg_date'] = reg_date #epoch / 86400
	65	+ obs[username]['cum_edit_count_main_ns'] = float(cum_edit_count_main_ns)
	66	+ obs[username]['cum_edit_count_other_ns'] = float(cum_edit_count_other_ns)
	67	+ obs[username]['article_count'] = float(article_count)
	68	+ return obs
	69	+
	70	+
41	71	def create_dataset(editors):
42	72	obs = {}
43		~~- print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date')~~
	73	+ #print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date')
	74	+ dates = [datetime(2010, 11, 30)] #, datetime(2010, 12, 31)]
44	75	for username in editors:
45		~~- for date in editors[username]:~~
46		~~- month = str(date.month)~~
47		~~- year = str(date.year)~~
48		~~- data = db.find_one('username', username)~~
49		~~- if data:~~
50		~~- revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0)~~
51		~~- character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0)~~
52		~~- reg_date = data.get('reg_date', datetime(2001, 1, 1))~~
53		~~- epoch = time.mktime(reg_date.timetuple())~~
54		~~- cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)~~
55		~~- cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0)~~
56		~~- article_count = data['article_count'].get(year, {}).get(month, 0)~~
57		~~- print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date)~~
58		~~- obs.setdefault(username, {})~~
59		~~- obs[username]['revert_count'] = revert_count~~
60		~~- obs[username]['character_count'] = character_count~~
61		~~- obs[username]['reg_date'] = epoch~~
62		~~- obs[username]['cum_edit_count_main_ns'] = cum_edit_count_main_ns~~
63		~~- obs[username]['cum_edit_count_other_ns'] = cum_edit_count_other_ns~~
64		~~- obs[username]['article_count'] = article_count~~
	76	+ for date in dates:
	77	+ obs = retrieve_variables(obs, username, date)
65	78	return obs
66	79
	80	+
67	81	def euclidean_distance(vars, person1, person2):
68		~~- sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars])~~
69		~~- return 1 / (1 + sum_of_squares)~~
	82	+ #handle the date variable
	83	+ #sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars])
	84	+ sum_of_squares = 0.0
	85	+ for item in vars:
	86	+ if item == 'reg_date' or item == 'revert_count':
	87	+ pass
	88	+# dt = person1[item] - person2[item]
	89	+# dt = log(dt.days) if dt.days > 0 else 0
	90	+# sum_of_squares += pow(dt, 2)
	91	+ else:
	92	+ sum_of_squares += pow(person1[item] - person2[item], 2)
	93	+ return 1 / (1 + sqrt(sum_of_squares))
70	94
71	95
72		~~-def calculate_distance_matrix(obs_a, obs_b):~~
73		~~- vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 'cum_edit_count_other_ns', 'article_count']~~
74		~~- matches = {}~~
	96	+def calculate_distance_matrix(vars, obs_a, obs_b):
	97	+ print 'Constructing distance matrix...'
	98	+ distances = {}
75	99	for person1 in obs_a:
76	100	for person2 in obs_b:
77		~~- d = euclidean_distance(vars, person1, person2)~~
78		~~- matches.setdefault(person1, {})~~
79		~~- matches[person1][person2] = d~~
	101	+ if person1 != person2:
	102	+ d = euclidean_distance(vars, obs_a[person1], obs_b[person2])
	103	+ #print obs_a[person1].values(), obs_b[person2].values(), d
	104	+ distances.setdefault(person1, {})
	105	+ distances[person1][person2] = d
	106	+ return distances
	107	+
	108	+
	109	+def normalize_dataset(vars, obs):
	110	+ editors = obs.keys()
	111	+ data = []
	112	+ for var in vars:
	113	+ for editor in editors:
	114	+ data.append(obs[editor][var])
	115	+ sd = standard_deviation(data)
	116	+ for editor in editors:
	117	+ try:
	118	+ obs[editor][var] = obs[editor][var] / sd
	119	+ except ZeroDivisionError:
	120	+ obs[editor][var] = 0
	121	+ return obs
	122	+
	123	+
	124	+def standard_deviation(data):
	125	+ n = len(data)
	126	+ values = sum(data)
	127	+ sq_values = values * values
	128	+ sd = (1.0 / n) * sq_values - (pow((1.0 / n) * values, 2))
	129	+ return sd
	130	+
	131	+
	132	+def inverse_dictionary(data):
	133	+ return dict((v, k) for k, v in data.iteritems())
	134	+
	135	+
	136	+def find_partner(distances):
	137	+ print 'Finding similar partners...'
	138	+ matches = []
	139	+ ppi_editors = distances.keys()
	140	+ for ppi_editor in ppi_editors:
	141	+ data = inverse_dictionary(distances[ppi_editor])
	142	+ min_d = min(data.keys())
	143	+ max_d = max(data.keys())
	144	+ match = data[max_d]
	145	+ matches.append((ppi_editor, match))
	146	+ for editor in distances:
	147	+ try:
	148	+ distances[editor].pop(match)
	149	+ except KeyError:
	150	+ pass
	151	+ print ppi_editor, match, min_d, max_d
80	152	return matches
81	153
82		~~-def find_partner(matches):~~
83		~~- pass~~
84	154
	155	+def write_dataset(vars, matches, obs_a, obs_b):
	156	+ print 'Writing dataset to CSV file...'
	157	+ fh = codecs.open('ppi_quality.csv', 'w', 'utf-8')
	158	+ fh.write('%s\t' % ('editor_a'))
	159	+ fh.write('_a\t'.join(vars))
	160	+ fh.write('\t%s\t' % ('editor_b'))
	161	+ fh.write('_b\t'.join(vars))
	162	+ fh.write('\tdelta registration days\tid\n')
	163	+ for i, match in enumerate(matches):
	164	+ line = []
	165	+ editor_a = match[0]
	166	+ editor_b = match[1]
	167	+ line.append(editor_a)
	168	+ values_a = [str(obs_a[editor_a][v]) for v in vars]
	169	+ values_b = [str(obs_b[editor_b][v]) for v in vars]
	170	+ line.extend(values_a)
	171	+ line.append(editor_b)
	172	+ line.extend(values_b)
	173	+ dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
	174	+ line.append(str(dt.days))
	175	+ line.append(str(i))
	176	+ line.append('\n')
	177	+ print line
	178	+ #line = '\t'.join([str(l).decode('utf-8') for l in line])
	179	+ line = '\t'.join(line)
	180	+ fh.write(line)
	181	+ fh.close()
	182	+
	183	+
85	184	def launcher():
	185	+ print 'Retrieving datasets...'
	186	+ vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns',
	187	+ 'cum_edit_count_other_ns', 'article_count', 'revert_count']
86	188	editors_a = create_sample_a()
87	189	obs_a = create_dataset(editors_a)
	190	+ #obs_a = normalize_dataset(vars, obs_a)
88	191	editors_b = create_sample_b()
89	192	obs_b = create_dataset(editors_b)
90		~~- matches = calculate_distance_matrix(obs_a, obs_b)~~
91		~~- find_partner(matches)~~
	193	+ #obs_b = normalize_dataset(vars, obs_b)
	194	+ distances = calculate_distance_matrix(vars, obs_a, obs_b)
	195	+ matches = find_partner(distances)
	196	+ write_dataset(vars, matches, obs_a, obs_b)
92	197
93	198
94	199	if __name__ == '__main__':