r88408 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88407‎ | r88408 | r88409 >
Date:15:10, 19 May 2011
Author:diederik
Status:deferred
Tags:
Comment:
Fixed euclidean distance formula.
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
@@ -4,7 +4,8 @@
55 __version__ = '0.1'
66
77 from datetime import datetime
8 -from math import pow
 8+from math import pow, log, sqrt
 9+import codecs
910 import time
1011 import sys
1112 import os
@@ -32,62 +33,166 @@
3334
3435
3536 def create_sample_b():
36 - date = datetime(2010, 6, 30)
37 - cursor = db.find('reg_date', {'$gte': date})
38 - return cursor
 37+ editors = []
 38+ start_date = datetime(2010, 9, 1)
 39+ end_date = datetime(2010, 11, 1)
 40+ cursor = db.find('reg_date', {'$gte': start_date, '$lt': end_date})
 41+ for editor in cursor:
 42+ editors.append(editor['username'])
 43+ return editors
3944
4045
 46+def retrieve_variables(obs, username, date):
 47+ data = db.find_one('username', username)
 48+ year = str(date.year)
 49+ month = str(date.month)
 50+ if data:
 51+ revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0)
 52+ character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0)
 53+ reg_date = data.get('reg_date', datetime(2001, 1, 1))
 54+ #epoch = time.mktime(reg_date.timetuple())
 55+ cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
 56+ cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0)
 57+ article_count = data['article_count'].get(year, {}).get(month, {}).get('0', 0)
 58+
 59+ if character_count + cum_edit_count_main_ns + cum_edit_count_other_ns + article_count > 0:
 60+ #print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date)
 61+ obs.setdefault(username, {})
 62+ obs[username]['revert_count'] = float(revert_count)
 63+ obs[username]['character_count'] = float(character_count)
 64+ obs[username]['reg_date'] = reg_date #epoch / 86400
 65+ obs[username]['cum_edit_count_main_ns'] = float(cum_edit_count_main_ns)
 66+ obs[username]['cum_edit_count_other_ns'] = float(cum_edit_count_other_ns)
 67+ obs[username]['article_count'] = float(article_count)
 68+ return obs
 69+
 70+
4171 def create_dataset(editors):
4272 obs = {}
43 - print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date')
 73+ #print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date')
 74+ dates = [datetime(2010, 11, 30)] #, datetime(2010, 12, 31)]
4475 for username in editors:
45 - for date in editors[username]:
46 - month = str(date.month)
47 - year = str(date.year)
48 - data = db.find_one('username', username)
49 - if data:
50 - revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0)
51 - character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0)
52 - reg_date = data.get('reg_date', datetime(2001, 1, 1))
53 - epoch = time.mktime(reg_date.timetuple())
54 - cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
55 - cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0)
56 - article_count = data['article_count'].get(year, {}).get(month, 0)
57 - print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date)
58 - obs.setdefault(username, {})
59 - obs[username]['revert_count'] = revert_count
60 - obs[username]['character_count'] = character_count
61 - obs[username]['reg_date'] = epoch
62 - obs[username]['cum_edit_count_main_ns'] = cum_edit_count_main_ns
63 - obs[username]['cum_edit_count_other_ns'] = cum_edit_count_other_ns
64 - obs[username]['article_count'] = article_count
 76+ for date in dates:
 77+ obs = retrieve_variables(obs, username, date)
6578 return obs
6679
 80+
6781 def euclidean_distance(vars, person1, person2):
68 - sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars])
69 - return 1 / (1 + sum_of_squares)
 82+ #handle the date variable
 83+ #sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars])
 84+ sum_of_squares = 0.0
 85+ for item in vars:
 86+ if item == 'reg_date' or item == 'revert_count':
 87+ pass
 88+# dt = person1[item] - person2[item]
 89+# dt = log(dt.days) if dt.days > 0 else 0
 90+# sum_of_squares += pow(dt, 2)
 91+ else:
 92+ sum_of_squares += pow(person1[item] - person2[item], 2)
 93+ return 1 / (1 + sqrt(sum_of_squares))
7094
7195
72 -def calculate_distance_matrix(obs_a, obs_b):
73 - vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 'cum_edit_count_other_ns', 'article_count']
74 - matches = {}
 96+def calculate_distance_matrix(vars, obs_a, obs_b):
 97+ print 'Constructing distance matrix...'
 98+ distances = {}
7599 for person1 in obs_a:
76100 for person2 in obs_b:
77 - d = euclidean_distance(vars, person1, person2)
78 - matches.setdefault(person1, {})
79 - matches[person1][person2] = d
 101+ if person1 != person2:
 102+ d = euclidean_distance(vars, obs_a[person1], obs_b[person2])
 103+ #print obs_a[person1].values(), obs_b[person2].values(), d
 104+ distances.setdefault(person1, {})
 105+ distances[person1][person2] = d
 106+ return distances
 107+
 108+
 109+def normalize_dataset(vars, obs):
 110+ editors = obs.keys()
 111+ data = []
 112+ for var in vars:
 113+ for editor in editors:
 114+ data.append(obs[editor][var])
 115+ sd = standard_deviation(data)
 116+ for editor in editors:
 117+ try:
 118+ obs[editor][var] = obs[editor][var] / sd
 119+ except ZeroDivisionError:
 120+ obs[editor][var] = 0
 121+ return obs
 122+
 123+
 124+def standard_deviation(data):
 125+ n = len(data)
 126+ values = sum(data)
 127+ sq_values = values * values
 128+ sd = (1.0 / n) * sq_values - (pow((1.0 / n) * values, 2))
 129+ return sd
 130+
 131+
 132+def inverse_dictionary(data):
 133+ return dict((v, k) for k, v in data.iteritems())
 134+
 135+
 136+def find_partner(distances):
 137+ print 'Finding similar partners...'
 138+ matches = []
 139+ ppi_editors = distances.keys()
 140+ for ppi_editor in ppi_editors:
 141+ data = inverse_dictionary(distances[ppi_editor])
 142+ min_d = min(data.keys())
 143+ max_d = max(data.keys())
 144+ match = data[max_d]
 145+ matches.append((ppi_editor, match))
 146+ for editor in distances:
 147+ try:
 148+ distances[editor].pop(match)
 149+ except KeyError:
 150+ pass
 151+ print ppi_editor, match, min_d, max_d
80152 return matches
81153
82 -def find_partner(matches):
83 - pass
84154
 155+def write_dataset(vars, matches, obs_a, obs_b):
 156+ print 'Writing dataset to CSV file...'
 157+ fh = codecs.open('ppi_quality.csv', 'w', 'utf-8')
 158+ fh.write('%s\t' % ('editor_a'))
 159+ fh.write('_a\t'.join(vars))
 160+ fh.write('\t%s\t' % ('editor_b'))
 161+ fh.write('_b\t'.join(vars))
 162+ fh.write('\tdelta registration days\tid\n')
 163+ for i, match in enumerate(matches):
 164+ line = []
 165+ editor_a = match[0]
 166+ editor_b = match[1]
 167+ line.append(editor_a)
 168+ values_a = [str(obs_a[editor_a][v]) for v in vars]
 169+ values_b = [str(obs_b[editor_b][v]) for v in vars]
 170+ line.extend(values_a)
 171+ line.append(editor_b)
 172+ line.extend(values_b)
 173+ dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
 174+ line.append(str(dt.days))
 175+ line.append(str(i))
 176+ line.append('\n')
 177+ print line
 178+ #line = '\t'.join([str(l).decode('utf-8') for l in line])
 179+ line = '\t'.join(line)
 180+ fh.write(line)
 181+ fh.close()
 182+
 183+
85184 def launcher():
 185+ print 'Retrieving datasets...'
 186+ vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns',
 187+ 'cum_edit_count_other_ns', 'article_count', 'revert_count']
86188 editors_a = create_sample_a()
87189 obs_a = create_dataset(editors_a)
 190+ #obs_a = normalize_dataset(vars, obs_a)
88191 editors_b = create_sample_b()
89192 obs_b = create_dataset(editors_b)
90 - matches = calculate_distance_matrix(obs_a, obs_b)
91 - find_partner(matches)
 193+ #obs_b = normalize_dataset(vars, obs_b)
 194+ distances = calculate_distance_matrix(vars, obs_a, obs_b)
 195+ matches = find_partner(distances)
 196+ write_dataset(vars, matches, obs_a, obs_b)
92197
93198
94199 if __name__ == '__main__':