Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py |
— | — | @@ -4,7 +4,8 @@ |
5 | 5 | __version__ = '0.1' |
6 | 6 | |
7 | 7 | from datetime import datetime |
8 | | -from math import pow |
| 8 | +from math import pow, log, sqrt |
| 9 | +import codecs |
9 | 10 | import time |
10 | 11 | import sys |
11 | 12 | import os |
— | — | @@ -32,62 +33,166 @@ |
33 | 34 | |
34 | 35 | |
35 | 36 | def create_sample_b(): |
36 | | - date = datetime(2010, 6, 30) |
37 | | - cursor = db.find('reg_date', {'$gte': date}) |
38 | | - return cursor |
| 37 | + editors = [] |
| 38 | + start_date = datetime(2010, 9, 1) |
| 39 | + end_date = datetime(2010, 11, 1) |
| 40 | + cursor = db.find('reg_date', {'$gte': start_date, '$lt': end_date}) |
| 41 | + for editor in cursor: |
| 42 | + editors.append(editor['username']) |
| 43 | + return editors |
39 | 44 | |
40 | 45 | |
| 46 | +def retrieve_variables(obs, username, date): |
| 47 | + data = db.find_one('username', username) |
| 48 | + year = str(date.year) |
| 49 | + month = str(date.month) |
| 50 | + if data: |
| 51 | + revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0) |
| 52 | + character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0) |
| 53 | + reg_date = data.get('reg_date', datetime(2001, 1, 1)) |
| 54 | + #epoch = time.mktime(reg_date.timetuple()) |
| 55 | + cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0) |
| 56 | + cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0) |
| 57 | + article_count = data['article_count'].get(year, {}).get(month, {}).get('0', 0) |
| 58 | + |
| 59 | + if character_count + cum_edit_count_main_ns + cum_edit_count_other_ns + article_count > 0: |
| 60 | + #print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date) |
| 61 | + obs.setdefault(username, {}) |
| 62 | + obs[username]['revert_count'] = float(revert_count) |
| 63 | + obs[username]['character_count'] = float(character_count) |
| 64 | + obs[username]['reg_date'] = reg_date #epoch / 86400 |
| 65 | + obs[username]['cum_edit_count_main_ns'] = float(cum_edit_count_main_ns) |
| 66 | + obs[username]['cum_edit_count_other_ns'] = float(cum_edit_count_other_ns) |
| 67 | + obs[username]['article_count'] = float(article_count) |
| 68 | + return obs |
| 69 | + |
| 70 | + |
41 | 71 | def create_dataset(editors): |
42 | 72 | obs = {} |
43 | | - print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date') |
| 73 | + #print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date') |
| 74 | + dates = [datetime(2010, 11, 30)] #, datetime(2010, 12, 31)] |
44 | 75 | for username in editors: |
45 | | - for date in editors[username]: |
46 | | - month = str(date.month) |
47 | | - year = str(date.year) |
48 | | - data = db.find_one('username', username) |
49 | | - if data: |
50 | | - revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0) |
51 | | - character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0) |
52 | | - reg_date = data.get('reg_date', datetime(2001, 1, 1)) |
53 | | - epoch = time.mktime(reg_date.timetuple()) |
54 | | - cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0) |
55 | | - cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0) |
56 | | - article_count = data['article_count'].get(year, {}).get(month, 0) |
57 | | - print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date) |
58 | | - obs.setdefault(username, {}) |
59 | | - obs[username]['revert_count'] = revert_count |
60 | | - obs[username]['character_count'] = character_count |
61 | | - obs[username]['reg_date'] = epoch |
62 | | - obs[username]['cum_edit_count_main_ns'] = cum_edit_count_main_ns |
63 | | - obs[username]['cum_edit_count_other_ns'] = cum_edit_count_other_ns |
64 | | - obs[username]['article_count'] = article_count |
| 76 | + for date in dates: |
| 77 | + obs = retrieve_variables(obs, username, date) |
65 | 78 | return obs |
66 | 79 | |
| 80 | + |
67 | 81 | def euclidean_distance(vars, person1, person2): |
68 | | - sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars]) |
69 | | - return 1 / (1 + sum_of_squares) |
| 82 | + #handle the date variable |
| 83 | + #sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars]) |
| 84 | + sum_of_squares = 0.0 |
| 85 | + for item in vars: |
| 86 | + if item == 'reg_date' or item == 'revert_count': |
| 87 | + pass |
| 88 | +# dt = person1[item] - person2[item] |
| 89 | +# dt = log(dt.days) if dt.days > 0 else 0 |
| 90 | +# sum_of_squares += pow(dt, 2) |
| 91 | + else: |
| 92 | + sum_of_squares += pow(person1[item] - person2[item], 2) |
| 93 | + return 1 / (1 + sqrt(sum_of_squares)) |
70 | 94 | |
71 | 95 | |
72 | | -def calculate_distance_matrix(obs_a, obs_b): |
73 | | - vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 'cum_edit_count_other_ns', 'article_count'] |
74 | | - matches = {} |
| 96 | +def calculate_distance_matrix(vars, obs_a, obs_b): |
| 97 | + print 'Constructing distance matrix...' |
| 98 | + distances = {} |
75 | 99 | for person1 in obs_a: |
76 | 100 | for person2 in obs_b: |
77 | | - d = euclidean_distance(vars, person1, person2) |
78 | | - matches.setdefault(person1, {}) |
79 | | - matches[person1][person2] = d |
| 101 | + if person1 != person2: |
| 102 | + d = euclidean_distance(vars, obs_a[person1], obs_b[person2]) |
| 103 | + #print obs_a[person1].values(), obs_b[person2].values(), d |
| 104 | + distances.setdefault(person1, {}) |
| 105 | + distances[person1][person2] = d |
| 106 | + return distances |
| 107 | + |
| 108 | + |
| 109 | +def normalize_dataset(vars, obs): |
| 110 | + editors = obs.keys() |
| 111 | + data = [] |
| 112 | + for var in vars: |
| 113 | + for editor in editors: |
| 114 | + data.append(obs[editor][var]) |
| 115 | + sd = standard_deviation(data) |
| 116 | + for editor in editors: |
| 117 | + try: |
| 118 | + obs[editor][var] = obs[editor][var] / sd |
| 119 | + except ZeroDivisionError: |
| 120 | + obs[editor][var] = 0 |
| 121 | + return obs |
| 122 | + |
| 123 | + |
| 124 | +def standard_deviation(data): |
| 125 | + n = len(data) |
| 126 | + values = sum(data) |
| 127 | + sq_values = values * values |
| 128 | + sd = (1.0 / n) * sq_values - (pow((1.0 / n) * values, 2)) |
| 129 | + return sd |
| 130 | + |
| 131 | + |
| 132 | +def inverse_dictionary(data): |
| 133 | + return dict((v, k) for k, v in data.iteritems()) |
| 134 | + |
| 135 | + |
| 136 | +def find_partner(distances): |
| 137 | + print 'Finding similar partners...' |
| 138 | + matches = [] |
| 139 | + ppi_editors = distances.keys() |
| 140 | + for ppi_editor in ppi_editors: |
| 141 | + data = inverse_dictionary(distances[ppi_editor]) |
| 142 | + min_d = min(data.keys()) |
| 143 | + max_d = max(data.keys()) |
| 144 | + match = data[max_d] |
| 145 | + matches.append((ppi_editor, match)) |
| 146 | + for editor in distances: |
| 147 | + try: |
| 148 | + distances[editor].pop(match) |
| 149 | + except KeyError: |
| 150 | + pass |
| 151 | + print ppi_editor, match, min_d, max_d |
80 | 152 | return matches |
81 | 153 | |
82 | | -def find_partner(matches): |
83 | | - pass |
84 | 154 | |
| 155 | +def write_dataset(vars, matches, obs_a, obs_b): |
| 156 | + print 'Writing dataset to CSV file...' |
| 157 | + fh = codecs.open('ppi_quality.csv', 'w', 'utf-8') |
| 158 | + fh.write('%s\t' % ('editor_a')) |
| 159 | + fh.write('_a\t'.join(vars)) |
| 160 | + fh.write('\t%s\t' % ('editor_b')) |
| 161 | + fh.write('_b\t'.join(vars)) |
| 162 | + fh.write('\tdelta registration days\tid\n') |
| 163 | + for i, match in enumerate(matches): |
| 164 | + line = [] |
| 165 | + editor_a = match[0] |
| 166 | + editor_b = match[1] |
| 167 | + line.append(editor_a) |
| 168 | + values_a = [str(obs_a[editor_a][v]) for v in vars] |
| 169 | + values_b = [str(obs_b[editor_b][v]) for v in vars] |
| 170 | + line.extend(values_a) |
| 171 | + line.append(editor_b) |
| 172 | + line.extend(values_b) |
| 173 | + dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date'] |
| 174 | + line.append(str(dt.days)) |
| 175 | + line.append(str(i)) |
| 176 | + line.append('\n') |
| 177 | + print line |
| 178 | + #line = '\t'.join([str(l).decode('utf-8') for l in line]) |
| 179 | + line = '\t'.join(line) |
| 180 | + fh.write(line) |
| 181 | + fh.close() |
| 182 | + |
| 183 | + |
85 | 184 | def launcher(): |
| 185 | + print 'Retrieving datasets...' |
| 186 | + vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', |
| 187 | + 'cum_edit_count_other_ns', 'article_count', 'revert_count'] |
86 | 188 | editors_a = create_sample_a() |
87 | 189 | obs_a = create_dataset(editors_a) |
| 190 | + #obs_a = normalize_dataset(vars, obs_a) |
88 | 191 | editors_b = create_sample_b() |
89 | 192 | obs_b = create_dataset(editors_b) |
90 | | - matches = calculate_distance_matrix(obs_a, obs_b) |
91 | | - find_partner(matches) |
| 193 | + #obs_b = normalize_dataset(vars, obs_b) |
| 194 | + distances = calculate_distance_matrix(vars, obs_a, obs_b) |
| 195 | + matches = find_partner(distances) |
| 196 | + write_dataset(vars, matches, obs_a, obs_b) |
92 | 197 | |
93 | 198 | |
94 | 199 | if __name__ == '__main__': |