Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py |
— | — | @@ -0,0 +1,94 @@ |
| 2 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)']) |
| 3 | +__email__ = 'dvanliere at gmail dot com' |
| 4 | +__date__ = '2011-05-17' |
| 5 | +__version__ = '0.1' |
| 6 | + |
| 7 | +from datetime import datetime |
| 8 | +from math import pow |
| 9 | +import time |
| 10 | +import sys |
| 11 | +import os |
| 12 | +sys.path.append('../../') |
| 13 | + |
| 14 | +from classes import settings |
| 15 | +from classes import storage |
| 16 | + |
| 17 | +rts = settings.Settings() |
| 18 | +db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset') |
| 19 | + |
| 20 | +def create_sample_a(): |
| 21 | + editors = {} |
| 22 | + location = os.path.join(rts.csv_location, 'ppi_editors.csv') |
| 23 | + fh = open(location, 'r') |
| 24 | + for line in fh: |
| 25 | + line = line.strip() |
| 26 | + username, chars, date = line.split('\t') |
| 27 | + chars = int(chars) |
| 28 | + date = datetime.strptime(date, '%Y-%m-%d') |
| 29 | + editors.setdefault(username, []) |
| 30 | + editors[username].append(date) |
| 31 | + fh.close() |
| 32 | + return editors |
| 33 | + |
| 34 | + |
| 35 | +def create_sample_b(): |
| 36 | + date = datetime(2010, 6, 30) |
| 37 | + cursor = db.find('reg_date', {'$gte': date}) |
| 38 | + return cursor |
| 39 | + |
| 40 | + |
| 41 | +def create_dataset(editors): |
| 42 | + obs = {} |
| 43 | + print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date') |
| 44 | + for username in editors: |
| 45 | + for date in editors[username]: |
| 46 | + month = str(date.month) |
| 47 | + year = str(date.year) |
| 48 | + data = db.find_one('username', username) |
| 49 | + if data: |
| 50 | + revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0) |
| 51 | + character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0) |
| 52 | + reg_date = data.get('reg_date', datetime(2001, 1, 1)) |
| 53 | + epoch = time.mktime(reg_date.timetuple()) |
| 54 | + cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0) |
| 55 | + cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0) |
| 56 | + article_count = data['article_count'].get(year, {}).get(month, 0) |
| 57 | + print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date) |
| 58 | + obs.setdefault(username, {}) |
| 59 | + obs[username]['revert_count'] = revert_count |
| 60 | + obs[username]['character_count'] = character_count |
| 61 | + obs[username]['reg_date'] = epoch |
| 62 | + obs[username]['cum_edit_count_main_ns'] = cum_edit_count_main_ns |
| 63 | + obs[username]['cum_edit_count_other_ns'] = cum_edit_count_other_ns |
| 64 | + obs[username]['article_count'] = article_count |
| 65 | + return obs |
| 66 | + |
| 67 | +def euclidean_distance(vars, person1, person2): |
| 68 | + sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars]) |
| 69 | + return 1 / (1 + sum_of_squares) |
| 70 | + |
| 71 | + |
| 72 | +def calculate_distance_matrix(obs_a, obs_b): |
| 73 | + vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 'cum_edit_count_other_ns', 'article_count'] |
| 74 | + matches = {} |
| 75 | + for person1 in obs_a: |
| 76 | + for person2 in obs_b: |
| 77 | + d = euclidean_distance(vars, person1, person2) |
| 78 | + matches.setdefault(person1, {}) |
| 79 | + matches[person1][person2] = d |
| 80 | + return matches |
| 81 | + |
| 82 | +def find_partner(matches): |
| 83 | + pass |
| 84 | + |
| 85 | +def launcher(): |
| 86 | + editors_a = create_sample_a() |
| 87 | + obs_a = create_dataset(editors_a) |
| 88 | + editors_b = create_sample_b() |
| 89 | + obs_b = create_dataset(editors_b) |
| 90 | + matches = calculate_distance_matrix(obs_a, obs_b) |
| 91 | + find_partner(matches) |
| 92 | + |
| 93 | + |
| 94 | +if __name__ == '__main__': |
| 95 | + launcher() |
Property changes on: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 96 | + native |