r88349 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88348‎ | r88349 | r88350 >
Date:20:50, 17 May 2011
Author:diederik
Status:deferred
Tags:
Comment:
Compare quality of PPI editors vs. representative sample using a matching algorithm
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
@@ -0,0 +1,94 @@
 2+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
 3+__email__ = 'dvanliere at gmail dot com'
 4+__date__ = '2011-05-17'
 5+__version__ = '0.1'
 6+
 7+from datetime import datetime
 8+from math import pow
 9+import time
 10+import sys
 11+import os
 12+sys.path.append('../../')
 13+
 14+from classes import settings
 15+from classes import storage
 16+
 17+rts = settings.Settings()
 18+db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
 19+
 20+def create_sample_a():
 21+ editors = {}
 22+ location = os.path.join(rts.csv_location, 'ppi_editors.csv')
 23+ fh = open(location, 'r')
 24+ for line in fh:
 25+ line = line.strip()
 26+ username, chars, date = line.split('\t')
 27+ chars = int(chars)
 28+ date = datetime.strptime(date, '%Y-%m-%d')
 29+ editors.setdefault(username, [])
 30+ editors[username].append(date)
 31+ fh.close()
 32+ return editors
 33+
 34+
 35+def create_sample_b():
 36+ date = datetime(2010, 6, 30)
 37+ cursor = db.find('reg_date', {'$gte': date})
 38+ return cursor
 39+
 40+
 41+def create_dataset(editors):
 42+ obs = {}
 43+ print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 'number of characters added', 'registration date')
 44+ for username in editors:
 45+ for date in editors[username]:
 46+ month = str(date.month)
 47+ year = str(date.year)
 48+ data = db.find_one('username', username)
 49+ if data:
 50+ revert_count = data['revert_count'].get(year, {}).get(month, {}).get('0', 0)
 51+ character_count = data['character_count'].get(year, {}).get(month, {}).get('0', {}).get('added', 0)
 52+ reg_date = data.get('reg_date', datetime(2001, 1, 1))
 53+ epoch = time.mktime(reg_date.timetuple())
 54+ cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
 55+ cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0)
 56+ article_count = data['article_count'].get(year, {}).get(month, 0)
 57+ print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, character_count, reg_date)
 58+ obs.setdefault(username, {})
 59+ obs[username]['revert_count'] = revert_count
 60+ obs[username]['character_count'] = character_count
 61+ obs[username]['reg_date'] = epoch
 62+ obs[username]['cum_edit_count_main_ns'] = cum_edit_count_main_ns
 63+ obs[username]['cum_edit_count_other_ns'] = cum_edit_count_other_ns
 64+ obs[username]['article_count'] = article_count
 65+ return obs
 66+
 67+def euclidean_distance(vars, person1, person2):
 68+ sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in vars])
 69+ return 1 / (1 + sum_of_squares)
 70+
 71+
 72+def calculate_distance_matrix(obs_a, obs_b):
 73+ vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 'cum_edit_count_other_ns', 'article_count']
 74+ matches = {}
 75+ for person1 in obs_a:
 76+ for person2 in obs_b:
 77+ d = euclidean_distance(vars, person1, person2)
 78+ matches.setdefault(person1, {})
 79+ matches[person1][person2] = d
 80+ return matches
 81+
 82+def find_partner(matches):
 83+ pass
 84+
 85+def launcher():
 86+ editors_a = create_sample_a()
 87+ obs_a = create_dataset(editors_a)
 88+ editors_b = create_sample_b()
 89+ obs_b = create_dataset(editors_b)
 90+ matches = calculate_distance_matrix(obs_a, obs_b)
 91+ find_partner(matches)
 92+
 93+
 94+if __name__ == '__main__':
 95+ launcher()
Property changes on: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
___________________________________________________________________
Added: svn:eol-style
196 + native