r88905 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r88904‎ \| r88905 \| r88906 >
Date:	17:15, 26 May 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Some overdue commits
Modified paths:	/trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history) /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py (modified) (history) /trunk/tools/editor_trends/classes/analytics.py (modified) (history) /trunk/tools/editor_trends/classes/storage.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/kaggle.py (replaced) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/etl/variables.py (modified) (history) /trunk/tools/editor_trends/js_scripts/ppi_quality.js (added) (history) /trunk/tools/editor_trends/kaggle/training.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py
—	—	@@ -29,43 +29,34 @@
30	30
31	31	edits = editor['character_count']
32	32	username = editor['username']
33		~~- x = 0~~
34	33	try:
35	34	added = edits['2010']['11']['0']['added']
36		~~- x += 1~~
37	35	except KeyError:
38		~~- added = 2~~
39		~~-# try:~~
40		~~-# removed = edits['2010']['11']['0']['removed']~~
41		~~-# x += 1~~
42		~~-# except KeyError:~~
43		~~-# removed = 0~~
	36	+ added = 0
	37	+ try:
	38	+ removed = edits['2010']['11']['0']['removed']
	39	+ except KeyError:
	40	+ removed = 0
44	41
45		-
46	42	key = datetime(2010, 11, 30)
47	43	if added > 0:
48	44	var.add(key, added, {'username': username, 'added': 'added'})
49		~~-# if removed > 0:~~
50		~~-# var.add(key, removed, {'username': username, 'removed': 'removed'})~~
51		~~-# var.add(key, x, {'username': username, 'total': 'total'})~~
	45	+ if removed > 0:
	46	+ var.add(key, removed, {'username': username, 'removed': 'removed'})
52	47
53		~~- y = 0~~
54	48	try:
55	49	added = edits['2010']['12']['0']['added']
56		~~- y += 1~~
57	50	except KeyError:
58		~~- added = 4~~
59		~~-# try:~~
60		~~-# removed = edits['2010']['12']['0']['removed']~~
61		~~-# y += 1~~
62		~~-# except KeyError:~~
63		~~-# removed = 0~~
	51	+ added = 0
	52	+ try:
	53	+ removed = edits['2010']['12']['0']['removed']
	54	+ except KeyError:
	55	+ removed = 0
64	56
65	57	key = datetime(2010, 12, 31)
66	58	if added > 0:
67	59	var.add(key, added, {'username': username, 'added': 'added'})
68		~~-# if removed > 0:~~
69		~~-# var.add(key, removed, {'username': username, 'removed': 'removed'})~~
70		~~-# var.add(key, y, {'username': username, 'total': 'total'})~~
	60	+ if removed > 0:
	61	+ var.add(key, removed, {'username': username, 'removed': 'removed'})
71	62
72	63	return var
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -153,18 +153,20 @@
154	154
155	155
156	156	ppills = rts.number_of_processes
157		~~- while ppills > 0:~~
158		~~- try:~~
159		~~- res = result.get()~~
160		~~- if res == True:~~
161		~~- pbar.update(pbar.currval + 1)~~
162		~~- else:~~
163		~~- ppills -= 1~~
164		~~- var = res~~
165		~~- print 'ppills: %s' % ppills~~
166		~~- except Empty:~~
167		~~- pass~~
168		-
	157	+ while True:
	158	+ while ppills > 0:
	159	+ try:
	160	+ res = result.get()
	161	+ if res == True:
	162	+ pbar.update(pbar.currval + 1)
	163	+ else:
	164	+ ppills -= 1
	165	+ var = res
	166	+ print ppills
	167	+ except Empty:
	168	+ pass
	169	+ break
	170	+ print 'Waiting for tasks...'
169	171	tasks.join()
170	172
171	173	var = reconstruct_observations(var)
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
—	—	@@ -1,3 +1,17 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
2	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
3	17	__email__ = 'dvanliere at gmail dot com'
4	18	__date__ = '2011-05-17'
—	—	@@ -106,6 +120,10 @@
107	121
108	122
109	123	def normalize_dataset(vars, obs):
	124	+ '''
	125	+ This function rescales a dataset by dividing the observation by the standard
	126	+ deviation (which results in a Z-score)
	127	+ '''
110	128	editors = obs.keys()
111	129	data = []
112	130	for var in vars:
—	—	@@ -142,6 +160,7 @@
143	161	max_d = max(data.keys())
144	162	match = data[max_d]
145	163	matches.append((ppi_editor, match))
	164	+ #remove match to make sure that every matched pair is unique
146	165	for editor in distances:
147	166	try:
148	167	distances[editor].pop(match)
Index: trunk/tools/editor_trends/js_scripts/ppi_quality.js
—	—	@@ -0,0 +1,126 @@
	2	+var editors = new Array();
	3	+// Editors from December 2010
	4	+editors[0] = "IR393.sae211";
	5	+editors[1] = "Marian Sokolski";
	6	+editors[2] = "SoAuthentic";
	7	+editors[3] = "IR393davis";
	8	+editors[4] = "Mulforel";
	9	+editors[5] = "IR393ANDRICA";
	10	+editors[6] = "Richharriott";
	11	+editors[7] = "Prahalika";
	12	+editors[8] = "IR393.awc211";
	13	+editors[9] = "IR393TheSituation";
	14	+editors[10] = "IR393harrisonkatz";
	15	+editors[11] = "Kcahlber";
	16	+editors[12] = "Elangate";
	17	+editors[13] = "GWcontributor";
	18	+editors[14] = "Dounoannick";
	19	+editors[15] = "IR393DEME";
	20	+editors[16] = "Teklegam";
	21	+editors[17] = "Lbellows";
	22	+editors[18] = "Coogan630";
	23	+editors[19] = "Clairestum";
	24	+editors[20] = "Hawkinjw";
	25	+editors[21] = "Labrador70";
	26	+editors[22] = "Seaver02";
	27	+editors[23] = "Loflinrm";
	28	+editors[24] = "Kcsl";
	29	+editors[25] = "Farleyeye";
	30	+editors[26] = "Eva YFL";
	31	+editors[27] = "IR393Sadar";
	32	+editors[28] = "Creyjons";
	33	+editors[29] = "Okeland";
	34	+editors[30] = "Amongst no roses";
	35	+editors[31] = "Legin-gross-drawkcab";
	36	+editors[32] = "Air3drew";
	37	+editors[33] = "Owesetar";
	38	+editors[34] = "Jmsheats";
	39	+editors[35] = "Pearycrates";
	40	+editors[36] = "Rabesque";
	41	+editors[37] = "Hathornt";
	42	+editors[38] = "Gsrogers";
	43	+editors[39] = "Surfertk";
	44	+editors[40] = "Evansza1";
	45	+editors[41] = "Snyde2bd";
	46	+editors[42] = "Ace of Raves";
	47	+editors[43] = "IR393DrewGolding";
	48	+editors[44] = "Rothboy2";
	49	+editors[45] = "Ironman340";
	50	+editors[46] = "Elhugheszete";
	51	+editors[47] = "JanTan825";
	52	+editors[48] = "New Potato Caboose";
	53	+editors[49] = "Irishmeadow";
	54	+editors[50] = "Tannerbk";
	55	+editors[51] = "Luckbethislady";
	56	+editors[52] = "1Ridwan";
	57	+editors[53] = "Policydude";
	58	+editors[54] = "Bmw 1986";
	59	+editors[55] = "IR393ldc211";
	60	+editors[56] = "Atb2393";
	61	+editors[57] = "Goldstein2020";
	62	+editors[58] = "Srayburn";
	63	+editors[59] = "Yawloco";
	64	+editors[60] = "Al-Jahweri";
	65	+editors[61] = "Speon";
	66	+editors[62] = "IR393aes";
	67	+editors[63] = "Dohanian89";
	68	+editors[64] = "Tessitjp";
	69	+editors[65] = "Nerdpenguin";
	70	+editors[66] = "Mikewuh";
	71	+editors[67] = "Naj87";
	72	+editors[68] = "Harrimel";
	73	+editors[69] = "Contribute10";
	74	+editors[70] = "Ramacu";
	75	+editors[71] = "Kaloryth";
	76	+editors[72] = "Qiaochina";
	77	+editors[73] = "CBCrookham";
	78	+editors[74] = "Saxa228";
	79	+editors[75] = "Adw7";
	80	+editors[76] = "Jraytram";
	81	+editors[77] = "Indigoandcerise";
	82	+editors[78] = "Clafoutis";
	83	+editors[79] = "Klhrdy";
	84	+editors[80] = "TomSannicandro";
	85	+editors[81] = "Wkrantz";
	86	+editors[82] = "Padmin22";
	87	+editors[83] = "Alexvonzu";
	88	+editors[84] = "Klcai";
	89	+editors[85] = "Bridoc";
	90	+editors[86] = "Feuchtcc";
	91	+editors[87] = "Karthik Jagadeesh";
	92	+editors[88] = "Smj39";
	93	+editors[89] = "Xavier Peniche";
	94	+editors[90] = "Ka Yaffa";
	95	+editors[91] = "Jysg23";
	96	+editors[92] = "Wsko.ko";
	97	+editors[93] = "Psyoon";
	98	+editors[94] = "Wvhoya";
	99	+editors[95] = "Act25";
	100	+editors[96] = "Hpl1981";
	101	+editors[97] = "Jacqueline F";
	102	+editors[98] = "IR393.cfc211";
	103	+editors[99] = "Joko123nm";
	104	+editors[100] = "Kmac1986";
	105	+editors[101] = "Amfarr21";
	106	+editors[102] = "IR393Anjan";
	107	+
	108	+
	109	+var date1 = new Date();
	110	+date1.setFullYear(2010,12,31);
	111	+d = {"date1": date1};
	112	+
	113	+var editors = {"IR393.sae211": d,"Marian Sokolski": d,"SoAuthentic": d,"IR393davis": d,"Mulforel": d,"IR393ANDRICA": d,"Richharriott": d,"Prahalika": d,"IR393.awc211": d,"IR393TheSituation": d,"IR393harrisonkatz": d,"Kcahlber": d,"Elangate": d,"GWcontributor": d,"Dounoannick": d,"IR393DEME": d,"Teklegam": d,"Lbellows": d,"Coogan630": d,"Clairestum": d,"Hawkinjw": d,"Labrador70": d,"Seaver02": d,"Loflinrm": d,"Kcsl": d,"Farleyeye": d,"Eva YFL": d,"IR393Sadar": d,"Creyjons": d,"Okeland": d,"Amongst no roses": d,"Legin-gross-drawkcab": d,"Air3drew": d,"Owesetar": d,"Jmsheats": d,"Pearycrates": d,"Rabesque": d,"Hathornt": d,"Gsrogers": d,"Surfertk": d,"Evansza1": d,"Snyde2bd": d,"Ace of Raves": d,"IR393DrewGolding": d,"Rothboy2": d,"Ironman340": d,"Elhugheszete": d,"JanTan825": d,"New Potato Caboose": d,"Irishmeadow": d,"Tannerbk": d,"Luckbethislady": d,"1Ridwan": d,"Policydude": d,"Bmw 1986": d,"IR393ldc211": d,"Atb2393": d,"Goldstein2020": d,"Srayburn": d,"Yawloco": d,"Al-Jahweri": d,"Speon": d,"IR393aes": d,"Dohanian89": d,"Tessitjp": d,"Nerdpenguin": d,"Mikewuh": d,"Naj87": d,"Harrimel": d,"Contribute10": d,"Ramacu": d,"Kaloryth": d,"Qiaochina": d,"CBCrookham": d,"Saxa228": d,"Adw7": d,"Jraytram": d,"Indigoandcerise": d,"Clafoutis": d,"Klhrdy": d,"TomSannicandro": d,"Wkrantz": d,"Padmin22": d,"Alexvonzu": d,"Klcai": d,"Bridoc": d,"Feuchtcc": d,"Karthik Jagadeesh": d,"Smj39": d,"Xavier Peniche": d,"Ka Yaffa": d,"Jysg23": d,"Wsko.ko": d,"Psyoon": d,"Wvhoya": d,"Act25": d,"Hpl1981": d,"Jacqueline F": d,"IR393.cfc211": d,"Joko123nm": d,"Kmac1986": d,"Amfarr21": d,"IR393Anjan": d};
	114	+
	115	+for (var username in editors) {
	116	+ for (var obsdate in editors[username]) {
	117	+ print(obsdate);
	118	+ }
	119	+}
	120	+ print(username);
	121	+ //var reverts = db.enwiki_editors_dataset.findOne({"username": username}, {"revert_count": 1});
	122	+ //for (obsdate in editors[username]) {
	123	+ //month = date.getMonth();
	124	+ //year = date.getYear();
	125	+ // print(month, year);
	126	+ // }
	127	+}
\ No newline at end of file
Index: trunk/tools/editor_trends/etl/variables.py
—	—	@@ -254,6 +254,7 @@
255	255	past_revert['reverted_contributor'] = -1
256	256	return past_revert
257	257
	258	+
258	259	def is_revision_reverted(hash_cur, hashes):
259	260	'''
260	261	Determine whether an edit was reverted or not based on md5 hashes
—	—	@@ -314,7 +315,7 @@
315	316	'''
316	317	This function determines the xml_namespace version
317	318	'''
318		~~- for elem in siteinfo :~~
	319	+ for elem in siteinfo:
319	320	if elem.tag.endswith('sitename'):
320	321	xml_namespace = elem.tag
321	322	pos = xml_namespace.find('sitename')
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -133,7 +133,7 @@
134	134	'''
135	135	This function determines the title of an article and the
136	136	namespace to which it belongs. Then, if the namespace is one
137		~~- which we are interested set parse to True so that we start~~
	137	+ which we are interested in set parse to True so that we start
138	138	parsing this article, else it will skip this article.
139	139	'''
140	140	title = variables.parse_title(elem)
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -179,6 +179,7 @@
180	180	dc = cleanup_datacontainer(dc, {})
181	181	return dc
182	182
	183	+
183	184	def calculate_cum_edits(edits):
184	185	cum_edit_count_main_ns = 0
185	186	cum_edit_count_other_ns = 0
—	—	@@ -191,6 +192,7 @@
192	193
193	194	return cum_edit_count_main_ns, cum_edit_count_other_ns
194	195
	196	+
195	197	def determine_articles_workedon(edits, first_year, final_year):
196	198	'''
197	199	This function creates a list of article_ids that an editor has worked on in
Index: trunk/tools/editor_trends/kaggle/training.py
—	—	@@ -1,25 +1,83 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-04-12'
	19	+__version__ = '0.1'
	20	+
2	21	import codecs
3	22	import os
	23	+from datetime import datetime
	24	+import json
4	25
	26	+location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction'
	27	+files = os.listdir(location)
	28	+files.reverse()
	29	+dataset = codecs.open('training.tsv', 'w', 'utf-8')
	30	+t0 = datetime.now()
	31	+max_size = 2147483648
	32	+titles = {}
	33	+ids = set()
	34	+size = 0
	35	+cnt_obs = 0
	36	+max_size_reached = False
5	37
	38	+for filename in files:
	39	+ if not filename.startswith('comments') and not filename.startswith('articles'):
	40	+ fh = codecs.open(os.path.join(location, filename))
	41	+ if max_size_reached == True:
	42	+ break
	43	+ for line in fh:
	44	+ line = line.strip()
	45	+ line = line.split('\t')
	46	+ if len(line) != 12:
	47	+ continue
	48	+ if line[10] == '1':
	49	+ continue
	50	+ username = line[3].lower()
	51	+ if username.endswith('bot'):
	52	+ #line[10] = '1'
	53	+ continue
	54	+ cnt_obs += 1
	55	+ title_id = line[1]
	56	+ ids.add(line[2])
	57	+ title = line.pop(5)
	58	+ titles[title_id] = title
	59	+ line.append('\n')
	60	+ line = '\t'.join(line)
	61	+ size += len(line)
	62	+ if size > max_size:
	63	+ max_size_reached = True
	64	+ dataset.write(line.decode('utf-8'))
6	65
7		~~-location = '/home/diederik/wikimedia/wikilytics/en/wiki/txt'~~
8		~~-files = os.listdir(location)~~
	66	+dataset.close()
9	67
10		~~-output = codecs.open('training.txt', 'w', 'utf-8')~~
	68	+fh = codecs.open('titles.tsv', 'w', 'utf-8')
	69	+for id, title in titles.iteritems():
	70	+ fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
	71	+fh.close()
11	72
12		~~-for filename in files:~~
13		~~- fh = codecs.open(os.path.join(location, filename))~~
14		~~- for line in fh:~~
15		~~- line = line.strip()~~
16		~~- line = line.split('\t')~~
17		~~- if len(line) != 13:~~
18		~~- continue~~
19		~~- username = line[12].lower()~~
20		~~- if username.endswith('bot'):~~
21		~~- line[5] = 1~~
22		~~- line = '\t'.join(line)~~
23		~~- output.write(line)~~
24		-
25		-
26		~~-output.close()~~
\ No newline at end of file
	73	+fh = codecs.open('ids.json', 'w', 'utf-8')
	74	+json.dump(ids, fh)
	75	+#for id in ids:
	76	+#fh.write('%s\n' % (id.decode('utf-8')))
	77	+#fh.write('%s\n' % (json.du)
	78	+fh.close()
	79	+
	80	+t1 = datetime.now()
	81	+print 'Descriptives:\n'
	82	+print 'Number of editors: %s' % len(ids)
	83	+print 'Number of edits: %s' % cnt_obs
	84	+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/classes/storage.py
—	—	@@ -86,7 +86,7 @@
87	87	'''Find multiple observations in a collection'''
88	88
89	89	@abstractmethod
90		~~- def find_one(self, key, value):~~
	90	+ def find_one(self, key, value, var=False):
91	91	'''Find a single observation in a collection'''
92	92
93	93	@abstractmethod
—	—	@@ -145,7 +145,7 @@
146	146
147	147	def update(self, key, value, data):
148	148	assert isinstance(data, dict), 'You need to feed me dictionaries.'
149		~~- self.db[self.collection].update({key: value}, data, upsert=True)~~
	149	+ self.db[self.collection].update({key: value}, {'$set': data})
150	150
151	151	def find(self, key=None, qualifier=None):
152	152	if qualifier == 'min':
—	—	@@ -154,14 +154,24 @@
155	155	elif qualifier == 'max':
156	156	return self.db[self.collection].find({
157	157	key : {'$ne' : False}}).sort(key, pymongo.DESCENDING).limit(1)[0]
	158	+ elif qualifier:
	159	+ return self.db[self.collection].find({key : qualifier})
158	160	elif key != None:
159	161	return self.db[self.collection].find({}, fields=[key])
160	162	else:
161	163	return self.db[self.collection].find()
162	164
163		~~- def find_one(self, key, value):~~
164		~~- return self.db[self.collection].find_one({key: value})~~
	165	+ def find_one(self, key, value, vars=None):
	166	+ if vars:
	167	+ #if you only want to retrieve a specific variable(s) then you need to
	168	+ #specify vars, if vars is None then you will get the entire BSON object
	169	+ vars = vars.split(',')
	170	+ vars = dict([(var, 1) for var in vars])
	171	+ return self.db[self.collection].find_one({key: value}, vars)
	172	+ else:
	173	+ return self.db[self.collection].find_one({key: value})
165	174
	175	+
166	176	def drop_collection(self):
167	177	self.db.drop_collection(self.collection)
168	178
Index: trunk/tools/editor_trends/classes/analytics.py
—	—	@@ -92,7 +92,6 @@
93	93	project and then calls the plugin that does the actual mapping.
94	94	'''
95	95	db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset)
96		~~- x = 0~~
97	96	while True:
98	97	try:
99	98	editor_id = self.tasks.get(block=False)