r88905 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88904‎ | r88905 | r88906 >
Date:17:15, 26 May 2011
Author:diederik
Status:deferred
Tags:
Comment:
Some overdue commits
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py (modified) (history)
  • /trunk/tools/editor_trends/classes/analytics.py (modified) (history)
  • /trunk/tools/editor_trends/classes/storage.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/kaggle.py (replaced) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/etl/variables.py (modified) (history)
  • /trunk/tools/editor_trends/js_scripts/ppi_quality.js (added) (history)
  • /trunk/tools/editor_trends/kaggle/training.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py
@@ -29,43 +29,34 @@
3030
3131 edits = editor['character_count']
3232 username = editor['username']
33 - x = 0
3433 try:
3534 added = edits['2010']['11']['0']['added']
36 - x += 1
3735 except KeyError:
38 - added = 2
39 -# try:
40 -# removed = edits['2010']['11']['0']['removed']
41 -# x += 1
42 -# except KeyError:
43 -# removed = 0
 36+ added = 0
 37+ try:
 38+ removed = edits['2010']['11']['0']['removed']
 39+ except KeyError:
 40+ removed = 0
4441
45 -
4642 key = datetime(2010, 11, 30)
4743 if added > 0:
4844 var.add(key, added, {'username': username, 'added': 'added'})
49 -# if removed > 0:
50 -# var.add(key, removed, {'username': username, 'removed': 'removed'})
51 -# var.add(key, x, {'username': username, 'total': 'total'})
 45+ if removed > 0:
 46+ var.add(key, removed, {'username': username, 'removed': 'removed'})
5247
53 - y = 0
5448 try:
5549 added = edits['2010']['12']['0']['added']
56 - y += 1
5750 except KeyError:
58 - added = 4
59 -# try:
60 -# removed = edits['2010']['12']['0']['removed']
61 -# y += 1
62 -# except KeyError:
63 -# removed = 0
 51+ added = 0
 52+ try:
 53+ removed = edits['2010']['12']['0']['removed']
 54+ except KeyError:
 55+ removed = 0
6456
6557 key = datetime(2010, 12, 31)
6658 if added > 0:
6759 var.add(key, added, {'username': username, 'added': 'added'})
68 -# if removed > 0:
69 -# var.add(key, removed, {'username': username, 'removed': 'removed'})
70 -# var.add(key, y, {'username': username, 'total': 'total'})
 60+ if removed > 0:
 61+ var.add(key, removed, {'username': username, 'removed': 'removed'})
7162
7263 return var
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -153,18 +153,20 @@
154154
155155
156156 ppills = rts.number_of_processes
157 - while ppills > 0:
158 - try:
159 - res = result.get()
160 - if res == True:
161 - pbar.update(pbar.currval + 1)
162 - else:
163 - ppills -= 1
164 - var = res
165 - print 'ppills: %s' % ppills
166 - except Empty:
167 - pass
168 -
 157+ while True:
 158+ while ppills > 0:
 159+ try:
 160+ res = result.get()
 161+ if res == True:
 162+ pbar.update(pbar.currval + 1)
 163+ else:
 164+ ppills -= 1
 165+ var = res
 166+ print ppills
 167+ except Empty:
 168+ pass
 169+ break
 170+ print 'Waiting for tasks...'
169171 tasks.join()
170172
171173 var = reconstruct_observations(var)
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
@@ -1,3 +1,17 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
216 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)'])
317 __email__ = 'dvanliere at gmail dot com'
418 __date__ = '2011-05-17'
@@ -106,6 +120,10 @@
107121
108122
109123 def normalize_dataset(vars, obs):
 124+ '''
 125+ This function rescales a dataset by dividing the observation by the standard
 126+ deviation (which results in a Z-score)
 127+ '''
110128 editors = obs.keys()
111129 data = []
112130 for var in vars:
@@ -142,6 +160,7 @@
143161 max_d = max(data.keys())
144162 match = data[max_d]
145163 matches.append((ppi_editor, match))
 164+ #remove match to make sure that every matched pair is unique
146165 for editor in distances:
147166 try:
148167 distances[editor].pop(match)
Index: trunk/tools/editor_trends/js_scripts/ppi_quality.js
@@ -0,0 +1,126 @@
 2+var editors = new Array();
 3+// Editors from December 2010
 4+editors[0] = "IR393.sae211";
 5+editors[1] = "Marian Sokolski";
 6+editors[2] = "SoAuthentic";
 7+editors[3] = "IR393davis";
 8+editors[4] = "Mulforel";
 9+editors[5] = "IR393ANDRICA";
 10+editors[6] = "Richharriott";
 11+editors[7] = "Prahalika";
 12+editors[8] = "IR393.awc211";
 13+editors[9] = "IR393TheSituation";
 14+editors[10] = "IR393harrisonkatz";
 15+editors[11] = "Kcahlber";
 16+editors[12] = "Elangate";
 17+editors[13] = "GWcontributor";
 18+editors[14] = "Dounoannick";
 19+editors[15] = "IR393DEME";
 20+editors[16] = "Teklegam";
 21+editors[17] = "Lbellows";
 22+editors[18] = "Coogan630";
 23+editors[19] = "Clairestum";
 24+editors[20] = "Hawkinjw";
 25+editors[21] = "Labrador70";
 26+editors[22] = "Seaver02";
 27+editors[23] = "Loflinrm";
 28+editors[24] = "Kcsl";
 29+editors[25] = "Farleyeye";
 30+editors[26] = "Eva YFL";
 31+editors[27] = "IR393Sadar";
 32+editors[28] = "Creyjons";
 33+editors[29] = "Okeland";
 34+editors[30] = "Amongst no roses";
 35+editors[31] = "Legin-gross-drawkcab";
 36+editors[32] = "Air3drew";
 37+editors[33] = "Owesetar";
 38+editors[34] = "Jmsheats";
 39+editors[35] = "Pearycrates";
 40+editors[36] = "Rabesque";
 41+editors[37] = "Hathornt";
 42+editors[38] = "Gsrogers";
 43+editors[39] = "Surfertk";
 44+editors[40] = "Evansza1";
 45+editors[41] = "Snyde2bd";
 46+editors[42] = "Ace of Raves";
 47+editors[43] = "IR393DrewGolding";
 48+editors[44] = "Rothboy2";
 49+editors[45] = "Ironman340";
 50+editors[46] = "Elhugheszete";
 51+editors[47] = "JanTan825";
 52+editors[48] = "New Potato Caboose";
 53+editors[49] = "Irishmeadow";
 54+editors[50] = "Tannerbk";
 55+editors[51] = "Luckbethislady";
 56+editors[52] = "1Ridwan";
 57+editors[53] = "Policydude";
 58+editors[54] = "Bmw 1986";
 59+editors[55] = "IR393ldc211";
 60+editors[56] = "Atb2393";
 61+editors[57] = "Goldstein2020";
 62+editors[58] = "Srayburn";
 63+editors[59] = "Yawloco";
 64+editors[60] = "Al-Jahweri";
 65+editors[61] = "Speon";
 66+editors[62] = "IR393aes";
 67+editors[63] = "Dohanian89";
 68+editors[64] = "Tessitjp";
 69+editors[65] = "Nerdpenguin";
 70+editors[66] = "Mikewuh";
 71+editors[67] = "Naj87";
 72+editors[68] = "Harrimel";
 73+editors[69] = "Contribute10";
 74+editors[70] = "Ramacu";
 75+editors[71] = "Kaloryth";
 76+editors[72] = "Qiaochina";
 77+editors[73] = "CBCrookham";
 78+editors[74] = "Saxa228";
 79+editors[75] = "Adw7";
 80+editors[76] = "Jraytram";
 81+editors[77] = "Indigoandcerise";
 82+editors[78] = "Clafoutis";
 83+editors[79] = "Klhrdy";
 84+editors[80] = "TomSannicandro";
 85+editors[81] = "Wkrantz";
 86+editors[82] = "Padmin22";
 87+editors[83] = "Alexvonzu";
 88+editors[84] = "Klcai";
 89+editors[85] = "Bridoc";
 90+editors[86] = "Feuchtcc";
 91+editors[87] = "Karthik Jagadeesh";
 92+editors[88] = "Smj39";
 93+editors[89] = "Xavier Peniche";
 94+editors[90] = "Ka Yaffa";
 95+editors[91] = "Jysg23";
 96+editors[92] = "Wsko.ko";
 97+editors[93] = "Psyoon";
 98+editors[94] = "Wvhoya";
 99+editors[95] = "Act25";
 100+editors[96] = "Hpl1981";
 101+editors[97] = "Jacqueline F";
 102+editors[98] = "IR393.cfc211";
 103+editors[99] = "Joko123nm";
 104+editors[100] = "Kmac1986";
 105+editors[101] = "Amfarr21";
 106+editors[102] = "IR393Anjan";
 107+
 108+
 109+var date1 = new Date();
 110+date1.setFullYear(2010,12,31);
 111+d = {"date1": date1};
 112+
 113+var editors = {"IR393.sae211": d,"Marian Sokolski": d,"SoAuthentic": d,"IR393davis": d,"Mulforel": d,"IR393ANDRICA": d,"Richharriott": d,"Prahalika": d,"IR393.awc211": d,"IR393TheSituation": d,"IR393harrisonkatz": d,"Kcahlber": d,"Elangate": d,"GWcontributor": d,"Dounoannick": d,"IR393DEME": d,"Teklegam": d,"Lbellows": d,"Coogan630": d,"Clairestum": d,"Hawkinjw": d,"Labrador70": d,"Seaver02": d,"Loflinrm": d,"Kcsl": d,"Farleyeye": d,"Eva YFL": d,"IR393Sadar": d,"Creyjons": d,"Okeland": d,"Amongst no roses": d,"Legin-gross-drawkcab": d,"Air3drew": d,"Owesetar": d,"Jmsheats": d,"Pearycrates": d,"Rabesque": d,"Hathornt": d,"Gsrogers": d,"Surfertk": d,"Evansza1": d,"Snyde2bd": d,"Ace of Raves": d,"IR393DrewGolding": d,"Rothboy2": d,"Ironman340": d,"Elhugheszete": d,"JanTan825": d,"New Potato Caboose": d,"Irishmeadow": d,"Tannerbk": d,"Luckbethislady": d,"1Ridwan": d,"Policydude": d,"Bmw 1986": d,"IR393ldc211": d,"Atb2393": d,"Goldstein2020": d,"Srayburn": d,"Yawloco": d,"Al-Jahweri": d,"Speon": d,"IR393aes": d,"Dohanian89": d,"Tessitjp": d,"Nerdpenguin": d,"Mikewuh": d,"Naj87": d,"Harrimel": d,"Contribute10": d,"Ramacu": d,"Kaloryth": d,"Qiaochina": d,"CBCrookham": d,"Saxa228": d,"Adw7": d,"Jraytram": d,"Indigoandcerise": d,"Clafoutis": d,"Klhrdy": d,"TomSannicandro": d,"Wkrantz": d,"Padmin22": d,"Alexvonzu": d,"Klcai": d,"Bridoc": d,"Feuchtcc": d,"Karthik Jagadeesh": d,"Smj39": d,"Xavier Peniche": d,"Ka Yaffa": d,"Jysg23": d,"Wsko.ko": d,"Psyoon": d,"Wvhoya": d,"Act25": d,"Hpl1981": d,"Jacqueline F": d,"IR393.cfc211": d,"Joko123nm": d,"Kmac1986": d,"Amfarr21": d,"IR393Anjan": d};
 114+
 115+for (var username in editors) {
 116+ for (var obsdate in editors[username]) {
 117+ print(obsdate);
 118+ }
 119+}
 120+ print(username);
 121+ //var reverts = db.enwiki_editors_dataset.findOne({"username": username}, {"revert_count": 1});
 122+ //for (obsdate in editors[username]) {
 123+ //month = date.getMonth();
 124+ //year = date.getYear();
 125+ // print(month, year);
 126+ // }
 127+}
\ No newline at end of file
Index: trunk/tools/editor_trends/etl/variables.py
@@ -254,6 +254,7 @@
255255 past_revert['reverted_contributor'] = -1
256256 return past_revert
257257
 258+
258259 def is_revision_reverted(hash_cur, hashes):
259260 '''
260261 Determine whether an edit was reverted or not based on md5 hashes
@@ -314,7 +315,7 @@
315316 '''
316317 This function determines the xml_namespace version
317318 '''
318 - for elem in siteinfo :
 319+ for elem in siteinfo:
319320 if elem.tag.endswith('sitename'):
320321 xml_namespace = elem.tag
321322 pos = xml_namespace.find('sitename')
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -133,7 +133,7 @@
134134 '''
135135 This function determines the title of an article and the
136136 namespace to which it belongs. Then, if the namespace is one
137 - which we are interested set parse to True so that we start
 137+ which we are interested in set parse to True so that we start
138138 parsing this article, else it will skip this article.
139139 '''
140140 title = variables.parse_title(elem)
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -179,6 +179,7 @@
180180 dc = cleanup_datacontainer(dc, {})
181181 return dc
182182
 183+
183184 def calculate_cum_edits(edits):
184185 cum_edit_count_main_ns = 0
185186 cum_edit_count_other_ns = 0
@@ -191,6 +192,7 @@
192193
193194 return cum_edit_count_main_ns, cum_edit_count_other_ns
194195
 196+
195197 def determine_articles_workedon(edits, first_year, final_year):
196198 '''
197199 This function creates a list of article_ids that an editor has worked on in
Index: trunk/tools/editor_trends/kaggle/training.py
@@ -1,25 +1,83 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-04-12'
 19+__version__ = '0.1'
 20+
221 import codecs
322 import os
 23+from datetime import datetime
 24+import json
425
 26+location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction'
 27+files = os.listdir(location)
 28+files.reverse()
 29+dataset = codecs.open('training.tsv', 'w', 'utf-8')
 30+t0 = datetime.now()
 31+max_size = 2147483648
 32+titles = {}
 33+ids = set()
 34+size = 0
 35+cnt_obs = 0
 36+max_size_reached = False
537
 38+for filename in files:
 39+ if not filename.startswith('comments') and not filename.startswith('articles'):
 40+ fh = codecs.open(os.path.join(location, filename))
 41+ if max_size_reached == True:
 42+ break
 43+ for line in fh:
 44+ line = line.strip()
 45+ line = line.split('\t')
 46+ if len(line) != 12:
 47+ continue
 48+ if line[10] == '1':
 49+ continue
 50+ username = line[3].lower()
 51+ if username.endswith('bot'):
 52+ #line[10] = '1'
 53+ continue
 54+ cnt_obs += 1
 55+ title_id = line[1]
 56+ ids.add(line[2])
 57+ title = line.pop(5)
 58+ titles[title_id] = title
 59+ line.append('\n')
 60+ line = '\t'.join(line)
 61+ size += len(line)
 62+ if size > max_size:
 63+ max_size_reached = True
 64+ dataset.write(line.decode('utf-8'))
665
7 -location = '/home/diederik/wikimedia/wikilytics/en/wiki/txt'
8 -files = os.listdir(location)
 66+dataset.close()
967
10 -output = codecs.open('training.txt', 'w', 'utf-8')
 68+fh = codecs.open('titles.tsv', 'w', 'utf-8')
 69+for id, title in titles.iteritems():
 70+ fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
 71+fh.close()
1172
12 -for filename in files:
13 - fh = codecs.open(os.path.join(location, filename))
14 - for line in fh:
15 - line = line.strip()
16 - line = line.split('\t')
17 - if len(line) != 13:
18 - continue
19 - username = line[12].lower()
20 - if username.endswith('bot'):
21 - line[5] = 1
22 - line = '\t'.join(line)
23 - output.write(line)
24 -
25 -
26 -output.close()
\ No newline at end of file
 73+fh = codecs.open('ids.json', 'w', 'utf-8')
 74+json.dump(ids, fh)
 75+#for id in ids:
 76+#fh.write('%s\n' % (id.decode('utf-8')))
 77+#fh.write('%s\n' % (json.du)
 78+fh.close()
 79+
 80+t1 = datetime.now()
 81+print 'Descriptives:\n'
 82+print 'Number of editors: %s' % len(ids)
 83+print 'Number of edits: %s' % cnt_obs
 84+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/classes/storage.py
@@ -86,7 +86,7 @@
8787 '''Find multiple observations in a collection'''
8888
8989 @abstractmethod
90 - def find_one(self, key, value):
 90+ def find_one(self, key, value, var=False):
9191 '''Find a single observation in a collection'''
9292
9393 @abstractmethod
@@ -145,7 +145,7 @@
146146
147147 def update(self, key, value, data):
148148 assert isinstance(data, dict), 'You need to feed me dictionaries.'
149 - self.db[self.collection].update({key: value}, data, upsert=True)
 149+ self.db[self.collection].update({key: value}, {'$set': data})
150150
151151 def find(self, key=None, qualifier=None):
152152 if qualifier == 'min':
@@ -154,14 +154,24 @@
155155 elif qualifier == 'max':
156156 return self.db[self.collection].find({
157157 key : {'$ne' : False}}).sort(key, pymongo.DESCENDING).limit(1)[0]
 158+ elif qualifier:
 159+ return self.db[self.collection].find({key : qualifier})
158160 elif key != None:
159161 return self.db[self.collection].find({}, fields=[key])
160162 else:
161163 return self.db[self.collection].find()
162164
163 - def find_one(self, key, value):
164 - return self.db[self.collection].find_one({key: value})
 165+ def find_one(self, key, value, vars=None):
 166+ if vars:
 167+ #if you only want to retrieve a specific variable(s) then you need to
 168+ #specify vars, if vars is None then you will get the entire BSON object
 169+ vars = vars.split(',')
 170+ vars = dict([(var, 1) for var in vars])
 171+ return self.db[self.collection].find_one({key: value}, vars)
 172+ else:
 173+ return self.db[self.collection].find_one({key: value})
165174
 175+
166176 def drop_collection(self):
167177 self.db.drop_collection(self.collection)
168178
Index: trunk/tools/editor_trends/classes/analytics.py
@@ -92,7 +92,6 @@
9393 project and then calls the plugin that does the actual mapping.
9494 '''
9595 db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset)
96 - x = 0
9796 while True:
9897 try:
9998 editor_id = self.tasks.get(block=False)