r85344 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85343‎ | r85344 | r85345 >
Date:17:30, 4 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Updated derived dataset with new variables and optimized the storage of older variables. This should reduce harddrive space requirements. The plugins will break and need to be fixed.
Modified paths:
  • /trunk/tools/editor_trends/etl/shaper.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/shaper.py
@@ -34,13 +34,6 @@
3535 return d
3636
3737
38 -def create_clock():
39 - d = {}
40 - for i in xrange(0, 24):
41 - d[i] = 0.0
42 - return d
43 -
44 -
4538 def create_datacontainer(first_year, final_year, datatype='dict'):
4639 '''
4740 This function initializes an empty dictionary with as key the year (starting
@@ -53,6 +46,7 @@
5447 data[str(x)] = add_datatype(datatype)
5548 return data
5649
 50+
5751 def add_windows_to_datacontainer(datacontainer, windows):
5852 for dc in datacontainer:
5953 for w in windows:
@@ -60,6 +54,7 @@
6155
6256 return datacontainer
6357
 58+
6459 def add_months_to_datacontainer(datacontainer, datatype):
6560 for dc in datacontainer:
6661 datacontainer[dc] = {}
@@ -68,12 +63,10 @@
6964
7065 return datacontainer
7166
 67+
7268 def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype):
7369 for dc in datacontainer:
7470 datacontainer[dc] = {}
7571 for x in range(first_year, final_year):
7672 datacontainer[dc][x] = datatype
7773 return datacontainer
78 -
79 -
80 -
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -23,6 +23,7 @@
2424 from operator import itemgetter
2525 import datetime
2626 import sys
 27+from copy import deepcopy
2728
2829 from database import db
2930 from utils import file_utils
@@ -67,79 +68,113 @@
6869 return
6970 edits = editor['edits']
7071 username = editor['username']
 72+
7173 first_year, final_year = determine_year_range(edits)
72 - monthly_edits = determine_edits_by_month(edits, first_year, final_year)
73 - monthly_edits = db.stringify_keys(monthly_edits)
7474
75 - edits_by_year = determine_edits_by_year(edits, first_year, final_year)
76 - edits_by_year = db.stringify_keys(edits_by_year)
77 -
7875 last_edit_by_year = determine_last_edit_by_year(edits, first_year, final_year)
79 - last_edit_by_year = db.stringify_keys(last_edit_by_year)
80 -
8176 articles_edited = determine_articles_workedon(edits, first_year, final_year)
 77+ article_count = determine_article_count(articles_edited, first_year, final_year)
8278 articles_edited = db.stringify_keys(articles_edited)
8379
84 - articles_by_year = determine_articles_by_year(articles_edited, first_year, final_year)
85 - articles_by_year = db.stringify_keys(articles_by_year)
86 -
8780 namespaces_edited = determine_namespaces_workedon(edits, first_year, final_year)
88 - namespaces_edited = db.stringify_keys(namespaces_edited)
 81+ character_count = determine_edit_volume(edits, first_year, final_year)
 82+ revert_count = determine_number_reverts(edits, first_year, final_year)
8983
90 - character_counts = determine_edit_volume(edits, first_year, final_year)
91 - character_counts = db.stringify_keys(character_counts)
92 -
93 - count_reverts = determine_number_reverts(edits, first_year, final_year)
94 - count_reverts = db.stringify_keys(count_reverts)
95 -
9684 edits = sort_edits(edits)
9785 edit_count = determine_number_edits(edits, first_year, final_year)
9886
 87+ totals = {}
 88+ counts = shaper.create_datacontainer(first_year, final_year)
 89+ totals = calculate_totals(totals, counts, character_count, 'character_count')
 90+ totals = calculate_totals(totals, counts, revert_count, 'revert_count')
 91+ totals = calculate_totals(totals, counts, article_count, 'article_count')
 92+ totals = calculate_totals(totals, counts, edit_count, 'edit_count')
 93+ totals = db.stringify_keys(totals)
 94+
9995 if len(edits) > cutoff:
10096 new_wikipedian = edits[cutoff]['date']
10197 else:
10298 new_wikipedian = False
10399 first_edit = edits[0]['date']
104100 final_edit = edits[-1]['date']
105 - edits = edits[:cutoff]
106101
107102 self.output_db.insert({'editor': self.id,
108 - 'edits': edits,
109 - 'edits_by_year': edits_by_year,
110 - 'new_wikipedian': new_wikipedian,
111 - 'edit_count': edit_count,
112 - 'final_edit': final_edit,
113 - 'first_edit': first_edit,
114 - 'articles_by_year': articles_by_year,
115 - 'monthly_edits': monthly_edits,
116 - 'last_edit_by_year': last_edit_by_year,
117 - 'username': username,
118 - 'articles_edited': articles_edited,
119 - 'namespaces_edited': namespaces_edited,
120 - 'character_counts': character_counts,
121 - }, safe=True)
 103+ 'username': username,
 104+ 'new_wikipedian': new_wikipedian,
 105+ 'final_edit': final_edit,
 106+ 'first_edit': first_edit,
 107+ 'last_edit_by_year': last_edit_by_year,
 108+ 'articles_edited': articles_edited,
 109+ 'edit_count': edit_count,
 110+ 'namespaces_edited': namespaces_edited,
 111+ 'article_count': article_count,
 112+ 'character_count': character_count,
 113+ 'revert_count': revert_count,
 114+ 'totals': totals,
 115+ },
 116+ safe=True)
122117
123118
 119+def cleanup_datacontainer(dc, variable_type):
 120+ '''
 121+ valid variable_type are either a {}, a [] or 0.
 122+ '''
 123+ years = dc.keys()
 124+ for year in years:
 125+ months = dc[year].keys()
 126+ for month in months:
 127+ if dc[year][month] == variable_type:
 128+ del dc[year][month]
 129+ return dc
 130+
 131+
 132+def calculate_totals(totals, counts, dc, var):
 133+ cnts = deepcopy(counts)
 134+ totals.setdefault(var, {})
 135+ for year in dc:
 136+ for month in dc[year]:
 137+ for ns in dc[year][month]:
 138+ if isinstance(dc[year][month][ns], dict):
 139+ cnts[year].setdefault(ns, {})
 140+ for key in dc[year][month][ns]:
 141+ cnts[year][ns].setdefault(key, 0)
 142+ cnts[year][ns][key] += dc[year][month][ns][key]
 143+ else:
 144+ cnts[year].setdefault(ns, 0)
 145+ #print year, ns, type(ns), dc[year][month][ns]
 146+ cnts[year][ns] += dc[year][month][ns]
 147+ totals[var] = cnts
 148+ return totals
 149+
 150+
124151 def determine_number_edits(edits, first_year, final_year):
125 - count = 0
 152+ dc = shaper.create_datacontainer(first_year, final_year)
 153+ dc = shaper.add_months_to_datacontainer(dc, 'dict')
126154 for edit in edits:
127 - if edit['ns'] == 0:
128 - print edit['ns']
129 - count += 1
130 - return count
 155+ ns = edit['ns']
 156+ year, month = str(edit['date'].year), edit['date'].month
 157+ dc[year][month].setdefault(ns, 0)
 158+ dc[year][month][ns] += 1
 159+ dc = cleanup_datacontainer(dc, {})
 160+ dc = db.stringify_keys(dc)
 161+ return dc
131162
132163
133164 def determine_articles_workedon(edits, first_year, final_year):
134165 dc = shaper.create_datacontainer(first_year, final_year)
135 - dc = shaper.add_months_to_datacontainer(dc, 'set')
 166+ dc = shaper.add_months_to_datacontainer(dc, 'dict')
136167 for year in edits:
137168 for edit in edits[year]:
138169 month = edit['date'].month
139 - dc[year][month].add(edit['article'])
 170+ ns = edit['ns']
 171+ dc[year][month].setdefault(ns, set())
 172+ dc[year][month][ns].add(edit['article'])
140173
141174 for year in dc:
142175 for month in dc[year]:
143 - dc[year][month] = list(dc[year][month])
 176+ for ns in dc[year][month]:
 177+ dc[year][month][ns] = list(dc[year][month][ns])
 178+ dc = cleanup_datacontainer(dc, {})
144179 return dc
145180
146181
@@ -153,21 +188,31 @@
154189 for year in dc:
155190 for month in dc[year]:
156191 dc[year][month] = list(dc[year][month])
 192+ dc = cleanup_datacontainer(dc, [])
 193+ dc = db.stringify_keys(dc)
157194 return dc
158195
159196
160197 def determine_number_reverts(edits, first_year, final_year):
161198 dc = shaper.create_datacontainer(first_year, final_year)
162 - dc = shaper.add_months_to_datacontainer(dc, 0)
 199+ dc = shaper.add_months_to_datacontainer(dc, 'dict')
163200 for year in edits:
164201 for edit in edits[year]:
165202 month = edit['date'].month
 203+ ns = edit['ns']
166204 if edit['revert']:
167 - dc[year][month] += 1
 205+ dc[year][month].setdefault(ns, 0)
 206+ dc[year][month][ns] += 1
 207+ dc = cleanup_datacontainer(dc, {})
 208+ dc = db.stringify_keys(dc)
168209 return dc
169210
170211
171212 def determine_edit_volume(edits, first_year, final_year):
 213+ '''
 214+ This function counts the number of edits by year by month by namespace for
 215+ a particular editor.
 216+ '''
172217 dc = shaper.create_datacontainer(first_year, final_year)
173218 dc = shaper.add_months_to_datacontainer(dc, 'dict')
174219 for year in edits:
@@ -181,6 +226,8 @@
182227 dc[year][month][ns]['removed'] += edit['delta']
183228 elif edit['delta'] > 0:
184229 dc[year][month][ns]['added'] += edit['delta']
 230+ dc = cleanup_datacontainer(dc, {})
 231+ dc = db.stringify_keys(dc)
185232 return dc
186233
187234
@@ -200,42 +247,22 @@
201248 dc[date] = edit
202249 elif dc[date] < edit:
203250 dc[date] = edit
 251+ dc = db.stringify_keys(dc)
204252 return dc
205253
206254
207 -def determine_edits_by_month(edits, first_year, final_year):
208 - dc = shaper.create_datacontainer(first_year, final_year)
209 - dc = shaper.add_months_to_datacontainer(dc, 0.0)
210 - for year in edits:
211 - for edit in edits[year]:
212 - m = edit['date'].month
213 - dc[year][m] += 1
214 - return dc
215 -
216 -
217 -def determine_edits_by_year(edits, first_year, final_year):
 255+def determine_article_count(articles_edited, first_year, final_year):
218256 '''
219 - This function counts the number of edits by year made by a particular editor.
220 - '''
221 - dc = shaper.create_datacontainer(first_year, final_year, 0)
222 - for year in edits:
223 - for edit in edits[year]:
224 - year = str(edit['date'].year)
225 - dc[year] += 1
226 - return dc
227 -
228 -
229 -def determine_articles_by_year(articles_edited, first_year, final_year):
230 - '''
231257 This function counts the number of unique articles by year edited by a
232258 particular editor.
233259 '''
234260 dc = shaper.create_datacontainer(first_year, final_year)
 261+ dc = shaper.add_months_to_datacontainer(dc, 'dict')
235262 for year in articles_edited:
236 - edits = set()
237263 for month in articles_edited[year]:
238 - edits.update(articles_edited[year][month])
239 - dc[year] = len(edits)
 264+ for ns in articles_edited[year][month]:
 265+ dc[year][month][ns] = len(articles_edited[year][month][ns])
 266+ dc = db.stringify_keys(dc)
240267 return dc
241268
242269