Index: trunk/tools/editor_trends/etl/shaper.py |
— | — | @@ -34,13 +34,6 @@ |
35 | 35 | return d |
36 | 36 | |
37 | 37 | |
38 | | -def create_clock(): |
39 | | - d = {} |
40 | | - for i in xrange(0, 24): |
41 | | - d[i] = 0.0 |
42 | | - return d |
43 | | - |
44 | | - |
45 | 38 | def create_datacontainer(first_year, final_year, datatype='dict'): |
46 | 39 | ''' |
47 | 40 | This function initializes an empty dictionary with as key the year (starting |
— | — | @@ -53,6 +46,7 @@ |
54 | 47 | data[str(x)] = add_datatype(datatype) |
55 | 48 | return data |
56 | 49 | |
| 50 | + |
57 | 51 | def add_windows_to_datacontainer(datacontainer, windows): |
58 | 52 | for dc in datacontainer: |
59 | 53 | for w in windows: |
— | — | @@ -60,6 +54,7 @@ |
61 | 55 | |
62 | 56 | return datacontainer |
63 | 57 | |
| 58 | + |
64 | 59 | def add_months_to_datacontainer(datacontainer, datatype): |
65 | 60 | for dc in datacontainer: |
66 | 61 | datacontainer[dc] = {} |
— | — | @@ -68,12 +63,10 @@ |
69 | 64 | |
70 | 65 | return datacontainer |
71 | 66 | |
| 67 | + |
72 | 68 | def add_years_to_datacontainer(first_year, final_year, datacontainer, datatype): |
73 | 69 | for dc in datacontainer: |
74 | 70 | datacontainer[dc] = {} |
75 | 71 | for x in range(first_year, final_year): |
76 | 72 | datacontainer[dc][x] = datatype |
77 | 73 | return datacontainer |
78 | | - |
79 | | - |
80 | | - |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -23,6 +23,7 @@ |
24 | 24 | from operator import itemgetter |
25 | 25 | import datetime |
26 | 26 | import sys |
| 27 | +from copy import deepcopy |
27 | 28 | |
28 | 29 | from database import db |
29 | 30 | from utils import file_utils |
— | — | @@ -67,79 +68,113 @@ |
68 | 69 | return |
69 | 70 | edits = editor['edits'] |
70 | 71 | username = editor['username'] |
| 72 | + |
71 | 73 | first_year, final_year = determine_year_range(edits) |
72 | | - monthly_edits = determine_edits_by_month(edits, first_year, final_year) |
73 | | - monthly_edits = db.stringify_keys(monthly_edits) |
74 | 74 | |
75 | | - edits_by_year = determine_edits_by_year(edits, first_year, final_year) |
76 | | - edits_by_year = db.stringify_keys(edits_by_year) |
77 | | - |
78 | 75 | last_edit_by_year = determine_last_edit_by_year(edits, first_year, final_year) |
79 | | - last_edit_by_year = db.stringify_keys(last_edit_by_year) |
80 | | - |
81 | 76 | articles_edited = determine_articles_workedon(edits, first_year, final_year) |
| 77 | + article_count = determine_article_count(articles_edited, first_year, final_year) |
82 | 78 | articles_edited = db.stringify_keys(articles_edited) |
83 | 79 | |
84 | | - articles_by_year = determine_articles_by_year(articles_edited, first_year, final_year) |
85 | | - articles_by_year = db.stringify_keys(articles_by_year) |
86 | | - |
87 | 80 | namespaces_edited = determine_namespaces_workedon(edits, first_year, final_year) |
88 | | - namespaces_edited = db.stringify_keys(namespaces_edited) |
| 81 | + character_count = determine_edit_volume(edits, first_year, final_year) |
| 82 | + revert_count = determine_number_reverts(edits, first_year, final_year) |
89 | 83 | |
90 | | - character_counts = determine_edit_volume(edits, first_year, final_year) |
91 | | - character_counts = db.stringify_keys(character_counts) |
92 | | - |
93 | | - count_reverts = determine_number_reverts(edits, first_year, final_year) |
94 | | - count_reverts = db.stringify_keys(count_reverts) |
95 | | - |
96 | 84 | edits = sort_edits(edits) |
97 | 85 | edit_count = determine_number_edits(edits, first_year, final_year) |
98 | 86 | |
| 87 | + totals = {} |
| 88 | + counts = shaper.create_datacontainer(first_year, final_year) |
| 89 | + totals = calculate_totals(totals, counts, character_count, 'character_count') |
| 90 | + totals = calculate_totals(totals, counts, revert_count, 'revert_count') |
| 91 | + totals = calculate_totals(totals, counts, article_count, 'article_count') |
| 92 | + totals = calculate_totals(totals, counts, edit_count, 'edit_count') |
| 93 | + totals = db.stringify_keys(totals) |
| 94 | + |
99 | 95 | if len(edits) > cutoff: |
100 | 96 | new_wikipedian = edits[cutoff]['date'] |
101 | 97 | else: |
102 | 98 | new_wikipedian = False |
103 | 99 | first_edit = edits[0]['date'] |
104 | 100 | final_edit = edits[-1]['date'] |
105 | | - edits = edits[:cutoff] |
106 | 101 | |
107 | 102 | self.output_db.insert({'editor': self.id, |
108 | | - 'edits': edits, |
109 | | - 'edits_by_year': edits_by_year, |
110 | | - 'new_wikipedian': new_wikipedian, |
111 | | - 'edit_count': edit_count, |
112 | | - 'final_edit': final_edit, |
113 | | - 'first_edit': first_edit, |
114 | | - 'articles_by_year': articles_by_year, |
115 | | - 'monthly_edits': monthly_edits, |
116 | | - 'last_edit_by_year': last_edit_by_year, |
117 | | - 'username': username, |
118 | | - 'articles_edited': articles_edited, |
119 | | - 'namespaces_edited': namespaces_edited, |
120 | | - 'character_counts': character_counts, |
121 | | - }, safe=True) |
| 103 | + 'username': username, |
| 104 | + 'new_wikipedian': new_wikipedian, |
| 105 | + 'final_edit': final_edit, |
| 106 | + 'first_edit': first_edit, |
| 107 | + 'last_edit_by_year': last_edit_by_year, |
| 108 | + 'articles_edited': articles_edited, |
| 109 | + 'edit_count': edit_count, |
| 110 | + 'namespaces_edited': namespaces_edited, |
| 111 | + 'article_count': article_count, |
| 112 | + 'character_count': character_count, |
| 113 | + 'revert_count': revert_count, |
| 114 | + 'totals': totals, |
| 115 | + }, |
| 116 | + safe=True) |
122 | 117 | |
123 | 118 | |
| 119 | +def cleanup_datacontainer(dc, variable_type): |
| 120 | + ''' |
| 121 | + valid variable_type are either a {}, a [] or 0. |
| 122 | + ''' |
| 123 | + years = dc.keys() |
| 124 | + for year in years: |
| 125 | + months = dc[year].keys() |
| 126 | + for month in months: |
| 127 | + if dc[year][month] == variable_type: |
| 128 | + del dc[year][month] |
| 129 | + return dc |
| 130 | + |
| 131 | + |
| 132 | +def calculate_totals(totals, counts, dc, var): |
| 133 | + cnts = deepcopy(counts) |
| 134 | + totals.setdefault(var, {}) |
| 135 | + for year in dc: |
| 136 | + for month in dc[year]: |
| 137 | + for ns in dc[year][month]: |
| 138 | + if isinstance(dc[year][month][ns], dict): |
| 139 | + cnts[year].setdefault(ns, {}) |
| 140 | + for key in dc[year][month][ns]: |
| 141 | + cnts[year][ns].setdefault(key, 0) |
| 142 | + cnts[year][ns][key] += dc[year][month][ns][key] |
| 143 | + else: |
| 144 | + cnts[year].setdefault(ns, 0) |
| 145 | + #print year, ns, type(ns), dc[year][month][ns] |
| 146 | + cnts[year][ns] += dc[year][month][ns] |
| 147 | + totals[var] = cnts |
| 148 | + return totals |
| 149 | + |
| 150 | + |
124 | 151 | def determine_number_edits(edits, first_year, final_year): |
125 | | - count = 0 |
| 152 | + dc = shaper.create_datacontainer(first_year, final_year) |
| 153 | + dc = shaper.add_months_to_datacontainer(dc, 'dict') |
126 | 154 | for edit in edits: |
127 | | - if edit['ns'] == 0: |
128 | | - print edit['ns'] |
129 | | - count += 1 |
130 | | - return count |
| 155 | + ns = edit['ns'] |
| 156 | + year, month = str(edit['date'].year), edit['date'].month |
| 157 | + dc[year][month].setdefault(ns, 0) |
| 158 | + dc[year][month][ns] += 1 |
| 159 | + dc = cleanup_datacontainer(dc, {}) |
| 160 | + dc = db.stringify_keys(dc) |
| 161 | + return dc |
131 | 162 | |
132 | 163 | |
133 | 164 | def determine_articles_workedon(edits, first_year, final_year): |
134 | 165 | dc = shaper.create_datacontainer(first_year, final_year) |
135 | | - dc = shaper.add_months_to_datacontainer(dc, 'set') |
| 166 | + dc = shaper.add_months_to_datacontainer(dc, 'dict') |
136 | 167 | for year in edits: |
137 | 168 | for edit in edits[year]: |
138 | 169 | month = edit['date'].month |
139 | | - dc[year][month].add(edit['article']) |
| 170 | + ns = edit['ns'] |
| 171 | + dc[year][month].setdefault(ns, set()) |
| 172 | + dc[year][month][ns].add(edit['article']) |
140 | 173 | |
141 | 174 | for year in dc: |
142 | 175 | for month in dc[year]: |
143 | | - dc[year][month] = list(dc[year][month]) |
| 176 | + for ns in dc[year][month]: |
| 177 | + dc[year][month][ns] = list(dc[year][month][ns]) |
| 178 | + dc = cleanup_datacontainer(dc, {}) |
144 | 179 | return dc |
145 | 180 | |
146 | 181 | |
— | — | @@ -153,21 +188,31 @@ |
154 | 189 | for year in dc: |
155 | 190 | for month in dc[year]: |
156 | 191 | dc[year][month] = list(dc[year][month]) |
| 192 | + dc = cleanup_datacontainer(dc, []) |
| 193 | + dc = db.stringify_keys(dc) |
157 | 194 | return dc |
158 | 195 | |
159 | 196 | |
160 | 197 | def determine_number_reverts(edits, first_year, final_year): |
161 | 198 | dc = shaper.create_datacontainer(first_year, final_year) |
162 | | - dc = shaper.add_months_to_datacontainer(dc, 0) |
| 199 | + dc = shaper.add_months_to_datacontainer(dc, 'dict') |
163 | 200 | for year in edits: |
164 | 201 | for edit in edits[year]: |
165 | 202 | month = edit['date'].month |
| 203 | + ns = edit['ns'] |
166 | 204 | if edit['revert']: |
167 | | - dc[year][month] += 1 |
| 205 | + dc[year][month].setdefault(ns, 0) |
| 206 | + dc[year][month][ns] += 1 |
| 207 | + dc = cleanup_datacontainer(dc, {}) |
| 208 | + dc = db.stringify_keys(dc) |
168 | 209 | return dc |
169 | 210 | |
170 | 211 | |
171 | 212 | def determine_edit_volume(edits, first_year, final_year): |
| 213 | + ''' |
| 214 | + This function counts the number of edits by year by month by namespace for |
| 215 | + a particular editor. |
| 216 | + ''' |
172 | 217 | dc = shaper.create_datacontainer(first_year, final_year) |
173 | 218 | dc = shaper.add_months_to_datacontainer(dc, 'dict') |
174 | 219 | for year in edits: |
— | — | @@ -181,6 +226,8 @@ |
182 | 227 | dc[year][month][ns]['removed'] += edit['delta'] |
183 | 228 | elif edit['delta'] > 0: |
184 | 229 | dc[year][month][ns]['added'] += edit['delta'] |
| 230 | + dc = cleanup_datacontainer(dc, {}) |
| 231 | + dc = db.stringify_keys(dc) |
185 | 232 | return dc |
186 | 233 | |
187 | 234 | |
— | — | @@ -200,42 +247,22 @@ |
201 | 248 | dc[date] = edit |
202 | 249 | elif dc[date] < edit: |
203 | 250 | dc[date] = edit |
| 251 | + dc = db.stringify_keys(dc) |
204 | 252 | return dc |
205 | 253 | |
206 | 254 | |
207 | | -def determine_edits_by_month(edits, first_year, final_year): |
208 | | - dc = shaper.create_datacontainer(first_year, final_year) |
209 | | - dc = shaper.add_months_to_datacontainer(dc, 0.0) |
210 | | - for year in edits: |
211 | | - for edit in edits[year]: |
212 | | - m = edit['date'].month |
213 | | - dc[year][m] += 1 |
214 | | - return dc |
215 | | - |
216 | | - |
217 | | -def determine_edits_by_year(edits, first_year, final_year): |
| 255 | +def determine_article_count(articles_edited, first_year, final_year): |
218 | 256 | ''' |
219 | | - This function counts the number of edits by year made by a particular editor. |
220 | | - ''' |
221 | | - dc = shaper.create_datacontainer(first_year, final_year, 0) |
222 | | - for year in edits: |
223 | | - for edit in edits[year]: |
224 | | - year = str(edit['date'].year) |
225 | | - dc[year] += 1 |
226 | | - return dc |
227 | | - |
228 | | - |
229 | | -def determine_articles_by_year(articles_edited, first_year, final_year): |
230 | | - ''' |
231 | 257 | This function counts the number of unique articles by year edited by a |
232 | 258 | particular editor. |
233 | 259 | ''' |
234 | 260 | dc = shaper.create_datacontainer(first_year, final_year) |
| 261 | + dc = shaper.add_months_to_datacontainer(dc, 'dict') |
235 | 262 | for year in articles_edited: |
236 | | - edits = set() |
237 | 263 | for month in articles_edited[year]: |
238 | | - edits.update(articles_edited[year][month]) |
239 | | - dc[year] = len(edits) |
| 264 | + for ns in articles_edited[year][month]: |
| 265 | + dc[year][month][ns] = len(articles_edited[year][month][ns]) |
| 266 | + dc = db.stringify_keys(dc) |
240 | 267 | return dc |
241 | 268 | |
242 | 269 | |