r84667 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84666‎ | r84667 | r84668 >
Date:06:57, 24 March 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Added reporting for arbitrary interval sizes over fundraiser data. Report drafts queries of minutely snapshots of the data and plots the results.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/query_store.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py
@@ -21,6 +21,7 @@
2222 import MySQLdb
2323 import pylab
2424 import HTML
 25+import math
2526
2627 import query_store as qs
2728 import miner_help as mh
@@ -30,16 +31,67 @@
3132
3233 """
3334
34 - Takes a list of timestamps as input and converts it to a set of hours counting back from 0
 35+ Takes a list of timestamps as input and converts it to a set of days,hours, or minutes counting back from 0
3536
36 - time_lists - a list of timestamp lists
37 - time_norm - a dictionary of normalized times
 37+ time_lists - a list of datetime objects
3838
 39+ time_unit - an integer indicating what unit to measure time in (0 = day, 1 = hour, 2 = minute)
 40+
 41+ RETURN:
 42+ time_norm - a dictionary of normalized times
3943 """
40 - def normalize_timestamps(self, time_lists):
 44+ def normalize_timestamps(self, time_lists, count_back, time_unit):
4145
42 - # Convert lists into dictionaries before processing
43 - # it is assumed that lists are composed of only simple types
 46+ time_lists, isList = self.timestamps_to_dict(time_lists)
 47+
 48+
 49+ """ Depending on args set the start date """
 50+ if count_back:
 51+ start_date_obj = self.find_latest_date_in_list(time_lists)
 52+ else:
 53+ start_date_obj = self.find_earliest_date_in_list(time_lists)
 54+
 55+ start_day = start_date_obj.day
 56+ start_hr = start_date_obj.hour
 57+ start_mte = start_date_obj.minute
 58+
 59+ # Normalize dates
 60+ time_norm = mh.AutoVivification()
 61+ for key in time_lists.keys():
 62+ for date_obj in time_lists[key]:
 63+
 64+ day = date_obj.day
 65+ hr = date_obj.hour
 66+ mte = date_obj.minute
 67+
 68+ if time_unit == 0:
 69+ elem = (day - start_day)
 70+ elif time_unit == 1:
 71+ elem = (day - start_day) * 24 + (hr - start_hr)
 72+ elif time_unit == 2:
 73+ elem = (day - start_day) * 24 * 60 + (hr - start_hr) * 60 + (mte - start_mte)
 74+
 75+ try:
 76+ time_norm[key].append(elem)
 77+ except:
 78+ time_norm[key] = list()
 79+ time_norm[key].append(elem)
 80+
 81+ # If the original argument was a list put it back in that form
 82+ if isList:
 83+ time_norm = time_norm[key]
 84+
 85+ return time_norm
 86+
 87+ """
 88+
 89+ HELPER METHOD for normalize_timestamps
 90+
 91+ Convert lists into dictionaries before processing it is assumed that lists are composed of only simple types
 92+
 93+ """
 94+ def timestamps_to_dict(self, time_lists):
 95+
4496 isList = 0
4597 if type(time_lists) is list:
4698 isList = 1
@@ -52,47 +104,53 @@
53105
54106 for i in range(len(old_list)):
55107 time_lists[key].append(old_list[i])
 108+
 109+ return [time_lists, isList]
 110+
 111+ """
 112+
 113+ HELPER METHOD for normalize_timestamps
56114
 115+ Find the latest time stamp in a list
 116+
 117+ """
 118+ def find_latest_date_in_list(self, time_lists):
57119
58 - # Find the earliest date
59 - max_i = 0
 120+ date_max = datetime.datetime(1000,1,1,0,0,0)
60121
61122 for key in time_lists.keys():
62 - for date_str in time_lists[key]:
63 - day_int = int(date_str[8:10])
64 - hr_int = int(date_str[11:13])
65 - date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
66 - if date_int > max_i:
67 - max_i = date_int
68 - max_day = day_int
69 - max_hr = hr_int
 123+ for date_obj in time_lists[key]:
 124+ if date_int > date_min:
 125+ date_min = date_obj
 126+
 127+ return date_max
 128+
 129+ """
 130+
 131+ HELPER METHOD for normalize_timestamps
70132
 133+ Find the earliest time stamp in a list
 134+
 135+ """
 136+ def find_earliest_date_in_list(self, time_lists):
71137
72 - # Normalize dates
73 - time_norm = mh.AutoVivification()
 138+ date_min = datetime.datetime(3000,1,1,0,0,0)
 139+
74140 for key in time_lists.keys():
75 - for date_str in time_lists[key]:
76 - day = int(date_str[8:10])
77 - hr = int(date_str[11:13])
78 - # date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
79 - elem = (day - max_day) * 24 + (hr - max_hr)
80 - try:
81 - time_norm[key].append(elem)
82 - except:
83 - time_norm[key] = list()
84 - time_norm[key].append(elem)
85 -
86 - # If the original argument was a list put it back in that form
87 - if isList:
88 - time_norm = time_norm[key]
89 -
90 - return time_norm
91 -
 141+ for date_obj in time_lists[key]:
 142+ if date_obj < date_min:
 143+ date_min = date_obj
 144+
 145+ return date_min
 146+
92147 """
 148+ Takes datetime objects and converts them to a string format YYYYMMDDHHmmSS for SQL processing by the hour
93149
94 - input - python datetime object
95 - returns - formatted datetime strings
 150+ now - datetime object
 151+ hours_back - number of hours between start and end time
96152
 153+ returns formatted datetime start_time and end_time strings
 154+
97155 """
98156 def gen_date_strings_hr(self, now, hours_back):
99157
@@ -138,10 +196,13 @@
139197 return [start_time, end_time]
140198
141199 """
 200+ Takes datetime objects and converts them to a string format YYYYMMDDHHmmSS for SQL processing by day
142201
143 - input - python datetime object
144 - returns - formatted datetime strings
 202+ now - datetime object
 203+ days_back - number of days between start and end time
145204
 205+ returns formatted datetime start_time and end_time strings
 206+
146207 """
147208 def gen_date_strings_day(self, now, days_back):
148209
@@ -179,17 +240,17 @@
180241
181242 """
182243
183 - Formats date string to match the form of civicrm.civicrm_contribution.recieve_date
 244+ Given a datetime object produce a timestamp a number of hours in the past and according to a particular format
184245
185246 format 1 - 20080101000606
186247 format 2 - 2008-01-01 00:06:06
187248
188249 input:
189250
190 - now - python datetime object
 251+ now - datetime object
191252 hours_back - the amount of time the
192253 format - the format of the returned timestamp strings
193 - resolution - the
 254+ resolution - the resolution detail of the timestamp (e.g. down to the minute, down to the hour, ...)
194255
195256
196257 returns - formatted datetime strings
@@ -212,21 +273,10 @@
213274
214275 """
215276
216 - Formats date string to match the form of civicrm.civicrm_contribution.recieve_date
 277+ Convert datetime objects to a timestamp of a given format
 278+
 279+ HELPER METHOD for gen_date_strings
217280
218 - format 1 - 20080101000606
219 - format 2 - 2008-01-01 00:06:06
220 -
221 - input:
222 -
223 - time_obj - python datetime object
224 - hours_back - the amount of time the
225 - format - the format of the returned timestamp strings
226 - resolution -
227 -
228 -
229 - returns - formatted datetime strings
230 -
231281 """
232282 def timestamp_from_obj(self, time_obj, format, resolution):
233283
@@ -277,19 +327,83 @@
278328 return timestamp
279329
280330
 331+ """
 332+
 333+ Convert timestamp to a datetime object of a given format
 334+
 335+ """
 336+ def timestamp_to_obj(self, timestamp, format):
 337+
 338+ if format == 1:
 339+ time_obj = datetime.datetime(int(timestamp[0:4]), int(timestamp[4:6]), int(timestamp[6:8]), \
 340+ int(timestamp[8:10]), int(timestamp[10:12]), int(timestamp[12:14]))
 341+
 342+ elif format == 2:
 343+ time_obj = datetime.datetime(int(timestamp[0:4]), int(timestamp[5:7]), int(timestamp[8:10]), \
 344+ int(timestamp[11:13]), int(timestamp[14:16]), int(timestamp[17:19]))
 345+
 346+ return time_obj
 347+
 348+
 349+ """
 350+
 351+ Inserts missing interval points into the time and metric lists
 352+
 353+ Assumptions:
 354+ _metrics_ and _times_ are lists of the same length
 355+ there must be a data point at each interval
 356+ Some data points may be missed
 357+ where there is no metric data the metric takes on the value 0.0
 358+
 359+ e.g. when _interval_ = 10
 360+ times = [0 10 30 50], metrics = [1 1 1 1] ==> [0 10 30 40 50], [1 1 0 1 0 1]
 361+
 362+ """
 363+ def normalize_intervals(self, times, metrics, interval):
 364+
 365+ current_time = 0.0
 366+ index = 0
 367+ iterations = 0
 368+ max_elems = math.ceil((times[-1] - times[0]) / interval) # there should be no more elements in the list than this
 369+
 370+ new_times = list()
 371+ new_metrics = list()
 372+
 373+ """ Iterate through the time list """
 374+ while index < len(times):
 375+
 376+ """ TEMPORARY SOLUTION: break out of the loop if more than the maximum number of elements is reached """
 377+ if iterations > max_elems:
 378+ break;
 379+
 380+ new_times.append(current_time)
 381+
 382+ """ If the current time is not in the current list then add it and a metric value of 0.0
 383+ otherwise add the existing elements to the new lists """
 384+ if current_time != times[index]:
 385+ new_metrics.append(0.0)
 386+
 387+ else:
 388+ new_metrics.append(metrics[index])
 389+ index = index + 1
 390+
 391+ current_time = current_time + interval
 392+
 393+ iterations = iterations + 1
 394+
 395+ return [new_times, new_metrics]
 396+
281397 """
282398
283399 CLASS :: ^FundraiserReporting^
284400
285401 Base class for reporting fundraiser analytics. Methods that are intended to be extended in derived classes include:
286402
287 -run_query()
 403+run_query() - format and execute the query to obtain data
 404+gen_plot() - plots the results of the report
 405+write_to_html_table() - writes the results to an HTML table
288406 run()
289 -gen_plot()
290 -publish_google_sheet()
291 -write_to_html_table()
292407
293 -
294408 """
295409 class FundraiserReporting(TimestampProcesser):
296410
@@ -300,8 +414,8 @@
301415 def init_db(self):
302416 """ Establish connection """
303417 #db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner')
304 - #self.db = MySQLdb.connect(host='127.0.0.1', user='rfaulk', db='faulkner', port=3307)
305 - self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner')
 418+ self.db = MySQLdb.connect(host='127.0.0.1', user='rfaulk', db='faulkner', port=3307)
 419+ #self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner')
306420
307421 """ Create cursor """
308422 self.cur = self.db.cursor()
@@ -1051,43 +1165,41 @@
10521166
10531167 """
10541168
1055 -INCOMPLETE -
1056 -CLASS :: ^ConfidenceReporting^
 1169+CLASS :: ^IntervalReporting^
10571170
1058 -To be called primarily for reporting
 1171+Performs queries that take timestamps, query, and an interval as arguments. Data for a single metric
 1172+is generated for each time interval in the time period defined by the start and end timestamps.
10591173
 1174+Types of queries supported:
 1175+
 1176+report_banner_metrics_minutely
 1177+report_LP_metrics_minutely
 1178+
10601179 """
10611180
1062 -class ConfidenceReporting(FundraiserReporting):
 1181+class IntervalReporting(FundraiserReporting):
10631182
1064 - def __init__(self, query_name, cmpgn_1, cmpgn_2, item_1, item_2, start_time , end_time, metric):
1065 - self.query_name = query_name
1066 - self.cmpgn_1 = cmpgn_1
1067 - self.cmpgn_2 = cmpgn_2
1068 - self.item_1 = item_1
1069 - self.item_2 = item_2
1070 - self.start_time = start_time
1071 - self.end_time = end_time
1072 - self.metric = metric
1073 -
1074 -
1075 - def run_query(self):
 1183+ def run_query(self, start_time, end_time, interval, query_name, metric_name, campaign):
10761184
10771185 self.init_db()
 1186+
10781187 query_obj = qs.query_store()
 1188+
 1189+ metrics = mh.AutoVivification()
 1190+ times = mh.AutoVivification()
 1191+ times_norm = mh.AutoVivification()
10791192
1080 - metric_list_1 = mh.AutoVivification()
1081 - metric_list_2 = mh.AutoVivification()
1082 - time_list = mh.AutoVivification()
1083 -
10841193 # Load the SQL File & Format
1085 - filename = './sql/' + self.query_name + '.sql'
 1194+ filename = './sql/' + query_name + '.sql'
10861195 sql_stmnt = mh.read_sql(filename)
10871196
1088 - sql_stmnt = query_obj.format_query(self.query_name, sql_stmnt, [self.start_time, self.end_time, self.cmpgn_1, self.item_1])
 1197+ sql_stmnt = query_obj.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval])
 1198+ #print sql_stmnt
10891199
 1200+ # Get Indexes into Query
 1201+ key_index = query_obj.get_banner_index(query_name)
 1202+ metric_index = query_obj.get_metric_index(query_name, metric_name)
10901203 time_index = query_obj.get_time_index(query_name)
1091 - metric_index = query_obj.get_metric_index(query_name, metric_name)
10921204
10931205 # Composes the data for each banner
10941206 try:
@@ -1097,29 +1209,119 @@
10981210 results = self.cur.fetchall()
10991211
11001212 for row in results:
1101 -
 1213+
11021214 key_name = row[key_index]
1103 -
 1215+ time_obj = self.timestamp_to_obj(row[time_index], 1) # format = 1, 14-digit TS
 1216+
 1217+ """ For each new dictionary index by key name start a new list if its not already there """
11041218 try:
1105 - metric_lists[key_name].append(row[metric_index])
1106 - time_lists[key_name].append(row[time_index])
 1219+ metrics[key_name].append(row[metric_index])
 1220+ times[key_name].append(time_obj)
11071221 except:
1108 - metric_lists[key_name] = list()
1109 - time_lists[key_name] = list()
1110 -
1111 - metric_lists[key_name].append(row[metric_index])
1112 - time_lists[key_name].append(row[time_index])
1113 -
 1222+ metrics[key_name] = list()
 1223+ times[key_name] = list()
 1224+
 1225+ metrics[key_name].append(row[metric_index])
 1226+ times[key_name].append(time_obj)
 1227+
11141228 except:
11151229 self.db.rollback()
11161230 sys.exit("Database Interface Exception:\n" + err_msg)
11171231
 1232+ """ Convert Times to Integers that indicate relative times AND normalize the intervals in case any are missing """
 1233+ for key in times.keys():
 1234+ times_norm[key] = self.normalize_timestamps(times[key], False, 2)
 1235+ times_norm[key], metrics[key] = self.normalize_intervals(times_norm[key], metrics[key], interval)
 1236+
11181237 self.close_db()
11191238
1120 - def gen_plot(self,counts, times, title, xlabel, ylabel, ranges, subplot_index, fname):
1121 - return
 1239+ return [metrics, times_norm]
 1240+
11221241
1123 - def run(self):
1124 - self.run_query()
 1242+ def gen_plot(self, metrics, times, title, xlabel, ylabel, ranges, subplot_index, fname):
 1243+
 1244+ pylab.subplot(subplot_index)
 1245+ pylab.figure(num=None,figsize=[26,14])
 1246+
 1247+ line_types = ['b-o','g-o','r-o','c-o','m-o','k-o','y-o','b--d','g--d','r--d','c--d','m--d','k--d','y--d','b-.s','g-.s','r-.s','c-.s','m-.s','k-.s','y-.s']
 1248+
 1249+ count = 0
 1250+ for key in metrics.keys():
 1251+ pylab.plot(times[key], metrics[key], line_types[count])
 1252+ count = count + 1
 1253+
 1254+ pylab.grid()
 1255+ pylab.xlim(ranges[0], ranges[1])
 1256+ pylab.legend(metrics.keys(),loc=2)
 1257+
 1258+ pylab.xlabel(xlabel)
 1259+ pylab.ylabel(ylabel)
 1260+
 1261+ pylab.title(title)
 1262+ pylab.savefig(fname, format='png')
 1263+
 1264+
 1265+
 1266+ """
 1267+
 1268+ start_time, end_time -- start and end timestamps
 1269+
 1270+ """
 1271+ def run(self, start_time, end_time, interval, query_name, metric_name, campaign):
 1272+
 1273+ query_obj = qs.query_store()
 1274+
 1275+ print '\nGenerating ' + query_name +', start and end times are: ' + start_time + ' - ' + end_time +' ... \n'
 1276+
 1277+ # Run Query
 1278+ return_val = self.run_query(start_time, end_time, interval, query_name, metric_name, campaign)
 1279+ counts = return_val[0]
 1280+ times = return_val[1]
 1281+
 1282+ # Normalize times
 1283+ min_time = min(times)
 1284+ ranges = [min_time, 0]
 1285+
 1286+ xlabel = 'Hours'
 1287+ subplot_index = 111
 1288+ fname = query_name + '.png'
 1289+
 1290+ title = query_obj.get_plot_title(query_name)
 1291+ title = title + ' -- ' + start_time + ' - ' + end_time
 1292+ ylabel = query_obj.get_plot_ylabel(query_name)
 1293+
 1294+ # Convert counts to float (from Decimal) to prevent exception when bar plotting
 1295+ # Bbox::update_numerix_xy expected numerix array
 1296+ for key in counts.keys():
 1297+ counts_new = list()
 1298+ for i in range(len(counts[key])):
 1299+ counts_new.append(float(counts[key][i]))
 1300+ counts[key] = counts_new
 1301+
 1302+ """ Determine List maximums """
 1303+ times_max = 0
 1304+ metrics_max = 0
 1305+
 1306+ for key in counts.keys():
 1307+ list_max = max(counts[key])
 1308+ if list_max > metrics_max:
 1309+ metrics_max = list_max
 1310+
 1311+ for key in times.keys():
 1312+ list_max = max(times[key])
 1313+ if list_max > times_max:
 1314+ times_max = list_max
 1315+
 1316+ print times_max
 1317+ print metrics_max
 1318+ ranges = list()
 1319+ ranges.append(0)
 1320+ ranges.append(times_max * 1.1)
 1321+ ranges.append(0)
 1322+ ranges.append(metrics_max * 1.1)
 1323+
 1324+
 1325+ # Generate Histogram
11251326 self.gen_plot(counts, times, title, xlabel, ylabel, ranges, subplot_index, fname)
1126 -
\ No newline at end of file
 1327+
 1328+
Index: trunk/fundraiser-statistics/fundraiser-scripts/query_store.py
@@ -126,7 +126,22 @@
127127
128128 elif query_name == 'report_ecomm_by_contact':
129129 where_str = args[0]
130 - sql_stmnt = sql_stmnt % ('%', '%', '%', '%', where_str)
 130+ sql_stmnt = sql_stmnt % ('%', '%', '%', '%', where_str)
 131+
 132+ elif query_name == 'report_banner_metrics_minutely':
 133+ start_time = args[0]
 134+ end_time = args[1]
 135+ campaign = args[2]
 136+ interval = args[3]
 137+ sql_stmnt = sql_stmnt % ('%', '%', '%', '%', interval, interval, start_time, end_time, '%', '%', '%', '%', interval, interval, start_time, end_time, campaign, \
 138+ '%', '%', '%', '%', interval, interval, start_time, end_time, campaign)
 139+
 140+ elif query_name == 'report_LP_metrics_minutely':
 141+ start_time = args[0]
 142+ end_time = args[1]
 143+ campaign = args[2]
 144+ interval = args[3]
 145+ sql_stmnt = sql_stmnt % ('%', '%', '%', '%', interval, interval, start_time, end_time, campaign, '%', '%', '%', '%', interval, interval, start_time, end_time, campaign)
131146
132147 else:
133148 return 'no such table\n'
@@ -151,6 +166,13 @@
152167 else:
153168 return 'no such table'
154169
 170+ """ Returns the index of the key for the query data """
 171+ def get_key_index(self, query_name):
 172+ if query_name == 'report_banner_metrics_minutely':
 173+ return 1
 174+ elif query_name == 'report_LP_metrics_minutely':
 175+ return 1
 176+
155177 def get_count_index(self, query_name):
156178 if query_name == 'report_lp_views_by_hour':
157179 return 1
@@ -176,6 +198,10 @@
177199 return 0
178200 elif query_name == 'report_lp_views_by_hour':
179201 return 0
 202+ elif query_name == 'report_banner_metrics_minutely':
 203+ return 0
 204+ elif query_name == 'report_LP_metrics_minutely':
 205+ return 0
180206 else:
181207 return -1
182208
@@ -202,6 +228,8 @@
203229 return 1
204230 elif query_name == 'report_bannerLP_metrics':
205231 return 1
 232+ elif query_name == 'report_banner_metrics_minutely':
 233+ return 1
206234 else:
207235 return -1
208236
@@ -216,6 +244,8 @@
217245 return 2
218246 elif query_name == 'report_bannerLP_metrics':
219247 return 1
 248+ elif query_name == 'report_LP_metrics_minutely':
 249+ return 1
220250 else:
221251 return -1
222252
@@ -317,9 +347,24 @@
318348 return 15
319349 else:
320350 return -1
 351+ elif query_name == 'report_LP_metrics_minutely':
 352+ if metric_name == 'don_per_view':
 353+ return 7
 354+ elif metric_name == 'amt50_per_view':
 355+ return 9
 356+ else:
 357+ return -1
 358+ elif query_name == 'report_banner_metrics_minutely':
 359+ if metric_name == 'don_per_imp':
 360+ return 10
 361+ elif metric_name == 'amt50_per_imp':
 362+ return 12
 363+ else:
 364+ return -1
321365 else:
322366 return 'no such table'
323367
 368+
324369 def get_plot_title(self, query_name):
325370 if query_name == 'report_banner_impressions_by_hour':
326371 return 'Banner Impressions Over the Past 24 Hours'

Status & tagging log