r86602 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86601‎ | r86602 | r86603 >
Date:07:56, 21 April 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Functionality decoupled and added to reporting classes.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py (deleted) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py
@@ -1,622 +0,0 @@
2 -
3 -
4 -"""
5 -
6 -This module defines reporting and analysis for determining the statistical confidence
7 -of of choice metrics over time.
8 -
9 -!!MODIFY!!
10 -
11 -- This should extend DataReporting.DataReporting
12 -- The querying functionality should be exported to DataLoader
13 -
14 -"""
15 -
16 -__author__ = "Ryan Faulkner"
17 -__revision__ = "$Rev$"
18 -__date__ = "January 11th, 2011"
19 -
20 -import sys
21 -sys.path.append('../')
22 -
23 -import math
24 -import datetime as dt
25 -import MySQLdb
26 -import pylab
27 -import matplotlib
28 -
29 -import miner_help as mh
30 -import QueryData as QD
31 -import DataLoader as DL
32 -import TimestampProcessor as TP
33 -
34 -matplotlib.use('Agg')
35 -
36 -
37 -"""
38 -
39 - CLASS :: ConfidenceTest
40 -
41 -
42 - METHODS:
43 - query_tables
44 - get_time_lists
45 - gen_plot
46 - run_test
47 - compute_parameters
48 - print_metrics
49 -
50 -"""
51 -class ConfidenceTest(object):
52 -
53 - _data_loader_ = None
54 -
55 - """
56 - """
57 - def __init__(self):
58 - self._data_loader_ = DL.DataLoader()
59 -
60 - """
61 - ConfidenceTesting :: query_tables
62 - """
63 - def query_tables(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):
64 -
65 - ret = self.get_time_lists(start_time, end_time, interval, num_samples)
66 - times = ret[0]
67 - times_indices = ret[1]
68 -
69 - self._data_loader_.init_db()
70 -
71 - filename = '../sql/' + query_name + '.sql'
72 - sql_stmnt = mh.read_sql(filename)
73 -
74 - metric_index = QD.get_metric_index(query_name, metric_name)
75 - metrics_1 = []
76 - metrics_2 = []
77 -
78 - for i in range(len(times) - 1):
79 -
80 - # print '\nExecuting number ' + str(i) + ' batch of of data.'
81 - t1 = times[i]
82 - t2 = times[i+1]
83 -
84 - formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])
85 - formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])
86 -
87 - try:
88 - err_msg = formatted_sql_stmnt_1
89 -
90 - self._data_loader_._cur_.execute(formatted_sql_stmnt_1)
91 - results_1 = self._data_loader_._cur_.fetchone() # there should only be a single row
92 -
93 - err_msg = formatted_sql_stmnt_2
94 -
95 - self._data_loader_._cur_.execute(formatted_sql_stmnt_2)
96 - results_2 = self._data_loader_._cur_.fetchone() # there should only be a single row
97 -
98 - except Exception as inst:
99 - print type(inst) # the exception instance
100 - print inst.args # arguments stored in .args
101 - print inst # __str__ allows args to printed directly
102 -
103 - self._data_loader_._db_.rollback()
104 - sys.exit("Database Interface Exception:\n" + err_msg)
105 -
106 - """ If no results are returned in this set the sample value is 0.0 """
107 - try:
108 - metrics_1.append(results_1[metric_index])
109 - except TypeError:
110 - metrics_1.append(0.0)
111 - try:
112 - metrics_2.append(results_2[metric_index])
113 - except TypeError:
114 - metrics_2.append(0.0)
115 -
116 - #print metrics_1
117 - #print metrics_2
118 -
119 - self._data_loader_.close_db()
120 -
121 - # return the metric values at each time
122 - return [metrics_1, metrics_2, times_indices]
123 -
124 -
125 -
126 - """
127 - ConfidenceTesting :: get_time_lists
128 -
129 - num_samples is the
130 - interval - intervals at which samples are drawn within the range, units = minutes
131 - start_time, end_time - timestamps 'yyyymmddhhmmss'
132 - """
133 - def get_time_lists(self, start_time, end_time, interval, num_samples):
134 -
135 - # range must be divisible by interval - convert to hours
136 - range = float(interval * num_samples) / 60
137 -
138 - # Compose times
139 - start_datetime = dt.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14]))
140 - end_datetime = dt.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14]))
141 -
142 - # current timestamp and hour index
143 - curr_datetime = start_datetime
144 - curr_timestamp = start_time
145 - curr_hour_index = 0.0
146 -
147 - # lists to store timestamps and indices
148 - times = []
149 - time_indices = []
150 -
151 - sample_count = 1
152 -
153 - # build a list of timestamps and time indices for plotting
154 - # increment the time
155 - while curr_datetime < end_datetime:
156 -
157 - # for timestamp formatting
158 - month_str_fill = ''
159 - day_str_fill = ''
160 - hour_str_fill = ''
161 - minute_str_fill = ''
162 - if curr_datetime.month < 10:
163 - month_str_fill = '0'
164 - if curr_datetime.day < 10:
165 - month_str_fill = '0'
166 - if curr_datetime.hour < 10:
167 - hour_str_fill = '0'
168 - if curr_datetime.minute < 10:
169 - minute_str_fill = '0'
170 -
171 - curr_timestamp = str(curr_datetime.year) + month_str_fill + str(curr_datetime.month) + day_str_fill + str(curr_datetime.day) + hour_str_fill+ str(curr_datetime.hour) + minute_str_fill+ str(curr_datetime.minute) + '00'
172 - times.append(curr_timestamp)
173 -
174 - # increment curr_hour_index if the
175 - if sample_count == num_samples:
176 -
177 - time_indices.append(curr_hour_index + range / 2)
178 - curr_hour_index = curr_hour_index + range
179 - sample_count = 1
180 - else:
181 - sample_count = sample_count + 1
182 -
183 -
184 - # increment the time by interval minutes
185 - td = dt.timedelta(minutes=interval)
186 - curr_datetime = curr_datetime + td
187 -
188 - # append the last items onto time lists
189 - times.append(end_time)
190 - # added_index = float(end_datetime.hour - curr_datetime.hour) + float(end_datetime.minute - curr_datetime.minute) / 60
191 - # curr_hour_index = float(curr_hour_index) + range / 2
192 - # time_indices.append(curr_hour_index)
193 -
194 - return [times, time_indices]
195 - # compute parameters for each sample range (mean, standard deviation)
196 -
197 -
198 -
199 - """
200 - ConfidenceTesting :: gen_plot
201 -
202 - plot the test results with errorbars
203 - """
204 - def gen_plot(self,means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname):
205 -
206 - file_format = 'png'
207 -
208 - pylab.subplot(subplot_index)
209 - pylab.figure(num=None,figsize=[26,14])
210 -
211 - e1 = pylab.errorbar(times_indices, means_1, yerr=std_devs_1, fmt='xb-')
212 - e2 = pylab.errorbar(times_indices, means_2, yerr=std_devs_2, fmt='dr-')
213 - # pylab.hist(counts, times)
214 -
215 - """ Set the figure and font size """
216 - fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
217 - inches_per_pt = 1.0/72.27 # Convert pt to inch
218 - golden_mean = (math.sqrt(5)-1.0)/2.0 # Aesthetic ratio
219 - fig_width = fig_width_pt*inches_per_pt # width in inches
220 - fig_height = fig_width*golden_mean # height in inches
221 - fig_size = [fig_width,fig_height]
222 -
223 - font_size = 20
224 -
225 - params = { 'axes.labelsize': font_size,
226 - 'text.fontsize': font_size,
227 - 'xtick.labelsize': font_size,
228 - 'ytick.labelsize': font_size,
229 - 'legend.pad': 0.1, # empty space around the legend box
230 - 'legend.fontsize': font_size,
231 - 'font.size': font_size,
232 - 'text.usetex': False,
233 - 'figure.figsize': fig_size}
234 -
235 - pylab.rcParams.update(params)
236 -
237 - pylab.grid()
238 - pylab.ylim(ranges[2], ranges[3])
239 - pylab.xlim(ranges[0], ranges[1])
240 - pylab.legend([e1[0], e2[0]], labels,loc=2)
241 -
242 - pylab.xlabel(xlabel)
243 - pylab.ylabel(ylabel)
244 -
245 - pylab.title(title)
246 - pylab.savefig(fname + '.' + file_format, format=file_format)
247 -
248 -
249 - """
250 - ConfidenceTesting :: run_test
251 -
252 - Executes the confidence test - prints and plots the results
253 - """
254 - def run_test(self, test_name, query_name, metric_name, campaign, items, start_time, end_time, interval, num_samples):
255 -
256 - """ TEMPORARY - map items and labels, this should be more generalized """
257 - counter = 1
258 - for key in items.keys():
259 - if counter == 1:
260 - item_1 = items[key]
261 - label_1 = key
262 - elif counter == 2:
263 - item_2 = items[key]
264 - label_2 = key
265 - counter += 1
266 -
267 - """ Retrieve values from database """
268 - ret = self.query_tables(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples)
269 - metrics_1 = ret[0]
270 - metrics_2 = ret[1]
271 - times_indices = ret[2]
272 -
273 - """ run the confidence test """
274 - ret = self.confidence_test(metrics_1, metrics_2, num_samples)
275 - means_1 = ret[0]
276 - means_2 = ret[1]
277 - std_devs_1 = ret[2]
278 - std_devs_2 = ret[3]
279 - confidence = ret[4]
280 -
281 - """ plot the results """
282 - xlabel = 'Hours'
283 - subplot_index = 111
284 - fname = './tests/' + campaign + '_conf_' + metric_name
285 -
286 - title = confidence + '\n\n' + test_name + ' -- ' + TP.timestamp_convert_format(start_time,1,2) + ' - ' + TP.timestamp_convert_format(end_time,1,2)
287 -
288 - max_mean = max(max(means_1),max(means_2))
289 - max_sd = max(max(std_devs_1),max(std_devs_2))
290 - max_y = float(max_mean) + float(max_sd)
291 - max_y = max_y + 0.1 * max_y
292 - max_x = max(times_indices) + min(times_indices)
293 - ranges = [0.0, max_x, 0, max_y]
294 -
295 - ylabel = metric_name
296 - labels = [label_1, label_2]
297 -
298 - self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname)
299 -
300 - """ Print out results """
301 - test_call = "run_test('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \
302 - item_1 + "', '" + item_2 + "', '" + start_time + "', '" + end_time + "', " + str(interval) + ", " + str(num_samples) + ")"
303 - self.print_metrics(fname, title, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call)
304 -
305 - return
306 -
307 -
308 - """
309 - assess the confidence of the winner - define in subclass
310 - """
311 - def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
312 - return
313 -
314 -
315 - """
316 - assess the confidence of the winner - define in subclass
317 - """
318 - def compute_parameters(self, metrics_1, metrics_2, num_samples):
319 -
320 - # A trial represents a group of samples over which parameters are computed
321 - num_trials = int(math.ceil(len(metrics_1) / num_samples))
322 -
323 - means_1 = []
324 - means_2 = []
325 - vars_1 = []
326 - vars_2 = []
327 -
328 - m_tot = 0
329 - sd_tot = 0
330 -
331 - # Compute the mean and variance for each group across all trials
332 - for i in range(num_trials):
333 -
334 - m1 = 0.0 # mean of group 1
335 - m2 = 0.0 # mean of group 2
336 - var1 = 0.0 # variance of group 1
337 - var2 = 0.0 # variance of group 2
338 -
339 - for j in range(num_samples):
340 - index = i * num_samples + j
341 -
342 - # Compute mean for each group
343 - m1 = m1 + float(metrics_1[index])
344 - m2 = m2 + float(metrics_2[index])
345 -
346 - m1 = m1 / num_samples
347 - m2 = m2 / num_samples
348 -
349 - # Compute Sample Variance for each group
350 - for j in range(num_samples):
351 - index = i + j
352 -
353 - var1 = var1 + math.pow((float(metrics_1[i]) - m1), 2)
354 - var2 = var2 + math.pow((float(metrics_2[i]) - m2), 2)
355 -
356 - means_1.append(float(m1))
357 - means_2.append(float(m2))
358 - vars_1.append(var1 / num_samples)
359 - vars_2.append(var2 / num_samples)
360 -
361 - return [num_trials, means_1, means_2, vars_1, vars_2]
362 -
363 -
364 - """ Print in Tabular form the means and standard deviation of each group over each interval """
365 - def print_metrics(self, filename, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call):
366 -
367 - filename += '.txt'
368 - file = open(filename, 'w')
369 -
370 - """ Compute % increase and report """
371 - av_means_1 = sum(means_1) / len(means_1)
372 - av_means_2 = sum(means_2) / len(means_2)
373 - percent_increase = math.fabs(av_means_1 - av_means_2) / min(av_means_1,av_means_2) * 100.0
374 -
375 - """ Compute the average standard deviations """
376 - av_std_dev_1 = 0
377 - av_std_dev_2 = 0
378 -
379 - for i in range(len(std_devs_1)):
380 - av_std_dev_1 = av_std_dev_1 + math.pow(std_devs_1[i], 2)
381 - av_std_dev_2 = av_std_dev_2 + math.pow(std_devs_2[i], 2)
382 -
383 - av_std_dev_1 = math.pow(av_std_dev_1 / len(std_devs_1), 0.5)
384 - av_std_dev_2 = math.pow(av_std_dev_2 / len(std_devs_1), 0.5)
385 -
386 - """ Assign the winner """
387 - if av_means_1 > av_means_2:
388 - winner = labels[0]
389 - else:
390 - winner = labels[1]
391 -
392 - win_str = "\nThe winner " + winner + " had a %.2f%s increase."
393 - win_str = win_str % (percent_increase, '%')
394 -
395 - print '\nCOMMAND = ' + test_call
396 - file.write('\nCOMMAND = ' + test_call)
397 -
398 -
399 - print '\n\n' + metric_name
400 - print '\nitem 1 = ' + labels[0]
401 - print 'item 2 = ' + labels[1]
402 - print win_str
403 - print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n'
404 - file.write('\n\n' + metric_name)
405 - file.write('\n\nitem 1 = ' + labels[0] + '\n')
406 - file.write('item 2 = ' + labels[1] + '\n')
407 - file.write(win_str)
408 - file.write('\n\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n\n')
409 -
410 -
411 - """ Print out the parameters for each interval """
412 -
413 - for i in range(len(times_indices)):
414 - line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'
415 - line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i])
416 - print line_str
417 - file.write(line_str)
418 -
419 - """ Print out the averaged parameters """
420 - line_args = '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'
421 - line_str = line_args % (av_means_1, av_means_2, av_std_dev_1, av_std_dev_2)
422 -
423 - print '\n\nOverall Parameters -- the confidence test was run with these parameters:\n'
424 - print '\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n'
425 - print line_str
426 -
427 - file.write('\n\nOverall Parameters:\n')
428 - file.write('\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n')
429 - file.write(line_str)
430 -
431 -
432 - file.close()
433 -
434 -
435 -"""
436 -
437 -Implements a Wald test where the distribution of donations over a given period are assumed to be normal
438 -
439 -http://en.wikipedia.org/wiki/Wald_test
440 -
441 -"""
442 -class WaldTest(ConfidenceTest):
443 -
444 - def confidence_test(self, metrics_1, metrics_2, num_samples):
445 -
446 - ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
447 - num_trials = ret[0]
448 - means_1 = ret[1]
449 - means_2 = ret[2]
450 - vars_1 = ret[3]
451 - vars_2 = ret[4]
452 -
453 - """ Compute std devs """
454 - std_devs_1 = []
455 - std_devs_2 = []
456 - for i in range(len(vars_1)):
457 - std_devs_1.append(math.pow(vars_1[i], 0.5))
458 - std_devs_2.append(math.pow(vars_2[i], 0.5))
459 -
460 - m_tot = 0
461 - sd_tot = 0
462 -
463 - # Compute the parameters for the Wald test
464 - # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
465 - # where X{1,2} is the random variable corresponding to the group {1,2}
466 - for i in range(num_trials):
467 -
468 - # Perform wald - compose W = X1 - X2 for each trial
469 - sd = math.pow(vars_1[i] + vars_2[i], 0.5)
470 - m = math.fabs(means_1[i] - means_2[i])
471 -
472 - m_tot = m_tot + m
473 - sd_tot = sd_tot + sd
474 -
475 -
476 - W = m_tot / sd_tot
477 - # print W
478 -
479 - # determine the probability that the
480 - if (W >= 1.9):
481 - conf_str = '95% confident about the winner.'
482 - P = 0.95
483 - elif (W >= 1.6):
484 - conf_str = '89% confident about the winner.'
485 - P = 0.89
486 - elif (W >= 1.3):
487 - conf_str = '81% confident about the winner.'
488 - P = 0.81
489 - elif (W >= 1.0):
490 - conf_str = '73% confident about the winner.'
491 - P = 0.73
492 - elif (W >= 0.9):
493 - conf_str = '68% confident about the winner.'
494 - P = 0.68
495 - elif (W >= 0.8):
496 - conf_str = '63% confident about the winner.'
497 - P = 0.63
498 - elif (W >= 0.7):
499 - conf_str = '52% confident about the winner.'
500 - P = 0.52
501 - elif (W >= 0.6):
502 - conf_str = '45% confident about the winner.'
503 - P = 0.45
504 - elif (W >= 0.5):
505 - conf_str = '38% confident about the winner.'
506 - P = 0.38
507 - elif (W >= 0.4):
508 - conf_str = '31% confident about the winner.'
509 - P = 0.31
510 - elif (W >= 0.3):
511 - conf_str = '24% confident about the winner.'
512 - P = 0.24
513 - elif (W >= 0.2):
514 - conf_str = '16% confident about the winner.'
515 - P = 0.16
516 - elif (W >= 0.1):
517 - conf_str = '8% confident about the winner.'
518 - P = 0.08
519 - else:
520 - conf_str = 'There is no clear winner.'
521 - P = 0.08
522 -
523 -
524 - return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
525 -
526 -
527 -"""
528 -
529 -Implements a Student's T test where the distribution of donations over a given period are assumed to resemble those of a students t distribution
530 -
531 -http://en.wikipedia.org/wiki/Student%27s_t-test
532 -
533 -"""
534 -class TTest(ConfidenceTest):
535 -
536 - def confidence_test(self, metrics_1, metrics_2, num_samples):
537 -
538 - """ retrieve means and variances """
539 - ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
540 - num_trials = ret[0]
541 - means_1 = ret[1]
542 - means_2 = ret[2]
543 - vars_1 = ret[3]
544 - vars_2 = ret[4]
545 -
546 - """ Compute std devs """
547 - std_devs_1 = []
548 - std_devs_2 = []
549 - for i in range(len(vars_1)):
550 - std_devs_1.append(math.pow(vars_1[i], 0.5))
551 - std_devs_2.append(math.pow(vars_2[i], 0.5))
552 -
553 - m_tot = 0
554 - var_1_tot = 0
555 - var_2_tot = 0
556 -
557 - """ Compute the parameters for the student's t-test
558 - The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
559 - where X{1,2} is the random variable corresponding to the group {1,2} """
560 - for i in range(num_trials):
561 -
562 - m_tot = m_tot + math.fabs(means_1[i] - means_2[i])
563 - var_1_tot = var_1_tot + vars_1[i]
564 - var_2_tot = var_2_tot + vars_2[i]
565 -
566 - m = m_tot / num_trials
567 - s_1 = var_1_tot / num_trials
568 - s_2 = var_2_tot / num_trials
569 -
570 - total_samples = len(metrics_1)
571 -
572 - t = m / math.pow((s_1 + s_2) / total_samples, 0.5)
573 - degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * (total_samples - 1)
574 -
575 -
576 - """ lookup confidence """
577 - # get t and df
578 - degrees_of_freedom = math.ceil(degrees_of_freedom)
579 - if degrees_of_freedom > 30:
580 - degrees_of_freedom = 99
581 -
582 - select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
583 -
584 - self._data_loader_.init_db()
585 -
586 - try:
587 - self._data_loader_._cur_.execute(select_stmnt)
588 - results = self._data_loader_._cur_.fetchone()
589 -
590 - if results[0] != None:
591 - p = float(results[0])
592 - else:
593 - p = .0005
594 - except:
595 - self._data_loader_._db_.rollback()
596 - self._data_loader_._db_.close()
597 - sys.exit('Could not execute: ' + select_stmnt)
598 -
599 - #print p
600 - self._data_loader_._db_.close()
601 -
602 - probs = [0.400000, 0.250000, 0.100000, 0.050000, 0.025000, 0.010000, 0.005000, 0.000500]
603 - prob_diffs = [math.fabs(i-p) for i in probs]
604 - min_index = min((n, i) for i, n in enumerate(prob_diffs))[1]
605 -
606 - if min_index > 0:
607 - lower_p = probs[min_index - 1]
608 -
609 - conf_str = 'Between ' + str((1 - lower_p) * 100) + '% and ' + str((1 - p) * 100) + '% confident about the winner.'
610 -
611 - return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
612 -
613 -"""
614 -
615 -Implements a Chi Square test where the distribution of donations over a given period are assumed to resemble those of a students t distribution
616 -
617 -http://en.wikipedia.org/wiki/Chi-square_test
618 -
619 -"""
620 -class ChiSquareTest(ConfidenceTest):
621 - def confidence_test(self, metrics_1, metrics_2, num_samples):
622 - return
623 -
\ No newline at end of file

Status & tagging log