Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py |
— | — | @@ -1,622 +0,0 @@ |
2 | | - |
3 | | - |
4 | | -""" |
5 | | - |
6 | | -This module defines reporting and analysis for determining the statistical confidence |
7 | | -of of choice metrics over time. |
8 | | - |
9 | | -!!MODIFY!! |
10 | | - |
11 | | -- This should extend DataReporting.DataReporting |
12 | | -- The querying functionality should be exported to DataLoader |
13 | | - |
14 | | -""" |
15 | | - |
16 | | -__author__ = "Ryan Faulkner" |
17 | | -__revision__ = "$Rev$" |
18 | | -__date__ = "January 11th, 2011" |
19 | | - |
20 | | -import sys |
21 | | -sys.path.append('../') |
22 | | - |
23 | | -import math |
24 | | -import datetime as dt |
25 | | -import MySQLdb |
26 | | -import pylab |
27 | | -import matplotlib |
28 | | - |
29 | | -import miner_help as mh |
30 | | -import QueryData as QD |
31 | | -import DataLoader as DL |
32 | | -import TimestampProcessor as TP |
33 | | - |
34 | | -matplotlib.use('Agg') |
35 | | - |
36 | | - |
37 | | -""" |
38 | | - |
39 | | - CLASS :: ConfidenceTest |
40 | | - |
41 | | - |
42 | | - METHODS: |
43 | | - query_tables |
44 | | - get_time_lists |
45 | | - gen_plot |
46 | | - run_test |
47 | | - compute_parameters |
48 | | - print_metrics |
49 | | - |
50 | | -""" |
51 | | -class ConfidenceTest(object): |
52 | | - |
53 | | - _data_loader_ = None |
54 | | - |
55 | | - """ |
56 | | - """ |
57 | | - def __init__(self): |
58 | | - self._data_loader_ = DL.DataLoader() |
59 | | - |
60 | | - """ |
61 | | - ConfidenceTesting :: query_tables |
62 | | - """ |
63 | | - def query_tables(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples): |
64 | | - |
65 | | - ret = self.get_time_lists(start_time, end_time, interval, num_samples) |
66 | | - times = ret[0] |
67 | | - times_indices = ret[1] |
68 | | - |
69 | | - self._data_loader_.init_db() |
70 | | - |
71 | | - filename = '../sql/' + query_name + '.sql' |
72 | | - sql_stmnt = mh.read_sql(filename) |
73 | | - |
74 | | - metric_index = QD.get_metric_index(query_name, metric_name) |
75 | | - metrics_1 = [] |
76 | | - metrics_2 = [] |
77 | | - |
78 | | - for i in range(len(times) - 1): |
79 | | - |
80 | | - # print '\nExecuting number ' + str(i) + ' batch of of data.' |
81 | | - t1 = times[i] |
82 | | - t2 = times[i+1] |
83 | | - |
84 | | - formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign]) |
85 | | - formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign]) |
86 | | - |
87 | | - try: |
88 | | - err_msg = formatted_sql_stmnt_1 |
89 | | - |
90 | | - self._data_loader_._cur_.execute(formatted_sql_stmnt_1) |
91 | | - results_1 = self._data_loader_._cur_.fetchone() # there should only be a single row |
92 | | - |
93 | | - err_msg = formatted_sql_stmnt_2 |
94 | | - |
95 | | - self._data_loader_._cur_.execute(formatted_sql_stmnt_2) |
96 | | - results_2 = self._data_loader_._cur_.fetchone() # there should only be a single row |
97 | | - |
98 | | - except Exception as inst: |
99 | | - print type(inst) # the exception instance |
100 | | - print inst.args # arguments stored in .args |
101 | | - print inst # __str__ allows args to printed directly |
102 | | - |
103 | | - self._data_loader_._db_.rollback() |
104 | | - sys.exit("Database Interface Exception:\n" + err_msg) |
105 | | - |
106 | | - """ If no results are returned in this set the sample value is 0.0 """ |
107 | | - try: |
108 | | - metrics_1.append(results_1[metric_index]) |
109 | | - except TypeError: |
110 | | - metrics_1.append(0.0) |
111 | | - try: |
112 | | - metrics_2.append(results_2[metric_index]) |
113 | | - except TypeError: |
114 | | - metrics_2.append(0.0) |
115 | | - |
116 | | - #print metrics_1 |
117 | | - #print metrics_2 |
118 | | - |
119 | | - self._data_loader_.close_db() |
120 | | - |
121 | | - # return the metric values at each time |
122 | | - return [metrics_1, metrics_2, times_indices] |
123 | | - |
124 | | - |
125 | | - |
126 | | - """ |
127 | | - ConfidenceTesting :: get_time_lists |
128 | | - |
129 | | - num_samples is the |
130 | | - interval - intervals at which samples are drawn within the range, units = minutes |
131 | | - start_time, end_time - timestamps 'yyyymmddhhmmss' |
132 | | - """ |
133 | | - def get_time_lists(self, start_time, end_time, interval, num_samples): |
134 | | - |
135 | | - # range must be divisible by interval - convert to hours |
136 | | - range = float(interval * num_samples) / 60 |
137 | | - |
138 | | - # Compose times |
139 | | - start_datetime = dt.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14])) |
140 | | - end_datetime = dt.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14])) |
141 | | - |
142 | | - # current timestamp and hour index |
143 | | - curr_datetime = start_datetime |
144 | | - curr_timestamp = start_time |
145 | | - curr_hour_index = 0.0 |
146 | | - |
147 | | - # lists to store timestamps and indices |
148 | | - times = [] |
149 | | - time_indices = [] |
150 | | - |
151 | | - sample_count = 1 |
152 | | - |
153 | | - # build a list of timestamps and time indices for plotting |
154 | | - # increment the time |
155 | | - while curr_datetime < end_datetime: |
156 | | - |
157 | | - # for timestamp formatting |
158 | | - month_str_fill = '' |
159 | | - day_str_fill = '' |
160 | | - hour_str_fill = '' |
161 | | - minute_str_fill = '' |
162 | | - if curr_datetime.month < 10: |
163 | | - month_str_fill = '0' |
164 | | - if curr_datetime.day < 10: |
165 | | - month_str_fill = '0' |
166 | | - if curr_datetime.hour < 10: |
167 | | - hour_str_fill = '0' |
168 | | - if curr_datetime.minute < 10: |
169 | | - minute_str_fill = '0' |
170 | | - |
171 | | - curr_timestamp = str(curr_datetime.year) + month_str_fill + str(curr_datetime.month) + day_str_fill + str(curr_datetime.day) + hour_str_fill+ str(curr_datetime.hour) + minute_str_fill+ str(curr_datetime.minute) + '00' |
172 | | - times.append(curr_timestamp) |
173 | | - |
174 | | - # increment curr_hour_index if the |
175 | | - if sample_count == num_samples: |
176 | | - |
177 | | - time_indices.append(curr_hour_index + range / 2) |
178 | | - curr_hour_index = curr_hour_index + range |
179 | | - sample_count = 1 |
180 | | - else: |
181 | | - sample_count = sample_count + 1 |
182 | | - |
183 | | - |
184 | | - # increment the time by interval minutes |
185 | | - td = dt.timedelta(minutes=interval) |
186 | | - curr_datetime = curr_datetime + td |
187 | | - |
188 | | - # append the last items onto time lists |
189 | | - times.append(end_time) |
190 | | - # added_index = float(end_datetime.hour - curr_datetime.hour) + float(end_datetime.minute - curr_datetime.minute) / 60 |
191 | | - # curr_hour_index = float(curr_hour_index) + range / 2 |
192 | | - # time_indices.append(curr_hour_index) |
193 | | - |
194 | | - return [times, time_indices] |
195 | | - # compute parameters for each sample range (mean, standard deviation) |
196 | | - |
197 | | - |
198 | | - |
199 | | - """ |
200 | | - ConfidenceTesting :: gen_plot |
201 | | - |
202 | | - plot the test results with errorbars |
203 | | - """ |
204 | | - def gen_plot(self,means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname): |
205 | | - |
206 | | - file_format = 'png' |
207 | | - |
208 | | - pylab.subplot(subplot_index) |
209 | | - pylab.figure(num=None,figsize=[26,14]) |
210 | | - |
211 | | - e1 = pylab.errorbar(times_indices, means_1, yerr=std_devs_1, fmt='xb-') |
212 | | - e2 = pylab.errorbar(times_indices, means_2, yerr=std_devs_2, fmt='dr-') |
213 | | - # pylab.hist(counts, times) |
214 | | - |
215 | | - """ Set the figure and font size """ |
216 | | - fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth |
217 | | - inches_per_pt = 1.0/72.27 # Convert pt to inch |
218 | | - golden_mean = (math.sqrt(5)-1.0)/2.0 # Aesthetic ratio |
219 | | - fig_width = fig_width_pt*inches_per_pt # width in inches |
220 | | - fig_height = fig_width*golden_mean # height in inches |
221 | | - fig_size = [fig_width,fig_height] |
222 | | - |
223 | | - font_size = 20 |
224 | | - |
225 | | - params = { 'axes.labelsize': font_size, |
226 | | - 'text.fontsize': font_size, |
227 | | - 'xtick.labelsize': font_size, |
228 | | - 'ytick.labelsize': font_size, |
229 | | - 'legend.pad': 0.1, # empty space around the legend box |
230 | | - 'legend.fontsize': font_size, |
231 | | - 'font.size': font_size, |
232 | | - 'text.usetex': False, |
233 | | - 'figure.figsize': fig_size} |
234 | | - |
235 | | - pylab.rcParams.update(params) |
236 | | - |
237 | | - pylab.grid() |
238 | | - pylab.ylim(ranges[2], ranges[3]) |
239 | | - pylab.xlim(ranges[0], ranges[1]) |
240 | | - pylab.legend([e1[0], e2[0]], labels,loc=2) |
241 | | - |
242 | | - pylab.xlabel(xlabel) |
243 | | - pylab.ylabel(ylabel) |
244 | | - |
245 | | - pylab.title(title) |
246 | | - pylab.savefig(fname + '.' + file_format, format=file_format) |
247 | | - |
248 | | - |
249 | | - """ |
250 | | - ConfidenceTesting :: run_test |
251 | | - |
252 | | - Executes the confidence test - prints and plots the results |
253 | | - """ |
254 | | - def run_test(self, test_name, query_name, metric_name, campaign, items, start_time, end_time, interval, num_samples): |
255 | | - |
256 | | - """ TEMPORARY - map items and labels, this should be more generalized """ |
257 | | - counter = 1 |
258 | | - for key in items.keys(): |
259 | | - if counter == 1: |
260 | | - item_1 = items[key] |
261 | | - label_1 = key |
262 | | - elif counter == 2: |
263 | | - item_2 = items[key] |
264 | | - label_2 = key |
265 | | - counter += 1 |
266 | | - |
267 | | - """ Retrieve values from database """ |
268 | | - ret = self.query_tables(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples) |
269 | | - metrics_1 = ret[0] |
270 | | - metrics_2 = ret[1] |
271 | | - times_indices = ret[2] |
272 | | - |
273 | | - """ run the confidence test """ |
274 | | - ret = self.confidence_test(metrics_1, metrics_2, num_samples) |
275 | | - means_1 = ret[0] |
276 | | - means_2 = ret[1] |
277 | | - std_devs_1 = ret[2] |
278 | | - std_devs_2 = ret[3] |
279 | | - confidence = ret[4] |
280 | | - |
281 | | - """ plot the results """ |
282 | | - xlabel = 'Hours' |
283 | | - subplot_index = 111 |
284 | | - fname = './tests/' + campaign + '_conf_' + metric_name |
285 | | - |
286 | | - title = confidence + '\n\n' + test_name + ' -- ' + TP.timestamp_convert_format(start_time,1,2) + ' - ' + TP.timestamp_convert_format(end_time,1,2) |
287 | | - |
288 | | - max_mean = max(max(means_1),max(means_2)) |
289 | | - max_sd = max(max(std_devs_1),max(std_devs_2)) |
290 | | - max_y = float(max_mean) + float(max_sd) |
291 | | - max_y = max_y + 0.1 * max_y |
292 | | - max_x = max(times_indices) + min(times_indices) |
293 | | - ranges = [0.0, max_x, 0, max_y] |
294 | | - |
295 | | - ylabel = metric_name |
296 | | - labels = [label_1, label_2] |
297 | | - |
298 | | - self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname) |
299 | | - |
300 | | - """ Print out results """ |
301 | | - test_call = "run_test('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \ |
302 | | - item_1 + "', '" + item_2 + "', '" + start_time + "', '" + end_time + "', " + str(interval) + ", " + str(num_samples) + ")" |
303 | | - self.print_metrics(fname, title, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call) |
304 | | - |
305 | | - return |
306 | | - |
307 | | - |
308 | | - """ |
309 | | - assess the confidence of the winner - define in subclass |
310 | | - """ |
311 | | - def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples): |
312 | | - return |
313 | | - |
314 | | - |
315 | | - """ |
316 | | - assess the confidence of the winner - define in subclass |
317 | | - """ |
318 | | - def compute_parameters(self, metrics_1, metrics_2, num_samples): |
319 | | - |
320 | | - # A trial represents a group of samples over which parameters are computed |
321 | | - num_trials = int(math.ceil(len(metrics_1) / num_samples)) |
322 | | - |
323 | | - means_1 = [] |
324 | | - means_2 = [] |
325 | | - vars_1 = [] |
326 | | - vars_2 = [] |
327 | | - |
328 | | - m_tot = 0 |
329 | | - sd_tot = 0 |
330 | | - |
331 | | - # Compute the mean and variance for each group across all trials |
332 | | - for i in range(num_trials): |
333 | | - |
334 | | - m1 = 0.0 # mean of group 1 |
335 | | - m2 = 0.0 # mean of group 2 |
336 | | - var1 = 0.0 # variance of group 1 |
337 | | - var2 = 0.0 # variance of group 2 |
338 | | - |
339 | | - for j in range(num_samples): |
340 | | - index = i * num_samples + j |
341 | | - |
342 | | - # Compute mean for each group |
343 | | - m1 = m1 + float(metrics_1[index]) |
344 | | - m2 = m2 + float(metrics_2[index]) |
345 | | - |
346 | | - m1 = m1 / num_samples |
347 | | - m2 = m2 / num_samples |
348 | | - |
349 | | - # Compute Sample Variance for each group |
350 | | - for j in range(num_samples): |
351 | | - index = i + j |
352 | | - |
353 | | - var1 = var1 + math.pow((float(metrics_1[i]) - m1), 2) |
354 | | - var2 = var2 + math.pow((float(metrics_2[i]) - m2), 2) |
355 | | - |
356 | | - means_1.append(float(m1)) |
357 | | - means_2.append(float(m2)) |
358 | | - vars_1.append(var1 / num_samples) |
359 | | - vars_2.append(var2 / num_samples) |
360 | | - |
361 | | - return [num_trials, means_1, means_2, vars_1, vars_2] |
362 | | - |
363 | | - |
364 | | - """ Print in Tabular form the means and standard deviation of each group over each interval """ |
365 | | - def print_metrics(self, filename, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call): |
366 | | - |
367 | | - filename += '.txt' |
368 | | - file = open(filename, 'w') |
369 | | - |
370 | | - """ Compute % increase and report """ |
371 | | - av_means_1 = sum(means_1) / len(means_1) |
372 | | - av_means_2 = sum(means_2) / len(means_2) |
373 | | - percent_increase = math.fabs(av_means_1 - av_means_2) / min(av_means_1,av_means_2) * 100.0 |
374 | | - |
375 | | - """ Compute the average standard deviations """ |
376 | | - av_std_dev_1 = 0 |
377 | | - av_std_dev_2 = 0 |
378 | | - |
379 | | - for i in range(len(std_devs_1)): |
380 | | - av_std_dev_1 = av_std_dev_1 + math.pow(std_devs_1[i], 2) |
381 | | - av_std_dev_2 = av_std_dev_2 + math.pow(std_devs_2[i], 2) |
382 | | - |
383 | | - av_std_dev_1 = math.pow(av_std_dev_1 / len(std_devs_1), 0.5) |
384 | | - av_std_dev_2 = math.pow(av_std_dev_2 / len(std_devs_1), 0.5) |
385 | | - |
386 | | - """ Assign the winner """ |
387 | | - if av_means_1 > av_means_2: |
388 | | - winner = labels[0] |
389 | | - else: |
390 | | - winner = labels[1] |
391 | | - |
392 | | - win_str = "\nThe winner " + winner + " had a %.2f%s increase." |
393 | | - win_str = win_str % (percent_increase, '%') |
394 | | - |
395 | | - print '\nCOMMAND = ' + test_call |
396 | | - file.write('\nCOMMAND = ' + test_call) |
397 | | - |
398 | | - |
399 | | - print '\n\n' + metric_name |
400 | | - print '\nitem 1 = ' + labels[0] |
401 | | - print 'item 2 = ' + labels[1] |
402 | | - print win_str |
403 | | - print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n' |
404 | | - file.write('\n\n' + metric_name) |
405 | | - file.write('\n\nitem 1 = ' + labels[0] + '\n') |
406 | | - file.write('item 2 = ' + labels[1] + '\n') |
407 | | - file.write(win_str) |
408 | | - file.write('\n\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n\n') |
409 | | - |
410 | | - |
411 | | - """ Print out the parameters for each interval """ |
412 | | - |
413 | | - for i in range(len(times_indices)): |
414 | | - line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n' |
415 | | - line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i]) |
416 | | - print line_str |
417 | | - file.write(line_str) |
418 | | - |
419 | | - """ Print out the averaged parameters """ |
420 | | - line_args = '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n' |
421 | | - line_str = line_args % (av_means_1, av_means_2, av_std_dev_1, av_std_dev_2) |
422 | | - |
423 | | - print '\n\nOverall Parameters -- the confidence test was run with these parameters:\n' |
424 | | - print '\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n' |
425 | | - print line_str |
426 | | - |
427 | | - file.write('\n\nOverall Parameters:\n') |
428 | | - file.write('\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n') |
429 | | - file.write(line_str) |
430 | | - |
431 | | - |
432 | | - file.close() |
433 | | - |
434 | | - |
435 | | -""" |
436 | | - |
437 | | -Implements a Wald test where the distribution of donations over a given period are assumed to be normal |
438 | | - |
439 | | -http://en.wikipedia.org/wiki/Wald_test |
440 | | - |
441 | | -""" |
442 | | -class WaldTest(ConfidenceTest): |
443 | | - |
444 | | - def confidence_test(self, metrics_1, metrics_2, num_samples): |
445 | | - |
446 | | - ret = self.compute_parameters(metrics_1, metrics_2, num_samples) |
447 | | - num_trials = ret[0] |
448 | | - means_1 = ret[1] |
449 | | - means_2 = ret[2] |
450 | | - vars_1 = ret[3] |
451 | | - vars_2 = ret[4] |
452 | | - |
453 | | - """ Compute std devs """ |
454 | | - std_devs_1 = [] |
455 | | - std_devs_2 = [] |
456 | | - for i in range(len(vars_1)): |
457 | | - std_devs_1.append(math.pow(vars_1[i], 0.5)) |
458 | | - std_devs_2.append(math.pow(vars_2[i], 0.5)) |
459 | | - |
460 | | - m_tot = 0 |
461 | | - sd_tot = 0 |
462 | | - |
463 | | - # Compute the parameters for the Wald test |
464 | | - # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial |
465 | | - # where X{1,2} is the random variable corresponding to the group {1,2} |
466 | | - for i in range(num_trials): |
467 | | - |
468 | | - # Perform wald - compose W = X1 - X2 for each trial |
469 | | - sd = math.pow(vars_1[i] + vars_2[i], 0.5) |
470 | | - m = math.fabs(means_1[i] - means_2[i]) |
471 | | - |
472 | | - m_tot = m_tot + m |
473 | | - sd_tot = sd_tot + sd |
474 | | - |
475 | | - |
476 | | - W = m_tot / sd_tot |
477 | | - # print W |
478 | | - |
479 | | - # determine the probability that the |
480 | | - if (W >= 1.9): |
481 | | - conf_str = '95% confident about the winner.' |
482 | | - P = 0.95 |
483 | | - elif (W >= 1.6): |
484 | | - conf_str = '89% confident about the winner.' |
485 | | - P = 0.89 |
486 | | - elif (W >= 1.3): |
487 | | - conf_str = '81% confident about the winner.' |
488 | | - P = 0.81 |
489 | | - elif (W >= 1.0): |
490 | | - conf_str = '73% confident about the winner.' |
491 | | - P = 0.73 |
492 | | - elif (W >= 0.9): |
493 | | - conf_str = '68% confident about the winner.' |
494 | | - P = 0.68 |
495 | | - elif (W >= 0.8): |
496 | | - conf_str = '63% confident about the winner.' |
497 | | - P = 0.63 |
498 | | - elif (W >= 0.7): |
499 | | - conf_str = '52% confident about the winner.' |
500 | | - P = 0.52 |
501 | | - elif (W >= 0.6): |
502 | | - conf_str = '45% confident about the winner.' |
503 | | - P = 0.45 |
504 | | - elif (W >= 0.5): |
505 | | - conf_str = '38% confident about the winner.' |
506 | | - P = 0.38 |
507 | | - elif (W >= 0.4): |
508 | | - conf_str = '31% confident about the winner.' |
509 | | - P = 0.31 |
510 | | - elif (W >= 0.3): |
511 | | - conf_str = '24% confident about the winner.' |
512 | | - P = 0.24 |
513 | | - elif (W >= 0.2): |
514 | | - conf_str = '16% confident about the winner.' |
515 | | - P = 0.16 |
516 | | - elif (W >= 0.1): |
517 | | - conf_str = '8% confident about the winner.' |
518 | | - P = 0.08 |
519 | | - else: |
520 | | - conf_str = 'There is no clear winner.' |
521 | | - P = 0.08 |
522 | | - |
523 | | - |
524 | | - return [means_1, means_2, std_devs_1, std_devs_2, conf_str] |
525 | | - |
526 | | - |
527 | | -""" |
528 | | - |
529 | | -Implements a Student's T test where the distribution of donations over a given period are assumed to resemble those of a students t distribution |
530 | | - |
531 | | -http://en.wikipedia.org/wiki/Student%27s_t-test |
532 | | - |
533 | | -""" |
534 | | -class TTest(ConfidenceTest): |
535 | | - |
536 | | - def confidence_test(self, metrics_1, metrics_2, num_samples): |
537 | | - |
538 | | - """ retrieve means and variances """ |
539 | | - ret = self.compute_parameters(metrics_1, metrics_2, num_samples) |
540 | | - num_trials = ret[0] |
541 | | - means_1 = ret[1] |
542 | | - means_2 = ret[2] |
543 | | - vars_1 = ret[3] |
544 | | - vars_2 = ret[4] |
545 | | - |
546 | | - """ Compute std devs """ |
547 | | - std_devs_1 = [] |
548 | | - std_devs_2 = [] |
549 | | - for i in range(len(vars_1)): |
550 | | - std_devs_1.append(math.pow(vars_1[i], 0.5)) |
551 | | - std_devs_2.append(math.pow(vars_2[i], 0.5)) |
552 | | - |
553 | | - m_tot = 0 |
554 | | - var_1_tot = 0 |
555 | | - var_2_tot = 0 |
556 | | - |
557 | | - """ Compute the parameters for the student's t-test |
558 | | - The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial |
559 | | - where X{1,2} is the random variable corresponding to the group {1,2} """ |
560 | | - for i in range(num_trials): |
561 | | - |
562 | | - m_tot = m_tot + math.fabs(means_1[i] - means_2[i]) |
563 | | - var_1_tot = var_1_tot + vars_1[i] |
564 | | - var_2_tot = var_2_tot + vars_2[i] |
565 | | - |
566 | | - m = m_tot / num_trials |
567 | | - s_1 = var_1_tot / num_trials |
568 | | - s_2 = var_2_tot / num_trials |
569 | | - |
570 | | - total_samples = len(metrics_1) |
571 | | - |
572 | | - t = m / math.pow((s_1 + s_2) / total_samples, 0.5) |
573 | | - degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * (total_samples - 1) |
574 | | - |
575 | | - |
576 | | - """ lookup confidence """ |
577 | | - # get t and df |
578 | | - degrees_of_freedom = math.ceil(degrees_of_freedom) |
579 | | - if degrees_of_freedom > 30: |
580 | | - degrees_of_freedom = 99 |
581 | | - |
582 | | - select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t) |
583 | | - |
584 | | - self._data_loader_.init_db() |
585 | | - |
586 | | - try: |
587 | | - self._data_loader_._cur_.execute(select_stmnt) |
588 | | - results = self._data_loader_._cur_.fetchone() |
589 | | - |
590 | | - if results[0] != None: |
591 | | - p = float(results[0]) |
592 | | - else: |
593 | | - p = .0005 |
594 | | - except: |
595 | | - self._data_loader_._db_.rollback() |
596 | | - self._data_loader_._db_.close() |
597 | | - sys.exit('Could not execute: ' + select_stmnt) |
598 | | - |
599 | | - #print p |
600 | | - self._data_loader_._db_.close() |
601 | | - |
602 | | - probs = [0.400000, 0.250000, 0.100000, 0.050000, 0.025000, 0.010000, 0.005000, 0.000500] |
603 | | - prob_diffs = [math.fabs(i-p) for i in probs] |
604 | | - min_index = min((n, i) for i, n in enumerate(prob_diffs))[1] |
605 | | - |
606 | | - if min_index > 0: |
607 | | - lower_p = probs[min_index - 1] |
608 | | - |
609 | | - conf_str = 'Between ' + str((1 - lower_p) * 100) + '% and ' + str((1 - p) * 100) + '% confident about the winner.' |
610 | | - |
611 | | - return [means_1, means_2, std_devs_1, std_devs_2, conf_str] |
612 | | - |
613 | | -""" |
614 | | - |
615 | | -Implements a Chi Square test where the distribution of donations over a given period are assumed to resemble those of a students t distribution |
616 | | - |
617 | | -http://en.wikipedia.org/wiki/Chi-square_test |
618 | | - |
619 | | -""" |
620 | | -class ChiSquareTest(ConfidenceTest): |
621 | | - def confidence_test(self, metrics_1, metrics_2, num_samples): |
622 | | - return |
623 | | - |
\ No newline at end of file |