r86602 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r86601‎ \| r86602 \| r86603 >
Date:	07:56, 21 April 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	Functionality decoupled and added to reporting classes.
Modified paths:	/trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py (deleted) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py
—	—	@@ -1,622 +0,0 @@
2		-
3		-
4		~~-"""~~
5		-
6		~~-This module defines reporting and analysis for determining the statistical confidence~~
7		~~-of of choice metrics over time.~~
8		-
9		~~-!!MODIFY!!~~
10		-
11		~~-- This should extend DataReporting.DataReporting~~
12		~~-- The querying functionality should be exported to DataLoader~~
13		-
14		~~-"""~~
15		-
16		~~-__author__ = "Ryan Faulkner"~~
17		~~-__revision__ = "$Rev$"~~
18		~~-__date__ = "January 11th, 2011"~~
19		-
20		~~-import sys~~
21		~~-sys.path.append('../')~~
22		-
23		~~-import math~~
24		~~-import datetime as dt~~
25		~~-import MySQLdb~~
26		~~-import pylab~~
27		~~-import matplotlib~~
28		-
29		~~-import miner_help as mh~~
30		~~-import QueryData as QD~~
31		~~-import DataLoader as DL~~
32		~~-import TimestampProcessor as TP~~
33		-
34		~~-matplotlib.use('Agg')~~
35		-
36		-
37		~~-"""~~
38		-
39		~~- CLASS :: ConfidenceTest~~
40		-
41		-
42		~~- METHODS:~~
43		~~- query_tables~~
44		~~- get_time_lists~~
45		~~- gen_plot~~
46		~~- run_test~~
47		~~- compute_parameters~~
48		~~- print_metrics~~
49		-
50		~~-"""~~
51		~~-class ConfidenceTest(object):~~
52		-
53		~~- _data_loader_ = None~~
54		-
55		~~- """~~
56		~~- """~~
57		~~- def __init__(self):~~
58		~~- self._data_loader_ = DL.DataLoader()~~
59		-
60		~~- """~~
61		~~- ConfidenceTesting :: query_tables~~
62		~~- """~~
63		~~- def query_tables(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):~~
64		-
65		~~- ret = self.get_time_lists(start_time, end_time, interval, num_samples)~~
66		~~- times = ret[0]~~
67		~~- times_indices = ret[1]~~
68		-
69		~~- self._data_loader_.init_db()~~
70		-
71		~~- filename = '../sql/' + query_name + '.sql'~~
72		~~- sql_stmnt = mh.read_sql(filename)~~
73		-
74		~~- metric_index = QD.get_metric_index(query_name, metric_name)~~
75		~~- metrics_1 = []~~
76		~~- metrics_2 = []~~
77		-
78		~~- for i in range(len(times) - 1):~~
79		-
80		~~- # print '\nExecuting number ' + str(i) + ' batch of of data.'~~
81		~~- t1 = times[i]~~
82		~~- t2 = times[i+1]~~
83		-
84		~~- formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])~~
85		~~- formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])~~
86		-
87		~~- try:~~
88		~~- err_msg = formatted_sql_stmnt_1~~
89		-
90		~~- self._data_loader_._cur_.execute(formatted_sql_stmnt_1)~~
91		~~- results_1 = self._data_loader_._cur_.fetchone() # there should only be a single row~~
92		-
93		~~- err_msg = formatted_sql_stmnt_2~~
94		-
95		~~- self._data_loader_._cur_.execute(formatted_sql_stmnt_2)~~
96		~~- results_2 = self._data_loader_._cur_.fetchone() # there should only be a single row~~
97		-
98		~~- except Exception as inst:~~
99		~~- print type(inst) # the exception instance~~
100		~~- print inst.args # arguments stored in .args~~
101		~~- print inst # __str__ allows args to printed directly~~
102		-
103		~~- self._data_loader_._db_.rollback()~~
104		~~- sys.exit("Database Interface Exception:\n" + err_msg)~~
105		-
106		~~- """ If no results are returned in this set the sample value is 0.0 """~~
107		~~- try:~~
108		~~- metrics_1.append(results_1[metric_index])~~
109		~~- except TypeError:~~
110		~~- metrics_1.append(0.0)~~
111		~~- try:~~
112		~~- metrics_2.append(results_2[metric_index])~~
113		~~- except TypeError:~~
114		~~- metrics_2.append(0.0)~~
115		-
116		~~- #print metrics_1~~
117		~~- #print metrics_2~~
118		-
119		~~- self._data_loader_.close_db()~~
120		-
121		~~- # return the metric values at each time~~
122		~~- return [metrics_1, metrics_2, times_indices]~~
123		-
124		-
125		-
126		~~- """~~
127		~~- ConfidenceTesting :: get_time_lists~~
128		-
129		~~- num_samples is the~~
130		~~- interval - intervals at which samples are drawn within the range, units = minutes~~
131		~~- start_time, end_time - timestamps 'yyyymmddhhmmss'~~
132		~~- """~~
133		~~- def get_time_lists(self, start_time, end_time, interval, num_samples):~~
134		-
135		~~- # range must be divisible by interval - convert to hours~~
136		~~- range = float(interval * num_samples) / 60~~
137		-
138		~~- # Compose times~~
139		~~- start_datetime = dt.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14]))~~
140		~~- end_datetime = dt.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14]))~~
141		-
142		~~- # current timestamp and hour index~~
143		~~- curr_datetime = start_datetime~~
144		~~- curr_timestamp = start_time~~
145		~~- curr_hour_index = 0.0~~
146		-
147		~~- # lists to store timestamps and indices~~
148		~~- times = []~~
149		~~- time_indices = []~~
150		-
151		~~- sample_count = 1~~
152		-
153		~~- # build a list of timestamps and time indices for plotting~~
154		~~- # increment the time~~
155		~~- while curr_datetime < end_datetime:~~
156		-
157		~~- # for timestamp formatting~~
158		~~- month_str_fill = ''~~
159		~~- day_str_fill = ''~~
160		~~- hour_str_fill = ''~~
161		~~- minute_str_fill = ''~~
162		~~- if curr_datetime.month < 10:~~
163		~~- month_str_fill = '0'~~
164		~~- if curr_datetime.day < 10:~~
165		~~- month_str_fill = '0'~~
166		~~- if curr_datetime.hour < 10:~~
167		~~- hour_str_fill = '0'~~
168		~~- if curr_datetime.minute < 10:~~
169		~~- minute_str_fill = '0'~~
170		-
171		- curr_timestamp = str(curr_datetime.year) + month_str_fill + str(curr_datetime.month) + day_str_fill + str(curr_datetime.day) + hour_str_fill+ str(curr_datetime.hour) + minute_str_fill+ str(curr_datetime.minute) + '00'
172		~~- times.append(curr_timestamp)~~
173		-
174		~~- # increment curr_hour_index if the~~
175		~~- if sample_count == num_samples:~~
176		-
177		~~- time_indices.append(curr_hour_index + range / 2)~~
178		~~- curr_hour_index = curr_hour_index + range~~
179		~~- sample_count = 1~~
180		~~- else:~~
181		~~- sample_count = sample_count + 1~~
182		-
183		-
184		~~- # increment the time by interval minutes~~
185		~~- td = dt.timedelta(minutes=interval)~~
186		~~- curr_datetime = curr_datetime + td~~
187		-
188		~~- # append the last items onto time lists~~
189		~~- times.append(end_time)~~
190		~~- # added_index = float(end_datetime.hour - curr_datetime.hour) + float(end_datetime.minute - curr_datetime.minute) / 60~~
191		~~- # curr_hour_index = float(curr_hour_index) + range / 2~~
192		~~- # time_indices.append(curr_hour_index)~~
193		-
194		~~- return [times, time_indices]~~
195		~~- # compute parameters for each sample range (mean, standard deviation)~~
196		-
197		-
198		-
199		~~- """~~
200		~~- ConfidenceTesting :: gen_plot~~
201		-
202		~~- plot the test results with errorbars~~
203		~~- """~~
204		~~- def gen_plot(self,means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname):~~
205		-
206		~~- file_format = 'png'~~
207		-
208		~~- pylab.subplot(subplot_index)~~
209		~~- pylab.figure(num=None,figsize=[26,14])~~
210		-
211		~~- e1 = pylab.errorbar(times_indices, means_1, yerr=std_devs_1, fmt='xb-')~~
212		~~- e2 = pylab.errorbar(times_indices, means_2, yerr=std_devs_2, fmt='dr-')~~
213		~~- # pylab.hist(counts, times)~~
214		-
215		~~- """ Set the figure and font size """~~
216		~~- fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth~~
217		~~- inches_per_pt = 1.0/72.27 # Convert pt to inch~~
218		~~- golden_mean = (math.sqrt(5)-1.0)/2.0 # Aesthetic ratio~~
219		~~- fig_width = fig_width_pt*inches_per_pt # width in inches~~
220		~~- fig_height = fig_width*golden_mean # height in inches~~
221		~~- fig_size = [fig_width,fig_height]~~
222		-
223		~~- font_size = 20~~
224		-
225		~~- params = { 'axes.labelsize': font_size,~~
226		~~- 'text.fontsize': font_size,~~
227		~~- 'xtick.labelsize': font_size,~~
228		~~- 'ytick.labelsize': font_size,~~
229		~~- 'legend.pad': 0.1, # empty space around the legend box~~
230		~~- 'legend.fontsize': font_size,~~
231		~~- 'font.size': font_size,~~
232		~~- 'text.usetex': False,~~
233		~~- 'figure.figsize': fig_size}~~
234		-
235		~~- pylab.rcParams.update(params)~~
236		-
237		~~- pylab.grid()~~
238		~~- pylab.ylim(ranges[2], ranges[3])~~
239		~~- pylab.xlim(ranges[0], ranges[1])~~
240		~~- pylab.legend([e1[0], e2[0]], labels,loc=2)~~
241		-
242		~~- pylab.xlabel(xlabel)~~
243		~~- pylab.ylabel(ylabel)~~
244		-
245		~~- pylab.title(title)~~
246		~~- pylab.savefig(fname + '.' + file_format, format=file_format)~~
247		-
248		-
249		~~- """~~
250		~~- ConfidenceTesting :: run_test~~
251		-
252		~~- Executes the confidence test - prints and plots the results~~
253		~~- """~~
254		~~- def run_test(self, test_name, query_name, metric_name, campaign, items, start_time, end_time, interval, num_samples):~~
255		-
256		~~- """ TEMPORARY - map items and labels, this should be more generalized """~~
257		~~- counter = 1~~
258		~~- for key in items.keys():~~
259		~~- if counter == 1:~~
260		~~- item_1 = items[key]~~
261		~~- label_1 = key~~
262		~~- elif counter == 2:~~
263		~~- item_2 = items[key]~~
264		~~- label_2 = key~~
265		~~- counter += 1~~
266		-
267		~~- """ Retrieve values from database """~~
268		~~- ret = self.query_tables(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples)~~
269		~~- metrics_1 = ret[0]~~
270		~~- metrics_2 = ret[1]~~
271		~~- times_indices = ret[2]~~
272		-
273		~~- """ run the confidence test """~~
274		~~- ret = self.confidence_test(metrics_1, metrics_2, num_samples)~~
275		~~- means_1 = ret[0]~~
276		~~- means_2 = ret[1]~~
277		~~- std_devs_1 = ret[2]~~
278		~~- std_devs_2 = ret[3]~~
279		~~- confidence = ret[4]~~
280		-
281		~~- """ plot the results """~~
282		~~- xlabel = 'Hours'~~
283		~~- subplot_index = 111~~
284		~~- fname = './tests/' + campaign + '_conf_' + metric_name~~
285		-
286		~~- title = confidence + '\n\n' + test_name + ' -- ' + TP.timestamp_convert_format(start_time,1,2) + ' - ' + TP.timestamp_convert_format(end_time,1,2)~~
287		-
288		~~- max_mean = max(max(means_1),max(means_2))~~
289		~~- max_sd = max(max(std_devs_1),max(std_devs_2))~~
290		~~- max_y = float(max_mean) + float(max_sd)~~
291		~~- max_y = max_y + 0.1 * max_y~~
292		~~- max_x = max(times_indices) + min(times_indices)~~
293		~~- ranges = [0.0, max_x, 0, max_y]~~
294		-
295		~~- ylabel = metric_name~~
296		~~- labels = [label_1, label_2]~~
297		-
298		~~- self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname)~~
299		-
300		~~- """ Print out results """~~
301		~~- test_call = "run_test('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \~~
302		~~- item_1 + "', '" + item_2 + "', '" + start_time + "', '" + end_time + "', " + str(interval) + ", " + str(num_samples) + ")"~~
303		~~- self.print_metrics(fname, title, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call)~~
304		-
305		~~- return~~
306		-
307		-
308		~~- """~~
309		~~- assess the confidence of the winner - define in subclass~~
310		~~- """~~
311		~~- def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):~~
312		~~- return~~
313		-
314		-
315		~~- """~~
316		~~- assess the confidence of the winner - define in subclass~~
317		~~- """~~
318		~~- def compute_parameters(self, metrics_1, metrics_2, num_samples):~~
319		-
320		~~- # A trial represents a group of samples over which parameters are computed~~
321		~~- num_trials = int(math.ceil(len(metrics_1) / num_samples))~~
322		-
323		~~- means_1 = []~~
324		~~- means_2 = []~~
325		~~- vars_1 = []~~
326		~~- vars_2 = []~~
327		-
328		~~- m_tot = 0~~
329		~~- sd_tot = 0~~
330		-
331		~~- # Compute the mean and variance for each group across all trials~~
332		~~- for i in range(num_trials):~~
333		-
334		~~- m1 = 0.0 # mean of group 1~~
335		~~- m2 = 0.0 # mean of group 2~~
336		~~- var1 = 0.0 # variance of group 1~~
337		~~- var2 = 0.0 # variance of group 2~~
338		-
339		~~- for j in range(num_samples):~~
340		~~- index = i * num_samples + j~~
341		-
342		~~- # Compute mean for each group~~
343		~~- m1 = m1 + float(metrics_1[index])~~
344		~~- m2 = m2 + float(metrics_2[index])~~
345		-
346		~~- m1 = m1 / num_samples~~
347		~~- m2 = m2 / num_samples~~
348		-
349		~~- # Compute Sample Variance for each group~~
350		~~- for j in range(num_samples):~~
351		~~- index = i + j~~
352		-
353		~~- var1 = var1 + math.pow((float(metrics_1[i]) - m1), 2)~~
354		~~- var2 = var2 + math.pow((float(metrics_2[i]) - m2), 2)~~
355		-
356		~~- means_1.append(float(m1))~~
357		~~- means_2.append(float(m2))~~
358		~~- vars_1.append(var1 / num_samples)~~
359		~~- vars_2.append(var2 / num_samples)~~
360		-
361		~~- return [num_trials, means_1, means_2, vars_1, vars_2]~~
362		-
363		-
364		~~- """ Print in Tabular form the means and standard deviation of each group over each interval """~~
365		~~- def print_metrics(self, filename, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call):~~
366		-
367		~~- filename += '.txt'~~
368		~~- file = open(filename, 'w')~~
369		-
370		~~- """ Compute % increase and report """~~
371		~~- av_means_1 = sum(means_1) / len(means_1)~~
372		~~- av_means_2 = sum(means_2) / len(means_2)~~
373		~~- percent_increase = math.fabs(av_means_1 - av_means_2) / min(av_means_1,av_means_2) * 100.0~~
374		-
375		~~- """ Compute the average standard deviations """~~
376		~~- av_std_dev_1 = 0~~
377		~~- av_std_dev_2 = 0~~
378		-
379		~~- for i in range(len(std_devs_1)):~~
380		~~- av_std_dev_1 = av_std_dev_1 + math.pow(std_devs_1[i], 2)~~
381		~~- av_std_dev_2 = av_std_dev_2 + math.pow(std_devs_2[i], 2)~~
382		-
383		~~- av_std_dev_1 = math.pow(av_std_dev_1 / len(std_devs_1), 0.5)~~
384		~~- av_std_dev_2 = math.pow(av_std_dev_2 / len(std_devs_1), 0.5)~~
385		-
386		~~- """ Assign the winner """~~
387		~~- if av_means_1 > av_means_2:~~
388		~~- winner = labels[0]~~
389		~~- else:~~
390		~~- winner = labels[1]~~
391		-
392		~~- win_str = "\nThe winner " + winner + " had a %.2f%s increase."~~
393		~~- win_str = win_str % (percent_increase, '%')~~
394		-
395		~~- print '\nCOMMAND = ' + test_call~~
396		~~- file.write('\nCOMMAND = ' + test_call)~~
397		-
398		-
399		~~- print '\n\n' + metric_name~~
400		~~- print '\nitem 1 = ' + labels[0]~~
401		~~- print 'item 2 = ' + labels[1]~~
402		~~- print win_str~~
403		~~- print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n'~~
404		~~- file.write('\n\n' + metric_name)~~
405		~~- file.write('\n\nitem 1 = ' + labels[0] + '\n')~~
406		~~- file.write('item 2 = ' + labels[1] + '\n')~~
407		~~- file.write(win_str)~~
408		~~- file.write('\n\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n\n')~~
409		-
410		-
411		~~- """ Print out the parameters for each interval """~~
412		-
413		~~- for i in range(len(times_indices)):~~
414		~~- line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'~~
415		~~- line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i])~~
416		~~- print line_str~~
417		~~- file.write(line_str)~~
418		-
419		~~- """ Print out the averaged parameters """~~
420		~~- line_args = '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'~~
421		~~- line_str = line_args % (av_means_1, av_means_2, av_std_dev_1, av_std_dev_2)~~
422		-
423		~~- print '\n\nOverall Parameters -- the confidence test was run with these parameters:\n'~~
424		~~- print '\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n'~~
425		~~- print line_str~~
426		-
427		~~- file.write('\n\nOverall Parameters:\n')~~
428		~~- file.write('\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n')~~
429		~~- file.write(line_str)~~
430		-
431		-
432		~~- file.close()~~
433		-
434		-
435		~~-"""~~
436		-
437		~~-Implements a Wald test where the distribution of donations over a given period are assumed to be normal~~
438		-
439		~~-http://en.wikipedia.org/wiki/Wald_test~~
440		-
441		~~-"""~~
442		~~-class WaldTest(ConfidenceTest):~~
443		-
444		~~- def confidence_test(self, metrics_1, metrics_2, num_samples):~~
445		-
446		~~- ret = self.compute_parameters(metrics_1, metrics_2, num_samples)~~
447		~~- num_trials = ret[0]~~
448		~~- means_1 = ret[1]~~
449		~~- means_2 = ret[2]~~
450		~~- vars_1 = ret[3]~~
451		~~- vars_2 = ret[4]~~
452		-
453		~~- """ Compute std devs """~~
454		~~- std_devs_1 = []~~
455		~~- std_devs_2 = []~~
456		~~- for i in range(len(vars_1)):~~
457		~~- std_devs_1.append(math.pow(vars_1[i], 0.5))~~
458		~~- std_devs_2.append(math.pow(vars_2[i], 0.5))~~
459		-
460		~~- m_tot = 0~~
461		~~- sd_tot = 0~~
462		-
463		~~- # Compute the parameters for the Wald test~~
464		~~- # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial~~
465		~~- # where X{1,2} is the random variable corresponding to the group {1,2}~~
466		~~- for i in range(num_trials):~~
467		-
468		~~- # Perform wald - compose W = X1 - X2 for each trial~~
469		~~- sd = math.pow(vars_1[i] + vars_2[i], 0.5)~~
470		~~- m = math.fabs(means_1[i] - means_2[i])~~
471		-
472		~~- m_tot = m_tot + m~~
473		~~- sd_tot = sd_tot + sd~~
474		-
475		-
476		~~- W = m_tot / sd_tot~~
477		~~- # print W~~
478		-
479		~~- # determine the probability that the~~
480		~~- if (W >= 1.9):~~
481		~~- conf_str = '95% confident about the winner.'~~
482		~~- P = 0.95~~
483		~~- elif (W >= 1.6):~~
484		~~- conf_str = '89% confident about the winner.'~~
485		~~- P = 0.89~~
486		~~- elif (W >= 1.3):~~
487		~~- conf_str = '81% confident about the winner.'~~
488		~~- P = 0.81~~
489		~~- elif (W >= 1.0):~~
490		~~- conf_str = '73% confident about the winner.'~~
491		~~- P = 0.73~~
492		~~- elif (W >= 0.9):~~
493		~~- conf_str = '68% confident about the winner.'~~
494		~~- P = 0.68~~
495		~~- elif (W >= 0.8):~~
496		~~- conf_str = '63% confident about the winner.'~~
497		~~- P = 0.63~~
498		~~- elif (W >= 0.7):~~
499		~~- conf_str = '52% confident about the winner.'~~
500		~~- P = 0.52~~
501		~~- elif (W >= 0.6):~~
502		~~- conf_str = '45% confident about the winner.'~~
503		~~- P = 0.45~~
504		~~- elif (W >= 0.5):~~
505		~~- conf_str = '38% confident about the winner.'~~
506		~~- P = 0.38~~
507		~~- elif (W >= 0.4):~~
508		~~- conf_str = '31% confident about the winner.'~~
509		~~- P = 0.31~~
510		~~- elif (W >= 0.3):~~
511		~~- conf_str = '24% confident about the winner.'~~
512		~~- P = 0.24~~
513		~~- elif (W >= 0.2):~~
514		~~- conf_str = '16% confident about the winner.'~~
515		~~- P = 0.16~~
516		~~- elif (W >= 0.1):~~
517		~~- conf_str = '8% confident about the winner.'~~
518		~~- P = 0.08~~
519		~~- else:~~
520		~~- conf_str = 'There is no clear winner.'~~
521		~~- P = 0.08~~
522		-
523		-
524		~~- return [means_1, means_2, std_devs_1, std_devs_2, conf_str]~~
525		-
526		-
527		~~-"""~~
528		-
529		~~-Implements a Student's T test where the distribution of donations over a given period are assumed to resemble those of a students t distribution~~
530		-
531		~~-http://en.wikipedia.org/wiki/Student%27s_t-test~~
532		-
533		~~-"""~~
534		~~-class TTest(ConfidenceTest):~~
535		-
536		~~- def confidence_test(self, metrics_1, metrics_2, num_samples):~~
537		-
538		~~- """ retrieve means and variances """~~
539		~~- ret = self.compute_parameters(metrics_1, metrics_2, num_samples)~~
540		~~- num_trials = ret[0]~~
541		~~- means_1 = ret[1]~~
542		~~- means_2 = ret[2]~~
543		~~- vars_1 = ret[3]~~
544		~~- vars_2 = ret[4]~~
545		-
546		~~- """ Compute std devs """~~
547		~~- std_devs_1 = []~~
548		~~- std_devs_2 = []~~
549		~~- for i in range(len(vars_1)):~~
550		~~- std_devs_1.append(math.pow(vars_1[i], 0.5))~~
551		~~- std_devs_2.append(math.pow(vars_2[i], 0.5))~~
552		-
553		~~- m_tot = 0~~
554		~~- var_1_tot = 0~~
555		~~- var_2_tot = 0~~
556		-
557		~~- """ Compute the parameters for the student's t-test~~
558		~~- The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial~~
559		~~- where X{1,2} is the random variable corresponding to the group {1,2} """~~
560		~~- for i in range(num_trials):~~
561		-
562		~~- m_tot = m_tot + math.fabs(means_1[i] - means_2[i])~~
563		~~- var_1_tot = var_1_tot + vars_1[i]~~
564		~~- var_2_tot = var_2_tot + vars_2[i]~~
565		-
566		~~- m = m_tot / num_trials~~
567		~~- s_1 = var_1_tot / num_trials~~
568		~~- s_2 = var_2_tot / num_trials~~
569		-
570		~~- total_samples = len(metrics_1)~~
571		-
572		~~- t = m / math.pow((s_1 + s_2) / total_samples, 0.5)~~
573		~~- degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * (total_samples - 1)~~
574		-
575		-
576		~~- """ lookup confidence """~~
577		~~- # get t and df~~
578		~~- degrees_of_freedom = math.ceil(degrees_of_freedom)~~
579		~~- if degrees_of_freedom > 30:~~
580		~~- degrees_of_freedom = 99~~
581		-
582		~~- select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)~~
583		-
584		~~- self._data_loader_.init_db()~~
585		-
586		~~- try:~~
587		~~- self._data_loader_._cur_.execute(select_stmnt)~~
588		~~- results = self._data_loader_._cur_.fetchone()~~
589		-
590		~~- if results[0] != None:~~
591		~~- p = float(results[0])~~
592		~~- else:~~
593		~~- p = .0005~~
594		~~- except:~~
595		~~- self._data_loader_._db_.rollback()~~
596		~~- self._data_loader_._db_.close()~~
597		~~- sys.exit('Could not execute: ' + select_stmnt)~~
598		-
599		~~- #print p~~
600		~~- self._data_loader_._db_.close()~~
601		-
602		~~- probs = [0.400000, 0.250000, 0.100000, 0.050000, 0.025000, 0.010000, 0.005000, 0.000500]~~
603		~~- prob_diffs = [math.fabs(i-p) for i in probs]~~
604		~~- min_index = min((n, i) for i, n in enumerate(prob_diffs))[1]~~
605		-
606		~~- if min_index > 0:~~
607		~~- lower_p = probs[min_index - 1]~~
608		-
609		~~- conf_str = 'Between ' + str((1 - lower_p) * 100) + '% and ' + str((1 - p) * 100) + '% confident about the winner.'~~
610		-
611		~~- return [means_1, means_2, std_devs_1, std_devs_2, conf_str]~~
612		-
613		~~-"""~~
614		-
615		~~-Implements a Chi Square test where the distribution of donations over a given period are assumed to resemble those of a students t distribution~~
616		-
617		~~-http://en.wikipedia.org/wiki/Chi-square_test~~
618		-
619		~~-"""~~
620		~~-class ChiSquareTest(ConfidenceTest):~~
621		~~- def confidence_test(self, metrics_1, metrics_2, num_samples):~~
622		~~- return~~
623		-
\ No newline at end of file

Status & tagging log

12:09, 21 April 2011 😂 (talk | contribs) changed the status of r86602 [removed: new added: deferred]