r86284 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r86283‎ \| r86284 \| r86285 >
Date:	21:10, 17 April 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	This module is designed to abstract and decouple hypothesis testing logic from reporting and data handling.
Modified paths:	/trunk/fundraiser-statistics/fundraiser-scripts/classes/HypothesisTest.py (added) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/HypothesisTest.py
—	—	@@ -0,0 +1,349 @@
	2	+
	3	+"""
	4	+
	5	+This module is used to define the reporting methodologies on different types of data. The base class
	6	+DataReporting is defined to outline the general functionality of the reporting architecture and
	7	+functionality which includes generating the data via a dataloader object and transforming the data
	8	+among different reporting mediums including matlab plots (primary medium) and html tables.
	9	+
	10	+The DataLoader class decouples the data access of the reports using the Adapter structural pattern.
	11	+
	12	+"""
	13	+
	14	+__author__ = "Ryan Faulkner"
	15	+__revision__ = "$Rev$"
	16	+__date__ = "April 16th, 2011"
	17	+
	18	+
	19	+import sys
	20	+sys.path.append('../')
	21	+
	22	+import math
	23	+import datetime as dt
	24	+import MySQLdb
	25	+import pylab
	26	+import matplotlib
	27	+
	28	+import miner_help as mh
	29	+import QueryData as QD
	30	+import DataLoader as DL
	31	+import TimestampProcessor as TP
	32	+
	33	+matplotlib.use('Agg')
	34	+
	35	+
	36	+"""
	37	+
	38	+ CLASS :: HypothesisTest
	39	+
	40	+
	41	+ METHODS:
	42	+ confidence_test - defined in subclasses, performs the test
	43	+ compute_parameters - computes parameters of models for test
	44	+
	45	+"""
	46	+class HypothesisTest(object):
	47	+
	48	+ """
	49	+ Assess the confidence of the winner - define in subclass
	50	+
	51	+ INPUT:
	52	+
	53	+ RETURN:
	54	+
	55	+ """
	56	+ def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
	57	+ return
	58	+
	59	+
	60	+ """
	61	+ Determine the parameters of the distribution
	62	+
	63	+ INPUT:
	64	+
	65	+ RETURN:
	66	+
	67	+ """
	68	+ def compute_parameters(self, metrics_1, metrics_2, num_samples):
	69	+
	70	+ # A trial represents a group of samples over which parameters are computed
	71	+ num_trials = int(math.ceil(len(metrics_1) / num_samples))
	72	+
	73	+ means_1 = []
	74	+ means_2 = []
	75	+ vars_1 = []
	76	+ vars_2 = []
	77	+
	78	+ m_tot = 0
	79	+ sd_tot = 0
	80	+
	81	+ # Compute the mean and variance for each group across all trials
	82	+ for i in range(num_trials):
	83	+
	84	+ m1 = 0 # mean of group 1
	85	+ m2 = 0 # mean of group 2
	86	+ var1 = 0 # variance of group 1
	87	+ var2 = 0 # variance of group 2
	88	+
	89	+ for j in range(num_samples):
	90	+ index = i * num_samples + j
	91	+
	92	+ # Compute mean for each group
	93	+ m1 = m1 + metrics_1[index]
	94	+ m2 = m2 + metrics_2[index]
	95	+
	96	+ m1 = m1 / num_samples
	97	+ m2 = m2 / num_samples
	98	+
	99	+ # Compute Sample Variance for each group
	100	+ for j in range(num_samples):
	101	+ index = i + j
	102	+
	103	+ var1 = var1 + math.pow((metrics_1[i] - m1), 2)
	104	+ var2 = var2 + math.pow((metrics_2[i] - m2), 2)
	105	+
	106	+ means_1.append(float(m1))
	107	+ means_2.append(float(m2))
	108	+ vars_1.append(var1 / num_samples)
	109	+ vars_2.append(var2 / num_samples)
	110	+
	111	+ return [num_trials, means_1, means_2, vars_1, vars_2]
	112	+
	113	+
	114	+
	115	+
	116	+"""
	117	+
	118	+Class :: WaldTest
	119	+
	120	+Implements a Wald test where the distribution of donations over a given period are
	121	+assumed to be normal
	122	+
	123	+http://en.wikipedia.org/wiki/Wald_test
	124	+
	125	+"""
	126	+class WaldTest(HypothesisTest):
	127	+
	128	+ """
	129	+ <description>
	130	+
	131	+ INPUT:
	132	+
	133	+ RETURN:
	134	+
	135	+ """
	136	+ def confidence_test(self, metrics_1, metrics_2, num_samples):
	137	+
	138	+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
	139	+ num_trials = ret[0]
	140	+ means_1 = ret[1]
	141	+ means_2 = ret[2]
	142	+ vars_1 = ret[3]
	143	+ vars_2 = ret[4]
	144	+
	145	+ """ Compute std devs """
	146	+ std_devs_1 = []
	147	+ std_devs_2 = []
	148	+ for i in range(len(vars_1)):
	149	+ std_devs_1.append(math.pow(vars_1[i], 0.5))
	150	+ std_devs_2.append(math.pow(vars_2[i], 0.5))
	151	+
	152	+ m_tot = 0
	153	+ sd_tot = 0
	154	+
	155	+ # Compute the parameters for the Wald test
	156	+ # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
	157	+ # where X{1,2} is the random variable corresponding to the group {1,2}
	158	+ for i in range(num_trials):
	159	+
	160	+ # Perform wald - compose W = X1 - X2 for each trial
	161	+ sd = math.pow(vars_1[i] + vars_2[i], 0.5)
	162	+ m = math.fabs(means_1[i] - means_2[i])
	163	+
	164	+ m_tot = m_tot + m
	165	+ sd_tot = sd_tot + sd
	166	+
	167	+
	168	+ W = m_tot / sd_tot
	169	+ # print W
	170	+
	171	+ # determine the probability that the
	172	+ if (W >= 1.9):
	173	+ conf_str = '95% confident about the winner.'
	174	+ P = 0.95
	175	+ elif (W >= 1.6):
	176	+ conf_str = '89% confident about the winner.'
	177	+ P = 0.89
	178	+ elif (W >= 1.3):
	179	+ conf_str = '81% confident about the winner.'
	180	+ P = 0.81
	181	+ elif (W >= 1.0):
	182	+ conf_str = '73% confident about the winner.'
	183	+ P = 0.73
	184	+ elif (W >= 0.9):
	185	+ conf_str = '68% confident about the winner.'
	186	+ P = 0.68
	187	+ elif (W >= 0.8):
	188	+ conf_str = '63% confident about the winner.'
	189	+ P = 0.63
	190	+ elif (W >= 0.7):
	191	+ conf_str = '52% confident about the winner.'
	192	+ P = 0.52
	193	+ elif (W >= 0.6):
	194	+ conf_str = '45% confident about the winner.'
	195	+ P = 0.45
	196	+ elif (W >= 0.5):
	197	+ conf_str = '38% confident about the winner.'
	198	+ P = 0.38
	199	+ elif (W >= 0.4):
	200	+ conf_str = '31% confident about the winner.'
	201	+ P = 0.31
	202	+ elif (W >= 0.3):
	203	+ conf_str = '24% confident about the winner.'
	204	+ P = 0.24
	205	+ elif (W >= 0.2):
	206	+ conf_str = '16% confident about the winner.'
	207	+ P = 0.16
	208	+ elif (W >= 0.1):
	209	+ conf_str = '8% confident about the winner.'
	210	+ P = 0.08
	211	+ else:
	212	+ conf_str = 'There is no clear winner.'
	213	+ P = 0.08
	214	+
	215	+
	216	+ return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
	217	+
	218	+
	219	+"""
	220	+
	221	+Class :: TTest
	222	+
	223	+Implements a Student's T test where the distribution of donations over a given period are
	224	+assumed to resemble those of a students t distribution
	225	+
	226	+http://en.wikipedia.org/wiki/Student%27s_t-test
	227	+
	228	+"""
	229	+class TTest(HypothesisTest):
	230	+
	231	+ _data_loader_ = None
	232	+
	233	+ """
	234	+ <description>
	235	+
	236	+ INPUT:
	237	+
	238	+ RETURN:
	239	+
	240	+ """
	241	+ def __init__(self):
	242	+ _data_loader_ = DL.TTestLoaderHelp()
	243	+
	244	+ """
	245	+ <description>
	246	+
	247	+ INPUT:
	248	+
	249	+ RETURN:
	250	+
	251	+ """
	252	+ def confidence_test(self, metrics_1, metrics_2, num_samples):
	253	+
	254	+ """ retrieve means and variances """
	255	+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
	256	+ num_trials = ret[0]
	257	+ means_1 = ret[1]
	258	+ means_2 = ret[2]
	259	+ vars_1 = ret[3]
	260	+ vars_2 = ret[4]
	261	+
	262	+ """ Compute std devs """
	263	+ std_devs_1 = []
	264	+ std_devs_2 = []
	265	+ for i in range(len(vars_1)):
	266	+ std_devs_1.append(math.pow(vars_1[i], 0.5))
	267	+ std_devs_2.append(math.pow(vars_2[i], 0.5))
	268	+
	269	+ m_tot = 0
	270	+ var_1_tot = 0
	271	+ var_2_tot = 0
	272	+
	273	+ """ Compute the parameters for the student's t-test
	274	+ The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
	275	+ where X{1,2} is the random variable corresponding to the group {1,2} """
	276	+ for i in range(num_trials):
	277	+
	278	+ m_tot = m_tot + math.fabs(means_1[i] - means_2[i])
	279	+ var_1_tot = var_1_tot + vars_1[i]
	280	+ var_2_tot = var_2_tot + vars_2[i]
	281	+
	282	+ m = m_tot / num_trials
	283	+ s_1 = var_1_tot / num_trials
	284	+ s_2 = var_2_tot / num_trials
	285	+
	286	+ total_samples = len(metrics_1)
	287	+
	288	+ t = m / math.pow((s_1 + s_2) / total_samples, 0.5)
	289	+ degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * total_samples
	290	+
	291	+
	292	+ """ lookup confidence """
	293	+
	294	+ # get t and df
	295	+ degrees_of_freedom = math.ceil(degrees_of_freedom)
	296	+ if degrees_of_freedom > 30:
	297	+ degrees_of_freedom = 99
	298	+
	299	+
	300	+ """
	301	+ select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
	302	+
	303	+ self._data_loader_.init_db()
	304	+
	305	+ try:
	306	+ self._data_loader_._cur_.execute(select_stmnt)
	307	+ results = self._data_loader_._cur_.fetchone()
	308	+
	309	+ if results[0] != None:
	310	+ p = float(results[0])
	311	+ else:
	312	+ p = .0005
	313	+ except:
	314	+ self._data_loader_._db_.rollback()
	315	+ self._data_loader_._db_.close()
	316	+ sys.exit('Could not execute: ' + select_stmnt)
	317	+
	318	+ #print p
	319	+ self._data_loader_._db_.close()
	320	+ """
	321	+
	322	+ p = _data_loader_.get_pValue(degrees_of_freedom, t)
	323	+
	324	+ conf_str = str((1 - p) * 100) + '% confident about the winner.'
	325	+
	326	+ return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
	327	+
	328	+"""
	329	+
	330	+Class :: ChiSquareTest
	331	+
	332	+Implements a Chi Square test where the distribution of donations over a given period are
	333	+assumed to resemble those of a students t distribution
	334	+
	335	+http://en.wikipedia.org/wiki/Chi-square_test
	336	+
	337	+"""
	338	+class ChiSquareTest(HypothesisTest):
	339	+
	340	+ """
	341	+ <description>
	342	+
	343	+ INPUT:
	344	+
	345	+ RETURN:
	346	+
	347	+ """
	348	+ def confidence_test(self, metrics_1, metrics_2, num_samples):
	349	+ return
	350	+
\ No newline at end of file

Status & tagging log

09:00, 18 April 2011 Reedy (talk | contribs) changed the status of r86284 [removed: new added: deferred]