Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/HypothesisTest.py |
— | — | @@ -0,0 +1,349 @@ |
| 2 | +
|
| 3 | +"""
|
| 4 | +
|
| 5 | +This module is used to define the reporting methodologies on different types of data. The base class
|
| 6 | +DataReporting is defined to outline the general functionality of the reporting architecture and
|
| 7 | +functionality which includes generating the data via a dataloader object and transforming the data
|
| 8 | +among different reporting mediums including matlab plots (primary medium) and html tables.
|
| 9 | +
|
| 10 | +The DataLoader class decouples the data access of the reports using the Adapter structural pattern.
|
| 11 | +
|
| 12 | +"""
|
| 13 | +
|
| 14 | +__author__ = "Ryan Faulkner"
|
| 15 | +__revision__ = "$Rev$"
|
| 16 | +__date__ = "April 16th, 2011"
|
| 17 | +
|
| 18 | +
|
| 19 | +import sys
|
| 20 | +sys.path.append('../')
|
| 21 | +
|
| 22 | +import math
|
| 23 | +import datetime as dt
|
| 24 | +import MySQLdb
|
| 25 | +import pylab
|
| 26 | +import matplotlib
|
| 27 | +
|
| 28 | +import miner_help as mh
|
| 29 | +import QueryData as QD
|
| 30 | +import DataLoader as DL
|
| 31 | +import TimestampProcessor as TP
|
| 32 | +
|
| 33 | +matplotlib.use('Agg')
|
| 34 | +
|
| 35 | +
|
| 36 | +"""
|
| 37 | +
|
| 38 | + CLASS :: HypothesisTest
|
| 39 | +
|
| 40 | +
|
| 41 | + METHODS:
|
| 42 | + confidence_test - defined in subclasses, performs the test
|
| 43 | + compute_parameters - computes parameters of models for test
|
| 44 | +
|
| 45 | +"""
|
| 46 | +class HypothesisTest(object):
|
| 47 | +
|
| 48 | + """
|
| 49 | + Assess the confidence of the winner - define in subclass
|
| 50 | +
|
| 51 | + INPUT:
|
| 52 | +
|
| 53 | + RETURN:
|
| 54 | +
|
| 55 | + """
|
| 56 | + def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
|
| 57 | + return
|
| 58 | +
|
| 59 | +
|
| 60 | + """
|
| 61 | + Determine the parameters of the distribution
|
| 62 | +
|
| 63 | + INPUT:
|
| 64 | +
|
| 65 | + RETURN:
|
| 66 | +
|
| 67 | + """
|
| 68 | + def compute_parameters(self, metrics_1, metrics_2, num_samples):
|
| 69 | +
|
| 70 | + # A trial represents a group of samples over which parameters are computed
|
| 71 | + num_trials = int(math.ceil(len(metrics_1) / num_samples))
|
| 72 | +
|
| 73 | + means_1 = []
|
| 74 | + means_2 = []
|
| 75 | + vars_1 = []
|
| 76 | + vars_2 = []
|
| 77 | +
|
| 78 | + m_tot = 0
|
| 79 | + sd_tot = 0
|
| 80 | +
|
| 81 | + # Compute the mean and variance for each group across all trials
|
| 82 | + for i in range(num_trials):
|
| 83 | +
|
| 84 | + m1 = 0 # mean of group 1
|
| 85 | + m2 = 0 # mean of group 2
|
| 86 | + var1 = 0 # variance of group 1
|
| 87 | + var2 = 0 # variance of group 2
|
| 88 | +
|
| 89 | + for j in range(num_samples):
|
| 90 | + index = i * num_samples + j
|
| 91 | +
|
| 92 | + # Compute mean for each group
|
| 93 | + m1 = m1 + metrics_1[index]
|
| 94 | + m2 = m2 + metrics_2[index]
|
| 95 | +
|
| 96 | + m1 = m1 / num_samples
|
| 97 | + m2 = m2 / num_samples
|
| 98 | +
|
| 99 | + # Compute Sample Variance for each group
|
| 100 | + for j in range(num_samples):
|
| 101 | + index = i + j
|
| 102 | +
|
| 103 | + var1 = var1 + math.pow((metrics_1[i] - m1), 2)
|
| 104 | + var2 = var2 + math.pow((metrics_2[i] - m2), 2)
|
| 105 | +
|
| 106 | + means_1.append(float(m1))
|
| 107 | + means_2.append(float(m2))
|
| 108 | + vars_1.append(var1 / num_samples)
|
| 109 | + vars_2.append(var2 / num_samples)
|
| 110 | +
|
| 111 | + return [num_trials, means_1, means_2, vars_1, vars_2]
|
| 112 | +
|
| 113 | +
|
| 114 | +
|
| 115 | +
|
| 116 | +"""
|
| 117 | +
|
| 118 | +Class :: WaldTest
|
| 119 | +
|
| 120 | +Implements a Wald test where the distribution of donations over a given period are
|
| 121 | +assumed to be normal
|
| 122 | +
|
| 123 | +http://en.wikipedia.org/wiki/Wald_test
|
| 124 | +
|
| 125 | +"""
|
| 126 | +class WaldTest(HypothesisTest):
|
| 127 | +
|
| 128 | + """
|
| 129 | + <description>
|
| 130 | +
|
| 131 | + INPUT:
|
| 132 | +
|
| 133 | + RETURN:
|
| 134 | +
|
| 135 | + """
|
| 136 | + def confidence_test(self, metrics_1, metrics_2, num_samples):
|
| 137 | +
|
| 138 | + ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
|
| 139 | + num_trials = ret[0]
|
| 140 | + means_1 = ret[1]
|
| 141 | + means_2 = ret[2]
|
| 142 | + vars_1 = ret[3]
|
| 143 | + vars_2 = ret[4]
|
| 144 | +
|
| 145 | + """ Compute std devs """
|
| 146 | + std_devs_1 = []
|
| 147 | + std_devs_2 = []
|
| 148 | + for i in range(len(vars_1)):
|
| 149 | + std_devs_1.append(math.pow(vars_1[i], 0.5))
|
| 150 | + std_devs_2.append(math.pow(vars_2[i], 0.5))
|
| 151 | +
|
| 152 | + m_tot = 0
|
| 153 | + sd_tot = 0
|
| 154 | +
|
| 155 | + # Compute the parameters for the Wald test
|
| 156 | + # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
|
| 157 | + # where X{1,2} is the random variable corresponding to the group {1,2}
|
| 158 | + for i in range(num_trials):
|
| 159 | +
|
| 160 | + # Perform wald - compose W = X1 - X2 for each trial
|
| 161 | + sd = math.pow(vars_1[i] + vars_2[i], 0.5)
|
| 162 | + m = math.fabs(means_1[i] - means_2[i])
|
| 163 | +
|
| 164 | + m_tot = m_tot + m
|
| 165 | + sd_tot = sd_tot + sd
|
| 166 | +
|
| 167 | +
|
| 168 | + W = m_tot / sd_tot
|
| 169 | + # print W
|
| 170 | +
|
| 171 | + # determine the probability that the
|
| 172 | + if (W >= 1.9):
|
| 173 | + conf_str = '95% confident about the winner.'
|
| 174 | + P = 0.95
|
| 175 | + elif (W >= 1.6):
|
| 176 | + conf_str = '89% confident about the winner.'
|
| 177 | + P = 0.89
|
| 178 | + elif (W >= 1.3):
|
| 179 | + conf_str = '81% confident about the winner.'
|
| 180 | + P = 0.81
|
| 181 | + elif (W >= 1.0):
|
| 182 | + conf_str = '73% confident about the winner.'
|
| 183 | + P = 0.73
|
| 184 | + elif (W >= 0.9):
|
| 185 | + conf_str = '68% confident about the winner.'
|
| 186 | + P = 0.68
|
| 187 | + elif (W >= 0.8):
|
| 188 | + conf_str = '63% confident about the winner.'
|
| 189 | + P = 0.63
|
| 190 | + elif (W >= 0.7):
|
| 191 | + conf_str = '52% confident about the winner.'
|
| 192 | + P = 0.52
|
| 193 | + elif (W >= 0.6):
|
| 194 | + conf_str = '45% confident about the winner.'
|
| 195 | + P = 0.45
|
| 196 | + elif (W >= 0.5):
|
| 197 | + conf_str = '38% confident about the winner.'
|
| 198 | + P = 0.38
|
| 199 | + elif (W >= 0.4):
|
| 200 | + conf_str = '31% confident about the winner.'
|
| 201 | + P = 0.31
|
| 202 | + elif (W >= 0.3):
|
| 203 | + conf_str = '24% confident about the winner.'
|
| 204 | + P = 0.24
|
| 205 | + elif (W >= 0.2):
|
| 206 | + conf_str = '16% confident about the winner.'
|
| 207 | + P = 0.16
|
| 208 | + elif (W >= 0.1):
|
| 209 | + conf_str = '8% confident about the winner.'
|
| 210 | + P = 0.08
|
| 211 | + else:
|
| 212 | + conf_str = 'There is no clear winner.'
|
| 213 | + P = 0.08
|
| 214 | +
|
| 215 | +
|
| 216 | + return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
|
| 217 | +
|
| 218 | +
|
| 219 | +"""
|
| 220 | +
|
| 221 | +Class :: TTest
|
| 222 | +
|
| 223 | +Implements a Student's T test where the distribution of donations over a given period are
|
| 224 | +assumed to resemble those of a students t distribution
|
| 225 | +
|
| 226 | +http://en.wikipedia.org/wiki/Student%27s_t-test
|
| 227 | +
|
| 228 | +"""
|
| 229 | +class TTest(HypothesisTest):
|
| 230 | +
|
| 231 | + _data_loader_ = None
|
| 232 | +
|
| 233 | + """
|
| 234 | + <description>
|
| 235 | +
|
| 236 | + INPUT:
|
| 237 | +
|
| 238 | + RETURN:
|
| 239 | +
|
| 240 | + """
|
| 241 | + def __init__(self):
|
| 242 | + _data_loader_ = DL.TTestLoaderHelp()
|
| 243 | +
|
| 244 | + """
|
| 245 | + <description>
|
| 246 | +
|
| 247 | + INPUT:
|
| 248 | +
|
| 249 | + RETURN:
|
| 250 | +
|
| 251 | + """
|
| 252 | + def confidence_test(self, metrics_1, metrics_2, num_samples):
|
| 253 | +
|
| 254 | + """ retrieve means and variances """
|
| 255 | + ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
|
| 256 | + num_trials = ret[0]
|
| 257 | + means_1 = ret[1]
|
| 258 | + means_2 = ret[2]
|
| 259 | + vars_1 = ret[3]
|
| 260 | + vars_2 = ret[4]
|
| 261 | +
|
| 262 | + """ Compute std devs """
|
| 263 | + std_devs_1 = []
|
| 264 | + std_devs_2 = []
|
| 265 | + for i in range(len(vars_1)):
|
| 266 | + std_devs_1.append(math.pow(vars_1[i], 0.5))
|
| 267 | + std_devs_2.append(math.pow(vars_2[i], 0.5))
|
| 268 | +
|
| 269 | + m_tot = 0
|
| 270 | + var_1_tot = 0
|
| 271 | + var_2_tot = 0
|
| 272 | +
|
| 273 | + """ Compute the parameters for the student's t-test
|
| 274 | + The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
|
| 275 | + where X{1,2} is the random variable corresponding to the group {1,2} """
|
| 276 | + for i in range(num_trials):
|
| 277 | +
|
| 278 | + m_tot = m_tot + math.fabs(means_1[i] - means_2[i])
|
| 279 | + var_1_tot = var_1_tot + vars_1[i]
|
| 280 | + var_2_tot = var_2_tot + vars_2[i]
|
| 281 | +
|
| 282 | + m = m_tot / num_trials
|
| 283 | + s_1 = var_1_tot / num_trials
|
| 284 | + s_2 = var_2_tot / num_trials
|
| 285 | +
|
| 286 | + total_samples = len(metrics_1)
|
| 287 | +
|
| 288 | + t = m / math.pow((s_1 + s_2) / total_samples, 0.5)
|
| 289 | + degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * total_samples
|
| 290 | +
|
| 291 | +
|
| 292 | + """ lookup confidence """
|
| 293 | +
|
| 294 | + # get t and df
|
| 295 | + degrees_of_freedom = math.ceil(degrees_of_freedom)
|
| 296 | + if degrees_of_freedom > 30:
|
| 297 | + degrees_of_freedom = 99
|
| 298 | +
|
| 299 | +
|
| 300 | + """
|
| 301 | + select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
|
| 302 | +
|
| 303 | + self._data_loader_.init_db()
|
| 304 | +
|
| 305 | + try:
|
| 306 | + self._data_loader_._cur_.execute(select_stmnt)
|
| 307 | + results = self._data_loader_._cur_.fetchone()
|
| 308 | +
|
| 309 | + if results[0] != None:
|
| 310 | + p = float(results[0])
|
| 311 | + else:
|
| 312 | + p = .0005
|
| 313 | + except:
|
| 314 | + self._data_loader_._db_.rollback()
|
| 315 | + self._data_loader_._db_.close()
|
| 316 | + sys.exit('Could not execute: ' + select_stmnt)
|
| 317 | +
|
| 318 | + #print p
|
| 319 | + self._data_loader_._db_.close()
|
| 320 | + """
|
| 321 | +
|
| 322 | + p = _data_loader_.get_pValue(degrees_of_freedom, t)
|
| 323 | +
|
| 324 | + conf_str = str((1 - p) * 100) + '% confident about the winner.'
|
| 325 | +
|
| 326 | + return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
|
| 327 | +
|
| 328 | +"""
|
| 329 | +
|
| 330 | +Class :: ChiSquareTest
|
| 331 | +
|
| 332 | +Implements a Chi Square test where the distribution of donations over a given period are
|
| 333 | +assumed to resemble those of a students t distribution
|
| 334 | +
|
| 335 | +http://en.wikipedia.org/wiki/Chi-square_test
|
| 336 | +
|
| 337 | +"""
|
| 338 | +class ChiSquareTest(HypothesisTest):
|
| 339 | +
|
| 340 | + """
|
| 341 | + <description>
|
| 342 | +
|
| 343 | + INPUT:
|
| 344 | +
|
| 345 | + RETURN:
|
| 346 | +
|
| 347 | + """
|
| 348 | + def confidence_test(self, metrics_1, metrics_2, num_samples):
|
| 349 | + return
|
| 350 | + |
\ No newline at end of file |