r86284 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86283‎ | r86284 | r86285 >
Date:21:10, 17 April 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
This module is designed to abstract and decouple hypothesis testing logic from reporting and data handling.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/HypothesisTest.py (added) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/HypothesisTest.py
@@ -0,0 +1,349 @@
 2+
 3+"""
 4+
 5+This module is used to define the reporting methodologies on different types of data. The base class
 6+DataReporting is defined to outline the general functionality of the reporting architecture and
 7+functionality which includes generating the data via a dataloader object and transforming the data
 8+among different reporting mediums including matlab plots (primary medium) and html tables.
 9+
 10+The DataLoader class decouples the data access of the reports using the Adapter structural pattern.
 11+
 12+"""
 13+
 14+__author__ = "Ryan Faulkner"
 15+__revision__ = "$Rev$"
 16+__date__ = "April 16th, 2011"
 17+
 18+
 19+import sys
 20+sys.path.append('../')
 21+
 22+import math
 23+import datetime as dt
 24+import MySQLdb
 25+import pylab
 26+import matplotlib
 27+
 28+import miner_help as mh
 29+import QueryData as QD
 30+import DataLoader as DL
 31+import TimestampProcessor as TP
 32+
 33+matplotlib.use('Agg')
 34+
 35+
 36+"""
 37+
 38+ CLASS :: HypothesisTest
 39+
 40+
 41+ METHODS:
 42+ confidence_test - defined in subclasses, performs the test
 43+ compute_parameters - computes parameters of models for test
 44+
 45+"""
 46+class HypothesisTest(object):
 47+
 48+ """
 49+ Assess the confidence of the winner - define in subclass
 50+
 51+ INPUT:
 52+
 53+ RETURN:
 54+
 55+ """
 56+ def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
 57+ return
 58+
 59+
 60+ """
 61+ Determine the parameters of the distribution
 62+
 63+ INPUT:
 64+
 65+ RETURN:
 66+
 67+ """
 68+ def compute_parameters(self, metrics_1, metrics_2, num_samples):
 69+
 70+ # A trial represents a group of samples over which parameters are computed
 71+ num_trials = int(math.ceil(len(metrics_1) / num_samples))
 72+
 73+ means_1 = []
 74+ means_2 = []
 75+ vars_1 = []
 76+ vars_2 = []
 77+
 78+ m_tot = 0
 79+ sd_tot = 0
 80+
 81+ # Compute the mean and variance for each group across all trials
 82+ for i in range(num_trials):
 83+
 84+ m1 = 0 # mean of group 1
 85+ m2 = 0 # mean of group 2
 86+ var1 = 0 # variance of group 1
 87+ var2 = 0 # variance of group 2
 88+
 89+ for j in range(num_samples):
 90+ index = i * num_samples + j
 91+
 92+ # Compute mean for each group
 93+ m1 = m1 + metrics_1[index]
 94+ m2 = m2 + metrics_2[index]
 95+
 96+ m1 = m1 / num_samples
 97+ m2 = m2 / num_samples
 98+
 99+ # Compute Sample Variance for each group
 100+ for j in range(num_samples):
 101+ index = i + j
 102+
 103+ var1 = var1 + math.pow((metrics_1[i] - m1), 2)
 104+ var2 = var2 + math.pow((metrics_2[i] - m2), 2)
 105+
 106+ means_1.append(float(m1))
 107+ means_2.append(float(m2))
 108+ vars_1.append(var1 / num_samples)
 109+ vars_2.append(var2 / num_samples)
 110+
 111+ return [num_trials, means_1, means_2, vars_1, vars_2]
 112+
 113+
 114+
 115+
 116+"""
 117+
 118+Class :: WaldTest
 119+
 120+Implements a Wald test where the distribution of donations over a given period are
 121+assumed to be normal
 122+
 123+http://en.wikipedia.org/wiki/Wald_test
 124+
 125+"""
 126+class WaldTest(HypothesisTest):
 127+
 128+ """
 129+ <description>
 130+
 131+ INPUT:
 132+
 133+ RETURN:
 134+
 135+ """
 136+ def confidence_test(self, metrics_1, metrics_2, num_samples):
 137+
 138+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
 139+ num_trials = ret[0]
 140+ means_1 = ret[1]
 141+ means_2 = ret[2]
 142+ vars_1 = ret[3]
 143+ vars_2 = ret[4]
 144+
 145+ """ Compute std devs """
 146+ std_devs_1 = []
 147+ std_devs_2 = []
 148+ for i in range(len(vars_1)):
 149+ std_devs_1.append(math.pow(vars_1[i], 0.5))
 150+ std_devs_2.append(math.pow(vars_2[i], 0.5))
 151+
 152+ m_tot = 0
 153+ sd_tot = 0
 154+
 155+ # Compute the parameters for the Wald test
 156+ # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
 157+ # where X{1,2} is the random variable corresponding to the group {1,2}
 158+ for i in range(num_trials):
 159+
 160+ # Perform wald - compose W = X1 - X2 for each trial
 161+ sd = math.pow(vars_1[i] + vars_2[i], 0.5)
 162+ m = math.fabs(means_1[i] - means_2[i])
 163+
 164+ m_tot = m_tot + m
 165+ sd_tot = sd_tot + sd
 166+
 167+
 168+ W = m_tot / sd_tot
 169+ # print W
 170+
 171+ # determine the probability that the
 172+ if (W >= 1.9):
 173+ conf_str = '95% confident about the winner.'
 174+ P = 0.95
 175+ elif (W >= 1.6):
 176+ conf_str = '89% confident about the winner.'
 177+ P = 0.89
 178+ elif (W >= 1.3):
 179+ conf_str = '81% confident about the winner.'
 180+ P = 0.81
 181+ elif (W >= 1.0):
 182+ conf_str = '73% confident about the winner.'
 183+ P = 0.73
 184+ elif (W >= 0.9):
 185+ conf_str = '68% confident about the winner.'
 186+ P = 0.68
 187+ elif (W >= 0.8):
 188+ conf_str = '63% confident about the winner.'
 189+ P = 0.63
 190+ elif (W >= 0.7):
 191+ conf_str = '52% confident about the winner.'
 192+ P = 0.52
 193+ elif (W >= 0.6):
 194+ conf_str = '45% confident about the winner.'
 195+ P = 0.45
 196+ elif (W >= 0.5):
 197+ conf_str = '38% confident about the winner.'
 198+ P = 0.38
 199+ elif (W >= 0.4):
 200+ conf_str = '31% confident about the winner.'
 201+ P = 0.31
 202+ elif (W >= 0.3):
 203+ conf_str = '24% confident about the winner.'
 204+ P = 0.24
 205+ elif (W >= 0.2):
 206+ conf_str = '16% confident about the winner.'
 207+ P = 0.16
 208+ elif (W >= 0.1):
 209+ conf_str = '8% confident about the winner.'
 210+ P = 0.08
 211+ else:
 212+ conf_str = 'There is no clear winner.'
 213+ P = 0.08
 214+
 215+
 216+ return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
 217+
 218+
 219+"""
 220+
 221+Class :: TTest
 222+
 223+Implements a Student's T test where the distribution of donations over a given period are
 224+assumed to resemble those of a students t distribution
 225+
 226+http://en.wikipedia.org/wiki/Student%27s_t-test
 227+
 228+"""
 229+class TTest(HypothesisTest):
 230+
 231+ _data_loader_ = None
 232+
 233+ """
 234+ <description>
 235+
 236+ INPUT:
 237+
 238+ RETURN:
 239+
 240+ """
 241+ def __init__(self):
 242+ _data_loader_ = DL.TTestLoaderHelp()
 243+
 244+ """
 245+ <description>
 246+
 247+ INPUT:
 248+
 249+ RETURN:
 250+
 251+ """
 252+ def confidence_test(self, metrics_1, metrics_2, num_samples):
 253+
 254+ """ retrieve means and variances """
 255+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
 256+ num_trials = ret[0]
 257+ means_1 = ret[1]
 258+ means_2 = ret[2]
 259+ vars_1 = ret[3]
 260+ vars_2 = ret[4]
 261+
 262+ """ Compute std devs """
 263+ std_devs_1 = []
 264+ std_devs_2 = []
 265+ for i in range(len(vars_1)):
 266+ std_devs_1.append(math.pow(vars_1[i], 0.5))
 267+ std_devs_2.append(math.pow(vars_2[i], 0.5))
 268+
 269+ m_tot = 0
 270+ var_1_tot = 0
 271+ var_2_tot = 0
 272+
 273+ """ Compute the parameters for the student's t-test
 274+ The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
 275+ where X{1,2} is the random variable corresponding to the group {1,2} """
 276+ for i in range(num_trials):
 277+
 278+ m_tot = m_tot + math.fabs(means_1[i] - means_2[i])
 279+ var_1_tot = var_1_tot + vars_1[i]
 280+ var_2_tot = var_2_tot + vars_2[i]
 281+
 282+ m = m_tot / num_trials
 283+ s_1 = var_1_tot / num_trials
 284+ s_2 = var_2_tot / num_trials
 285+
 286+ total_samples = len(metrics_1)
 287+
 288+ t = m / math.pow((s_1 + s_2) / total_samples, 0.5)
 289+ degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * total_samples
 290+
 291+
 292+ """ lookup confidence """
 293+
 294+ # get t and df
 295+ degrees_of_freedom = math.ceil(degrees_of_freedom)
 296+ if degrees_of_freedom > 30:
 297+ degrees_of_freedom = 99
 298+
 299+
 300+ """
 301+ select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
 302+
 303+ self._data_loader_.init_db()
 304+
 305+ try:
 306+ self._data_loader_._cur_.execute(select_stmnt)
 307+ results = self._data_loader_._cur_.fetchone()
 308+
 309+ if results[0] != None:
 310+ p = float(results[0])
 311+ else:
 312+ p = .0005
 313+ except:
 314+ self._data_loader_._db_.rollback()
 315+ self._data_loader_._db_.close()
 316+ sys.exit('Could not execute: ' + select_stmnt)
 317+
 318+ #print p
 319+ self._data_loader_._db_.close()
 320+ """
 321+
 322+ p = _data_loader_.get_pValue(degrees_of_freedom, t)
 323+
 324+ conf_str = str((1 - p) * 100) + '% confident about the winner.'
 325+
 326+ return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
 327+
 328+"""
 329+
 330+Class :: ChiSquareTest
 331+
 332+Implements a Chi Square test where the distribution of donations over a given period are
 333+assumed to resemble those of a students t distribution
 334+
 335+http://en.wikipedia.org/wiki/Chi-square_test
 336+
 337+"""
 338+class ChiSquareTest(HypothesisTest):
 339+
 340+ """
 341+ <description>
 342+
 343+ INPUT:
 344+
 345+ RETURN:
 346+
 347+ """
 348+ def confidence_test(self, metrics_1, metrics_2, num_samples):
 349+ return
 350+
\ No newline at end of file

Status & tagging log