r80596 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80595‎ | r80596 | r80597 >
Date:22:59, 19 January 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
added T-Test functionality to confidence computation. Also added a method to print out the distribution parameters at each interval.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/compute_confidence.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/sql/report_LP_confidence.sql (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_confidence.sql (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/compute_confidence.py
@@ -262,6 +262,9 @@
263263 labels = [item_1, item_2]
264264 self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname)
265265
 266+ """ Print out results """
 267+ self.print_metrics(title, means_1, means_2, std_devs_1, std_devs_2, times_indices)
 268+
266269 return
267270
268271
@@ -270,33 +273,32 @@
271274 """
272275 def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
273276 return
274 -
275 -"""
276 -
277 -Implements a Wald test where the distribution of donations over a given period are assumed to be normal
278 -
279 -"""
280 -class WaldTest(ConfidenceTest):
 277+
281278
282 - def confidence_test(self, metrics_1, metrics_2, num_samples):
 279+ """
 280+ assess the confidence of the winner - define in subclass
 281+ """
 282+ def compute_parameters(self, metrics_1, metrics_2, num_samples):
283283
284 - # Partition over different
285 - num_trials = math.ceil(len(metrics_1) / num_samples)
 284+ # A trial represents a group of samples over which parameters are computed
 285+ num_trials = int(math.ceil(len(metrics_1) / num_samples))
 286+
286287 means_1 = []
287 - std_devs_1 = []
288288 means_2 = []
289 - std_devs_2 = []
 289+ vars_1 = []
 290+ vars_2 = []
290291
291292 m_tot = 0
292293 sd_tot = 0
293294
294 - # print num_trials
295 - for i in range(int(num_trials)):
 295+
 296+ # Compute the mean and variance for each group across all trials
 297+ for i in range(num_trials):
296298
297 - m1 = 0
298 - m2 = 0
299 - sd1 = 0
300 - sd2 = 0
 299+ m1 = 0 # mean of group 1
 300+ m2 = 0 # mean of group 2
 301+ var1 = 0 # variance of group 1
 302+ var2 = 0 # variance of group 2
301303
302304 for j in range(num_samples):
303305 index = i + j
@@ -308,29 +310,74 @@
309311 m1 = m1 / num_samples
310312 m2 = m2 / num_samples
311313
 314+ # Compute Sample Variance for each group
312315 for j in range(num_samples):
313316 index = i + j
314317
315 - # Compute standard deviation
316 - sd1 = sd1 + math.pow((metrics_1[i] - m1), 2)
317 - sd2 = sd2 + math.pow((metrics_2[i] - m2), 2)
 318+ var1 = var1 + math.pow((metrics_1[i] - m1), 2)
 319+ var2 = var2 + math.pow((metrics_2[i] - m2), 2)
318320
319 - # Perform wald
320 - sd = math.pow(sd1 / num_samples + sd2 / num_samples, 0.5)
321 - m = math.fabs(m1 - m2)
322 -
323321 means_1.append(float(m1))
324322 means_2.append(float(m2))
325 - std_devs_1.append(math.pow(sd1 / num_samples, 0.5))
326 - std_devs_2.append(math.pow(sd2 / num_samples, 0.5))
 323+ vars_1.append(var1 / num_samples)
 324+ vars_2.append(var2 / num_samples)
327325
 326+
 327+ return [num_trials, means_1, means_2, vars_1, vars_2]
 328+
 329+
 330+ """ Print in Tabular form the means and standard deviation of each group over each interval """
 331+ def print_metrics(self, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices):
 332+
 333+ print '\n\n' + metric_name
 334+ print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n'
 335+
 336+ for i in range(1,len(times_indices) - 1):
 337+ line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'
 338+ line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i])
 339+ print line_str
 340+
 341+
 342+"""
 343+
 344+Implements a Wald test where the distribution of donations over a given period are assumed to be normal
 345+
 346+http://en.wikipedia.org/wiki/Wald_test
 347+
 348+"""
 349+class WaldTest(ConfidenceTest):
 350+
 351+ def confidence_test(self, metrics_1, metrics_2, num_samples):
 352+
 353+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
 354+ num_trials = ret[0]
 355+ means_1 = ret[1]
 356+ means_2 = ret[2]
 357+ vars_1 = ret[3]
 358+ vars_2 = ret[4]
 359+
 360+ """ Compute std devs """
 361+ std_devs_1 = []
 362+ std_devs_2 = []
 363+ for i in range(len(vars_1)):
 364+ std_devs_1.append(math.pow(vars_1[i], 0.5))
 365+ std_devs_2.append(math.pow(vars_2[i], 0.5))
 366+
 367+ m_tot = 0
 368+ sd_tot = 0
 369+
 370+ # Compute the parameters for the Wald test
 371+ # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
 372+ # where X{1,2} is the random variable corresponding to the group {1,2}
 373+ for i in range(num_trials):
 374+
 375+ # Perform wald - compose W = X1 - X2 for each trial
 376+ sd = math.pow(vars_1[i] + vars_2[i], 0.5)
 377+ m = math.fabs(means_1[i] - means_2[i])
 378+
328379 m_tot = m_tot + m
329380 sd_tot = sd_tot + sd
330381
331 - # print m1
332 - # print m2
333 - # print m
334 - # print sd
335382
336383 W = m_tot / sd_tot
337384 # print W
@@ -387,37 +434,91 @@
388435
389436 Implements a Student's T test where the distribution of donations over a given period are assumed to resemble those of a students t distribution
390437
 438+http://en.wikipedia.org/wiki/Student%27s_t-test
 439+
391440 """
392441 class TTest(ConfidenceTest):
393442
394 - def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
 443+ def confidence_test(self, metrics_1, metrics_2, num_samples):
395444
396 - # Partition over different
 445+ # retrieve means and variances
 446+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
 447+ num_trials = ret[0]
 448+ means_1 = ret[1]
 449+ means_2 = ret[2]
 450+ vars_1 = ret[3]
 451+ vars_2 = ret[4]
397452
 453+ """ Compute std devs """
 454+ std_devs_1 = []
 455+ std_devs_2 = []
 456+ for i in range(len(vars_1)):
 457+ std_devs_1.append(math.pow(vars_1[i], 0.5))
 458+ std_devs_2.append(math.pow(vars_2[i], 0.5))
 459+
 460+ m_tot = 0
 461+ var_1_tot = 0
 462+ var_2_tot = 0
398463
399 - # Compute mean for each group
400 - m1 = sum(metrics_1) / len(metrics_1)
401 - m2 = sum(metrics_2) / len(metrics_2)
 464+ # Compute the parameters for the Wald test
 465+ # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
 466+ # where X{1,2} is the random variable corresponding to the group {1,2}
 467+ for i in range(num_trials):
402468
403 - # Compute standard deviation
404 - sd1 = 0
405 - sd2 = 0
406 - for i in range(len(metrics_1)):
407 - sd1 = sd1 + math.pow((metrics_1[i] - m1), 2)
408 - sd2 = sd2 + math.pow((metrics_2[i] - m2), 2)
 469+ m_tot = m_tot + math.fabs(means_1[i] - means_2[i])
 470+ var_1_tot = var_1_tot + vars_1[i]
 471+ var_2_tot = var_2_tot + vars_2[i]
409472
410 - sd1 = pow(sd1 / len(metrics_1), 0.5)
411 - sd2 = pow(sd2 / len(metrics_2), 0.5)
 473+ m = m_tot / num_trials
 474+ s_1 = var_1_tot / num_trials
 475+ s_2 = var_2_tot / num_trials
412476
413 - # degrees of freedom
 477+ total_samples = len(metrics_1)
414478
415 - # Perform wald
416 - m1
 479+ t = m / math.pow((s_1 + s_2) / total_samples, 0.5)
 480+ degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * total_samples
 481+
417482
418 - print m1
419 - print m2
420 - print sd1
421 - print sd2
 483+ """ lookup confidence """
422484
 485+ print ''
 486+ print t
 487+ print degrees_of_freedom
 488+
 489+ # get t and df
 490+ degrees_of_freedom = math.ceil(degrees_of_freedom)
 491+ if degrees_of_freedom > 30:
 492+ degrees_of_freedom = 99
 493+
 494+ select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
 495+
 496+ self.init_db()
 497+
 498+ try:
 499+ self.cur.execute(select_stmnt)
 500+ results = self.cur.fetchone()
 501+
 502+ p = float(results[0])
 503+ except:
 504+ self.db.rollback()
 505+ self.db.close()
 506+ sys.exit('Could not execute: ' + select_stmnt)
 507+
 508+ print p
 509+ self.db.close()
 510+
 511+ conf_str = str((1 - p) * 100) + '% confident about the winner.'
 512+
 513+ return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
 514+
 515+"""
 516+
 517+Implements a Chi Square test where the distribution of donations over a given period are assumed to resemble those of a students t distribution
 518+
 519+http://en.wikipedia.org/wiki/Chi-square_test
 520+
 521+"""
 522+class ChiSquareTest(ConfidenceTest):
 523+ def confidence_test(self, metrics_1, metrics_2, num_samples):
423524 return
424 -
\ No newline at end of file
 525+
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_LP_confidence.sql
@@ -3,7 +3,7 @@
44
55 select
66
7 -lp.utm_source,
 7+lp.landing_page,
88 views as views,
99 total_clicks as clicks,
1010 donations as donations,
@@ -15,19 +15,21 @@
1616
1717 from
1818
19 -select
 19+(select
2020 landing_page,
 21+utm_campaign,
2122 count(*) as views
2223 from landing_page
2324 where request_time >= '%s' and request_time < '%s'
2425 and utm_campaign REGEXP '%s'
2526 and landing_page REGEXP '%s'
26 -group by 1) as lp
 27+group by 1,2) as lp
2728
2829 join
2930
3031 (select
3132 SUBSTRING_index(substring_index(utm_source, '.', 2),'.',-1) as landing_page,
 33+utm_campaign,
3234 count(*) as total_clicks,
3335 sum(not isnull(contribution_tracking.contribution_id)) as donations,
3436 sum(converted_amount) AS amount
@@ -37,7 +39,7 @@
3840 where ts >= '%s' and ts < '%s'
3941 and utm_campaign REGEXP '%s'
4042 and SUBSTRING_index(substring_index(utm_source, '.', 2),'.',-1) REGEXP '%s'
41 -group by 1) as ecomm
 43+group by 1,2) as ecomm
4244
4345 on ecomm.landing_page = lp.landing_page and ecomm.utm_campaign = lp.utm_campaign
4446
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_confidence.sql
@@ -59,4 +59,4 @@
6060 on ecomm.banner = lp.utm_source
6161
6262 group by 1
63 -having impressions > 100000 and donations > 10;
 63+-- having impressions > 100000 and donations > 10;

Status & tagging log