Index: trunk/fundraiser-statistics/fundraiser-scripts/compute_confidence.py |
— | — | @@ -262,6 +262,9 @@ |
263 | 263 | labels = [item_1, item_2] |
264 | 264 | self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname) |
265 | 265 | |
| 266 | + """ Print out results """ |
| 267 | + self.print_metrics(title, means_1, means_2, std_devs_1, std_devs_2, times_indices) |
| 268 | + |
266 | 269 | return |
267 | 270 | |
268 | 271 | |
— | — | @@ -270,33 +273,32 @@ |
271 | 274 | """ |
272 | 275 | def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples): |
273 | 276 | return |
274 | | - |
275 | | -""" |
276 | | - |
277 | | -Implements a Wald test where the distribution of donations over a given period are assumed to be normal |
278 | | - |
279 | | -""" |
280 | | -class WaldTest(ConfidenceTest): |
| 277 | + |
281 | 278 | |
282 | | - def confidence_test(self, metrics_1, metrics_2, num_samples): |
| 279 | + """ |
| 280 | + assess the confidence of the winner - define in subclass |
| 281 | + """ |
| 282 | + def compute_parameters(self, metrics_1, metrics_2, num_samples): |
283 | 283 | |
284 | | - # Partition over different |
285 | | - num_trials = math.ceil(len(metrics_1) / num_samples) |
| 284 | + # A trial represents a group of samples over which parameters are computed |
| 285 | + num_trials = int(math.ceil(len(metrics_1) / num_samples)) |
| 286 | + |
286 | 287 | means_1 = [] |
287 | | - std_devs_1 = [] |
288 | 288 | means_2 = [] |
289 | | - std_devs_2 = [] |
| 289 | + vars_1 = [] |
| 290 | + vars_2 = [] |
290 | 291 | |
291 | 292 | m_tot = 0 |
292 | 293 | sd_tot = 0 |
293 | 294 | |
294 | | - # print num_trials |
295 | | - for i in range(int(num_trials)): |
| 295 | + |
| 296 | + # Compute the mean and variance for each group across all trials |
| 297 | + for i in range(num_trials): |
296 | 298 | |
297 | | - m1 = 0 |
298 | | - m2 = 0 |
299 | | - sd1 = 0 |
300 | | - sd2 = 0 |
| 299 | + m1 = 0 # mean of group 1 |
| 300 | + m2 = 0 # mean of group 2 |
| 301 | + var1 = 0 # variance of group 1 |
| 302 | + var2 = 0 # variance of group 2 |
301 | 303 | |
302 | 304 | for j in range(num_samples): |
303 | 305 | index = i + j |
— | — | @@ -308,29 +310,74 @@ |
309 | 311 | m1 = m1 / num_samples |
310 | 312 | m2 = m2 / num_samples |
311 | 313 | |
| 314 | + # Compute Sample Variance for each group |
312 | 315 | for j in range(num_samples): |
313 | 316 | index = i + j |
314 | 317 | |
315 | | - # Compute standard deviation |
316 | | - sd1 = sd1 + math.pow((metrics_1[i] - m1), 2) |
317 | | - sd2 = sd2 + math.pow((metrics_2[i] - m2), 2) |
| 318 | + var1 = var1 + math.pow((metrics_1[i] - m1), 2) |
| 319 | + var2 = var2 + math.pow((metrics_2[i] - m2), 2) |
318 | 320 | |
319 | | - # Perform wald |
320 | | - sd = math.pow(sd1 / num_samples + sd2 / num_samples, 0.5) |
321 | | - m = math.fabs(m1 - m2) |
322 | | - |
323 | 321 | means_1.append(float(m1)) |
324 | 322 | means_2.append(float(m2)) |
325 | | - std_devs_1.append(math.pow(sd1 / num_samples, 0.5)) |
326 | | - std_devs_2.append(math.pow(sd2 / num_samples, 0.5)) |
| 323 | + vars_1.append(var1 / num_samples) |
| 324 | + vars_2.append(var2 / num_samples) |
327 | 325 | |
| 326 | + |
| 327 | + return [num_trials, means_1, means_2, vars_1, vars_2] |
| 328 | + |
| 329 | + |
| 330 | + """ Print in Tabular form the means and standard deviation of each group over each interval """ |
| 331 | + def print_metrics(self, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices): |
| 332 | + |
| 333 | + print '\n\n' + metric_name |
| 334 | + print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n' |
| 335 | + |
| 336 | + for i in range(1,len(times_indices) - 1): |
| 337 | + line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n' |
| 338 | + line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i]) |
| 339 | + print line_str |
| 340 | + |
| 341 | + |
| 342 | +""" |
| 343 | + |
| 344 | +Implements a Wald test where the distribution of donations over a given period are assumed to be normal |
| 345 | + |
| 346 | +http://en.wikipedia.org/wiki/Wald_test |
| 347 | + |
| 348 | +""" |
| 349 | +class WaldTest(ConfidenceTest): |
| 350 | + |
| 351 | + def confidence_test(self, metrics_1, metrics_2, num_samples): |
| 352 | + |
| 353 | + ret = self.compute_parameters(metrics_1, metrics_2, num_samples) |
| 354 | + num_trials = ret[0] |
| 355 | + means_1 = ret[1] |
| 356 | + means_2 = ret[2] |
| 357 | + vars_1 = ret[3] |
| 358 | + vars_2 = ret[4] |
| 359 | + |
| 360 | + """ Compute std devs """ |
| 361 | + std_devs_1 = [] |
| 362 | + std_devs_2 = [] |
| 363 | + for i in range(len(vars_1)): |
| 364 | + std_devs_1.append(math.pow(vars_1[i], 0.5)) |
| 365 | + std_devs_2.append(math.pow(vars_2[i], 0.5)) |
| 366 | + |
| 367 | + m_tot = 0 |
| 368 | + sd_tot = 0 |
| 369 | + |
| 370 | + # Compute the parameters for the Wald test |
| 371 | + # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial |
| 372 | + # where X{1,2} is the random variable corresponding to the group {1,2} |
| 373 | + for i in range(num_trials): |
| 374 | + |
| 375 | + # Perform wald - compose W = X1 - X2 for each trial |
| 376 | + sd = math.pow(vars_1[i] + vars_2[i], 0.5) |
| 377 | + m = math.fabs(means_1[i] - means_2[i]) |
| 378 | + |
328 | 379 | m_tot = m_tot + m |
329 | 380 | sd_tot = sd_tot + sd |
330 | 381 | |
331 | | - # print m1 |
332 | | - # print m2 |
333 | | - # print m |
334 | | - # print sd |
335 | 382 | |
336 | 383 | W = m_tot / sd_tot |
337 | 384 | # print W |
— | — | @@ -387,37 +434,91 @@ |
388 | 435 | |
389 | 436 | Implements a Student's T test where the distribution of donations over a given period are assumed to resemble those of a students t distribution |
390 | 437 | |
| 438 | +http://en.wikipedia.org/wiki/Student%27s_t-test |
| 439 | + |
391 | 440 | """ |
392 | 441 | class TTest(ConfidenceTest): |
393 | 442 | |
394 | | - def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples): |
| 443 | + def confidence_test(self, metrics_1, metrics_2, num_samples): |
395 | 444 | |
396 | | - # Partition over different |
| 445 | + # retrieve means and variances |
| 446 | + ret = self.compute_parameters(metrics_1, metrics_2, num_samples) |
| 447 | + num_trials = ret[0] |
| 448 | + means_1 = ret[1] |
| 449 | + means_2 = ret[2] |
| 450 | + vars_1 = ret[3] |
| 451 | + vars_2 = ret[4] |
397 | 452 | |
| 453 | + """ Compute std devs """ |
| 454 | + std_devs_1 = [] |
| 455 | + std_devs_2 = [] |
| 456 | + for i in range(len(vars_1)): |
| 457 | + std_devs_1.append(math.pow(vars_1[i], 0.5)) |
| 458 | + std_devs_2.append(math.pow(vars_2[i], 0.5)) |
| 459 | + |
| 460 | + m_tot = 0 |
| 461 | + var_1_tot = 0 |
| 462 | + var_2_tot = 0 |
398 | 463 | |
399 | | - # Compute mean for each group |
400 | | - m1 = sum(metrics_1) / len(metrics_1) |
401 | | - m2 = sum(metrics_2) / len(metrics_2) |
| 464 | + # Compute the parameters for the Wald test |
| 465 | + # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial |
| 466 | + # where X{1,2} is the random variable corresponding to the group {1,2} |
| 467 | + for i in range(num_trials): |
402 | 468 | |
403 | | - # Compute standard deviation |
404 | | - sd1 = 0 |
405 | | - sd2 = 0 |
406 | | - for i in range(len(metrics_1)): |
407 | | - sd1 = sd1 + math.pow((metrics_1[i] - m1), 2) |
408 | | - sd2 = sd2 + math.pow((metrics_2[i] - m2), 2) |
| 469 | + m_tot = m_tot + math.fabs(means_1[i] - means_2[i]) |
| 470 | + var_1_tot = var_1_tot + vars_1[i] |
| 471 | + var_2_tot = var_2_tot + vars_2[i] |
409 | 472 | |
410 | | - sd1 = pow(sd1 / len(metrics_1), 0.5) |
411 | | - sd2 = pow(sd2 / len(metrics_2), 0.5) |
| 473 | + m = m_tot / num_trials |
| 474 | + s_1 = var_1_tot / num_trials |
| 475 | + s_2 = var_2_tot / num_trials |
412 | 476 | |
413 | | - # degrees of freedom |
| 477 | + total_samples = len(metrics_1) |
414 | 478 | |
415 | | - # Perform wald |
416 | | - m1 |
| 479 | + t = m / math.pow((s_1 + s_2) / total_samples, 0.5) |
| 480 | + degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * total_samples |
| 481 | + |
417 | 482 | |
418 | | - print m1 |
419 | | - print m2 |
420 | | - print sd1 |
421 | | - print sd2 |
| 483 | + """ lookup confidence """ |
422 | 484 | |
| 485 | + print '' |
| 486 | + print t |
| 487 | + print degrees_of_freedom |
| 488 | + |
| 489 | + # get t and df |
| 490 | + degrees_of_freedom = math.ceil(degrees_of_freedom) |
| 491 | + if degrees_of_freedom > 30: |
| 492 | + degrees_of_freedom = 99 |
| 493 | + |
| 494 | + select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t) |
| 495 | + |
| 496 | + self.init_db() |
| 497 | + |
| 498 | + try: |
| 499 | + self.cur.execute(select_stmnt) |
| 500 | + results = self.cur.fetchone() |
| 501 | + |
| 502 | + p = float(results[0]) |
| 503 | + except: |
| 504 | + self.db.rollback() |
| 505 | + self.db.close() |
| 506 | + sys.exit('Could not execute: ' + select_stmnt) |
| 507 | + |
| 508 | + print p |
| 509 | + self.db.close() |
| 510 | + |
| 511 | + conf_str = str((1 - p) * 100) + '% confident about the winner.' |
| 512 | + |
| 513 | + return [means_1, means_2, std_devs_1, std_devs_2, conf_str] |
| 514 | + |
| 515 | +""" |
| 516 | + |
| 517 | +Implements a Chi Square test where the distribution of donations over a given period are assumed to resemble those of a students t distribution |
| 518 | + |
| 519 | +http://en.wikipedia.org/wiki/Chi-square_test |
| 520 | + |
| 521 | +""" |
| 522 | +class ChiSquareTest(ConfidenceTest): |
| 523 | + def confidence_test(self, metrics_1, metrics_2, num_samples): |
423 | 524 | return |
424 | | - |
\ No newline at end of file |
| 525 | + |
\ No newline at end of file |
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_LP_confidence.sql |
— | — | @@ -3,7 +3,7 @@ |
4 | 4 | |
5 | 5 | select |
6 | 6 | |
7 | | -lp.utm_source, |
| 7 | +lp.landing_page, |
8 | 8 | views as views, |
9 | 9 | total_clicks as clicks, |
10 | 10 | donations as donations, |
— | — | @@ -15,19 +15,21 @@ |
16 | 16 | |
17 | 17 | from |
18 | 18 | |
19 | | -select |
| 19 | +(select |
20 | 20 | landing_page, |
| 21 | +utm_campaign, |
21 | 22 | count(*) as views |
22 | 23 | from landing_page |
23 | 24 | where request_time >= '%s' and request_time < '%s' |
24 | 25 | and utm_campaign REGEXP '%s' |
25 | 26 | and landing_page REGEXP '%s' |
26 | | -group by 1) as lp |
| 27 | +group by 1,2) as lp |
27 | 28 | |
28 | 29 | join |
29 | 30 | |
30 | 31 | (select |
31 | 32 | SUBSTRING_index(substring_index(utm_source, '.', 2),'.',-1) as landing_page, |
| 33 | +utm_campaign, |
32 | 34 | count(*) as total_clicks, |
33 | 35 | sum(not isnull(contribution_tracking.contribution_id)) as donations, |
34 | 36 | sum(converted_amount) AS amount |
— | — | @@ -37,7 +39,7 @@ |
38 | 40 | where ts >= '%s' and ts < '%s' |
39 | 41 | and utm_campaign REGEXP '%s' |
40 | 42 | and SUBSTRING_index(substring_index(utm_source, '.', 2),'.',-1) REGEXP '%s' |
41 | | -group by 1) as ecomm |
| 43 | +group by 1,2) as ecomm |
42 | 44 | |
43 | 45 | on ecomm.landing_page = lp.landing_page and ecomm.utm_campaign = lp.utm_campaign |
44 | 46 | |
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_confidence.sql |
— | — | @@ -59,4 +59,4 @@ |
60 | 60 | on ecomm.banner = lp.utm_source |
61 | 61 | |
62 | 62 | group by 1 |
63 | | -having impressions > 100000 and donations > 10; |
| 63 | +-- having impressions > 100000 and donations > 10; |