r80596 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r80595‎ \| r80596 \| r80597 >
Date:	22:59, 19 January 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	added T-Test functionality to confidence computation. Also added a method to print out the distribution parameters at each interval.
Modified paths:	/trunk/fundraiser-statistics/fundraiser-scripts/compute_confidence.py (modified) (history) /trunk/fundraiser-statistics/fundraiser-scripts/sql/report_LP_confidence.sql (modified) (history) /trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_confidence.sql (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/compute_confidence.py
—	—	@@ -262,6 +262,9 @@
263	263	labels = [item_1, item_2]
264	264	self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname)
265	265
	266	+ """ Print out results """
	267	+ self.print_metrics(title, means_1, means_2, std_devs_1, std_devs_2, times_indices)
	268	+
266	269	return
267	270
268	271
—	—	@@ -270,33 +273,32 @@
271	274	"""
272	275	def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):
273	276	return
274		-
275		~~-"""~~
276		-
277		~~-Implements a Wald test where the distribution of donations over a given period are assumed to be normal~~
278		-
279		~~-"""~~
280		~~-class WaldTest(ConfidenceTest):~~
	277	+
281	278
282		~~- def confidence_test(self, metrics_1, metrics_2, num_samples):~~
	279	+ """
	280	+ assess the confidence of the winner - define in subclass
	281	+ """
	282	+ def compute_parameters(self, metrics_1, metrics_2, num_samples):
283	283
284		~~- # Partition over different~~
285		~~- num_trials = math.ceil(len(metrics_1) / num_samples)~~
	284	+ # A trial represents a group of samples over which parameters are computed
	285	+ num_trials = int(math.ceil(len(metrics_1) / num_samples))
	286	+
286	287	means_1 = []
287		~~- std_devs_1 = []~~
288	288	means_2 = []
289		~~- std_devs_2 = []~~
	289	+ vars_1 = []
	290	+ vars_2 = []
290	291
291	292	m_tot = 0
292	293	sd_tot = 0
293	294
294		~~- # print num_trials~~
295		~~- for i in range(int(num_trials)):~~
	295	+
	296	+ # Compute the mean and variance for each group across all trials
	297	+ for i in range(num_trials):
296	298
297		~~- m1 = 0~~
298		~~- m2 = 0~~
299		~~- sd1 = 0~~
300		~~- sd2 = 0~~
	299	+ m1 = 0 # mean of group 1
	300	+ m2 = 0 # mean of group 2
	301	+ var1 = 0 # variance of group 1
	302	+ var2 = 0 # variance of group 2
301	303
302	304	for j in range(num_samples):
303	305	index = i + j
—	—	@@ -308,29 +310,74 @@
309	311	m1 = m1 / num_samples
310	312	m2 = m2 / num_samples
311	313
	314	+ # Compute Sample Variance for each group
312	315	for j in range(num_samples):
313	316	index = i + j
314	317
315		~~- # Compute standard deviation~~
316		~~- sd1 = sd1 + math.pow((metrics_1[i] - m1), 2)~~
317		~~- sd2 = sd2 + math.pow((metrics_2[i] - m2), 2)~~
	318	+ var1 = var1 + math.pow((metrics_1[i] - m1), 2)
	319	+ var2 = var2 + math.pow((metrics_2[i] - m2), 2)
318	320
319		~~- # Perform wald~~
320		~~- sd = math.pow(sd1 / num_samples + sd2 / num_samples, 0.5)~~
321		~~- m = math.fabs(m1 - m2)~~
322		-
323	321	means_1.append(float(m1))
324	322	means_2.append(float(m2))
325		~~- std_devs_1.append(math.pow(sd1 / num_samples, 0.5))~~
326		~~- std_devs_2.append(math.pow(sd2 / num_samples, 0.5))~~
	323	+ vars_1.append(var1 / num_samples)
	324	+ vars_2.append(var2 / num_samples)
327	325
	326	+
	327	+ return [num_trials, means_1, means_2, vars_1, vars_2]
	328	+
	329	+
	330	+ """ Print in Tabular form the means and standard deviation of each group over each interval """
	331	+ def print_metrics(self, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices):
	332	+
	333	+ print '\n\n' + metric_name
	334	+ print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n'
	335	+
	336	+ for i in range(1,len(times_indices) - 1):
	337	+ line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'
	338	+ line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i])
	339	+ print line_str
	340	+
	341	+
	342	+"""
	343	+
	344	+Implements a Wald test where the distribution of donations over a given period are assumed to be normal
	345	+
	346	+http://en.wikipedia.org/wiki/Wald_test
	347	+
	348	+"""
	349	+class WaldTest(ConfidenceTest):
	350	+
	351	+ def confidence_test(self, metrics_1, metrics_2, num_samples):
	352	+
	353	+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
	354	+ num_trials = ret[0]
	355	+ means_1 = ret[1]
	356	+ means_2 = ret[2]
	357	+ vars_1 = ret[3]
	358	+ vars_2 = ret[4]
	359	+
	360	+ """ Compute std devs """
	361	+ std_devs_1 = []
	362	+ std_devs_2 = []
	363	+ for i in range(len(vars_1)):
	364	+ std_devs_1.append(math.pow(vars_1[i], 0.5))
	365	+ std_devs_2.append(math.pow(vars_2[i], 0.5))
	366	+
	367	+ m_tot = 0
	368	+ sd_tot = 0
	369	+
	370	+ # Compute the parameters for the Wald test
	371	+ # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
	372	+ # where X{1,2} is the random variable corresponding to the group {1,2}
	373	+ for i in range(num_trials):
	374	+
	375	+ # Perform wald - compose W = X1 - X2 for each trial
	376	+ sd = math.pow(vars_1[i] + vars_2[i], 0.5)
	377	+ m = math.fabs(means_1[i] - means_2[i])
	378	+
328	379	m_tot = m_tot + m
329	380	sd_tot = sd_tot + sd
330	381
331		~~- # print m1~~
332		~~- # print m2~~
333		~~- # print m~~
334		~~- # print sd~~
335	382
336	383	W = m_tot / sd_tot
337	384	# print W
—	—	@@ -387,37 +434,91 @@
388	435
389	436	Implements a Student's T test where the distribution of donations over a given period are assumed to resemble those of a students t distribution
390	437
	438	+http://en.wikipedia.org/wiki/Student%27s_t-test
	439	+
391	440	"""
392	441	class TTest(ConfidenceTest):
393	442
394		~~- def confidence_test(self, metrics_1, metrics_2, time_indices, num_samples):~~
	443	+ def confidence_test(self, metrics_1, metrics_2, num_samples):
395	444
396		~~- # Partition over different~~
	445	+ # retrieve means and variances
	446	+ ret = self.compute_parameters(metrics_1, metrics_2, num_samples)
	447	+ num_trials = ret[0]
	448	+ means_1 = ret[1]
	449	+ means_2 = ret[2]
	450	+ vars_1 = ret[3]
	451	+ vars_2 = ret[4]
397	452
	453	+ """ Compute std devs """
	454	+ std_devs_1 = []
	455	+ std_devs_2 = []
	456	+ for i in range(len(vars_1)):
	457	+ std_devs_1.append(math.pow(vars_1[i], 0.5))
	458	+ std_devs_2.append(math.pow(vars_2[i], 0.5))
	459	+
	460	+ m_tot = 0
	461	+ var_1_tot = 0
	462	+ var_2_tot = 0
398	463
399		~~- # Compute mean for each group~~
400		~~- m1 = sum(metrics_1) / len(metrics_1)~~
401		~~- m2 = sum(metrics_2) / len(metrics_2)~~
	464	+ # Compute the parameters for the Wald test
	465	+ # The difference of the means and the sum of the variances is used to compose the random variable W = X1 - X2 for each trial
	466	+ # where X{1,2} is the random variable corresponding to the group {1,2}
	467	+ for i in range(num_trials):
402	468
403		~~- # Compute standard deviation~~
404		~~- sd1 = 0~~
405		~~- sd2 = 0~~
406		~~- for i in range(len(metrics_1)):~~
407		~~- sd1 = sd1 + math.pow((metrics_1[i] - m1), 2)~~
408		~~- sd2 = sd2 + math.pow((metrics_2[i] - m2), 2)~~
	469	+ m_tot = m_tot + math.fabs(means_1[i] - means_2[i])
	470	+ var_1_tot = var_1_tot + vars_1[i]
	471	+ var_2_tot = var_2_tot + vars_2[i]
409	472
410		~~- sd1 = pow(sd1 / len(metrics_1), 0.5)~~
411		~~- sd2 = pow(sd2 / len(metrics_2), 0.5)~~
	473	+ m = m_tot / num_trials
	474	+ s_1 = var_1_tot / num_trials
	475	+ s_2 = var_2_tot / num_trials
412	476
413		~~- # degrees of freedom~~
	477	+ total_samples = len(metrics_1)
414	478
415		~~- # Perform wald~~
416		~~- m1~~
	479	+ t = m / math.pow((s_1 + s_2) / total_samples, 0.5)
	480	+ degrees_of_freedom = (math.pow(s_1 / total_samples + s_2 / total_samples, 2) / (math.pow(s_1 / total_samples, 2) + math.pow(s_2 / total_samples, 2))) * total_samples
	481	+
417	482
418		~~- print m1~~
419		~~- print m2~~
420		~~- print sd1~~
421		~~- print sd2~~
	483	+ """ lookup confidence """
422	484
	485	+ print ''
	486	+ print t
	487	+ print degrees_of_freedom
	488	+
	489	+ # get t and df
	490	+ degrees_of_freedom = math.ceil(degrees_of_freedom)
	491	+ if degrees_of_freedom > 30:
	492	+ degrees_of_freedom = 99
	493	+
	494	+ select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
	495	+
	496	+ self.init_db()
	497	+
	498	+ try:
	499	+ self.cur.execute(select_stmnt)
	500	+ results = self.cur.fetchone()
	501	+
	502	+ p = float(results[0])
	503	+ except:
	504	+ self.db.rollback()
	505	+ self.db.close()
	506	+ sys.exit('Could not execute: ' + select_stmnt)
	507	+
	508	+ print p
	509	+ self.db.close()
	510	+
	511	+ conf_str = str((1 - p) * 100) + '% confident about the winner.'
	512	+
	513	+ return [means_1, means_2, std_devs_1, std_devs_2, conf_str]
	514	+
	515	+"""
	516	+
	517	+Implements a Chi Square test where the distribution of donations over a given period are assumed to resemble those of a students t distribution
	518	+
	519	+http://en.wikipedia.org/wiki/Chi-square_test
	520	+
	521	+"""
	522	+class ChiSquareTest(ConfidenceTest):
	523	+ def confidence_test(self, metrics_1, metrics_2, num_samples):
423	524	return
424		-
\ No newline at end of file
	525	+
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_LP_confidence.sql
—	—	@@ -3,7 +3,7 @@
4	4
5	5	select
6	6
7		~~-lp.utm_source,~~
	7	+lp.landing_page,
8	8	views as views,
9	9	total_clicks as clicks,
10	10	donations as donations,
—	—	@@ -15,19 +15,21 @@
16	16
17	17	from
18	18
19		~~-select~~
	19	+(select
20	20	landing_page,
	21	+utm_campaign,
21	22	count(*) as views
22	23	from landing_page
23	24	where request_time >= '%s' and request_time < '%s'
24	25	and utm_campaign REGEXP '%s'
25	26	and landing_page REGEXP '%s'
26		~~-group by 1) as lp~~
	27	+group by 1,2) as lp
27	28
28	29	join
29	30
30	31	(select
31	32	SUBSTRING_index(substring_index(utm_source, '.', 2),'.',-1) as landing_page,
	33	+utm_campaign,
32	34	count(*) as total_clicks,
33	35	sum(not isnull(contribution_tracking.contribution_id)) as donations,
34	36	sum(converted_amount) AS amount
—	—	@@ -37,7 +39,7 @@
38	40	where ts >= '%s' and ts < '%s'
39	41	and utm_campaign REGEXP '%s'
40	42	and SUBSTRING_index(substring_index(utm_source, '.', 2),'.',-1) REGEXP '%s'
41		~~-group by 1) as ecomm~~
	43	+group by 1,2) as ecomm
42	44
43	45	on ecomm.landing_page = lp.landing_page and ecomm.utm_campaign = lp.utm_campaign
44	46
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_confidence.sql
—	—	@@ -59,4 +59,4 @@
60	60	on ecomm.banner = lp.utm_source
61	61
62	62	group by 1
63		~~-having impressions > 100000 and donations > 10;~~
	63	+-- having impressions > 100000 and donations > 10;

Status & tagging log

13:39, 20 January 2011 Reedy (talk | contribs) changed the status of r80596 [removed: new added: deferred]