r110201 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r110200‎ \| r110201 \| r110202 >
Date:	01:57, 28 January 2012
Author:	rfaulk
Status:	deferred
Tags:
Comment:	added methods to munge data for chi-squared goodness of fit tests added script to use helper methods and execute chi-square tests on resulting distributions
Modified paths:	/trunk/tools/wsor/message_templates/R/R_helper_functions.R (added) (history) /trunk/tools/wsor/message_templates/R/huggle3_analysis.R (modified) (history) /trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R (added) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R
—	—	@@ -4,6 +4,10 @@
5	5	# Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short)
6	6	#
7	7
	8	+
	9	+source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
	10	+
	11	+
8	12	# Read aggregated results for z64
9	13
10	14	metrics_ec_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
—	—	@@ -16,16 +20,8 @@
17	21	metrics_blocks_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
18	22
19	23
20		~~-# Compute max edit counts~~
	24	+# Compute the change in edits after the template
21	25
22		~~-max_ec_z64_ns_0 = max(append(metrics_ec_z64['ns_0_revisions_before'][[1]], metrics_ec_z64['ns_0_revisions_after'][[1]]))~~
23		~~-max_ec_z64_ns_3 = max(append(metrics_ec_z64['ns_3_revisions_before'][[1]], metrics_ec_z64['ns_3_revisions_after'][[1]]))~~
24		~~-max_ec_z65_ns_0 = max(append(metrics_ec_z65['ns_0_revisions_before'][[1]], metrics_ec_z65['ns_0_revisions_after'][[1]]))~~
25		~~-max_ec_z65_ns_3 = max(append(metrics_ec_z65['ns_3_revisions_before'][[1]], metrics_ec_z65['ns_3_revisions_after'][[1]]))~~
26		-
27		-
28		~~-# Compute the increase~~
29		-
30	26	z64_ns0 = (metrics_ec_z64$ns_0_revisions_after - metrics_ec_z64$ns_0_revisions_before) / metrics_ec_z64$ns_0_revisions_before
31	27	z65_ns0 = (metrics_ec_z65$ns_0_revisions_after - metrics_ec_z65$ns_0_revisions_before / metrics_ec_z65$ns_0_revisions_before
32	28
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R
—	—	@@ -0,0 +1,76 @@
	2	+
	3	+# Ryan Faulkner, January 25th 2012
	4	+#
	5	+# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test
	6	+#
	7	+
	8	+
	9	+source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
	10	+
	11	+
	12	+# MAIN EXECUTION
	13	+# ==============
	14	+
	15	+# Read aggregated results
	16	+
	17	+metrics_test = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
	18	+metrics_control = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
	19	+
	20	+
	21	+# Compute the change in edits after the template
	22	+# ===============================================
	23	+
	24	+
	25	+test_samples <- c()
	26	+control_samples <- c()
	27	+
	28	+for (i in 1:length(metrics_test$ns_0_revisions_before))
	29	+ if (metrics_test$ns_0_revisions_before[i] != 0)
	30	+ test_samples <- c(test_samples,
	31	+ (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i])
	32	+
	33	+for (i in 1:length(metrics_control$ns_0_revisions_before))
	34	+ if (metrics_control$ns_0_revisions_before[i] != 0)
	35	+ control_samples <- c(control_samples,
	36	+ (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i])
	37	+
	38	+
	39	+
	40	+# Construct a distribution (Normal) using parameters computed from metric count data
	41	+# This will be used as the model distribution - do symetrically (ie. fot both ways)
	42	+# ====================================================================================
	43	+
	44	+
	45	+# Number of samples for each template
	46	+
	47	+n_test <- length(test_samples)
	48	+n_control <- length(control_samples)
	49	+
	50	+
	51	+# Produce probabilities for normal to be fit
	52	+# build data frames
	53	+
	54	+lower_bound_range <- trunc(min(min(c(control_samples, test_samples))) - 1)
	55	+upper_bound_range <- trunc(max(max(c(control_samples, test_samples))) + 1)
	56	+bins <- sort(lower_bound_range : upper_bound_range)
	57	+
	58	+probs_control <- get_normal_bins(bins, control_samples)
	59	+probs_test <- get_normal_bins(bins, test_samples)
	60	+
	61	+probs_control <- data.frame(values=bins, counts=probs_control)
	62	+probs_test <- data.frame(values=bins, counts=probs_test)
	63	+
	64	+counts_test <- get_bin_counts(bins, test_samples)
	65	+counts_control <- get_bin_counts(bins, control_samples)
	66	+
	67	+counts_test <- pad_counts(bins, counts_test)
	68	+counts_control <- pad_counts(bins, counts_control)
	69	+
	70	+
	71	+# Get chi-squared test results
	72	+chisq_res_1 = chisq.test(counts_test$counts, p=probs_control$counts)
	73	+chisq_res_2 = chisq.test(counts_control$counts, p=probs_test$counts)
	74	+
	75	+
	76	+
	77	+
Index: trunk/tools/wsor/message_templates/R/R_helper_functions.R
—	—	@@ -0,0 +1,149 @@
	2	+# Ryan Faulkner, January 26th 2012
	3	+#
	4	+# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test
	5	+#
	6	+
	7	+
	8	+
	9	+# FUNCTION
	10	+#
	11	+# Given a set of data compute a normal distribution and the probabilities of falling on each bin
	12	+#
	13	+# bins --
	14	+# data --
	15	+#
	16	+
	17	+get_normal_bins <- function(bins, data) {
	18	+
	19	+ sample_sd <- sd(data)
	20	+ sample_mean <- mean(data)
	21	+
	22	+ # vector to store bucket probabilities
	23	+ probs <- c()
	24	+ num_bins <- length(bins)
	25	+
	26	+ # Compute the probabilities
	27	+
	28	+ for (i in 1:num_bins)
	29	+ {
	30	+ if (i == 1) {
	31	+ upper <- bins[1] + ((bins[2] - bins[1]) / 2)
	32	+ lower <- bins[1] - ((bins[2] - bins[1]) / 2)
	33	+ } else if (i == num_bins) {
	34	+ upper <- bins[num_bins] + ((bins[num_bins] - bins[num_bins-1]) / 2)
	35	+ lower <- bins[num_bins] - ((bins[num_bins] - bins[num_bins-1]) / 2)
	36	+ } else {
	37	+ ip1 <- i + 1
	38	+ im1 <- i - 1
	39	+ upper <- bins[i] + ((bins[ip1] - bins[i]) / 2)
	40	+ lower <- bins[i] - ((bins[i] - bins[im1]) / 2)
	41	+ }
	42	+
	43	+ p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE)
	44	+ probs <- c(probs, p)
	45	+ }
	46	+
	47	+ probs <- probs / sum(probs) # normalize the probabilities
	48	+ probs
	49	+}
	50	+
	51	+
	52	+
	53	+# FUNCTION
	54	+#
	55	+#
	56	+# Given a set of data compute a normal distribution and the probabilities of falling on each bin
	57	+#
	58	+# bins --
	59	+# value --
	60	+#
	61	+
	62	+find_bin <- function(bins, value) {
	63	+ distances <- abs(bins - value)
	64	+ index <- order(sapply(distances, min))[1]
	65	+ bins[index]
	66	+}
	67	+
	68	+
	69	+# FUNCTION :: get_bin_counts
	70	+#
	71	+# Given a set of data break it into bins and return the counts with the bin index
	72	+#
	73	+
	74	+get_bin_counts <- function(bins, data) {
	75	+
	76	+ new_data <- c()
	77	+ for (i in 1:length(data))
	78	+ {
	79	+ bin <- find_bin(bins, data[i])
	80	+ new_data <- c(new_data, bin)
	81	+ }
	82	+
	83	+ tab <- table(new_data)
	84	+ xu <- as.numeric(names(tab))
	85	+ xn <- as.vector(tab)
	86	+ data.frame(values=xu, counts=xn)
	87	+}
	88	+
	89	+
	90	+# FUNCTION :: construct_probs
	91	+#
	92	+# Extract the probabilities corresponding to the samples
	93	+#
	94	+
	95	+construct_probs <- function(values, full_probs) {
	96	+
	97	+ sample_probs <- c()
	98	+
	99	+ for (i in 1:length(values))
	100	+ {
	101	+ val <- values[i]
	102	+ bin <- find_bin(full_probs$values, val)
	103	+ index <- which(full_probs$values == bin)[1]
	104	+ sample_probs <- c(sample_probs, full_probs$counts[index])
	105	+ }
	106	+
	107	+ sample_probs
	108	+}
	109	+
	110	+
	111	+# FUNCTION :: convert_to_bins
	112	+#
	113	+# Maps counts from a data frame (values, counts) to a pre-defined set of bins
	114	+#
	115	+
	116	+convert_to_bins <- function(bins, samples) {
	117	+
	118	+ for (i in 1:length(samples$values))
	119	+ samples$values[i] <- find_bin(bins, samples$values[i])
	120	+
	121	+ samples
	122	+}
	123	+
	124	+
	125	+# FUNCTION :: pad_counts
	126	+#
	127	+# Pad counts from a data frame (values, counts) in a given range to contain 0 values where a bin is missing
	128	+#
	129	+
	130	+pad_counts <- function(bin_range, samples) {
	131	+
	132	+ new_values <- c()
	133	+ new_counts <- c()
	134	+
	135	+ for (i in bin_range)
	136	+ {
	137	+ if (i %in% samples$values)
	138	+ {
	139	+ index <- which(samples$values == i)[1]
	140	+ new_values <- c(new_values, i)
	141	+ new_counts <- c(new_counts, samples$counts[index])
	142	+
	143	+ } else {
	144	+ new_values <- c(new_values, i)
	145	+ new_counts <- c(new_counts, 0)
	146	+ }
	147	+ }
	148	+
	149	+ data.frame(values=new_values, counts=new_counts)
	150	+}
\ No newline at end of file

Status & tagging log

02:35, 28 January 2012 😂 (talk | contribs) changed the status of r110201 [removed: new added: deferred]