r110201 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r110200‎ | r110201 | r110202 >
Date:01:57, 28 January 2012
Author:rfaulk
Status:deferred
Tags:
Comment:
added methods to munge data for chi-squared goodness of fit tests
added script to use helper methods and execute chi-square tests on resulting distributions
Modified paths:
  • /trunk/tools/wsor/message_templates/R/R_helper_functions.R (added) (history)
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis.R (modified) (history)
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R (added) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R
@@ -4,6 +4,10 @@
55 # Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short)
66 #
77
 8+
 9+source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
 10+
 11+
812 # Read aggregated results for z64
913
1014 metrics_ec_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
@@ -16,16 +20,8 @@
1721 metrics_blocks_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
1822
1923
20 -# Compute max edit counts
 24+# Compute the change in edits after the template
2125
22 -max_ec_z64_ns_0 = max(append(metrics_ec_z64['ns_0_revisions_before'][[1]], metrics_ec_z64['ns_0_revisions_after'][[1]]))
23 -max_ec_z64_ns_3 = max(append(metrics_ec_z64['ns_3_revisions_before'][[1]], metrics_ec_z64['ns_3_revisions_after'][[1]]))
24 -max_ec_z65_ns_0 = max(append(metrics_ec_z65['ns_0_revisions_before'][[1]], metrics_ec_z65['ns_0_revisions_after'][[1]]))
25 -max_ec_z65_ns_3 = max(append(metrics_ec_z65['ns_3_revisions_before'][[1]], metrics_ec_z65['ns_3_revisions_after'][[1]]))
26 -
27 -
28 -# Compute the increase
29 -
3026 z64_ns0 = (metrics_ec_z64$ns_0_revisions_after - metrics_ec_z64$ns_0_revisions_before) / metrics_ec_z64$ns_0_revisions_before
3127 z65_ns0 = (metrics_ec_z65$ns_0_revisions_after - metrics_ec_z65$ns_0_revisions_before / metrics_ec_z65$ns_0_revisions_before
3228
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R
@@ -0,0 +1,76 @@
 2+
 3+# Ryan Faulkner, January 25th 2012
 4+#
 5+# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test
 6+#
 7+
 8+
 9+source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
 10+
 11+
 12+# MAIN EXECUTION
 13+# ==============
 14+
 15+# Read aggregated results
 16+
 17+metrics_test = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 18+metrics_control = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 19+
 20+
 21+# Compute the change in edits after the template
 22+# ===============================================
 23+
 24+
 25+test_samples <- c()
 26+control_samples <- c()
 27+
 28+for (i in 1:length(metrics_test$ns_0_revisions_before))
 29+ if (metrics_test$ns_0_revisions_before[i] != 0)
 30+ test_samples <- c(test_samples,
 31+ (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i])
 32+
 33+for (i in 1:length(metrics_control$ns_0_revisions_before))
 34+ if (metrics_control$ns_0_revisions_before[i] != 0)
 35+ control_samples <- c(control_samples,
 36+ (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i])
 37+
 38+
 39+
 40+# Construct a distribution (Normal) using parameters computed from metric count data
 41+# This will be used as the model distribution - do symetrically (ie. fot both ways)
 42+# ====================================================================================
 43+
 44+
 45+# Number of samples for each template
 46+
 47+n_test <- length(test_samples)
 48+n_control <- length(control_samples)
 49+
 50+
 51+# Produce probabilities for normal to be fit
 52+# build data frames
 53+
 54+lower_bound_range <- trunc(min(min(c(control_samples, test_samples))) - 1)
 55+upper_bound_range <- trunc(max(max(c(control_samples, test_samples))) + 1)
 56+bins <- sort(lower_bound_range : upper_bound_range)
 57+
 58+probs_control <- get_normal_bins(bins, control_samples)
 59+probs_test <- get_normal_bins(bins, test_samples)
 60+
 61+probs_control <- data.frame(values=bins, counts=probs_control)
 62+probs_test <- data.frame(values=bins, counts=probs_test)
 63+
 64+counts_test <- get_bin_counts(bins, test_samples)
 65+counts_control <- get_bin_counts(bins, control_samples)
 66+
 67+counts_test <- pad_counts(bins, counts_test)
 68+counts_control <- pad_counts(bins, counts_control)
 69+
 70+
 71+# Get chi-squared test results
 72+chisq_res_1 = chisq.test(counts_test$counts, p=probs_control$counts)
 73+chisq_res_2 = chisq.test(counts_control$counts, p=probs_test$counts)
 74+
 75+
 76+
 77+
Index: trunk/tools/wsor/message_templates/R/R_helper_functions.R
@@ -0,0 +1,149 @@
 2+# Ryan Faulkner, January 26th 2012
 3+#
 4+# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test
 5+#
 6+
 7+
 8+
 9+# FUNCTION
 10+#
 11+# Given a set of data compute a normal distribution and the probabilities of falling on each bin
 12+#
 13+# bins --
 14+# data --
 15+#
 16+
 17+get_normal_bins <- function(bins, data) {
 18+
 19+ sample_sd <- sd(data)
 20+ sample_mean <- mean(data)
 21+
 22+ # vector to store bucket probabilities
 23+ probs <- c()
 24+ num_bins <- length(bins)
 25+
 26+ # Compute the probabilities
 27+
 28+ for (i in 1:num_bins)
 29+ {
 30+ if (i == 1) {
 31+ upper <- bins[1] + ((bins[2] - bins[1]) / 2)
 32+ lower <- bins[1] - ((bins[2] - bins[1]) / 2)
 33+ } else if (i == num_bins) {
 34+ upper <- bins[num_bins] + ((bins[num_bins] - bins[num_bins-1]) / 2)
 35+ lower <- bins[num_bins] - ((bins[num_bins] - bins[num_bins-1]) / 2)
 36+ } else {
 37+ ip1 <- i + 1
 38+ im1 <- i - 1
 39+ upper <- bins[i] + ((bins[ip1] - bins[i]) / 2)
 40+ lower <- bins[i] - ((bins[i] - bins[im1]) / 2)
 41+ }
 42+
 43+ p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE)
 44+ probs <- c(probs, p)
 45+ }
 46+
 47+ probs <- probs / sum(probs) # normalize the probabilities
 48+ probs
 49+}
 50+
 51+
 52+
 53+# FUNCTION
 54+#
 55+#
 56+# Given a set of data compute a normal distribution and the probabilities of falling on each bin
 57+#
 58+# bins --
 59+# value --
 60+#
 61+
 62+find_bin <- function(bins, value) {
 63+ distances <- abs(bins - value)
 64+ index <- order(sapply(distances, min))[1]
 65+ bins[index]
 66+}
 67+
 68+
 69+# FUNCTION :: get_bin_counts
 70+#
 71+# Given a set of data break it into bins and return the counts with the bin index
 72+#
 73+
 74+get_bin_counts <- function(bins, data) {
 75+
 76+ new_data <- c()
 77+ for (i in 1:length(data))
 78+ {
 79+ bin <- find_bin(bins, data[i])
 80+ new_data <- c(new_data, bin)
 81+ }
 82+
 83+ tab <- table(new_data)
 84+ xu <- as.numeric(names(tab))
 85+ xn <- as.vector(tab)
 86+ data.frame(values=xu, counts=xn)
 87+}
 88+
 89+
 90+# FUNCTION :: construct_probs
 91+#
 92+# Extract the probabilities corresponding to the samples
 93+#
 94+
 95+construct_probs <- function(values, full_probs) {
 96+
 97+ sample_probs <- c()
 98+
 99+ for (i in 1:length(values))
 100+ {
 101+ val <- values[i]
 102+ bin <- find_bin(full_probs$values, val)
 103+ index <- which(full_probs$values == bin)[1]
 104+ sample_probs <- c(sample_probs, full_probs$counts[index])
 105+ }
 106+
 107+ sample_probs
 108+}
 109+
 110+
 111+# FUNCTION :: convert_to_bins
 112+#
 113+# Maps counts from a data frame (values, counts) to a pre-defined set of bins
 114+#
 115+
 116+convert_to_bins <- function(bins, samples) {
 117+
 118+ for (i in 1:length(samples$values))
 119+ samples$values[i] <- find_bin(bins, samples$values[i])
 120+
 121+ samples
 122+}
 123+
 124+
 125+# FUNCTION :: pad_counts
 126+#
 127+# Pad counts from a data frame (values, counts) in a given range to contain 0 values where a bin is missing
 128+#
 129+
 130+pad_counts <- function(bin_range, samples) {
 131+
 132+ new_values <- c()
 133+ new_counts <- c()
 134+
 135+ for (i in bin_range)
 136+ {
 137+ if (i %in% samples$values)
 138+ {
 139+ index <- which(samples$values == i)[1]
 140+ new_values <- c(new_values, i)
 141+ new_counts <- c(new_counts, samples$counts[index])
 142+
 143+ } else {
 144+ new_values <- c(new_values, i)
 145+ new_counts <- c(new_counts, 0)
 146+ }
 147+ }
 148+
 149+ data.frame(values=new_values, counts=new_counts)
 150+}
\ No newline at end of file

Status & tagging log