Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R |
— | — | @@ -4,6 +4,10 @@ |
5 | 5 | # Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short) |
6 | 6 | # |
7 | 7 | |
| 8 | + |
| 9 | +source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R') |
| 10 | + |
| 11 | + |
8 | 12 | # Read aggregated results for z64 |
9 | 13 | |
10 | 14 | metrics_ec_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T) |
— | — | @@ -16,16 +20,8 @@ |
17 | 21 | metrics_blocks_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T) |
18 | 22 | |
19 | 23 | |
20 | | -# Compute max edit counts |
| 24 | +# Compute the change in edits after the template |
21 | 25 | |
22 | | -max_ec_z64_ns_0 = max(append(metrics_ec_z64['ns_0_revisions_before'][[1]], metrics_ec_z64['ns_0_revisions_after'][[1]])) |
23 | | -max_ec_z64_ns_3 = max(append(metrics_ec_z64['ns_3_revisions_before'][[1]], metrics_ec_z64['ns_3_revisions_after'][[1]])) |
24 | | -max_ec_z65_ns_0 = max(append(metrics_ec_z65['ns_0_revisions_before'][[1]], metrics_ec_z65['ns_0_revisions_after'][[1]])) |
25 | | -max_ec_z65_ns_3 = max(append(metrics_ec_z65['ns_3_revisions_before'][[1]], metrics_ec_z65['ns_3_revisions_after'][[1]])) |
26 | | - |
27 | | - |
28 | | -# Compute the increase |
29 | | - |
30 | 26 | z64_ns0 = (metrics_ec_z64$ns_0_revisions_after - metrics_ec_z64$ns_0_revisions_before) / metrics_ec_z64$ns_0_revisions_before |
31 | 27 | z65_ns0 = (metrics_ec_z65$ns_0_revisions_after - metrics_ec_z65$ns_0_revisions_before / metrics_ec_z65$ns_0_revisions_before |
32 | 28 | |
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R |
— | — | @@ -0,0 +1,76 @@ |
| 2 | + |
| 3 | +# Ryan Faulkner, January 25th 2012 |
| 4 | +# |
| 5 | +# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test |
| 6 | +# |
| 7 | + |
| 8 | + |
| 9 | +source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R') |
| 10 | + |
| 11 | + |
| 12 | +# MAIN EXECUTION |
| 13 | +# ============== |
| 14 | + |
| 15 | +# Read aggregated results |
| 16 | + |
| 17 | +metrics_test = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T) |
| 18 | +metrics_control = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T) |
| 19 | + |
| 20 | + |
| 21 | +# Compute the change in edits after the template |
| 22 | +# =============================================== |
| 23 | + |
| 24 | + |
| 25 | +test_samples <- c() |
| 26 | +control_samples <- c() |
| 27 | + |
| 28 | +for (i in 1:length(metrics_test$ns_0_revisions_before)) |
| 29 | + if (metrics_test$ns_0_revisions_before[i] != 0) |
| 30 | + test_samples <- c(test_samples, |
| 31 | + (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i]) |
| 32 | + |
| 33 | +for (i in 1:length(metrics_control$ns_0_revisions_before)) |
| 34 | + if (metrics_control$ns_0_revisions_before[i] != 0) |
| 35 | + control_samples <- c(control_samples, |
| 36 | + (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i]) |
| 37 | + |
| 38 | + |
| 39 | + |
| 40 | +# Construct a distribution (Normal) using parameters computed from metric count data |
| 41 | +# This will be used as the model distribution - do symetrically (ie. fot both ways) |
| 42 | +# ==================================================================================== |
| 43 | + |
| 44 | + |
| 45 | +# Number of samples for each template |
| 46 | + |
| 47 | +n_test <- length(test_samples) |
| 48 | +n_control <- length(control_samples) |
| 49 | + |
| 50 | + |
| 51 | +# Produce probabilities for normal to be fit |
| 52 | +# build data frames |
| 53 | + |
| 54 | +lower_bound_range <- trunc(min(min(c(control_samples, test_samples))) - 1) |
| 55 | +upper_bound_range <- trunc(max(max(c(control_samples, test_samples))) + 1) |
| 56 | +bins <- sort(lower_bound_range : upper_bound_range) |
| 57 | + |
| 58 | +probs_control <- get_normal_bins(bins, control_samples) |
| 59 | +probs_test <- get_normal_bins(bins, test_samples) |
| 60 | + |
| 61 | +probs_control <- data.frame(values=bins, counts=probs_control) |
| 62 | +probs_test <- data.frame(values=bins, counts=probs_test) |
| 63 | + |
| 64 | +counts_test <- get_bin_counts(bins, test_samples) |
| 65 | +counts_control <- get_bin_counts(bins, control_samples) |
| 66 | + |
| 67 | +counts_test <- pad_counts(bins, counts_test) |
| 68 | +counts_control <- pad_counts(bins, counts_control) |
| 69 | + |
| 70 | + |
| 71 | +# Get chi-squared test results |
| 72 | +chisq_res_1 = chisq.test(counts_test$counts, p=probs_control$counts) |
| 73 | +chisq_res_2 = chisq.test(counts_control$counts, p=probs_test$counts) |
| 74 | + |
| 75 | + |
| 76 | + |
| 77 | + |
Index: trunk/tools/wsor/message_templates/R/R_helper_functions.R |
— | — | @@ -0,0 +1,149 @@ |
| 2 | +# Ryan Faulkner, January 26th 2012 |
| 3 | +# |
| 4 | +# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test |
| 5 | +# |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | +# FUNCTION |
| 10 | +# |
| 11 | +# Given a set of data compute a normal distribution and the probabilities of falling on each bin |
| 12 | +# |
| 13 | +# bins -- |
| 14 | +# data -- |
| 15 | +# |
| 16 | + |
| 17 | +get_normal_bins <- function(bins, data) { |
| 18 | + |
| 19 | + sample_sd <- sd(data) |
| 20 | + sample_mean <- mean(data) |
| 21 | + |
| 22 | + # vector to store bucket probabilities |
| 23 | + probs <- c() |
| 24 | + num_bins <- length(bins) |
| 25 | + |
| 26 | + # Compute the probabilities |
| 27 | + |
| 28 | + for (i in 1:num_bins) |
| 29 | + { |
| 30 | + if (i == 1) { |
| 31 | + upper <- bins[1] + ((bins[2] - bins[1]) / 2) |
| 32 | + lower <- bins[1] - ((bins[2] - bins[1]) / 2) |
| 33 | + } else if (i == num_bins) { |
| 34 | + upper <- bins[num_bins] + ((bins[num_bins] - bins[num_bins-1]) / 2) |
| 35 | + lower <- bins[num_bins] - ((bins[num_bins] - bins[num_bins-1]) / 2) |
| 36 | + } else { |
| 37 | + ip1 <- i + 1 |
| 38 | + im1 <- i - 1 |
| 39 | + upper <- bins[i] + ((bins[ip1] - bins[i]) / 2) |
| 40 | + lower <- bins[i] - ((bins[i] - bins[im1]) / 2) |
| 41 | + } |
| 42 | + |
| 43 | + p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE) |
| 44 | + probs <- c(probs, p) |
| 45 | + } |
| 46 | + |
| 47 | + probs <- probs / sum(probs) # normalize the probabilities |
| 48 | + probs |
| 49 | +} |
| 50 | + |
| 51 | + |
| 52 | + |
| 53 | +# FUNCTION |
| 54 | +# |
| 55 | +# |
| 56 | +# Given a set of data compute a normal distribution and the probabilities of falling on each bin |
| 57 | +# |
| 58 | +# bins -- |
| 59 | +# value -- |
| 60 | +# |
| 61 | + |
| 62 | +find_bin <- function(bins, value) { |
| 63 | + distances <- abs(bins - value) |
| 64 | + index <- order(sapply(distances, min))[1] |
| 65 | + bins[index] |
| 66 | +} |
| 67 | + |
| 68 | + |
| 69 | +# FUNCTION :: get_bin_counts |
| 70 | +# |
| 71 | +# Given a set of data break it into bins and return the counts with the bin index |
| 72 | +# |
| 73 | + |
| 74 | +get_bin_counts <- function(bins, data) { |
| 75 | + |
| 76 | + new_data <- c() |
| 77 | + for (i in 1:length(data)) |
| 78 | + { |
| 79 | + bin <- find_bin(bins, data[i]) |
| 80 | + new_data <- c(new_data, bin) |
| 81 | + } |
| 82 | + |
| 83 | + tab <- table(new_data) |
| 84 | + xu <- as.numeric(names(tab)) |
| 85 | + xn <- as.vector(tab) |
| 86 | + data.frame(values=xu, counts=xn) |
| 87 | +} |
| 88 | + |
| 89 | + |
| 90 | +# FUNCTION :: construct_probs |
| 91 | +# |
| 92 | +# Extract the probabilities corresponding to the samples |
| 93 | +# |
| 94 | + |
| 95 | +construct_probs <- function(values, full_probs) { |
| 96 | + |
| 97 | + sample_probs <- c() |
| 98 | + |
| 99 | + for (i in 1:length(values)) |
| 100 | + { |
| 101 | + val <- values[i] |
| 102 | + bin <- find_bin(full_probs$values, val) |
| 103 | + index <- which(full_probs$values == bin)[1] |
| 104 | + sample_probs <- c(sample_probs, full_probs$counts[index]) |
| 105 | + } |
| 106 | + |
| 107 | + sample_probs |
| 108 | +} |
| 109 | + |
| 110 | + |
| 111 | +# FUNCTION :: convert_to_bins |
| 112 | +# |
| 113 | +# Maps counts from a data frame (values, counts) to a pre-defined set of bins |
| 114 | +# |
| 115 | + |
| 116 | +convert_to_bins <- function(bins, samples) { |
| 117 | + |
| 118 | + for (i in 1:length(samples$values)) |
| 119 | + samples$values[i] <- find_bin(bins, samples$values[i]) |
| 120 | + |
| 121 | + samples |
| 122 | +} |
| 123 | + |
| 124 | + |
| 125 | +# FUNCTION :: pad_counts |
| 126 | +# |
| 127 | +# Pad counts from a data frame (values, counts) in a given range to contain 0 values where a bin is missing |
| 128 | +# |
| 129 | + |
| 130 | +pad_counts <- function(bin_range, samples) { |
| 131 | + |
| 132 | + new_values <- c() |
| 133 | + new_counts <- c() |
| 134 | + |
| 135 | + for (i in bin_range) |
| 136 | + { |
| 137 | + if (i %in% samples$values) |
| 138 | + { |
| 139 | + index <- which(samples$values == i)[1] |
| 140 | + new_values <- c(new_values, i) |
| 141 | + new_counts <- c(new_counts, samples$counts[index]) |
| 142 | + |
| 143 | + } else { |
| 144 | + new_values <- c(new_values, i) |
| 145 | + new_counts <- c(new_counts, 0) |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + data.frame(values=new_values, counts=new_counts) |
| 150 | +} |
\ No newline at end of file |