r112747 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r112746‎ | r112747 | r112748 >
Date:00:42, 1 March 2012
Author:rfaulk
Status:deferred
Tags:
Comment:
moved analysis source to generic script for testing templates
Modified paths:
  • /trunk/tools/wsor/message_templates/R/template_analysis.R (added) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/template_analysis.R
@@ -0,0 +1,184 @@
 2+# source('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/template_analysis.R')
 3+#
 4+# Ryan Faulkner, January 23rd 2012
 5+#
 6+# Process template posting aggregate data for analysis
 7+#
 8+
 9+# Import helper methods - GLOBAL
 10+
 11+home_dir <- "/home/rfaulkner/trunk/projects/WSOR/message_templates/"
 12+# home_dir <- "/home/rfaulk/trunk/projects/WSOR/message_templates/"
 13+
 14+helper_import <- paste(home_dir,"R/R_helper_functions.R",sep="")
 15+source(helper_import)
 16+
 17+
 18+# FUNCTION :: import.experimental.metrics.data
 19+#
 20+# Import the template data and build data frames from it
 21+#
 22+
 23+import.experimental.metrics.data <- function(template_indices_test, template_indices_control, fname_first_part) {
 24+
 25+ # Read aggregated results for the template
 26+
 27+ fname_last_part_edits <- "_editcounts.tsv"
 28+ fname_last_part_blocks <- "_blocks.tsv"
 29+ fname_last_part_warn <- "_warnings.tsv"
 30+
 31+ warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, string_frames=c(1))
 32+ warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, string_frames=c(1))
 33+
 34+ blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, string_frames=c(1))
 35+ blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, string_frames=c(1))
 36+
 37+ edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, string_frames=c(1))
 38+ edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, string_frames=c(1))
 39+
 40+}
 41+
 42+
 43+
 44+# FUNCTION :: process.data.frames
 45+#
 46+# Given a set of data frames containing template test metrics per user posting combine and generate summary metric frames
 47+#
 48+# GLOBALS assumed to exist: warn_test, warn_control, blocks_test, blocks_control, edits_test, edits_control
 49+#
 50+
 51+process.data.frames <- function(min_edits_before=0, min_deleted_edits_before=0, max_edits_before=Inf, max_deleted_edits_before=Inf) {
 52+
 53+ # MERGE THE METRICS AND ADD TEMPLATE COLS
 54+
 55+ # print("Merge Data..")
 56+
 57+ merged_test <<- merge(edits_test, blocks_test, by=intersect(names(edits_test),names(blocks_test)), all=TRUE)
 58+ merged_control <<- merge(edits_control, blocks_control, by=intersect(names(edits_control),names(blocks_control)), all=TRUE)
 59+
 60+ merged_test <<- merge(merged_test, warn_test, by=intersect(names(merged_test),names(warn_test)), all=TRUE)
 61+ merged_control <<- merge(merged_control, warn_control, by=intersect(names(merged_control),names(warn_control)), all=TRUE)
 62+
 63+ merged_test$template <<- 1
 64+ merged_control$template <<- 0
 65+
 66+
 67+ # FILTER DATA
 68+
 69+ # print("Filter Data..")
 70+
 71+ maximum_warns_before <- 0
 72+
 73+ IP_regex <- "^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
 74+ IP_regex_not <- '.*[a-zA-z].*'
 75+
 76+ condition_1 <- TRUE # merged_test$blocks_before > 0
 77+ condition_2 <- merged_test$blocks_after == 0
 78+ condition_3 <- merged_test$ns_0_revisions_before >= min_edits_before & merged_test$ns_0_revisions_before <= max_edits_before
 79+ condition_4 <- merged_test$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_test$ns_0_revisions_deleted_before <= max_deleted_edits_before
 80+ condition_5 <- merged_test$warns_before <= maximum_warns_before
 81+ condition_6 <- filter.list.by.regex(IP_regex_not, merged_test$recipient_name)
 82+ condition_7 <- merged_test$ns_0_revisions_after_0_3 > 0
 83+
 84+ indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
 85+ merged_test <<- merged_test[indices,]
 86+
 87+ condition_1 <- TRUE # merged_control$blocks_before > 0
 88+ condition_2 <- merged_control$blocks_after == 0
 89+ condition_3 <- merged_control$ns_0_revisions_before >= min_edits_before & merged_control$ns_0_revisions_before <= max_edits_before
 90+ condition_4 <- merged_control$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_control$ns_0_revisions_deleted_before <= max_deleted_edits_before
 91+ condition_5 <- merged_control$warns_before <= maximum_warns_before
 92+ condition_6 <- filter.list.by.regex(IP_regex_not, merged_control$recipient_name)
 93+ condition_7 <- merged_control$ns_0_revisions_after_0_3 > 0
 94+
 95+ indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
 96+ merged_control <<- merged_control[indices,]
 97+
 98+
 99+ # ADD DERIVED COLS
 100+
 101+ # print("Add derived columns..")
 102+
 103+ merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - merged_test$ns_0_revisions_after_0_3) / (merged_test$ns_0_revisions_before)
 104+ merged_control$edits_decrease <<- (merged_control$ns_0_revisions_before - merged_control$ns_0_revisions_after_0_3) / (merged_control$ns_0_revisions_before)
 105+
 106+ # merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before)
 107+ # merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before)
 108+
 109+}
 110+
 111+# FUNCTION :: execute.chi.square.test
 112+#
 113+# Construct a distribution (Normal) using parameters computed from metric count data
 114+# This will be used as the model distribution - do symetrically (ie. fot both ways)
 115+#
 116+
 117+execute.chi.square.test <- function(test_samples, control_samples) {
 118+
 119+ # Number of samples for each template
 120+ n_test <- length(test_samples)
 121+ n_control <- length(control_samples)
 122+
 123+
 124+ # Produce probabilities for normal to be fit
 125+
 126+ lower_bound_range <- trunc(min(min(c(control_samples, test_samples))) - 1)
 127+ upper_bound_range <- trunc(max(max(c(control_samples, test_samples))) + 1)
 128+ bins <- sort(lower_bound_range : upper_bound_range)
 129+
 130+ probs_control <- get_normal_bins(bins, control_samples)
 131+ probs_test <- get_normal_bins(bins, test_samples)
 132+
 133+ probs_control <- data.frame(values=bins, counts=probs_control)
 134+ probs_test <- data.frame(values=bins, counts=probs_test)
 135+
 136+ counts_test <- get_bin_counts(bins, test_samples)
 137+ counts_control <- get_bin_counts(bins, control_samples)
 138+
 139+ counts_test <- pad_counts(bins, counts_test)
 140+ counts_control <- pad_counts(bins, counts_control)
 141+
 142+ # Get chi-squared test results
 143+ chisq_res_test <<- chisq.test(counts_test$counts, p=probs_control$counts)
 144+ chisq_res_control <<- chisq.test(counts_control$counts, p=probs_test$counts)
 145+}
 146+
 147+
 148+# FUNCTION :: execute.main
 149+#
 150+# A pseudo main method to allow the script to be executed as a batch
 151+#
 152+
 153+execute.main <- function(test_samples, control_samples) {
 154+
 155+ # IMPORT DATA
 156+
 157+ template_indices_control <- c(78,81) # c(84, 0) # c(107,109,111,113,115) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76)
 158+ template_indices_test <- c(79,82) # c(86, 0) # c(108,110,114,116) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77)
 159+ fname_first_part <- paste(home_dir,"output/metrics_1109_1209_z",sep="") # paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="") # "/home/rfaulk/WSOR/message_templates/output/metrics_pt_z"
 160+
 161+ # import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part)
 162+
 163+
 164+
 165+ # PROCESS DATA
 166+
 167+ # print("")
 168+ # print("Processing data frames.")
 169+ process.data.frames(1,0,Inf,Inf)
 170+
 171+
 172+
 173+ # HYPOTHESIS TESTING
 174+
 175+ # t_result <- t.test(x=merged_test$edits_decrease, y=merged_control$edits_decrease, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
 176+ # chi_result <- execute.chi.square.test(merged_test$edits_decrease, merged_control$edits_decrease)
 177+
 178+
 179+ # LOGISTIC REGRESSION MODELLING:
 180+
 181+ all_data <- append.data.frames(merged_test, merged_control)
 182+ # summary(glm(template ~ edits_decrease, data=all_data, family=binomial(link="logit")))
 183+ # summary(glm(template ~ edits_del_decrease, data=all_data, family=binomial(link="logit")))
 184+
 185+}