r112523 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r112522‎ | r112523 | r112524 >
Date:21:50, 27 February 2012
Author:rfaulk
Status:deferred
Tags:
Comment:
added some modularity to the implementation for ease of use at the R command prompt
Modified paths:
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis.R (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R
@@ -1,52 +1,154 @@
2 -
 2+# source('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/huggle3_analysis.R')
 3+#
34 # Ryan Faulkner, January 23rd 2012
45 #
56 # Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short)
67 #
78
 9+# Import helper methods - GLOBAL
810
9 -source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
 11+home_dir <- "/home/rfaulkner/trunk/projects/WSOR/message_templates/"
 12+# home_dir <- "/home/rfaulk/trunk/projects/WSOR/message_templates/"
1013
 14+helper_import <- paste(home_dir,"R/R_helper_functions.R",sep="")
 15+source(helper_import)
1116
12 -# Read aggregated results for the template
1317
14 -template_indices_control <- c(60,62,64,66,68,70,72,74,76)
15 -template_indices_test <- c(61,63,65,67,69,71,73,75,77)
 18+# FUNCTION :: import.experimental.metrics.data
 19+#
 20+# Import the template data and build data frames from it
 21+#
1622
17 -fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z"
18 -fname_last_part <- "_editcounts.tsv"
 23+import.experimental.metrics.data <- function(template_indices_test, template_indices_control, fname_first_part) {
 24+
 25+ # Read aggregated results for the template
 26+
 27+ fname_last_part_edits <- "_editcounts.tsv"
 28+ fname_last_part_blocks <- "_blocks.tsv"
 29+ fname_last_part_warn <- "_warnings.tsv"
 30+
 31+ warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, string_frames=c(1))
 32+ warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, string_frames=c(1))
 33+
 34+ blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, string_frames=c(1))
 35+ blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, string_frames=c(1))
 36+
 37+ edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, string_frames=c(1))
 38+ edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, string_frames=c(1))
 39+
 40+}
1941
2042
21 -# MAIN EXECUTION
22 -# ==============
2343
24 -# BUILD THE DATA FRAMES
 44+# FUNCTION :: process.data.frames
 45+#
 46+# Given a set of data frames containing template test metrics per user posting combine and generate summary metric frames
 47+#
 48+# GLOBALS assumed to exist: warn_test, warn_control, blocks_test, blocks_control, edits_test, edits_control
 49+#
2550
26 -metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part)
27 -metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part)
 51+process.data.frames <- function() {
 52+
 53+ # MERGE THE METRICS AND ADD TEMPLATE COLS
2854
 55+ print("Merge Data..")
 56+
 57+ merged_test <<- merge(edits_test, blocks_test, by=intersect(names(edits_test),names(blocks_test)), all=TRUE)
 58+ merged_control <<- merge(edits_control, blocks_control, by=intersect(names(edits_control),names(blocks_control)), all=TRUE)
 59+
 60+ merged_test <<- merge(merged_test, warn_test, by=intersect(names(merged_test),names(warn_test)), all=TRUE)
 61+ merged_control <<- merge(merged_control, warn_control, by=intersect(names(merged_control),names(warn_control)), all=TRUE)
 62+
 63+ merged_test$template <<- 1
 64+ merged_control$template <<- 0
 65+
 66+
 67+ # FILTER DATA
2968
30 -# Compute the change in edits after the template -- default to namespace 0
31 -# User Talk namespace does not necessarily have edits before - in this case omit the result (it could be the case that templates stimulate user talk edits but that should be tested separately)
32 -# Only append non-zero results - do this for just namespace 3 since it has zero entries for 'ns_3_revisions_before'
 69+ print("Filter Data..")
 70+ min_edits_before <- 5
 71+ min_deleted_edits_before <- 0
 72+
 73+ max_edits_before <- Inf
 74+ max_deleted_edits_before <- Inf
 75+
 76+ maximum_warns_before <- 0
 77+
 78+ IP_regex <- "^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
 79+ IP_regex_not <- '.*[a-zA-z].*'
 80+
 81+ condition_1 <- TRUE # merged_test$blocks_before > 0
 82+ condition_2 <- merged_test$blocks_after == 0
 83+ condition_3 <- merged_test$ns_0_revisions_before >= min_edits_before & merged_test$ns_0_revisions_before <= max_edits_before
 84+ condition_4 <- merged_test$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_test$ns_0_revisions_deleted_before <= max_deleted_edits_before
 85+ condition_5 <- merged_test$warns_before <= maximum_warns_before
 86+ condition_6 <- filter.list.by.regex(IP_regex_not, merged_test$recipient_name)
 87+ condition_7 <- merged_test$ns_0_revisions_after_0_3 > 0
 88+
 89+ indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
 90+ merged_test <<- merged_test[indices,]
 91+
 92+ condition_1 <- TRUE # merged_control$blocks_before > 0
 93+ condition_2 <- merged_control$blocks_after == 0
 94+ condition_3 <- merged_control$ns_0_revisions_before >= min_edits_before & merged_control$ns_0_revisions_before <= max_edits_before
 95+ condition_4 <- merged_control$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_control$ns_0_revisions_deleted_before <= max_deleted_edits_before
 96+ condition_5 <- merged_control$warns_before <= maximum_warns_before
 97+ condition_6 <- filter.list.by.regex(IP_regex_not, merged_control$recipient_name)
 98+ condition_7 <- merged_control$ns_0_revisions_after_0_3 > 0
 99+
 100+ indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
 101+ merged_control <<- merged_control[indices,]
 102+
 103+
 104+ # ADD DERIVED COLS
 105+
 106+ print("Add derived columns..")
 107+
 108+ merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - merged_test$ns_0_revisions_after_0_3) / (merged_test$ns_0_revisions_before)
 109+ merged_control$edits_decrease <<- (merged_control$ns_0_revisions_before - merged_control$ns_0_revisions_after_0_3) / (merged_control$ns_0_revisions_before)
 110+
 111+ merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before + 1)
 112+ merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before + 1)
 113+
 114+}
33115
34 -z_test <- c()
35 -z_control <- c()
36116
37 -for (i in 1:length(metrics_test$ns_0_revisions_before))
38 - if (metrics_test$ns_0_revisions_before[i] != 0)
39 - z_test <- c(z_test,
40 - (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i])
41117
42 -for (i in 1:length(metrics_control['ns_0_revisions_before'][[1]]))
43 - if (metrics_control$ns_0_revisions_before[i] != 0)
44 - z_control <- c(z_control,
45 - (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i])
 118+# IMPORT DATA
46119
 120+template_indices_control <- c(84, 0) # c(107,109,111,113,115) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76)
 121+template_indices_test <- c(86, 0) # c(108,110,114,116) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77)
 122+fname_first_part <- paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="") # "/home/rfaulk/WSOR/message_templates/output/metrics_pt_z"
47123
48 -# Generate results:
 124+# import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part)
49125
50 -t_result = t.test(x=z_test, y=z_control, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
51 -# t_result_ns3 = t.test(x=z64_ns3, y=z65_ns3, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
52126
53127
 128+# PROCESS DATA
 129+
 130+# print("")
 131+# print("Processing data frames.")
 132+process.data.frames()
 133+
 134+
 135+
 136+# HYPOTHESIS TESTING
 137+
 138+#test_edits <- get.decrease.in.edits.after.template(edits_test$ns_0_revisions_before, edits_test$ns_0_revisions_after_3_30,lower_bound_rev_before=200,lower_bound_rev_after=0)
 139+#control_edits <- get.decrease.in.edits.after.template(edits_control$ns_0_revisions_before, edits_control$ns_0_revisions_after_3_30,lower_bound_rev_before=200, lower_bound_rev_after=0)
 140+
 141+#test_blocks <- get.change.in.blocks(blocks_test$blocks_before, blocks_test$blocks_after)
 142+#control_blocks <- get.change.in.blocks(blocks_control$blocks_before, blocks_control$blocks_after)
 143+
 144+#t_result_edits = t.test(x=test_edits, y=control_edits, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
 145+#t_result_blocks = t.test(x=test_blocks, y=control_blocks, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
 146+
 147+
 148+
 149+# LOGISTIC REGRESSION MODELLING:
 150+
 151+all_data <- append.data.frames(merged_test, merged_control)
 152+summary(glm(template ~ edits_decrease, data=all_data, family=binomial(link="logit")))
 153+# summary(glm(template ~ edits_del_decrease, data=all_data, family=binomial(link="logit")))
 154+
 155+