r114040 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r114039‎ | r114040 | r114041 >
Date:22:19, 16 March 2012
Author:rfaulk
Status:new
Tags:
Comment:
updated scripts to allow for more flexibility in handling parameters
Modified paths:
  • /trunk/tools/wsor/message_templates/R/template_analysis.R (modified) (history)
  • /trunk/tools/wsor/message_templates/R/visualize_edits_decrease.R (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/template_analysis.R
@@ -7,7 +7,7 @@
88
99 # Import helper methods - GLOBAL
1010
11 -home_dir <- "/home/rfaulkner/trunk/projects/WSOR/message_templates/"
 11+home_dir <<- "/home/rfaulkner/trunk/projects/WSOR/message_templates/"
1212 # home_dir <- "/home/rfaulk/trunk/projects/WSOR/message_templates/"
1313
1414 helper_import <- paste(home_dir,"R/R_helper_functions.R",sep="")
@@ -26,15 +26,15 @@
2727 fname_last_part_edits <- "_editcounts.tsv"
2828 fname_last_part_blocks <- "_blocks.tsv"
2929 fname_last_part_warn <- "_warnings.tsv"
30 -
31 - warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, string_frames=c(1))
32 - warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, string_frames=c(1))
 30+
 31+ warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, home_dir, string_frames=c(1))
 32+ warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, home_dir, string_frames=c(1))
3333
34 - blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, string_frames=c(1))
35 - blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, string_frames=c(1))
 34+ blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, home_dir, string_frames=c(1))
 35+ blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, home_dir, string_frames=c(1))
3636
37 - edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, string_frames=c(1))
38 - edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, string_frames=c(1))
 37+ edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, home_dir, string_frames=c(1))
 38+ edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, home_dir, string_frames=c(1))
3939
4040 }
4141
@@ -47,7 +47,7 @@
4848 # GLOBALS assumed to exist: warn_test, warn_control, blocks_test, blocks_control, edits_test, edits_control
4949 #
5050
51 -process.data.frames <- function(min_edits_before=0, min_deleted_edits_before=0, max_edits_before=Inf, max_deleted_edits_before=Inf, min_revisions_after = 0, registered=TRUE) {
 51+process.data.frames <- function(min_edits_before=0, min_deleted_edits_before=0, max_edits_before=Inf, max_deleted_edits_before=Inf, min_edits_after = 0, registered=TRUE) {
5252
5353 # MERGE THE METRICS AND ADD TEMPLATE COLS
5454
@@ -80,7 +80,7 @@
8181 condition_4 <- merged_test$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_test$ns_0_revisions_deleted_before <= max_deleted_edits_before
8282 condition_5 <- merged_test$warns_before <= maximum_warns_before
8383 condition_6 <- filter.list.by.regex(IP_regex, merged_test$recipient_name)
84 - condition_7 <- merged_test$ns_0_revisions_after_0_3 >= min_revisions_after
 84+ condition_7 <- merged_test$ns_0_revisions_after_0_3 >= min_edits_after
8585
8686 indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
8787 merged_test <<- merged_test[indices,]
@@ -91,7 +91,7 @@
9292 condition_4 <- merged_control$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_control$ns_0_revisions_deleted_before <= max_deleted_edits_before
9393 condition_5 <- merged_control$warns_before <= maximum_warns_before
9494 condition_6 <- filter.list.by.regex(IP_regex, merged_control$recipient_name)
95 - condition_7 <- merged_control$ns_0_revisions_after_0_3 >= min_revisions_after
 95+ condition_7 <- merged_control$ns_0_revisions_after_0_3 >= min_edits_after
9696
9797 indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
9898 merged_control <<- merged_control[indices,]
@@ -101,12 +101,20 @@
102102
103103 # print("Add derived columns..")
104104
105 - merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - merged_test$ns_0_revisions_after_0_3) / (merged_test$ns_0_revisions_before)
 105+ merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - (merged_test$ns_0_revisions_after_0_3)) / (merged_test$ns_0_revisions_before)
106106 merged_control$edits_decrease <<- (merged_control$ns_0_revisions_before - merged_control$ns_0_revisions_after_0_3) / (merged_control$ns_0_revisions_before)
107107
108 - # merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before)
109 - # merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before)
 108+ merged_test$edit_counts_0_3 <<- merged_test$ns_0_revisions_after_0_3
 109+ merged_control$edit_counts_0_3 <<- merged_control$ns_0_revisions_after_0_3
110110
 111+ merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before)
 112+ merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before)
 113+
 114+ merged_test$edit_del_counts_0_3 <<- merged_test$ns_0_revisions_deleted_after_0_3
 115+ merged_control$edit_del_counts_0_3 <<- merged_control$ns_0_revisions_deleted_after_0_3
 116+
 117+ merged_test$edit_del_counts <<- ceiling(merged_test$ns_0_revisions_deleted_after_0_3 / max(merged_test$ns_0_revisions_deleted_after_0_3))
 118+ merged_control$edit_del_counts <<- ceiling(merged_control$ns_0_revisions_deleted_after_0_3 / max(merged_control$ns_0_revisions_deleted_after_0_3))
111119 }
112120
113121 # FUNCTION :: execute.chi.square.test
@@ -151,27 +159,32 @@
152160 # A pseudo main method to allow the script to be executed as a batch
153161 #
154162
155 -execute.main <- function() {
 163+execute.main <- function(min_edits_before = 0, max_edits_before = Inf, min_edits_after = 0, min_deleted_edits_before = 0, max_deleted_edits_before = Inf,
 164+load_metrics = FALSE, load_file = "", import_metrics = FALSE, registered = FALSE) {
156165
157166 # IMPORT DATA
158167
159 - template_indices_control <- c(60,62,66,76) # c(107,109,111,113,115) # c(78,81) # c(84, 0) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76)
160 - template_indices_test <- c(61,63,67,77) # c(108,110,114,116) # c(79,82) # c(86, 0) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77)
161 - fname_first_part <- paste(home_dir,"output/metrics_1018_1119_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_1109_1209_z",sep="") # paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="")
 168+ # c(60,62,66,76) # c(107,109,111,113,115) # TWINKLE c(78,81) # c(84, 0) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76) # CORENSEARCH c(118, 120, 122, 124, 126, 128) # IMAGETAG c(132, 133, 135, 136, 138, 139, 141, 142)
 169+ # c(61,63,67,77) # c(108,110,114,116) # TWINKLE c(79,82) # c(86, 0) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77) # CORENSEARCH c(117, 119, 121, 123, 125, 127) # IMAGETAG c(131, 134, 137, 140)
 170+ # paste(home_dir,"output/metrics_1018_1119_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_1109_1209_z",sep="")
 171+ # paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="") # paste(home_dir,"output/metrics_z",sep="")
162172
163 - # import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part)
 173+ template_indices_control <- c(81,0)
 174+ template_indices_test <- c(82,0)
 175+ fname_first_part <- "output/metrics_1109_1209_z"
164176
 177+ if (import_metrics)
 178+ import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part)
165179
 180+ if (load_metrics)
 181+ load(load_file)
166182
167 - # PROCESS DATA
168183
169 - # print("")
170 - # print("Processing data frames.")
171 - registered = TRUE
172 - process.data.frames(3,0,Inf,Inf,registered)
 184+ # PROCESS DATA
 185+ process.data.frames(min_edits_before = min_edits_before, max_edits_before = max_edits_before, min_edits_after = min_edits_after,
 186+ min_deleted_edits_before = min_deleted_edits_before, max_deleted_edits_before = max_deleted_edits_before, registered = registered)
173187
174188
175 -
176189 # HYPOTHESIS TESTING
177190
178191 # t_result <- t.test(x=merged_test$edits_decrease, y=merged_control$edits_decrease, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
@@ -181,7 +194,10 @@
182195 # LOGISTIC REGRESSION MODELLING:
183196
184197 all_data <<- append.data.frames(merged_test, merged_control)
 198+
185199 # summary(glm(template ~ edits_decrease, data=all_data, family=binomial(link="logit")))
 200+ # summary(glm(template ~ edit_counts_0_3, data=all_data, family=binomial(link="logit")))
186201 # summary(glm(template ~ edits_del_decrease, data=all_data, family=binomial(link="logit")))
187 -
 202+ # summary(glm(template ~ edit_del_counts_0_3, data=all_data, family=binomial(link="logit")))
 203+ # summary(glm(template ~ edit_del_counts, data=all_data, family=binomial(link="logit")))
188204 }
Index: trunk/tools/wsor/message_templates/R/visualize_edits_decrease.R
@@ -64,23 +64,28 @@
6565 # save_plot - saves plot if TRUE
6666 # registered - look at registered editors if TRUE (non-registered otherwise)
6767 # error_bars - display error bars if TRUE
 68+# plot_samples - plots the sample sizes used for each data point
6869 #
6970
70 -line.plot.results <- function(edit_count_min_lower = 1, edit_count_min_upper = 10, import_metrics = FALSE, save_plot = TRUE, filename = 'ggplot_out_', registered = TRUE, error_bars = FALSE)
 71+line.plot.results <- function(edit_count_min_lower = 1, edit_count_min_upper = 10, rev_count_after_min = 0, import_metrics = FALSE, plot_width = 10,
 72+save_plot = FALSE, filename = 'ggplot_out_', registered = FALSE, error_bars = FALSE, plot_title = "Huggle Experiments", load_metrics = FALSE, load_file = "", plot_samples = FALSE,
 73+x_scale = "Minimum Edits before Template Posting", y_scale = "Sample Size", plot_title_metric = "Metric Description")
7174 {
7275 # IMPORT DATA
7376
74 - # c(78,81) c(1,4) c(60,62,64,66,68,70,72,74,76) c(60,62,66,76) c(107,109,111,113,115) c(84,99,101,103,105)
75 - # c(79,82) c(2,3) c(61,63,65,67,69,71,73,75,77) c(61,63,67,77) c(108,110,114,116) c(85,86,100,102,104,106)
76 - # paste(home_dir,"output/metrics_1109_1209_z",sep="") paste(home_dir,"output/metrics_pt_z",sep="") paste(home_dir,"output/metrics_1018_1119_z",sep="") paste(home_dir,"output/metrics_1122_1222_z",sep="")
 77+ # c(84, 0) c(78,81) c(1,4) c(60,62,64,66,68,70,72,74,76) c(60,62,66,76) c(107,109,111,113,115) c(84,99,101,103,105)
 78+ # c(85, 0) c(79,82) c(2,3) c(61,63,65,67,69,71,73,75,77) c(61,63,67,77) c(108,110,114,116) c(85,86,100,102,104,106)
 79+ # paste(home_dir,"output/metrics_1108_1202_z",sep="") paste(home_dir,"output/metrics_1109_1209_z",sep="") paste(home_dir,"output/metrics_pt_z",sep="") paste(home_dir,"output/metrics_1018_1119_z",sep="") paste(home_dir,"output/metrics_1122_1222_z",sep="")
7780
78 - template_indices_control <- c(84, 0)
79 - template_indices_test <- c(85, 0)
 81+ template_indices_control <- c(84, 0)
 82+ template_indices_test <- c(85, 0)
8083 fname_first_part <- paste(home_dir,"output/metrics_1108_1202_z",sep="")
8184
8285 if (import_metrics)
8386 import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part)
8487
 88+ if (load_metrics)
 89+ load(load_file)
8590
8691
8792 # PROCESS DATA
@@ -90,11 +95,11 @@
9196 data_counts_test <<- c()
9297 data_counts_control <<- c()
9398
94 - edit_decrease_means_test <<- c()
95 - edit_decrease_means_control <<- c()
 99+ means_test <<- c()
 100+ means_control <<- c()
96101
97 - edit_decrease_sd_test <<- c()
98 - edit_decrease_sd_control <<- c()
 102+ sd_test <<- c()
 103+ sd_control <<- c()
99104
100105
101106 if (registered)
@@ -104,31 +109,58 @@
105110
106111 for (i in edit_count_before_filter)
107112 {
108 - process.data.frames(i,0,Inf,Inf,registered=registered,min_revisions_after=0)
 113+ process.data.frames(min_deleted_edits_before = i, max_deleted_edits_before = Inf, registered=registered, min_edits_after=rev_count_after_min)
 114+
 115+ means_test <<- c(means_test, mean(merged_test$edit_del_counts_0_3))
 116+ means_control <<- c(means_control, mean(merged_control$edit_del_counts_0_3))
109117
110 - edit_decrease_means_test <<- c(edit_decrease_means_test, mean(merged_test$edits_decrease) * 100)
111 - edit_decrease_means_control <<- c(edit_decrease_means_control, mean(merged_control$edits_decrease) * 100)
 118+ sd_test <<- c(sd_test, sd(merged_test$edit_del_counts_0_3))
 119+ sd_control <<- c(sd_control, sd(merged_control$edit_del_counts_0_3))
112120
113 - edit_decrease_sd_test <<- c(edit_decrease_sd_test, sd(merged_test$edits_decrease * 100))
114 - edit_decrease_sd_control <<- c(edit_decrease_sd_control, sd(merged_control$edits_decrease * 100))
115 -
116 - data_counts_test <<- c(data_counts_test, length(merged_test$edits_decrease))
117 - data_counts_control <<- c(data_counts_control, length(merged_control$edits_decrease))
 121+ data_counts_test <<- c(data_counts_test, length(merged_test$edit_del_counts_0_3))
 122+ data_counts_control <<- c(data_counts_control, length(merged_control$edit_del_counts_0_3))
118123 }
119124
120 - # PLOT DATA
121125
122 - plot_title = paste("Huggle Short 1 & 2 Experiment (", reg_str, ") - Decrease in Editor Activity", sep="")
 126+ # PLOT - Decrease in Editor Activity
123127
124 - df <- data.frame(x=1:length(edit_decrease_means_test), y_test=edit_decrease_means_test, y_ctrl=edit_decrease_means_control, y_test_sd=edit_decrease_sd_test, y_ctrl_sd=edit_decrease_sd_control)
 128+ # plot_title_full = paste(plot_title, "(", reg_str, ") - Decrease in Editor Activity", sep="")
 129+ plot_title_full = paste(plot_title, "(", reg_str, ") - ", plot_title_metric, sep="")
 130+
 131+ df <- data.frame(x=1:length(means_test), y_test=means_test, y_ctrl=means_control, y_test_sd=sd_test, y_ctrl_sd=sd_control)
125132 p <- ggplot(df,aes(x)) + geom_line(aes(y=y_test,colour="Test")) + geom_line(aes(y=y_ctrl,colour="Control"))
126133
127134 if (error_bars)
128135 p <- p + geom_errorbar(aes(ymin = y_test - y_test_sd, ymax = y_test + y_test_sd, colour="Test"), width=0.2) + geom_errorbar(aes(ymin = y_ctrl - y_ctrl_sd, ymax = y_ctrl + y_ctrl_sd, colour="Control"), width=0.2)
129136
130 - p <- p + scale_x_continuous('Minimum Edits before Template Posting') + scale_y_continuous('Mean % Decrease in Edit Activity') + opts(title = plot_title, legend.title = theme_blank())
 137+ # Add axes labels and titles
 138+ p <- p + scale_x_continuous(x_scale) + scale_y_continuous(y_scale) + opts(title = plot_title_full, legend.title = theme_blank())
131139
132140 if (save_plot)
133 - ggsave(paste('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/plots/',filename,reg_str,'.png',sep=""),width=8)
 141+ ggsave(paste('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/plots/',filename,reg_str,'.png',sep=""), width=plot_width)
 142+
 143+
 144+
 145+ # PLOT - Sample Sizes
 146+
 147+ if (plot_samples)
 148+ {
 149+ plot_title_full = paste(plot_title, "(", reg_str, ") - Sample Sizes", sep="")
 150+ bins <- 1:length(data_counts_test)
 151+
 152+ test_samples <- counts.to.samples(bins, data_counts_test)
 153+ control_samples <- counts.to.samples(bins, data_counts_control)
 154+
 155+ labels <- c(test_samples * 0, control_samples / control_samples)
 156+ labels[labels == 0] = "Test"
 157+ labels[labels == 1] = "Control"
 158+
 159+ df <- data.frame(x=c(test_samples, control_samples), labels=labels)
 160+ p <- ggplot(df, aes(x, fill=labels)) + geom_bar(binwidth=0.4, position="dodge")
 161+ p <- p + scale_x_continuous(x_scale) + scale_y_continuous('Sample Size') + opts(title = plot_title_full, legend.title = theme_blank())
 162+
 163+ if (save_plot)
 164+ ggsave(paste('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/plots/',filename,"samples_",reg_str,'.png',sep=""), width=plot_width)
 165+ }
134166 }
135167

Status & tagging log