r110688 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r110687‎ | r110688 | r110689 >
Date:23:06, 3 February 2012
Author:rfaulk
Status:deferred
Tags:
Comment:
added functionality to do aggregate analysis of different templates
Modified paths:
  • /trunk/tools/wsor/message_templates/R/R_helper_functions.R (modified) (history)
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis.R (modified) (history)
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R
@@ -8,49 +8,45 @@
99 source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
1010
1111
12 -# Read aggregated results for z64
 12+# Read aggregated results for the template
1313
14 -metrics_ec_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
15 -metrics_blocks_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 14+template_indices_control <- c(60,62,64,66,68,70,72,74,76)
 15+template_indices_test <- c(61,63,65,67,69,71,73,75,77)
1616
 17+fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z"
 18+fname_last_part <- "_editcounts.tsv"
1719
18 -# Read aggregated results for z65
1920
20 -metrics_ec_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
21 -metrics_blocks_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 21+# MAIN EXECUTION
 22+# ==============
2223
 24+# BUILD THE DATA FRAMES
2325
24 -# Compute the change in edits after the template
 26+metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part)
 27+metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part)
2528
26 -z64_ns0 = (metrics_ec_z64$ns_0_revisions_after - metrics_ec_z64$ns_0_revisions_before) / metrics_ec_z64$ns_0_revisions_before
27 -z65_ns0 = (metrics_ec_z65$ns_0_revisions_after - metrics_ec_z65$ns_0_revisions_before / metrics_ec_z65$ns_0_revisions_before
2829
29 -
 30+# Compute the change in edits after the template -- default to namespace 0
3031 # User Talk namespace does not necessarily have edits before - in this case omit the result (it could be the case that templates stimulate user talk edits but that should be tested separately)
3132 # Only append non-zero results - do this for just namespace 3 since it has zero entries for 'ns_3_revisions_before'
3233
33 -z64_ns3 <- c()
34 -z65_ns3 <- c()
 34+z_test <- c()
 35+z_control <- c()
3536
36 -for (i in 1:length(metrics_ec_z64['ns_3_revisions_before'][[1]]))
37 - if (metrics_ec_z64['ns_3_revisions_before'][[1]][i] != 0)
38 - z64_ns3 <- c(z64_ns3,
39 - (metrics_ec_z64['ns_3_revisions_before'][[1]][i] - metrics_ec_z64['ns_3_revisions_after'][[1]][i]) / metrics_ec_z64['ns_3_revisions_before'][[1]][i])
 37+for (i in 1:length(metrics_test$ns_0_revisions_before))
 38+ if (metrics_test$ns_0_revisions_before[i] != 0)
 39+ z_test <- c(z_test,
 40+ (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i])
4041
41 -for (i in 1:length(metrics_ec_z65['ns_3_revisions_before'][[1]]))
42 - if (metrics_ec_z65['ns_3_revisions_before'][[1]][i] != 0)
43 - z65_ns3 <- c(z65_ns3,
44 - (metrics_ec_z65['ns_3_revisions_before'][[1]][i] - metrics_ec_z65['ns_3_revisions_after'][[1]][i]) / metrics_ec_z65['ns_3_revisions_before'][[1]][i])
 42+for (i in 1:length(metrics_control['ns_0_revisions_before'][[1]]))
 43+ if (metrics_control$ns_0_revisions_before[i] != 0)
 44+ z_control <- c(z_control,
 45+ (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i])
4546
4647
4748 # Generate results:
4849
49 -summary(z65_ns0)
50 -summary(z64_ns0)
51 -summary(z65_ns3)
52 -summary(z64_ns3)
 50+t_result = t.test(x=z_test, y=z_control, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
 51+# t_result_ns3 = t.test(x=z64_ns3, y=z65_ns3, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
5352
54 -t_result_ns0 = t.test(x=z64_ns0, y=z65_ns0, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
55 -t_result_ns3 = t.test(x=z64_ns3, y=z65_ns3, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
5653
57 -
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R
@@ -7,14 +7,22 @@
88
99 source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
1010
 11+# Read aggregated results for the template
1112
 13+template_indices_control <- c(60,62,64,66,68,70,72,74,76)
 14+template_indices_test <- c(61,63,65,67,69,71,73,75,77)
 15+
 16+fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z"
 17+fname_last_part <- "_editcounts.tsv"
 18+
 19+
1220 # MAIN EXECUTION
1321 # ==============
1422
15 -# Read aggregated results
 23+# BUILD THE DATA FRAMES
1624
17 -metrics_test = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
18 -metrics_control = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 25+metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part)
 26+metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part)
1927
2028
2129 # Compute the change in edits after the template
Index: trunk/tools/wsor/message_templates/R/R_helper_functions.R
@@ -17,7 +17,8 @@
1818
1919 sample_sd <- sd(data)
2020 sample_mean <- mean(data)
21 -
 21+ df <- length(data) - 1
 22+
2223 # vector to store bucket probabilities
2324 probs <- c()
2425 num_bins <- length(bins)
@@ -39,7 +40,8 @@
4041 lower <- bins[i] - ((bins[i] - bins[im1]) / 2)
4142 }
4243
43 - p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE)
 44+ # p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE)
 45+ p = pt(upper - sample_mean, df) - pnorm(lower - sample_mean, df)
4446 probs <- c(probs, p)
4547 }
4648
@@ -146,4 +148,76 @@
147149 }
148150
149151 data.frame(values=new_values, counts=new_counts)
 152+}
 153+
 154+
 155+# FUNCTION :: append.data.frames
 156+#
 157+# Given two data frames append the second to the first
 158+#
 159+# Assumes: the two data frames have the same column names
 160+#
 161+
 162+append.data.frames <- function(df_1, df_2) {
 163+
 164+ df_cols <- length(colnames(df_1))
 165+ df_rows_1 <- length(df_1[[1]])
 166+ df_rows_2 <- length(df_2[[1]])
 167+
 168+ new_rows <- df_rows_1 + df_rows_2
 169+ df_return <- data.frame(matrix(nrow=new_rows, ncol=df_cols))
 170+
 171+ for (i in 1:df_cols)
 172+ for (j in 1:df_rows_1)
 173+ df_return[colnames(df_return)[i]][[1]][j] <- df_1[colnames(df_1)[i]][[1]][j]
 174+
 175+ for (i in 1:df_cols)
 176+ for (j in 1:df_rows_2)
 177+ {
 178+ row_index <- j + df_rows_1
 179+ df_return[colnames(df_return)[i]][[1]][row_index] <- df_2[colnames(df_1)[i]][[1]][j]
 180+ }
 181+
 182+ # create the new data list
 183+ for (i in 1:df_cols)
 184+ {
 185+ colname <- colnames(df_1)[i]
 186+ colnames(df_return)[i] <- colname
 187+ }
 188+
 189+ df_return
 190+}
 191+
 192+
 193+# FUNCTION :: build.data.frames
 194+#
 195+# Constructs a concatenated data.frame from files
 196+#
 197+
 198+build.data.frames <- function(template_indices, fname_first_part, fname_last_part) {
 199+
 200+ # Initialize the data frame
 201+
 202+ filename <- paste(fname_first_part, template_indices[1], fname_last_part, sep="")
 203+ metrics = read.table(filename, na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 204+
 205+ output <- paste("Processing data from",filename,"....")
 206+ print(output)
 207+
 208+ # Extend the data frames
 209+
 210+ for (i in 2:length(template_indices_test))
 211+ {
 212+
 213+ index <- template_indices[i]
 214+ filename <- paste(fname_first_part, index, fname_last_part, sep="")
 215+
 216+ output <- paste("Processing data from",filename,"....")
 217+ print(output)
 218+
 219+ temp_frame = read.table(filename, na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
 220+ metrics <- append.data.frames(metrics, temp_frame)
 221+ }
 222+
 223+ metrics
150224 }
\ No newline at end of file

Status & tagging log