r110688 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r110687‎ \| r110688 \| r110689 >
Date:	23:06, 3 February 2012
Author:	rfaulk
Status:	deferred
Tags:
Comment:	added functionality to do aggregate analysis of different templates
Modified paths:	/trunk/tools/wsor/message_templates/R/R_helper_functions.R (modified) (history) /trunk/tools/wsor/message_templates/R/huggle3_analysis.R (modified) (history) /trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R
—	—	@@ -8,49 +8,45 @@
9	9	source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
10	10
11	11
12		~~-# Read aggregated results for z64~~
	12	+# Read aggregated results for the template
13	13
14		~~-metrics_ec_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)~~
15		~~-metrics_blocks_z64 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)~~
	14	+template_indices_control <- c(60,62,64,66,68,70,72,74,76)
	15	+template_indices_test <- c(61,63,65,67,69,71,73,75,77)
16	16
	17	+fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z"
	18	+fname_last_part <- "_editcounts.tsv"
17	19
18		~~-# Read aggregated results for z65~~
19	20
20		~~-metrics_ec_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)~~
21		~~-metrics_blocks_z65 = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_blocks.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)~~
	21	+# MAIN EXECUTION
	22	+# ==============
22	23
	24	+# BUILD THE DATA FRAMES
23	25
24		~~-# Compute the change in edits after the template~~
	26	+metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part)
	27	+metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part)
25	28
26		~~-z64_ns0 = (metrics_ec_z64$ns_0_revisions_after - metrics_ec_z64$ns_0_revisions_before) / metrics_ec_z64$ns_0_revisions_before~~
27		~~-z65_ns0 = (metrics_ec_z65$ns_0_revisions_after - metrics_ec_z65$ns_0_revisions_before / metrics_ec_z65$ns_0_revisions_before~~
28	29
29		-
	30	+# Compute the change in edits after the template -- default to namespace 0
30	31	# User Talk namespace does not necessarily have edits before - in this case omit the result (it could be the case that templates stimulate user talk edits but that should be tested separately)
31	32	# Only append non-zero results - do this for just namespace 3 since it has zero entries for 'ns_3_revisions_before'
32	33
33		~~-z64_ns3 <- c()~~
34		~~-z65_ns3 <- c()~~
	34	+z_test <- c()
	35	+z_control <- c()
35	36
36		~~-for (i in 1:length(metrics_ec_z64['ns_3_revisions_before'][[1]]))~~
37		~~- if (metrics_ec_z64['ns_3_revisions_before'][[1]][i] != 0)~~
38		~~- z64_ns3 <- c(z64_ns3,~~
39		~~- (metrics_ec_z64['ns_3_revisions_before'][[1]][i] - metrics_ec_z64['ns_3_revisions_after'][[1]][i]) / metrics_ec_z64['ns_3_revisions_before'][[1]][i])~~
	37	+for (i in 1:length(metrics_test$ns_0_revisions_before))
	38	+ if (metrics_test$ns_0_revisions_before[i] != 0)
	39	+ z_test <- c(z_test,
	40	+ (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i])
40	41
41		~~-for (i in 1:length(metrics_ec_z65['ns_3_revisions_before'][[1]]))~~
42		~~- if (metrics_ec_z65['ns_3_revisions_before'][[1]][i] != 0)~~
43		~~- z65_ns3 <- c(z65_ns3,~~
44		~~- (metrics_ec_z65['ns_3_revisions_before'][[1]][i] - metrics_ec_z65['ns_3_revisions_after'][[1]][i]) / metrics_ec_z65['ns_3_revisions_before'][[1]][i])~~
	42	+for (i in 1:length(metrics_control['ns_0_revisions_before'][[1]]))
	43	+ if (metrics_control$ns_0_revisions_before[i] != 0)
	44	+ z_control <- c(z_control,
	45	+ (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i])
45	46
46	47
47	48	# Generate results:
48	49
49		~~-summary(z65_ns0)~~
50		~~-summary(z64_ns0)~~
51		~~-summary(z65_ns3)~~
52		~~-summary(z64_ns3)~~
	50	+t_result = t.test(x=z_test, y=z_control, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
	51	+# t_result_ns3 = t.test(x=z64_ns3, y=z65_ns3, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
53	52
54		~~-t_result_ns0 = t.test(x=z64_ns0, y=z65_ns0, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)~~
55		~~-t_result_ns3 = t.test(x=z64_ns3, y=z65_ns3, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)~~
56	53
57		-
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R
—	—	@@ -7,14 +7,22 @@
8	8
9	9	source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
10	10
	11	+# Read aggregated results for the template
11	12
	13	+template_indices_control <- c(60,62,64,66,68,70,72,74,76)
	14	+template_indices_test <- c(61,63,65,67,69,71,73,75,77)
	15	+
	16	+fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z"
	17	+fname_last_part <- "_editcounts.tsv"
	18	+
	19	+
12	20	# MAIN EXECUTION
13	21	# ==============
14	22
15		~~-# Read aggregated results~~
	23	+# BUILD THE DATA FRAMES
16	24
17		~~-metrics_test = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z70_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)~~
18		~~-metrics_control = read.table("/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z71_editcounts.tsv", na.strings="\\N", sep="\t", comment.char="", quote="", header=T)~~
	25	+metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part)
	26	+metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part)
19	27
20	28
21	29	# Compute the change in edits after the template
Index: trunk/tools/wsor/message_templates/R/R_helper_functions.R
—	—	@@ -17,7 +17,8 @@
18	18
19	19	sample_sd <- sd(data)
20	20	sample_mean <- mean(data)
21		-
	21	+ df <- length(data) - 1
	22	+
22	23	# vector to store bucket probabilities
23	24	probs <- c()
24	25	num_bins <- length(bins)
—	—	@@ -39,7 +40,8 @@
40	41	lower <- bins[i] - ((bins[i] - bins[im1]) / 2)
41	42	}
42	43
43		~~- p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE)~~
	44	+ # p = pnorm(upper, mean = sample_mean, sd = sample_sd, log = FALSE) - pnorm(lower, mean = sample_mean, sd = sample_sd, log = FALSE)
	45	+ p = pt(upper - sample_mean, df) - pnorm(lower - sample_mean, df)
44	46	probs <- c(probs, p)
45	47	}
46	48
—	—	@@ -146,4 +148,76 @@
147	149	}
148	150
149	151	data.frame(values=new_values, counts=new_counts)
	152	+}
	153	+
	154	+
	155	+# FUNCTION :: append.data.frames
	156	+#
	157	+# Given two data frames append the second to the first
	158	+#
	159	+# Assumes: the two data frames have the same column names
	160	+#
	161	+
	162	+append.data.frames <- function(df_1, df_2) {
	163	+
	164	+ df_cols <- length(colnames(df_1))
	165	+ df_rows_1 <- length(df_1[[1]])
	166	+ df_rows_2 <- length(df_2[[1]])
	167	+
	168	+ new_rows <- df_rows_1 + df_rows_2
	169	+ df_return <- data.frame(matrix(nrow=new_rows, ncol=df_cols))
	170	+
	171	+ for (i in 1:df_cols)
	172	+ for (j in 1:df_rows_1)
	173	+ df_return[colnames(df_return)[i]][[1]][j] <- df_1[colnames(df_1)[i]][[1]][j]
	174	+
	175	+ for (i in 1:df_cols)
	176	+ for (j in 1:df_rows_2)
	177	+ {
	178	+ row_index <- j + df_rows_1
	179	+ df_return[colnames(df_return)[i]][[1]][row_index] <- df_2[colnames(df_1)[i]][[1]][j]
	180	+ }
	181	+
	182	+ # create the new data list
	183	+ for (i in 1:df_cols)
	184	+ {
	185	+ colname <- colnames(df_1)[i]
	186	+ colnames(df_return)[i] <- colname
	187	+ }
	188	+
	189	+ df_return
	190	+}
	191	+
	192	+
	193	+# FUNCTION :: build.data.frames
	194	+#
	195	+# Constructs a concatenated data.frame from files
	196	+#
	197	+
	198	+build.data.frames <- function(template_indices, fname_first_part, fname_last_part) {
	199	+
	200	+ # Initialize the data frame
	201	+
	202	+ filename <- paste(fname_first_part, template_indices[1], fname_last_part, sep="")
	203	+ metrics = read.table(filename, na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
	204	+
	205	+ output <- paste("Processing data from",filename,"....")
	206	+ print(output)
	207	+
	208	+ # Extend the data frames
	209	+
	210	+ for (i in 2:length(template_indices_test))
	211	+ {
	212	+
	213	+ index <- template_indices[i]
	214	+ filename <- paste(fname_first_part, index, fname_last_part, sep="")
	215	+
	216	+ output <- paste("Processing data from",filename,"....")
	217	+ print(output)
	218	+
	219	+ temp_frame = read.table(filename, na.strings="\\N", sep="\t", comment.char="", quote="", header=T)
	220	+ metrics <- append.data.frames(metrics, temp_frame)
	221	+ }
	222	+
	223	+ metrics
150	224	}
\ No newline at end of file

Status & tagging log

14:22, 4 February 2012 Reedy (talk | contribs) changed the status of r110688 [removed: new added: deferred]