Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R |
— | — | @@ -1,154 +0,0 @@ |
2 | | -# source('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/huggle3_analysis.R') |
3 | | -# |
4 | | -# Ryan Faulkner, January 23rd 2012 |
5 | | -# |
6 | | -# Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short) |
7 | | -# |
8 | | - |
9 | | -# Import helper methods - GLOBAL |
10 | | - |
11 | | -home_dir <- "/home/rfaulkner/trunk/projects/WSOR/message_templates/" |
12 | | -# home_dir <- "/home/rfaulk/trunk/projects/WSOR/message_templates/" |
13 | | - |
14 | | -helper_import <- paste(home_dir,"R/R_helper_functions.R",sep="") |
15 | | -source(helper_import) |
16 | | - |
17 | | - |
18 | | -# FUNCTION :: import.experimental.metrics.data |
19 | | -# |
20 | | -# Import the template data and build data frames from it |
21 | | -# |
22 | | - |
23 | | -import.experimental.metrics.data <- function(template_indices_test, template_indices_control, fname_first_part) { |
24 | | - |
25 | | - # Read aggregated results for the template |
26 | | - |
27 | | - fname_last_part_edits <- "_editcounts.tsv" |
28 | | - fname_last_part_blocks <- "_blocks.tsv" |
29 | | - fname_last_part_warn <- "_warnings.tsv" |
30 | | - |
31 | | - warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, string_frames=c(1)) |
32 | | - warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, string_frames=c(1)) |
33 | | - |
34 | | - blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, string_frames=c(1)) |
35 | | - blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, string_frames=c(1)) |
36 | | - |
37 | | - edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, string_frames=c(1)) |
38 | | - edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, string_frames=c(1)) |
39 | | - |
40 | | -} |
41 | | - |
42 | | - |
43 | | - |
44 | | -# FUNCTION :: process.data.frames |
45 | | -# |
46 | | -# Given a set of data frames containing template test metrics per user posting combine and generate summary metric frames |
47 | | -# |
48 | | -# GLOBALS assumed to exist: warn_test, warn_control, blocks_test, blocks_control, edits_test, edits_control |
49 | | -# |
50 | | - |
51 | | -process.data.frames <- function() { |
52 | | - |
53 | | - # MERGE THE METRICS AND ADD TEMPLATE COLS |
54 | | - |
55 | | - print("Merge Data..") |
56 | | - |
57 | | - merged_test <<- merge(edits_test, blocks_test, by=intersect(names(edits_test),names(blocks_test)), all=TRUE) |
58 | | - merged_control <<- merge(edits_control, blocks_control, by=intersect(names(edits_control),names(blocks_control)), all=TRUE) |
59 | | - |
60 | | - merged_test <<- merge(merged_test, warn_test, by=intersect(names(merged_test),names(warn_test)), all=TRUE) |
61 | | - merged_control <<- merge(merged_control, warn_control, by=intersect(names(merged_control),names(warn_control)), all=TRUE) |
62 | | - |
63 | | - merged_test$template <<- 1 |
64 | | - merged_control$template <<- 0 |
65 | | - |
66 | | - |
67 | | - # FILTER DATA |
68 | | - |
69 | | - print("Filter Data..") |
70 | | - min_edits_before <- 5 |
71 | | - min_deleted_edits_before <- 0 |
72 | | - |
73 | | - max_edits_before <- Inf |
74 | | - max_deleted_edits_before <- Inf |
75 | | - |
76 | | - maximum_warns_before <- 0 |
77 | | - |
78 | | - IP_regex <- "^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$" |
79 | | - IP_regex_not <- '.*[a-zA-z].*' |
80 | | - |
81 | | - condition_1 <- TRUE # merged_test$blocks_before > 0 |
82 | | - condition_2 <- merged_test$blocks_after == 0 |
83 | | - condition_3 <- merged_test$ns_0_revisions_before >= min_edits_before & merged_test$ns_0_revisions_before <= max_edits_before |
84 | | - condition_4 <- merged_test$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_test$ns_0_revisions_deleted_before <= max_deleted_edits_before |
85 | | - condition_5 <- merged_test$warns_before <= maximum_warns_before |
86 | | - condition_6 <- filter.list.by.regex(IP_regex_not, merged_test$recipient_name) |
87 | | - condition_7 <- merged_test$ns_0_revisions_after_0_3 > 0 |
88 | | - |
89 | | - indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7 |
90 | | - merged_test <<- merged_test[indices,] |
91 | | - |
92 | | - condition_1 <- TRUE # merged_control$blocks_before > 0 |
93 | | - condition_2 <- merged_control$blocks_after == 0 |
94 | | - condition_3 <- merged_control$ns_0_revisions_before >= min_edits_before & merged_control$ns_0_revisions_before <= max_edits_before |
95 | | - condition_4 <- merged_control$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_control$ns_0_revisions_deleted_before <= max_deleted_edits_before |
96 | | - condition_5 <- merged_control$warns_before <= maximum_warns_before |
97 | | - condition_6 <- filter.list.by.regex(IP_regex_not, merged_control$recipient_name) |
98 | | - condition_7 <- merged_control$ns_0_revisions_after_0_3 > 0 |
99 | | - |
100 | | - indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7 |
101 | | - merged_control <<- merged_control[indices,] |
102 | | - |
103 | | - |
104 | | - # ADD DERIVED COLS |
105 | | - |
106 | | - print("Add derived columns..") |
107 | | - |
108 | | - merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - merged_test$ns_0_revisions_after_0_3) / (merged_test$ns_0_revisions_before) |
109 | | - merged_control$edits_decrease <<- (merged_control$ns_0_revisions_before - merged_control$ns_0_revisions_after_0_3) / (merged_control$ns_0_revisions_before) |
110 | | - |
111 | | - merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before + 1) |
112 | | - merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before + 1) |
113 | | - |
114 | | -} |
115 | | - |
116 | | - |
117 | | - |
118 | | -# IMPORT DATA |
119 | | - |
120 | | -template_indices_control <- c(84, 0) # c(107,109,111,113,115) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76) |
121 | | -template_indices_test <- c(86, 0) # c(108,110,114,116) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77) |
122 | | -fname_first_part <- paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="") # "/home/rfaulk/WSOR/message_templates/output/metrics_pt_z" |
123 | | - |
124 | | -# import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part) |
125 | | - |
126 | | - |
127 | | - |
128 | | -# PROCESS DATA |
129 | | - |
130 | | -# print("") |
131 | | -# print("Processing data frames.") |
132 | | -process.data.frames() |
133 | | - |
134 | | - |
135 | | - |
136 | | -# HYPOTHESIS TESTING |
137 | | - |
138 | | -#test_edits <- get.decrease.in.edits.after.template(edits_test$ns_0_revisions_before, edits_test$ns_0_revisions_after_3_30,lower_bound_rev_before=200,lower_bound_rev_after=0) |
139 | | -#control_edits <- get.decrease.in.edits.after.template(edits_control$ns_0_revisions_before, edits_control$ns_0_revisions_after_3_30,lower_bound_rev_before=200, lower_bound_rev_after=0) |
140 | | - |
141 | | -#test_blocks <- get.change.in.blocks(blocks_test$blocks_before, blocks_test$blocks_after) |
142 | | -#control_blocks <- get.change.in.blocks(blocks_control$blocks_before, blocks_control$blocks_after) |
143 | | - |
144 | | -#t_result_edits = t.test(x=test_edits, y=control_edits, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95) |
145 | | -#t_result_blocks = t.test(x=test_blocks, y=control_blocks, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95) |
146 | | - |
147 | | - |
148 | | - |
149 | | -# LOGISTIC REGRESSION MODELLING: |
150 | | - |
151 | | -all_data <- append.data.frames(merged_test, merged_control) |
152 | | -summary(glm(template ~ edits_decrease, data=all_data, family=binomial(link="logit"))) |
153 | | -# summary(glm(template ~ edits_del_decrease, data=all_data, family=binomial(link="logit"))) |
154 | | - |
155 | | - |
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R |
— | — | @@ -1,84 +0,0 @@ |
2 | | - |
3 | | -# Ryan Faulkner, January 25th 2012 |
4 | | -# |
5 | | -# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test |
6 | | -# |
7 | | - |
8 | | - |
9 | | -source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R') |
10 | | - |
11 | | -# Read aggregated results for the template |
12 | | - |
13 | | -template_indices_control <- c(60,62,64,66,68,70,72,74,76) |
14 | | -template_indices_test <- c(61,63,65,67,69,71,73,75,77) |
15 | | - |
16 | | -fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z" |
17 | | -fname_last_part <- "_editcounts.tsv" |
18 | | - |
19 | | - |
20 | | -# MAIN EXECUTION |
21 | | -# ============== |
22 | | - |
23 | | -# BUILD THE DATA FRAMES |
24 | | - |
25 | | -metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part) |
26 | | -metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part) |
27 | | - |
28 | | - |
29 | | -# Compute the change in edits after the template |
30 | | -# =============================================== |
31 | | - |
32 | | - |
33 | | -test_samples <- c() |
34 | | -control_samples <- c() |
35 | | - |
36 | | -for (i in 1:length(metrics_test$ns_0_revisions_before)) |
37 | | - if (metrics_test$ns_0_revisions_before[i] != 0) |
38 | | - test_samples <- c(test_samples, |
39 | | - (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i]) |
40 | | - |
41 | | -for (i in 1:length(metrics_control$ns_0_revisions_before)) |
42 | | - if (metrics_control$ns_0_revisions_before[i] != 0) |
43 | | - control_samples <- c(control_samples, |
44 | | - (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i]) |
45 | | - |
46 | | - |
47 | | - |
48 | | -# Construct a distribution (Normal) using parameters computed from metric count data |
49 | | -# This will be used as the model distribution - do symetrically (ie. fot both ways) |
50 | | -# ==================================================================================== |
51 | | - |
52 | | - |
53 | | -# Number of samples for each template |
54 | | - |
55 | | -n_test <- length(test_samples) |
56 | | -n_control <- length(control_samples) |
57 | | - |
58 | | - |
59 | | -# Produce probabilities for normal to be fit |
60 | | -# build data frames |
61 | | - |
62 | | -lower_bound_range <- trunc(min(min(c(control_samples, test_samples))) - 1) |
63 | | -upper_bound_range <- trunc(max(max(c(control_samples, test_samples))) + 1) |
64 | | -bins <- sort(lower_bound_range : upper_bound_range) |
65 | | - |
66 | | -probs_control <- get_normal_bins(bins, control_samples) |
67 | | -probs_test <- get_normal_bins(bins, test_samples) |
68 | | - |
69 | | -probs_control <- data.frame(values=bins, counts=probs_control) |
70 | | -probs_test <- data.frame(values=bins, counts=probs_test) |
71 | | - |
72 | | -counts_test <- get_bin_counts(bins, test_samples) |
73 | | -counts_control <- get_bin_counts(bins, control_samples) |
74 | | - |
75 | | -counts_test <- pad_counts(bins, counts_test) |
76 | | -counts_control <- pad_counts(bins, counts_control) |
77 | | - |
78 | | - |
79 | | -# Get chi-squared test results |
80 | | -chisq_res_1 = chisq.test(counts_test$counts, p=probs_control$counts) |
81 | | -chisq_res_2 = chisq.test(counts_control$counts, p=probs_test$counts) |
82 | | - |
83 | | - |
84 | | - |
85 | | - |