Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R |
— | — | @@ -1,52 +1,154 @@ |
2 | | - |
| 2 | +# source('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/huggle3_analysis.R') |
| 3 | +# |
3 | 4 | # Ryan Faulkner, January 23rd 2012 |
4 | 5 | # |
5 | 6 | # Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short) |
6 | 7 | # |
7 | 8 | |
| 9 | +# Import helper methods - GLOBAL |
8 | 10 | |
9 | | -source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R') |
| 11 | +home_dir <- "/home/rfaulkner/trunk/projects/WSOR/message_templates/" |
| 12 | +# home_dir <- "/home/rfaulk/trunk/projects/WSOR/message_templates/" |
10 | 13 | |
| 14 | +helper_import <- paste(home_dir,"R/R_helper_functions.R",sep="") |
| 15 | +source(helper_import) |
11 | 16 | |
12 | | -# Read aggregated results for the template |
13 | 17 | |
14 | | -template_indices_control <- c(60,62,64,66,68,70,72,74,76) |
15 | | -template_indices_test <- c(61,63,65,67,69,71,73,75,77) |
| 18 | +# FUNCTION :: import.experimental.metrics.data |
| 19 | +# |
| 20 | +# Import the template data and build data frames from it |
| 21 | +# |
16 | 22 | |
17 | | -fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z" |
18 | | -fname_last_part <- "_editcounts.tsv" |
| 23 | +import.experimental.metrics.data <- function(template_indices_test, template_indices_control, fname_first_part) { |
| 24 | + |
| 25 | + # Read aggregated results for the template |
| 26 | + |
| 27 | + fname_last_part_edits <- "_editcounts.tsv" |
| 28 | + fname_last_part_blocks <- "_blocks.tsv" |
| 29 | + fname_last_part_warn <- "_warnings.tsv" |
| 30 | + |
| 31 | + warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, string_frames=c(1)) |
| 32 | + warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, string_frames=c(1)) |
| 33 | + |
| 34 | + blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, string_frames=c(1)) |
| 35 | + blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, string_frames=c(1)) |
| 36 | + |
| 37 | + edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, string_frames=c(1)) |
| 38 | + edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, string_frames=c(1)) |
| 39 | + |
| 40 | +} |
19 | 41 | |
20 | 42 | |
21 | | -# MAIN EXECUTION |
22 | | -# ============== |
23 | 43 | |
24 | | -# BUILD THE DATA FRAMES |
| 44 | +# FUNCTION :: process.data.frames |
| 45 | +# |
| 46 | +# Given a set of data frames containing template test metrics per user posting combine and generate summary metric frames |
| 47 | +# |
| 48 | +# GLOBALS assumed to exist: warn_test, warn_control, blocks_test, blocks_control, edits_test, edits_control |
| 49 | +# |
25 | 50 | |
26 | | -metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part) |
27 | | -metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part) |
| 51 | +process.data.frames <- function() { |
| 52 | + |
| 53 | + # MERGE THE METRICS AND ADD TEMPLATE COLS |
28 | 54 | |
| 55 | + print("Merge Data..") |
| 56 | + |
| 57 | + merged_test <<- merge(edits_test, blocks_test, by=intersect(names(edits_test),names(blocks_test)), all=TRUE) |
| 58 | + merged_control <<- merge(edits_control, blocks_control, by=intersect(names(edits_control),names(blocks_control)), all=TRUE) |
| 59 | + |
| 60 | + merged_test <<- merge(merged_test, warn_test, by=intersect(names(merged_test),names(warn_test)), all=TRUE) |
| 61 | + merged_control <<- merge(merged_control, warn_control, by=intersect(names(merged_control),names(warn_control)), all=TRUE) |
| 62 | + |
| 63 | + merged_test$template <<- 1 |
| 64 | + merged_control$template <<- 0 |
| 65 | + |
| 66 | + |
| 67 | + # FILTER DATA |
29 | 68 | |
30 | | -# Compute the change in edits after the template -- default to namespace 0 |
31 | | -# User Talk namespace does not necessarily have edits before - in this case omit the result (it could be the case that templates stimulate user talk edits but that should be tested separately) |
32 | | -# Only append non-zero results - do this for just namespace 3 since it has zero entries for 'ns_3_revisions_before' |
| 69 | + print("Filter Data..") |
| 70 | + min_edits_before <- 5 |
| 71 | + min_deleted_edits_before <- 0 |
| 72 | + |
| 73 | + max_edits_before <- Inf |
| 74 | + max_deleted_edits_before <- Inf |
| 75 | + |
| 76 | + maximum_warns_before <- 0 |
| 77 | + |
| 78 | + IP_regex <- "^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$" |
| 79 | + IP_regex_not <- '.*[a-zA-z].*' |
| 80 | + |
| 81 | + condition_1 <- TRUE # merged_test$blocks_before > 0 |
| 82 | + condition_2 <- merged_test$blocks_after == 0 |
| 83 | + condition_3 <- merged_test$ns_0_revisions_before >= min_edits_before & merged_test$ns_0_revisions_before <= max_edits_before |
| 84 | + condition_4 <- merged_test$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_test$ns_0_revisions_deleted_before <= max_deleted_edits_before |
| 85 | + condition_5 <- merged_test$warns_before <= maximum_warns_before |
| 86 | + condition_6 <- filter.list.by.regex(IP_regex_not, merged_test$recipient_name) |
| 87 | + condition_7 <- merged_test$ns_0_revisions_after_0_3 > 0 |
| 88 | + |
| 89 | + indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7 |
| 90 | + merged_test <<- merged_test[indices,] |
| 91 | + |
| 92 | + condition_1 <- TRUE # merged_control$blocks_before > 0 |
| 93 | + condition_2 <- merged_control$blocks_after == 0 |
| 94 | + condition_3 <- merged_control$ns_0_revisions_before >= min_edits_before & merged_control$ns_0_revisions_before <= max_edits_before |
| 95 | + condition_4 <- merged_control$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_control$ns_0_revisions_deleted_before <= max_deleted_edits_before |
| 96 | + condition_5 <- merged_control$warns_before <= maximum_warns_before |
| 97 | + condition_6 <- filter.list.by.regex(IP_regex_not, merged_control$recipient_name) |
| 98 | + condition_7 <- merged_control$ns_0_revisions_after_0_3 > 0 |
| 99 | + |
| 100 | + indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7 |
| 101 | + merged_control <<- merged_control[indices,] |
| 102 | + |
| 103 | + |
| 104 | + # ADD DERIVED COLS |
| 105 | + |
| 106 | + print("Add derived columns..") |
| 107 | + |
| 108 | + merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - merged_test$ns_0_revisions_after_0_3) / (merged_test$ns_0_revisions_before) |
| 109 | + merged_control$edits_decrease <<- (merged_control$ns_0_revisions_before - merged_control$ns_0_revisions_after_0_3) / (merged_control$ns_0_revisions_before) |
| 110 | + |
| 111 | + merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before + 1) |
| 112 | + merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before + 1) |
| 113 | + |
| 114 | +} |
33 | 115 | |
34 | | -z_test <- c() |
35 | | -z_control <- c() |
36 | 116 | |
37 | | -for (i in 1:length(metrics_test$ns_0_revisions_before)) |
38 | | - if (metrics_test$ns_0_revisions_before[i] != 0) |
39 | | - z_test <- c(z_test, |
40 | | - (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i]) |
41 | 117 | |
42 | | -for (i in 1:length(metrics_control['ns_0_revisions_before'][[1]])) |
43 | | - if (metrics_control$ns_0_revisions_before[i] != 0) |
44 | | - z_control <- c(z_control, |
45 | | - (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i]) |
| 118 | +# IMPORT DATA |
46 | 119 | |
| 120 | +template_indices_control <- c(84, 0) # c(107,109,111,113,115) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76) |
| 121 | +template_indices_test <- c(86, 0) # c(108,110,114,116) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77) |
| 122 | +fname_first_part <- paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="") # "/home/rfaulk/WSOR/message_templates/output/metrics_pt_z" |
47 | 123 | |
48 | | -# Generate results: |
| 124 | +# import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part) |
49 | 125 | |
50 | | -t_result = t.test(x=z_test, y=z_control, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95) |
51 | | -# t_result_ns3 = t.test(x=z64_ns3, y=z65_ns3, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95) |
52 | 126 | |
53 | 127 | |
| 128 | +# PROCESS DATA |
| 129 | + |
| 130 | +# print("") |
| 131 | +# print("Processing data frames.") |
| 132 | +process.data.frames() |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | +# HYPOTHESIS TESTING |
| 137 | + |
| 138 | +#test_edits <- get.decrease.in.edits.after.template(edits_test$ns_0_revisions_before, edits_test$ns_0_revisions_after_3_30,lower_bound_rev_before=200,lower_bound_rev_after=0) |
| 139 | +#control_edits <- get.decrease.in.edits.after.template(edits_control$ns_0_revisions_before, edits_control$ns_0_revisions_after_3_30,lower_bound_rev_before=200, lower_bound_rev_after=0) |
| 140 | + |
| 141 | +#test_blocks <- get.change.in.blocks(blocks_test$blocks_before, blocks_test$blocks_after) |
| 142 | +#control_blocks <- get.change.in.blocks(blocks_control$blocks_before, blocks_control$blocks_after) |
| 143 | + |
| 144 | +#t_result_edits = t.test(x=test_edits, y=control_edits, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95) |
| 145 | +#t_result_blocks = t.test(x=test_blocks, y=control_blocks, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95) |
| 146 | + |
| 147 | + |
| 148 | + |
| 149 | +# LOGISTIC REGRESSION MODELLING: |
| 150 | + |
| 151 | +all_data <- append.data.frames(merged_test, merged_control) |
| 152 | +summary(glm(template ~ edits_decrease, data=all_data, family=binomial(link="logit"))) |
| 153 | +# summary(glm(template ~ edits_del_decrease, data=all_data, family=binomial(link="logit"))) |
| 154 | + |
| 155 | + |