r112749 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r112748‎ | r112749 | r112750 >
Date:00:44, 1 March 2012
Author:rfaulk
Status:deferred
Tags:
Comment:
remove files. functionality has been consolidated into template_analysis.R
Modified paths:
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis.R (deleted) (history)
  • /trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R (deleted) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/huggle3_analysis.R
@@ -1,154 +0,0 @@
2 -# source('/home/rfaulkner/trunk/projects/WSOR/message_templates/R/huggle3_analysis.R')
3 -#
4 -# Ryan Faulkner, January 23rd 2012
5 -#
6 -# Comparison of edit counts for Huggle 3 test among templates z64 (http://en.wikipedia.org/wiki/Template:Uw-error1-default) / z65 (http://en.wikipedia.org/wiki/Template:Uw-error1-short)
7 -#
8 -
9 -# Import helper methods - GLOBAL
10 -
11 -home_dir <- "/home/rfaulkner/trunk/projects/WSOR/message_templates/"
12 -# home_dir <- "/home/rfaulk/trunk/projects/WSOR/message_templates/"
13 -
14 -helper_import <- paste(home_dir,"R/R_helper_functions.R",sep="")
15 -source(helper_import)
16 -
17 -
18 -# FUNCTION :: import.experimental.metrics.data
19 -#
20 -# Import the template data and build data frames from it
21 -#
22 -
23 -import.experimental.metrics.data <- function(template_indices_test, template_indices_control, fname_first_part) {
24 -
25 - # Read aggregated results for the template
26 -
27 - fname_last_part_edits <- "_editcounts.tsv"
28 - fname_last_part_blocks <- "_blocks.tsv"
29 - fname_last_part_warn <- "_warnings.tsv"
30 -
31 - warn_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_warn, string_frames=c(1))
32 - warn_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_warn, string_frames=c(1))
33 -
34 - blocks_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_blocks, string_frames=c(1))
35 - blocks_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_blocks, string_frames=c(1))
36 -
37 - edits_test <<- build.data.frames(template_indices_test, fname_first_part, fname_last_part_edits, string_frames=c(1))
38 - edits_control <<- build.data.frames(template_indices_control, fname_first_part, fname_last_part_edits, string_frames=c(1))
39 -
40 -}
41 -
42 -
43 -
44 -# FUNCTION :: process.data.frames
45 -#
46 -# Given a set of data frames containing template test metrics per user posting combine and generate summary metric frames
47 -#
48 -# GLOBALS assumed to exist: warn_test, warn_control, blocks_test, blocks_control, edits_test, edits_control
49 -#
50 -
51 -process.data.frames <- function() {
52 -
53 - # MERGE THE METRICS AND ADD TEMPLATE COLS
54 -
55 - print("Merge Data..")
56 -
57 - merged_test <<- merge(edits_test, blocks_test, by=intersect(names(edits_test),names(blocks_test)), all=TRUE)
58 - merged_control <<- merge(edits_control, blocks_control, by=intersect(names(edits_control),names(blocks_control)), all=TRUE)
59 -
60 - merged_test <<- merge(merged_test, warn_test, by=intersect(names(merged_test),names(warn_test)), all=TRUE)
61 - merged_control <<- merge(merged_control, warn_control, by=intersect(names(merged_control),names(warn_control)), all=TRUE)
62 -
63 - merged_test$template <<- 1
64 - merged_control$template <<- 0
65 -
66 -
67 - # FILTER DATA
68 -
69 - print("Filter Data..")
70 - min_edits_before <- 5
71 - min_deleted_edits_before <- 0
72 -
73 - max_edits_before <- Inf
74 - max_deleted_edits_before <- Inf
75 -
76 - maximum_warns_before <- 0
77 -
78 - IP_regex <- "^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
79 - IP_regex_not <- '.*[a-zA-z].*'
80 -
81 - condition_1 <- TRUE # merged_test$blocks_before > 0
82 - condition_2 <- merged_test$blocks_after == 0
83 - condition_3 <- merged_test$ns_0_revisions_before >= min_edits_before & merged_test$ns_0_revisions_before <= max_edits_before
84 - condition_4 <- merged_test$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_test$ns_0_revisions_deleted_before <= max_deleted_edits_before
85 - condition_5 <- merged_test$warns_before <= maximum_warns_before
86 - condition_6 <- filter.list.by.regex(IP_regex_not, merged_test$recipient_name)
87 - condition_7 <- merged_test$ns_0_revisions_after_0_3 > 0
88 -
89 - indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
90 - merged_test <<- merged_test[indices,]
91 -
92 - condition_1 <- TRUE # merged_control$blocks_before > 0
93 - condition_2 <- merged_control$blocks_after == 0
94 - condition_3 <- merged_control$ns_0_revisions_before >= min_edits_before & merged_control$ns_0_revisions_before <= max_edits_before
95 - condition_4 <- merged_control$ns_0_revisions_deleted_before >= min_deleted_edits_before & merged_control$ns_0_revisions_deleted_before <= max_deleted_edits_before
96 - condition_5 <- merged_control$warns_before <= maximum_warns_before
97 - condition_6 <- filter.list.by.regex(IP_regex_not, merged_control$recipient_name)
98 - condition_7 <- merged_control$ns_0_revisions_after_0_3 > 0
99 -
100 - indices <- condition_1 & condition_2 & condition_3 & condition_4 & condition_5 & condition_6 & condition_7
101 - merged_control <<- merged_control[indices,]
102 -
103 -
104 - # ADD DERIVED COLS
105 -
106 - print("Add derived columns..")
107 -
108 - merged_test$edits_decrease <<- (merged_test$ns_0_revisions_before - merged_test$ns_0_revisions_after_0_3) / (merged_test$ns_0_revisions_before)
109 - merged_control$edits_decrease <<- (merged_control$ns_0_revisions_before - merged_control$ns_0_revisions_after_0_3) / (merged_control$ns_0_revisions_before)
110 -
111 - merged_test$edits_del_decrease <<- (merged_test$ns_0_revisions_deleted_before - (merged_test$ns_0_revisions_deleted_after_0_3)) / (merged_test$ns_0_revisions_deleted_before + 1)
112 - merged_control$edits_del_decrease <<- (merged_control$ns_0_revisions_deleted_before - (merged_control$ns_0_revisions_deleted_after_0_3)) / (merged_control$ns_0_revisions_deleted_before + 1)
113 -
114 -}
115 -
116 -
117 -
118 -# IMPORT DATA
119 -
120 -template_indices_control <- c(84, 0) # c(107,109,111,113,115) # c(1,4) # c(84,99,101,103,105) # c(60,62,64,66,68,70,72,74,76)
121 -template_indices_test <- c(86, 0) # c(108,110,114,116) # c(2,3) # c(85,86,100,102,104,106) # c(61,63,65,67,69,71,73,75,77)
122 -fname_first_part <- paste(home_dir,"output/metrics_1108_1202_z",sep="") # paste(home_dir,"output/metrics_1122_1222_z",sep="") # paste(home_dir,"output/metrics_pt_z",sep="") # paste(home_dir,"output/metrics_1018_1119_z",sep="") # "/home/rfaulk/WSOR/message_templates/output/metrics_pt_z"
123 -
124 -# import.experimental.metrics.data(template_indices_test, template_indices_control, fname_first_part)
125 -
126 -
127 -
128 -# PROCESS DATA
129 -
130 -# print("")
131 -# print("Processing data frames.")
132 -process.data.frames()
133 -
134 -
135 -
136 -# HYPOTHESIS TESTING
137 -
138 -#test_edits <- get.decrease.in.edits.after.template(edits_test$ns_0_revisions_before, edits_test$ns_0_revisions_after_3_30,lower_bound_rev_before=200,lower_bound_rev_after=0)
139 -#control_edits <- get.decrease.in.edits.after.template(edits_control$ns_0_revisions_before, edits_control$ns_0_revisions_after_3_30,lower_bound_rev_before=200, lower_bound_rev_after=0)
140 -
141 -#test_blocks <- get.change.in.blocks(blocks_test$blocks_before, blocks_test$blocks_after)
142 -#control_blocks <- get.change.in.blocks(blocks_control$blocks_before, blocks_control$blocks_after)
143 -
144 -#t_result_edits = t.test(x=test_edits, y=control_edits, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
145 -#t_result_blocks = t.test(x=test_blocks, y=control_blocks, alternative = "two.sided", paired = FALSE, var.equal = FALSE, conf.level = 0.95)
146 -
147 -
148 -
149 -# LOGISTIC REGRESSION MODELLING:
150 -
151 -all_data <- append.data.frames(merged_test, merged_control)
152 -summary(glm(template ~ edits_decrease, data=all_data, family=binomial(link="logit")))
153 -# summary(glm(template ~ edits_del_decrease, data=all_data, family=binomial(link="logit")))
154 -
155 -
Index: trunk/tools/wsor/message_templates/R/huggle3_analysis_chi_sq.R
@@ -1,84 +0,0 @@
2 -
3 -# Ryan Faulkner, January 25th 2012
4 -#
5 -# Comparison of metrics for Huggle 3 using a chi-square goodness of fit test
6 -#
7 -
8 -
9 -source('/home/rfaulk/WSOR/message_templates/R/R_helper_functions.R')
10 -
11 -# Read aggregated results for the template
12 -
13 -template_indices_control <- c(60,62,64,66,68,70,72,74,76)
14 -template_indices_test <- c(61,63,65,67,69,71,73,75,77)
15 -
16 -fname_first_part <- "/home/rfaulk/WSOR/message_templates/output/metrics_1018_1119_z"
17 -fname_last_part <- "_editcounts.tsv"
18 -
19 -
20 -# MAIN EXECUTION
21 -# ==============
22 -
23 -# BUILD THE DATA FRAMES
24 -
25 -metrics_test <- build.data.frames(template_indices_test, fname_first_part, fname_last_part)
26 -metrics_control <- build.data.frames(template_indices_control, fname_first_part, fname_last_part)
27 -
28 -
29 -# Compute the change in edits after the template
30 -# ===============================================
31 -
32 -
33 -test_samples <- c()
34 -control_samples <- c()
35 -
36 -for (i in 1:length(metrics_test$ns_0_revisions_before))
37 - if (metrics_test$ns_0_revisions_before[i] != 0)
38 - test_samples <- c(test_samples,
39 - (metrics_test$ns_0_revisions_before[i] - metrics_test$ns_0_revisions_after[i]) / metrics_test$ns_0_revisions_before[i])
40 -
41 -for (i in 1:length(metrics_control$ns_0_revisions_before))
42 - if (metrics_control$ns_0_revisions_before[i] != 0)
43 - control_samples <- c(control_samples,
44 - (metrics_control$ns_0_revisions_before[i] - metrics_control$ns_0_revisions_after[i]) / metrics_control$ns_0_revisions_before[i])
45 -
46 -
47 -
48 -# Construct a distribution (Normal) using parameters computed from metric count data
49 -# This will be used as the model distribution - do symetrically (ie. fot both ways)
50 -# ====================================================================================
51 -
52 -
53 -# Number of samples for each template
54 -
55 -n_test <- length(test_samples)
56 -n_control <- length(control_samples)
57 -
58 -
59 -# Produce probabilities for normal to be fit
60 -# build data frames
61 -
62 -lower_bound_range <- trunc(min(min(c(control_samples, test_samples))) - 1)
63 -upper_bound_range <- trunc(max(max(c(control_samples, test_samples))) + 1)
64 -bins <- sort(lower_bound_range : upper_bound_range)
65 -
66 -probs_control <- get_normal_bins(bins, control_samples)
67 -probs_test <- get_normal_bins(bins, test_samples)
68 -
69 -probs_control <- data.frame(values=bins, counts=probs_control)
70 -probs_test <- data.frame(values=bins, counts=probs_test)
71 -
72 -counts_test <- get_bin_counts(bins, test_samples)
73 -counts_control <- get_bin_counts(bins, control_samples)
74 -
75 -counts_test <- pad_counts(bins, counts_test)
76 -counts_control <- pad_counts(bins, counts_control)
77 -
78 -
79 -# Get chi-squared test results
80 -chisq_res_1 = chisq.test(counts_test$counts, p=probs_control$counts)
81 -chisq_res_2 = chisq.test(counts_control$counts, p=probs_test$counts)
82 -
83 -
84 -
85 -