r106108 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r106107‎ | r106108 | r106109 >
Date:21:53, 13 December 2011
Author:halfak
Status:deferred
Tags:
Comment:
Added CHI^2 tests to hugglings
Modified paths:
  • /trunk/tools/wsor/newbie_warnings/R/chi2_tests.R (added) (history)
  • /trunk/tools/wsor/newbie_warnings/R/chi2_tests_mk2.R (added) (history)

Diff [purge]

Index: trunk/tools/wsor/newbie_warnings/R/chi2_tests_mk2.R
@@ -0,0 +1,377 @@
 2+source("loader/load_huggling_codings_mk2.R")
 3+library(doBy)
 4+hugglings = load_huggling_codings_mk2()
 5+
 6+#hugglingCounts = summaryBy(
 7+# recipient ~ recipient,
 8+# data = hugglings,
 9+# FUN=length
 10+#)
 11+#hugglingCounts$count = hugglingCounts$recipient.length
 12+#hugglingCounts$recipient.length = NULL
 13+#
 14+#hugglings = merge(hugglings, hugglingCounts, by=c("recipient"))
 15+
 16+#huggling_codings = load_huggling_codings(reload=T)
 17+#messaged_codings = huggling_codings[!is.na(huggling_codings$before_rating),]
 18+ifNA = function(val, naThen){
 19+ if(is.na(val)){
 20+ naThen
 21+ }else{
 22+ val
 23+ }
 24+}
 25+
 26+hugglings$contact = with(
 27+ hugglings,
 28+ mapply(
 29+ ifNA,
 30+ responds_hugglers_talk |
 31+ responds_own_talk |
 32+ responds_elsewhere |
 33+ retaliates,
 34+ F
 35+ )
 36+)
 37+hugglings$good_contact = mapply(
 38+ function(contact, retaliates){
 39+ if(!contact){
 40+ NA
 41+ }else{
 42+ !retaliates
 43+ }
 44+ },
 45+ hugglings$contact,
 46+ hugglings$retaliates
 47+)
 48+hugglings$stay = !is.na(hugglings$after_rating)
 49+hugglings$improves = hugglings$after_rating > hugglings$before_rating
 50+hugglings$talk_edits_before_msg = with(
 51+ hugglings,
 52+ user_talk_edits_after_msg + article_talk_edits_before_msg
 53+)
 54+# Can't do it
 55+#messaged_codings$ntalk_edits_before_msg = with(
 56+# messaged_codings,
 57+# edits_before_msg - talk_edits_before_msg
 58+#)
 59+
 60+hugglings$good_outcome = with(
 61+ hugglings,
 62+ ( #Suckas leave or get better
 63+ before_rating <= 2 &
 64+ (
 65+ is.na(after_rating) |
 66+ after_rating > 2
 67+ )
 68+ ) |
 69+ ( #Good contact was made
 70+ !is.na(good_contact) &
 71+ good_contact
 72+ ) |
 73+ ( #Edits are good after receiving message
 74+ !is.na(after_rating) &
 75+ after_rating > 3
 76+ )
 77+)
 78+
 79+##
 80+# Groups
 81+#
 82+# - <= 1 "vandal": We all agreed that the editor was a blatant vandal
 83+# - > 1 & <= 2 "bad faith": We all agreed that the editor was bad faith
 84+# - > 2 & < 4 "test": A test edit, but not good faith
 85+# - >= 4 "good faith": Good faith to excellent
 86+#
 87+# For each group:
 88+# - contact
 89+# - contact huggler + retaliate
 90+# - talk? (wait for staeiou)
 91+# - continue editing
 92+# - did they actually
 93+# - quality
 94+# - improve
 95+# - was it good
 96+# - degrade
 97+#
 98+#
 99+# Predictors:
 100+# - number of edilts before message
 101+# - number deleted
 102+# - makes edits to talk (before/after)
 103+
 104+hugglings$group = as.factor(sapply(
 105+ hugglings$before_rating,
 106+ function(rating){
 107+ if(is.na(rating)){
 108+ NA
 109+ }else if(rating <= 1){
 110+ "vandal"
 111+ }else if(rating > 1 & rating <= 2){
 112+ "bad faith"
 113+ }else if(rating > 2 & rating < 4){
 114+ "test"
 115+ }else if(rating >= 4){
 116+ "good faith"
 117+ }else{
 118+ NA
 119+ }
 120+ }
 121+))
 122+formatNum = function(num){
 123+ if(num >= 0){
 124+ paste(" ", format(round(num, 3), nsmall=3), sep="")
 125+ }else{
 126+ format(round(num, 3), nsmall=3)
 127+ }
 128+}
 129+
 130+for(group in c("vandal", "bad faith", "test", "good faith")){
 131+ group_codings = hugglings[hugglings$group == group,]
 132+
 133+
 134+ cat("Result's for ", length(group_codings$group), " '", group, "' editors:\n", sep='')
 135+ cat("============================================================\n")
 136+
 137+ control = group_codings[group_codings$def,]
 138+ personal = group_codings[group_codings$personal,]
 139+ nodirectives = group_codings[group_codings$nodirectives,]
 140+
 141+ experiments = list(
 142+ list(name="Personal ", data=personal),
 143+ list(name="No Directives", data=nodirectives)
 144+ )
 145+
 146+ outcomes = list(
 147+ list(name="Good outcome", field="good_outcome"),
 148+ list(name="Improves", field="improves"),
 149+ list(name="Contact", field="contact"),
 150+ list(name="Stays", field="stay"),
 151+ list(name="Good contact", field="good_contact")
 152+ )
 153+ for(outcome in outcomes){
 154+ cat(outcome$name, ": \n", sep="")
 155+
 156+ controlLen = length(na.omit(control[[outcome$field]]))
 157+ controlSuccess = sum(control[[outcome$field]], na.rm=T)
 158+ cat(
 159+ "\tControl ",
 160+ ": prop=", formatNum(controlSuccess/controlLen),
 161+ ", n=", controlLen, "\n",
 162+ sep=""
 163+ )
 164+ for(experiment in experiments){
 165+ expSuccess = sum(experiment$data[[outcome$field]], na.rm=T)
 166+ expLen = length(na.omit(experiment$data[[outcome$field]]))
 167+ t = prop.test(
 168+ c(
 169+ expSuccess,
 170+ controlSuccess
 171+ ),
 172+ c(
 173+ expLen,
 174+ controlLen
 175+ )
 176+ )
 177+
 178+ propDiff = mean(experiment$data[[outcome$field]], na.rm=T)-mean(control[[outcome$field]], na.rm=T)
 179+ cat(
 180+ "\t", experiment$name,
 181+ ": prop=", formatNum(expSuccess/expLen),
 182+ ", diff=", formatNum(propDiff),
 183+ ", p-value=", formatNum(t$p.value),
 184+ ", conf.int=(", formatNum(t$conf.int[1]), ", ", formatNum(t$conf.int[2]), ")",
 185+ ", n=", expLen, "\n",
 186+ sep=""
 187+ )
 188+ }
 189+ cat("\n")
 190+ }
 191+
 192+
 193+ cat("\n\n\n")
 194+}
 195+
 196+meanNoNA = function(x){
 197+ mean(x, na.rm=T)
 198+}
 199+lengthNoNA = function(x){
 200+ length(na.omit(x))
 201+}
 202+
 203+library(lattice)
 204+outcomeNames = list(
 205+ good_outcome = "with a \"good outcome\"",
 206+ improves = "who show improvement",
 207+ contact = "who contact the reverting editor",
 208+ good_contact = "who contact the reverting editor nicely",
 209+ stay = "who make at least one edit after reading the message"
 210+)
 211+for(outcomeName in c("good_outcome", "improves", "contact", "good_contact", "stay")){
 212+ f = with(
 213+ summaryBy(
 214+ outcome ~ group + teaching + personal,
 215+ data = data.frame(
 216+ outcome = messaged_codings[[outcomeName]],
 217+ teaching = messaged_codings$teaching,
 218+ personal = messaged_codings$personal,
 219+ group = messaged_codings$group
 220+ ),
 221+ FUN=c(meanNoNA, lengthNoNA)
 222+ ),
 223+ data.frame(
 224+ group = group,
 225+ message = mapply(
 226+ function(personal, teaching){
 227+ if(personal & teaching){
 228+ "personal & teaching"
 229+ }else if(personal){
 230+ "personal"
 231+ }else if(teaching){
 232+ "teaching"
 233+ }else{
 234+ "control"
 235+ }
 236+ },
 237+ personal,
 238+ teaching
 239+ ),
 240+ #teaching = teaching,
 241+ #personal = personal,
 242+ prop = outcome.meanNoNA,
 243+ n = outcome.lengthNoNA
 244+ )
 245+ )
 246+ cat(outcomeName, "\n")
 247+ cat(f$prop, "\n\n")
 248+ svg(paste("plots/outcome", outcomeName, "all_groups.svg", sep="."), height=4, width=8)
 249+ print(barchart(
 250+ prop ~ group | message,
 251+ data = f,
 252+ layout=c(4,1),
 253+ xlab="Pre-message rating",
 254+ lab="Proportion of editors",
 255+ main=paste("Proportion of editors", outcomeNames[[outcomeName]])
 256+ ))
 257+ dev.off()
 258+}
 259+
 260+messaged_codings$default = !messaged_codings$personal & !messaged_codings$teaching
 261+messaged_codings$teaching_only = messaged_codings$teaching & !messaged_codings$personal
 262+messaged_codings$personal_only = !messaged_codings$teaching & messaged_codings$personal
 263+messaged_codings$teaching_and_personal = messaged_codings$teaching & messaged_codings$personal
 264+
 265+s = scale
 266+
 267+for(condition in c("teaching_only", "personal_only", "teaching_and_personal")){
 268+ cat("-----------------------------------------------------------\n")
 269+ cat("-----------", condition, "\n")
 270+ cat("-----------------------------------------------------------\n")
 271+ exp_codings = messaged_codings[
 272+ messaged_codings[[condition]] |
 273+ messaged_codings$default,
 274+ ]
 275+
 276+ exp_codings$condition = exp_codings[[condition]]
 277+
 278+ print(summary(glm(
 279+ good_outcome ~
 280+ anon +
 281+ s(ntalk_edits_before_msg) +
 282+ s(talk_edits_before_msg) +
 283+ s(before_rating) *
 284+ condition,
 285+ data = exp_codings[exp_codings$image,]
 286+ )))
 287+ print(summary(glm(
 288+ good_outcome ~
 289+ anon +
 290+ s(ntalk_edits_before_msg) +
 291+ s(talk_edits_before_msg) +
 292+ s(before_rating) *
 293+ condition,
 294+ data = exp_codings[!exp_codings$image,]
 295+ )))
 296+
 297+
 298+ print(summary(glm(
 299+ improves ~
 300+ anon +
 301+ s(ntalk_edits_before_msg) +
 302+ s(talk_edits_before_msg) +
 303+ s(before_rating) *
 304+ condition,
 305+ data = exp_codings[exp_codings$image,]
 306+ )))
 307+ print(summary(glm(
 308+ improves ~
 309+ anon +
 310+ s(ntalk_edits_before_msg) +
 311+ s(talk_edits_before_msg) +
 312+ s(before_rating) *
 313+ condition,
 314+ data = exp_codings[!exp_codings$image,]
 315+ )))
 316+
 317+
 318+ print(summary(glm(
 319+ contact ~
 320+ anon +
 321+ s(ntalk_edits_before_msg) +
 322+ s(talk_edits_before_msg) +
 323+ s(before_rating) *
 324+ condition,
 325+ data = exp_codings[exp_codings$image,]
 326+ )))
 327+ print(summary(glm(
 328+ contact ~
 329+ anon +
 330+ s(ntalk_edits_before_msg) +
 331+ s(talk_edits_before_msg) +
 332+ s(before_rating) *
 333+ condition,
 334+ data = exp_codings[!exp_codings$image,]
 335+ )))
 336+
 337+
 338+ print(summary(glm(
 339+ good_contact ~
 340+ anon +
 341+ s(ntalk_edits_before_msg) +
 342+ s(talk_edits_before_msg) +
 343+ s(before_rating) *
 344+ condition,
 345+ data = exp_codings[exp_codings$image,]
 346+ )))
 347+ print(summary(glm(
 348+ good_contact ~
 349+ anon +
 350+ s(ntalk_edits_before_msg) +
 351+ s(talk_edits_before_msg) +
 352+ s(before_rating) *
 353+ condition,
 354+ data = exp_codings[!exp_codings$image,]
 355+ )))
 356+
 357+
 358+ print(summary(glm(
 359+ stay ~
 360+ anon +
 361+ s(ntalk_edits_before_msg) +
 362+ s(talk_edits_before_msg) +
 363+ s(before_rating) *
 364+ condition,
 365+ data = exp_codings[exp_codings$image,]
 366+ )))
 367+ print(summary(glm(
 368+ stay ~
 369+ anon +
 370+ s(ntalk_edits_before_msg) +
 371+ s(talk_edits_before_msg) +
 372+ s(before_rating) *
 373+ condition,
 374+ data = exp_codings[!exp_codings$image,]
 375+ )))
 376+}
 377+
 378+
Index: trunk/tools/wsor/newbie_warnings/R/chi2_tests.R
@@ -0,0 +1,188 @@
 2+source("loader/load_hugglings.R")
 3+source("loader/load_huggling_codings.R")
 4+library(doBy)
 5+hugglings = load_hugglings()
 6+
 7+hugglingCounts = summaryBy(
 8+ recipient ~ recipient,
 9+ data = hugglings,
 10+ FUN=length
 11+)
 12+hugglingCounts$count = hugglingCounts$recipient.length
 13+hugglingCounts$recipient.length = NULL
 14+
 15+hugglings = merge(hugglings, hugglingCounts, by=c("recipient"))
 16+
 17+huggling_codings = load_huggling_codings(reload=T)
 18+messaged_codings = huggling_codings[!is.na(huggling_codings$before_rating),]
 19+
 20+messaged_codings$retailates = messaged_codings$retaliates > 0
 21+messaged_codings$contact = !is.na(messaged_codings$contacts_huggler) & (messaged_codings$contacts_huggler > 0 | messaged_codings$retaliates > 0)
 22+messaged_codings$quality_work = messaged_codings$after_rating >= 3.0
 23+messaged_codings$stay = !is.na(messaged_codings$after_rating)
 24+messaged_codings$improves = messaged_codings$after_rating > messaged_codings$before_rating
 25+messaged_codings$anon = messaged_codings$is_anon > 0
 26+messaged_codings$talk_edits_before_msg = with(
 27+ messaged_codings,
 28+ user_talk_edits_after_msg + article_talk_edits_before_msg
 29+)
 30+messaged_codings$ntalk_edits_before_msg = with(
 31+ messaged_codings,
 32+ edits_before_msg - talk_edits_before_msg
 33+)
 34+messaged_codings$good_contact = mapply(
 35+ function(contact, retaliates){
 36+ if(!is.na(contact) & contact){
 37+ retaliates <= 0
 38+ }else{
 39+ NA
 40+ }
 41+ },
 42+ messaged_codings$contact,
 43+ messaged_codings$retaliates
 44+)
 45+messaged_codings$good_outcome = with(
 46+ messaged_codings,
 47+ (
 48+ before_rating <= 2 &
 49+ (
 50+ is.na(after_rating) |
 51+ after_rating > 2
 52+ )
 53+ ) |
 54+ (
 55+ !is.na(good_contact) &
 56+ good_contact
 57+ ) |
 58+ (
 59+ !is.na(quality_work) &
 60+ quality_work
 61+ )
 62+)
 63+
 64+##
 65+# Groups
 66+#
 67+# - < 2 at least one of us thought "no hope"
 68+# - >= 2 & <= 3 possibles
 69+# - > 3 at least one of us thought "golden"
 70+#
 71+# For each group:
 72+# - contact
 73+# - contact huggler + retaliate
 74+# - talk? (wait for staeiou)
 75+# - continue editing
 76+# - did they actually
 77+# - quality
 78+# - improve
 79+# - was it good
 80+# - degrade
 81+#
 82+#
 83+# Predictors:
 84+# - number of edilts before message
 85+# - number deleted
 86+# - makes edits to talk (before/after)
 87+
 88+messaged_codings$group = as.factor(sapply(
 89+ messaged_codings$before_rating,
 90+ function(rating){
 91+ if(is.na(rating)){
 92+ NA
 93+ }else if(rating < 2){
 94+ "unlikely"
 95+ }else if(rating <= 3){
 96+ "possible"
 97+ }else{
 98+ "golden"
 99+ }
 100+ }
 101+))
 102+
 103+formatNum = function(num){
 104+ if(!is.numeric(num) | is.nan(num)){
 105+ " --- "
 106+ }
 107+ else if(num >= 0){
 108+ paste(" ", format(round(num, 3), nsmall=3), sep="")
 109+ }else{
 110+ format(round(num, 3), nsmall=3)
 111+ }
 112+}
 113+
 114+for(group in c("unlikely", "possible", "golden")){
 115+ group_codings = messaged_codings[messaged_codings$group == group,]
 116+
 117+
 118+ cat("Result's for ", length(group_codings$group), " '", group, "' editors:\n", sep='')
 119+ cat("============================================================\n")
 120+
 121+ control = group_codings[!group_codings$personal & !group_codings$teaching,]
 122+ personal = group_codings[group_codings$personal & !group_codings$teaching,]
 123+ teaching = group_codings[group_codings$teaching & !group_codings$personal,]
 124+ both = group_codings[group_codings$teaching & group_codings$personal,]
 125+
 126+ experiments = list(
 127+ list(name="Personal ", data=personal),
 128+ list(name="Teaching ", data=teaching),
 129+ list(name="Personal & Teaching", data=teaching)
 130+ )
 131+
 132+ outcomes = list(
 133+ list(name="Good outcome", field="good_outcome"),
 134+ list(name="Improves", field="improves"),
 135+ list(name="Contact", field="contact"),
 136+ list(name="Stays", field="stay"),
 137+ list(name="Good contact", field="good_contact")
 138+ )
 139+
 140+ for(outcome in outcomes){
 141+ cat(outcome$name, ": \n", sep="")
 142+
 143+ controlLen = length(na.omit(control[[outcome$field]]))
 144+ controlSuccess = sum(control[[outcome$field]], na.rm=T)
 145+ cat(
 146+ "\tControl ",
 147+ ": prop=", formatNum(controlSuccess/controlLen),
 148+ ", n=", controlLen, "\n",
 149+ sep=""
 150+ )
 151+ for(experiment in experiments){
 152+ expSuccess = sum(experiment$data[[outcome$field]], na.rm=T)
 153+ expLen = length(na.omit(experiment$data[[outcome$field]]))
 154+ if(controlLen > 0 & expLen > 0){
 155+ t = prop.test(
 156+ c(
 157+ expSuccess,
 158+ controlSuccess
 159+ ),
 160+ c(
 161+ expLen,
 162+ controlLen
 163+ )
 164+ )
 165+ }else{
 166+ t = list(
 167+ p.value=NA,
 168+ conf.int=c(NA, NA)
 169+ )
 170+ }
 171+
 172+ propDiff = mean(experiment$data[[outcome$field]], na.rm=T)-mean(control[[outcome$field]], na.rm=T)
 173+ cat(
 174+ "\t", experiment$name,
 175+ ": prop=", formatNum(expSuccess/expLen),
 176+ ", diff=", formatNum(propDiff),
 177+ ", p-value=", formatNum(t$p.value),
 178+ ", conf.int=(", formatNum(t$conf.int[1]), ", ", formatNum(t$conf.int[2]), ")",
 179+ ", n=", expLen, "\n",
 180+ sep=""
 181+ )
 182+ }
 183+ cat("\n")
 184+ }
 185+
 186+
 187+ cat("\n\n\n")
 188+}
 189+

Status & tagging log