Index: trunk/tools/wsor/newbie_warnings/R/outcomes.R |
— | — | @@ -113,27 +113,27 @@ |
114 | 114 | cat("============================================================\n") |
115 | 115 | |
116 | 116 | print(summary(glm( |
117 | | - good_outcome ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal, |
| 117 | + good_outcome ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image, |
118 | 118 | data = group_codings |
119 | 119 | ))) |
120 | 120 | |
121 | 121 | print(summary(glm( |
122 | | - improves ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal, |
| 122 | + improves ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image, |
123 | 123 | data = group_codings |
124 | 124 | ))) |
125 | 125 | |
126 | 126 | print(summary(glm( |
127 | | - contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal, |
| 127 | + contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image, |
128 | 128 | data = group_codings |
129 | 129 | ))) |
130 | 130 | |
131 | 131 | print(summary(glm( |
132 | | - good_contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal, |
| 132 | + good_contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image, |
133 | 133 | data = group_codings |
134 | 134 | ))) |
135 | 135 | |
136 | 136 | print(summary(glm( |
137 | | - stay ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal, |
| 137 | + stay ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image, |
138 | 138 | data = group_codings |
139 | 139 | ))) |
140 | 140 | |
— | — | @@ -203,3 +203,123 @@ |
204 | 204 | )) |
205 | 205 | dev.off() |
206 | 206 | } |
| 207 | + |
| 208 | +messaged_codings$default = !messaged_codings$personal & !messaged_codings$teaching |
| 209 | +messaged_codings$teaching_only = messaged_codings$teaching & !messaged_codings$personal |
| 210 | +messaged_codings$personal_only = !messaged_codings$teaching & messaged_codings$personal |
| 211 | +messaged_codings$teaching_and_personal = messaged_codings$teaching & messaged_codings$personal |
| 212 | + |
| 213 | +s = scale |
| 214 | + |
| 215 | +for(condition in c("teaching_only", "personal_only", "teaching_and_personal")){ |
| 216 | + cat("-----------------------------------------------------------\n") |
| 217 | + cat("-----------", condition, "\n") |
| 218 | + cat("-----------------------------------------------------------\n") |
| 219 | + exp_codings = messaged_codings[ |
| 220 | + messaged_codings[[condition]] | |
| 221 | + messaged_codings$default, |
| 222 | + ] |
| 223 | + |
| 224 | + exp_codings$condition = exp_codings[[condition]] |
| 225 | + |
| 226 | + print(summary(glm( |
| 227 | + good_outcome ~ |
| 228 | + anon + |
| 229 | + s(ntalk_edits_before_msg) + |
| 230 | + s(talk_edits_before_msg) + |
| 231 | + s(before_rating) * |
| 232 | + condition, |
| 233 | + data = exp_codings[exp_codings$image,] |
| 234 | + ))) |
| 235 | + print(summary(glm( |
| 236 | + good_outcome ~ |
| 237 | + anon + |
| 238 | + s(ntalk_edits_before_msg) + |
| 239 | + s(talk_edits_before_msg) + |
| 240 | + s(before_rating) * |
| 241 | + condition, |
| 242 | + data = exp_codings[!exp_codings$image,] |
| 243 | + ))) |
| 244 | + |
| 245 | + |
| 246 | + print(summary(glm( |
| 247 | + improves ~ |
| 248 | + anon + |
| 249 | + s(ntalk_edits_before_msg) + |
| 250 | + s(talk_edits_before_msg) + |
| 251 | + s(before_rating) * |
| 252 | + condition, |
| 253 | + data = exp_codings[exp_codings$image,] |
| 254 | + ))) |
| 255 | + print(summary(glm( |
| 256 | + improves ~ |
| 257 | + anon + |
| 258 | + s(ntalk_edits_before_msg) + |
| 259 | + s(talk_edits_before_msg) + |
| 260 | + s(before_rating) * |
| 261 | + condition, |
| 262 | + data = exp_codings[!exp_codings$image,] |
| 263 | + ))) |
| 264 | + |
| 265 | + |
| 266 | + print(summary(glm( |
| 267 | + contact ~ |
| 268 | + anon + |
| 269 | + s(ntalk_edits_before_msg) + |
| 270 | + s(talk_edits_before_msg) + |
| 271 | + s(before_rating) * |
| 272 | + condition, |
| 273 | + data = exp_codings[exp_codings$image,] |
| 274 | + ))) |
| 275 | + print(summary(glm( |
| 276 | + contact ~ |
| 277 | + anon + |
| 278 | + s(ntalk_edits_before_msg) + |
| 279 | + s(talk_edits_before_msg) + |
| 280 | + s(before_rating) * |
| 281 | + condition, |
| 282 | + data = exp_codings[!exp_codings$image,] |
| 283 | + ))) |
| 284 | + |
| 285 | + |
| 286 | + print(summary(glm( |
| 287 | + good_contact ~ |
| 288 | + anon + |
| 289 | + s(ntalk_edits_before_msg) + |
| 290 | + s(talk_edits_before_msg) + |
| 291 | + s(before_rating) * |
| 292 | + condition, |
| 293 | + data = exp_codings[exp_codings$image,] |
| 294 | + ))) |
| 295 | + print(summary(glm( |
| 296 | + good_contact ~ |
| 297 | + anon + |
| 298 | + s(ntalk_edits_before_msg) + |
| 299 | + s(talk_edits_before_msg) + |
| 300 | + s(before_rating) * |
| 301 | + condition, |
| 302 | + data = exp_codings[!exp_codings$image,] |
| 303 | + ))) |
| 304 | + |
| 305 | + |
| 306 | + print(summary(glm( |
| 307 | + stay ~ |
| 308 | + anon + |
| 309 | + s(ntalk_edits_before_msg) + |
| 310 | + s(talk_edits_before_msg) + |
| 311 | + s(before_rating) * |
| 312 | + condition, |
| 313 | + data = exp_codings[exp_codings$image,] |
| 314 | + ))) |
| 315 | + print(summary(glm( |
| 316 | + stay ~ |
| 317 | + anon + |
| 318 | + s(ntalk_edits_before_msg) + |
| 319 | + s(talk_edits_before_msg) + |
| 320 | + s(before_rating) * |
| 321 | + condition, |
| 322 | + data = exp_codings[!exp_codings$image,] |
| 323 | + ))) |
| 324 | +} |
| 325 | + |
| 326 | + |
Index: trunk/tools/wsor/newbie_warnings/get_hugglings.v2.py |
— | — | @@ -0,0 +1,198 @@ |
| 2 | +import json, urllib2, re, argparse, os, MySQLdb, MySQLdb.cursors, sys |
| 3 | +import logging, urllib, types, time |
| 4 | + |
| 5 | +def main(): |
| 6 | + parser = argparse.ArgumentParser( |
| 7 | + description='Gathers huggle messages after a specified date that contain experimental comments' |
| 8 | + ) |
| 9 | + parser.add_argument( |
| 10 | + 'since', |
| 11 | + type=str, |
| 12 | + help='a date string to search for hugglings after' |
| 13 | + ) |
| 14 | + parser.add_argument( |
| 15 | + '-u', '--uri', |
| 16 | + type=str, |
| 17 | + help='the uri for the mediawiki API', |
| 18 | + default="http://en.wikipedia.org/w/api.php" |
| 19 | + ) |
| 20 | + parser.add_argument( |
| 21 | + '-c', '--cnf', |
| 22 | + metavar="<path>", |
| 23 | + type=str, |
| 24 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 25 | + default=os.path.expanduser("~/.my.cnf") |
| 26 | + ) |
| 27 | + parser.add_argument( |
| 28 | + '-s', '--host', |
| 29 | + type=str, |
| 30 | + help='the database host to connect to (defaults to localhost)', |
| 31 | + default="localhost" |
| 32 | + ) |
| 33 | + parser.add_argument( |
| 34 | + '-d', '--db', |
| 35 | + type=str, |
| 36 | + help='the language db to run the query in (defaults to enwiki)', |
| 37 | + default="enwiki" |
| 38 | + ) |
| 39 | + parser.add_argument( |
| 40 | + '-o', '--out', |
| 41 | + type=lambda fn:open(fn, 'a+'), |
| 42 | + help='Where should output be appended', |
| 43 | + default=sys.stdout |
| 44 | + ) |
| 45 | + args = parser.parse_args() |
| 46 | + |
| 47 | + LOGGING_STREAM = sys.stderr |
| 48 | + logging.basicConfig( |
| 49 | + level=logging.DEBUG, |
| 50 | + stream=LOGGING_STREAM, |
| 51 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 52 | + datefmt='%b-%d %H:%M:%S' |
| 53 | + ) |
| 54 | + |
| 55 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 56 | + db = Database( |
| 57 | + host=args.host, |
| 58 | + db=args.db, |
| 59 | + read_default_file=args.cnf |
| 60 | + ) |
| 61 | + wp = WPAPI(args.uri) |
| 62 | + |
| 63 | + headers = [ |
| 64 | + 'id', |
| 65 | + 'timestamp', |
| 66 | + 'poster_id', |
| 67 | + 'poster_name', |
| 68 | + 'recipient', |
| 69 | + 'personal', |
| 70 | + 'directives' |
| 71 | + ] |
| 72 | + #print("\t".join(headers)) |
| 73 | + |
| 74 | + logging.info("Getting huggling messages.") |
| 75 | + for post in db.getHugglingsSince(args.since): |
| 76 | + try: |
| 77 | + diff = wp.getRevisionDiff(post['id']) |
| 78 | + except Exception as e: |
| 79 | + logging.warning("%s: error retrieving posting from API: %s" % (post['timestamp'], e)) |
| 80 | + condition = getConditionFromDiff(diff) |
| 81 | + if condition == None: |
| 82 | + logging.debug("%(timestamp)s: non-experimental posting by %(poster_name)s to %(recipient)s" % post) |
| 83 | + else: |
| 84 | + logging.debug("%(timestamp)s: experimental posting by %(poster_name)s to %(recipient)s" % post) |
| 85 | + post.update(condition) |
| 86 | + print("\t".join(encode(post[h]) for h in headers)) |
| 87 | + |
| 88 | + |
| 89 | + |
| 90 | + |
| 91 | + |
| 92 | + |
| 93 | + |
| 94 | + |
| 95 | + |
| 96 | +class Database: |
| 97 | + |
| 98 | + def __init__(self, *args, **kwargs): |
| 99 | + self.args = args |
| 100 | + self.kwargs = kwargs |
| 101 | + self.conn = MySQLdb.connect(*args, **kwargs) |
| 102 | + |
| 103 | + def getHugglingsSince(self, timestamp): |
| 104 | + cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
| 105 | + cursor.execute(""" |
| 106 | + SELECT |
| 107 | + rc_this_oldid AS id, |
| 108 | + rc_timestamp AS timestamp, |
| 109 | + rc_user AS poster_id, |
| 110 | + rc_user_text AS poster_name, |
| 111 | + rc_comment AS comment, |
| 112 | + REPLACE(rc_title, "_", " ") AS recipient |
| 113 | + FROM recentchanges r |
| 114 | + WHERE rc_namespace = 3 |
| 115 | + AND rc_new IN (0, 1) |
| 116 | + AND rc_timestamp >= %(timestamp)s |
| 117 | + AND rc_comment LIKE %(huggle)s |
| 118 | + """, |
| 119 | + { |
| 120 | + 'timestamp': timestamp, |
| 121 | + 'huggle': "Message re." + "%" + "[[WP:HG" + "%" |
| 122 | + } |
| 123 | + ) |
| 124 | + for post in cursor: |
| 125 | + yield post |
| 126 | + |
| 127 | + |
| 128 | +class WPAPI: |
| 129 | + |
| 130 | + def __init__(self, uri): |
| 131 | + self.uri = uri |
| 132 | + |
| 133 | + def getRevisionDiff(self, revId, retries=10): |
| 134 | + attempt = 0 |
| 135 | + while attempt < retries: |
| 136 | + try: |
| 137 | + response = urllib2.urlopen( |
| 138 | + self.uri, |
| 139 | + urllib.urlencode({ |
| 140 | + 'action': 'query', |
| 141 | + 'prop': 'revisions', |
| 142 | + 'revids': revId, |
| 143 | + 'rvprop': 'ids', |
| 144 | + 'rvdiffto': 'prev', |
| 145 | + 'format': 'json' |
| 146 | + }) |
| 147 | + ) |
| 148 | + result = json.load(response) |
| 149 | + return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
| 150 | + except urllib2.HTTPError as e: |
| 151 | + time.sleep(attempt*2) |
| 152 | + attempt += 1 |
| 153 | + |
| 154 | + |
| 155 | +WARNINGS = { |
| 156 | + "personal1": { |
| 157 | + 'personal': False, |
| 158 | + 'directives': False |
| 159 | + }, |
| 160 | + "personal1-noimage": { |
| 161 | + 'personal': True, |
| 162 | + 'directives': True |
| 163 | + }, |
| 164 | + "default1": { |
| 165 | + 'personal': False, |
| 166 | + 'directives': True |
| 167 | + } |
| 168 | +} |
| 169 | +WARNING_RE = re.compile(r"<!-- Template:uw-(" + "|".join(WARNINGS.keys()) + ") -->") |
| 170 | + |
| 171 | +#DIFF_ADD_RE = re.compile(r'<td class="diff-addedline">([^<]|(<[^/]|(</[^t]|(</t[^d]|</td[^>]))))+</td>') |
| 172 | +DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') |
| 173 | + |
| 174 | +def getAddedContent(diff): |
| 175 | + return "\n".join(match.group(1) for match in DIFF_ADD_RE.finditer(diff)) |
| 176 | + |
| 177 | +def getCondition(message): |
| 178 | + match = WARNING_RE.search(message) |
| 179 | + if match == None: |
| 180 | + return None |
| 181 | + else: |
| 182 | + return WARNINGS[match.group(1)] |
| 183 | + |
| 184 | +def getConditionFromDiff(diff): |
| 185 | + content = getAddedContent(diff) |
| 186 | + return getCondition(content) |
| 187 | + |
| 188 | + |
| 189 | +def encode(v): |
| 190 | + if v == None: return "\N" |
| 191 | + |
| 192 | + if type(v) == types.LongType: v = int(v) |
| 193 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 194 | + |
| 195 | + return str(v).encode("string-escape") |
| 196 | + |
| 197 | + |
| 198 | + |
| 199 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/newbie_warnings/get_contribs.py |
— | — | @@ -0,0 +1,99 @@ |
| 2 | +import sys, subprocess, os, random, logging, argparse |
| 3 | +from StringIO import StringIO |
| 4 | + |
| 5 | +staeiouScriptPrefix = "/home/staeiou/contribs-peachy/REL0_1BETA/contribs-" |
| 6 | + |
| 7 | +def isDir(d): |
| 8 | + d = os.path.expanduser(d) |
| 9 | + assert os.path.isdir(d) |
| 10 | + return d |
| 11 | + |
| 12 | +def tense(s): |
| 13 | + assert s in ('before', 'after') |
| 14 | + return s |
| 15 | + |
| 16 | +def main(): |
| 17 | + |
| 18 | + parser = argparse.ArgumentParser( |
| 19 | + description='Gathers a user\'s contribs surrounding a date into an html file' |
| 20 | + ) |
| 21 | + parser.add_argument( |
| 22 | + 'tense', |
| 23 | + type=tense, |
| 24 | + help='the chronological direction to look for contribs (before or after)' |
| 25 | + ) |
| 26 | + parser.add_argument( |
| 27 | + '-u', '--uri', |
| 28 | + type=str, |
| 29 | + help='the uri for the mediawiki API', |
| 30 | + default="http://en.wikipedia.org/w/api.php" |
| 31 | + ) |
| 32 | + parser.add_argument( |
| 33 | + '-i', '--input', |
| 34 | + type=lambda fn:open(os.path.expanduser(fn), "r"), |
| 35 | + help='the input file to find users and timestamps (defaults to stdin)', |
| 36 | + default=sys.stdin |
| 37 | + ) |
| 38 | + parser.add_argument( |
| 39 | + '-o', '--output_dir', |
| 40 | + type=isDir, |
| 41 | + help='Where should the output files be written (defaults to current directory)', |
| 42 | + default=os.getcwd() |
| 43 | + ) |
| 44 | + args = parser.parse_args() |
| 45 | + |
| 46 | + LOGGING_STREAM = sys.stderr |
| 47 | + logging.basicConfig( |
| 48 | + level=logging.DEBUG, |
| 49 | + stream=LOGGING_STREAM, |
| 50 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 51 | + datefmt='%b-%d %H:%M:%S' |
| 52 | + ) |
| 53 | + |
| 54 | + scriptName = staeiouScriptPrefix + args.tense + ".php" |
| 55 | + |
| 56 | + logging.debug("Script name: %s" % scriptName) |
| 57 | + |
| 58 | + successes = 0 |
| 59 | + errors = 0 |
| 60 | + for line in args.input.read().split("\n"): |
| 61 | + try: |
| 62 | + userText, timestamp = line.strip().split("\t") |
| 63 | + except Exception as e: |
| 64 | + logging.error("Error occured while processing line %s:'%s' in input: %s" % (successes+errors+1,line, e)) |
| 65 | + raise e |
| 66 | + |
| 67 | + outFileName = os.path.join(args.output_dir, str(round(random.random(), 7))[2:] + ".html") |
| 68 | + while os.path.exists(outFileName): |
| 69 | + logging.warning("File name mismatch, re-randomizing.") |
| 70 | + outFileName = os.path.join(args.output_dir, str(round(random.random(), 7))[2:] + ".html") |
| 71 | + |
| 72 | + try: |
| 73 | + outFile = open(outFileName, "w") |
| 74 | + process = subprocess.Popen( |
| 75 | + " ".join(['php', scriptName, userText, timestamp, ">", outFileName]), |
| 76 | + shell=True, |
| 77 | + stderr=open('/dev/null', "w") |
| 78 | + ) |
| 79 | + #error = process.stderr.read() |
| 80 | + if process.wait() != 0: |
| 81 | + logging.error("The subscript exited with an error: %s" % error) |
| 82 | + errors += 1 |
| 83 | + LOGGING_STREAM.write("!") |
| 84 | + else: |
| 85 | + successes += 1 |
| 86 | + LOGGING_STREAM.write(".") |
| 87 | + except Exception as e: |
| 88 | + logging.error("An error occurred while running subscript: %s" % e) |
| 89 | + LOGGING_STREAM.write("!") |
| 90 | + errors += 1 |
| 91 | + |
| 92 | + |
| 93 | + #if (successes + errors) % 100 == 0: |
| 94 | + # logging.info("Processed %s users. %s successful and %s errorred" % (successes + errors, successes, errors)) |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | +if __name__ == "__main__": |
| 100 | + main() |