r99600 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r99599‎ | r99600 | r99601 >
Date:23:43, 11 October 2011
Author:halfak
Status:deferred
Tags:
Comment:
Added new huggle scripts for new huggling stuffs.
Modified paths:
  • /trunk/tools/wsor/newbie_warnings/R/outcomes.R (modified) (history)
  • /trunk/tools/wsor/newbie_warnings/get_contribs.py (added) (history)
  • /trunk/tools/wsor/newbie_warnings/get_hugglings.v2.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/newbie_warnings/R/outcomes.R
@@ -113,27 +113,27 @@
114114 cat("============================================================\n")
115115
116116 print(summary(glm(
117 - good_outcome ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,
 117+ good_outcome ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
118118 data = group_codings
119119 )))
120120
121121 print(summary(glm(
122 - improves ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,
 122+ improves ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
123123 data = group_codings
124124 )))
125125
126126 print(summary(glm(
127 - contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,
 127+ contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
128128 data = group_codings
129129 )))
130130
131131 print(summary(glm(
132 - good_contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,
 132+ good_contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
133133 data = group_codings
134134 )))
135135
136136 print(summary(glm(
137 - stay ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,
 137+ stay ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
138138 data = group_codings
139139 )))
140140
@@ -203,3 +203,123 @@
204204 ))
205205 dev.off()
206206 }
 207+
 208+messaged_codings$default = !messaged_codings$personal & !messaged_codings$teaching
 209+messaged_codings$teaching_only = messaged_codings$teaching & !messaged_codings$personal
 210+messaged_codings$personal_only = !messaged_codings$teaching & messaged_codings$personal
 211+messaged_codings$teaching_and_personal = messaged_codings$teaching & messaged_codings$personal
 212+
 213+s = scale
 214+
 215+for(condition in c("teaching_only", "personal_only", "teaching_and_personal")){
 216+ cat("-----------------------------------------------------------\n")
 217+ cat("-----------", condition, "\n")
 218+ cat("-----------------------------------------------------------\n")
 219+ exp_codings = messaged_codings[
 220+ messaged_codings[[condition]] |
 221+ messaged_codings$default,
 222+ ]
 223+
 224+ exp_codings$condition = exp_codings[[condition]]
 225+
 226+ print(summary(glm(
 227+ good_outcome ~
 228+ anon +
 229+ s(ntalk_edits_before_msg) +
 230+ s(talk_edits_before_msg) +
 231+ s(before_rating) *
 232+ condition,
 233+ data = exp_codings[exp_codings$image,]
 234+ )))
 235+ print(summary(glm(
 236+ good_outcome ~
 237+ anon +
 238+ s(ntalk_edits_before_msg) +
 239+ s(talk_edits_before_msg) +
 240+ s(before_rating) *
 241+ condition,
 242+ data = exp_codings[!exp_codings$image,]
 243+ )))
 244+
 245+
 246+ print(summary(glm(
 247+ improves ~
 248+ anon +
 249+ s(ntalk_edits_before_msg) +
 250+ s(talk_edits_before_msg) +
 251+ s(before_rating) *
 252+ condition,
 253+ data = exp_codings[exp_codings$image,]
 254+ )))
 255+ print(summary(glm(
 256+ improves ~
 257+ anon +
 258+ s(ntalk_edits_before_msg) +
 259+ s(talk_edits_before_msg) +
 260+ s(before_rating) *
 261+ condition,
 262+ data = exp_codings[!exp_codings$image,]
 263+ )))
 264+
 265+
 266+ print(summary(glm(
 267+ contact ~
 268+ anon +
 269+ s(ntalk_edits_before_msg) +
 270+ s(talk_edits_before_msg) +
 271+ s(before_rating) *
 272+ condition,
 273+ data = exp_codings[exp_codings$image,]
 274+ )))
 275+ print(summary(glm(
 276+ contact ~
 277+ anon +
 278+ s(ntalk_edits_before_msg) +
 279+ s(talk_edits_before_msg) +
 280+ s(before_rating) *
 281+ condition,
 282+ data = exp_codings[!exp_codings$image,]
 283+ )))
 284+
 285+
 286+ print(summary(glm(
 287+ good_contact ~
 288+ anon +
 289+ s(ntalk_edits_before_msg) +
 290+ s(talk_edits_before_msg) +
 291+ s(before_rating) *
 292+ condition,
 293+ data = exp_codings[exp_codings$image,]
 294+ )))
 295+ print(summary(glm(
 296+ good_contact ~
 297+ anon +
 298+ s(ntalk_edits_before_msg) +
 299+ s(talk_edits_before_msg) +
 300+ s(before_rating) *
 301+ condition,
 302+ data = exp_codings[!exp_codings$image,]
 303+ )))
 304+
 305+
 306+ print(summary(glm(
 307+ stay ~
 308+ anon +
 309+ s(ntalk_edits_before_msg) +
 310+ s(talk_edits_before_msg) +
 311+ s(before_rating) *
 312+ condition,
 313+ data = exp_codings[exp_codings$image,]
 314+ )))
 315+ print(summary(glm(
 316+ stay ~
 317+ anon +
 318+ s(ntalk_edits_before_msg) +
 319+ s(talk_edits_before_msg) +
 320+ s(before_rating) *
 321+ condition,
 322+ data = exp_codings[!exp_codings$image,]
 323+ )))
 324+}
 325+
 326+
Index: trunk/tools/wsor/newbie_warnings/get_hugglings.v2.py
@@ -0,0 +1,198 @@
 2+import json, urllib2, re, argparse, os, MySQLdb, MySQLdb.cursors, sys
 3+import logging, urllib, types, time
 4+
 5+def main():
 6+ parser = argparse.ArgumentParser(
 7+ description='Gathers huggle messages after a specified date that contain experimental comments'
 8+ )
 9+ parser.add_argument(
 10+ 'since',
 11+ type=str,
 12+ help='a date string to search for hugglings after'
 13+ )
 14+ parser.add_argument(
 15+ '-u', '--uri',
 16+ type=str,
 17+ help='the uri for the mediawiki API',
 18+ default="http://en.wikipedia.org/w/api.php"
 19+ )
 20+ parser.add_argument(
 21+ '-c', '--cnf',
 22+ metavar="<path>",
 23+ type=str,
 24+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 25+ default=os.path.expanduser("~/.my.cnf")
 26+ )
 27+ parser.add_argument(
 28+ '-s', '--host',
 29+ type=str,
 30+ help='the database host to connect to (defaults to localhost)',
 31+ default="localhost"
 32+ )
 33+ parser.add_argument(
 34+ '-d', '--db',
 35+ type=str,
 36+ help='the language db to run the query in (defaults to enwiki)',
 37+ default="enwiki"
 38+ )
 39+ parser.add_argument(
 40+ '-o', '--out',
 41+ type=lambda fn:open(fn, 'a+'),
 42+ help='Where should output be appended',
 43+ default=sys.stdout
 44+ )
 45+ args = parser.parse_args()
 46+
 47+ LOGGING_STREAM = sys.stderr
 48+ logging.basicConfig(
 49+ level=logging.DEBUG,
 50+ stream=LOGGING_STREAM,
 51+ format='%(asctime)s %(levelname)-8s %(message)s',
 52+ datefmt='%b-%d %H:%M:%S'
 53+ )
 54+
 55+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 56+ db = Database(
 57+ host=args.host,
 58+ db=args.db,
 59+ read_default_file=args.cnf
 60+ )
 61+ wp = WPAPI(args.uri)
 62+
 63+ headers = [
 64+ 'id',
 65+ 'timestamp',
 66+ 'poster_id',
 67+ 'poster_name',
 68+ 'recipient',
 69+ 'personal',
 70+ 'directives'
 71+ ]
 72+ #print("\t".join(headers))
 73+
 74+ logging.info("Getting huggling messages.")
 75+ for post in db.getHugglingsSince(args.since):
 76+ try:
 77+ diff = wp.getRevisionDiff(post['id'])
 78+ except Exception as e:
 79+ logging.warning("%s: error retrieving posting from API: %s" % (post['timestamp'], e))
 80+ condition = getConditionFromDiff(diff)
 81+ if condition == None:
 82+ logging.debug("%(timestamp)s: non-experimental posting by %(poster_name)s to %(recipient)s" % post)
 83+ else:
 84+ logging.debug("%(timestamp)s: experimental posting by %(poster_name)s to %(recipient)s" % post)
 85+ post.update(condition)
 86+ print("\t".join(encode(post[h]) for h in headers))
 87+
 88+
 89+
 90+
 91+
 92+
 93+
 94+
 95+
 96+class Database:
 97+
 98+ def __init__(self, *args, **kwargs):
 99+ self.args = args
 100+ self.kwargs = kwargs
 101+ self.conn = MySQLdb.connect(*args, **kwargs)
 102+
 103+ def getHugglingsSince(self, timestamp):
 104+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 105+ cursor.execute("""
 106+ SELECT
 107+ rc_this_oldid AS id,
 108+ rc_timestamp AS timestamp,
 109+ rc_user AS poster_id,
 110+ rc_user_text AS poster_name,
 111+ rc_comment AS comment,
 112+ REPLACE(rc_title, "_", " ") AS recipient
 113+ FROM recentchanges r
 114+ WHERE rc_namespace = 3
 115+ AND rc_new IN (0, 1)
 116+ AND rc_timestamp >= %(timestamp)s
 117+ AND rc_comment LIKE %(huggle)s
 118+ """,
 119+ {
 120+ 'timestamp': timestamp,
 121+ 'huggle': "Message re." + "%" + "[[WP:HG" + "%"
 122+ }
 123+ )
 124+ for post in cursor:
 125+ yield post
 126+
 127+
 128+class WPAPI:
 129+
 130+ def __init__(self, uri):
 131+ self.uri = uri
 132+
 133+ def getRevisionDiff(self, revId, retries=10):
 134+ attempt = 0
 135+ while attempt < retries:
 136+ try:
 137+ response = urllib2.urlopen(
 138+ self.uri,
 139+ urllib.urlencode({
 140+ 'action': 'query',
 141+ 'prop': 'revisions',
 142+ 'revids': revId,
 143+ 'rvprop': 'ids',
 144+ 'rvdiffto': 'prev',
 145+ 'format': 'json'
 146+ })
 147+ )
 148+ result = json.load(response)
 149+ return result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
 150+ except urllib2.HTTPError as e:
 151+ time.sleep(attempt*2)
 152+ attempt += 1
 153+
 154+
 155+WARNINGS = {
 156+ "personal1": {
 157+ 'personal': False,
 158+ 'directives': False
 159+ },
 160+ "personal1-noimage": {
 161+ 'personal': True,
 162+ 'directives': True
 163+ },
 164+ "default1": {
 165+ 'personal': False,
 166+ 'directives': True
 167+ }
 168+}
 169+WARNING_RE = re.compile(r"&lt;!-- Template:uw-(" + "|".join(WARNINGS.keys()) + ") --&gt;")
 170+
 171+#DIFF_ADD_RE = re.compile(r'<td class="diff-addedline">([^<]|(<[^/]|(</[^t]|(</t[^d]|</td[^>]))))+</td>')
 172+DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>')
 173+
 174+def getAddedContent(diff):
 175+ return "\n".join(match.group(1) for match in DIFF_ADD_RE.finditer(diff))
 176+
 177+def getCondition(message):
 178+ match = WARNING_RE.search(message)
 179+ if match == None:
 180+ return None
 181+ else:
 182+ return WARNINGS[match.group(1)]
 183+
 184+def getConditionFromDiff(diff):
 185+ content = getAddedContent(diff)
 186+ return getCondition(content)
 187+
 188+
 189+def encode(v):
 190+ if v == None: return "\N"
 191+
 192+ if type(v) == types.LongType: v = int(v)
 193+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 194+
 195+ return str(v).encode("string-escape")
 196+
 197+
 198+
 199+if __name__ == "__main__": main()
Index: trunk/tools/wsor/newbie_warnings/get_contribs.py
@@ -0,0 +1,99 @@
 2+import sys, subprocess, os, random, logging, argparse
 3+from StringIO import StringIO
 4+
 5+staeiouScriptPrefix = "/home/staeiou/contribs-peachy/REL0_1BETA/contribs-"
 6+
 7+def isDir(d):
 8+ d = os.path.expanduser(d)
 9+ assert os.path.isdir(d)
 10+ return d
 11+
 12+def tense(s):
 13+ assert s in ('before', 'after')
 14+ return s
 15+
 16+def main():
 17+
 18+ parser = argparse.ArgumentParser(
 19+ description='Gathers a user\'s contribs surrounding a date into an html file'
 20+ )
 21+ parser.add_argument(
 22+ 'tense',
 23+ type=tense,
 24+ help='the chronological direction to look for contribs (before or after)'
 25+ )
 26+ parser.add_argument(
 27+ '-u', '--uri',
 28+ type=str,
 29+ help='the uri for the mediawiki API',
 30+ default="http://en.wikipedia.org/w/api.php"
 31+ )
 32+ parser.add_argument(
 33+ '-i', '--input',
 34+ type=lambda fn:open(os.path.expanduser(fn), "r"),
 35+ help='the input file to find users and timestamps (defaults to stdin)',
 36+ default=sys.stdin
 37+ )
 38+ parser.add_argument(
 39+ '-o', '--output_dir',
 40+ type=isDir,
 41+ help='Where should the output files be written (defaults to current directory)',
 42+ default=os.getcwd()
 43+ )
 44+ args = parser.parse_args()
 45+
 46+ LOGGING_STREAM = sys.stderr
 47+ logging.basicConfig(
 48+ level=logging.DEBUG,
 49+ stream=LOGGING_STREAM,
 50+ format='%(asctime)s %(levelname)-8s %(message)s',
 51+ datefmt='%b-%d %H:%M:%S'
 52+ )
 53+
 54+ scriptName = staeiouScriptPrefix + args.tense + ".php"
 55+
 56+ logging.debug("Script name: %s" % scriptName)
 57+
 58+ successes = 0
 59+ errors = 0
 60+ for line in args.input.read().split("\n"):
 61+ try:
 62+ userText, timestamp = line.strip().split("\t")
 63+ except Exception as e:
 64+ logging.error("Error occured while processing line %s:'%s' in input: %s" % (successes+errors+1,line, e))
 65+ raise e
 66+
 67+ outFileName = os.path.join(args.output_dir, str(round(random.random(), 7))[2:] + ".html")
 68+ while os.path.exists(outFileName):
 69+ logging.warning("File name mismatch, re-randomizing.")
 70+ outFileName = os.path.join(args.output_dir, str(round(random.random(), 7))[2:] + ".html")
 71+
 72+ try:
 73+ outFile = open(outFileName, "w")
 74+ process = subprocess.Popen(
 75+ " ".join(['php', scriptName, userText, timestamp, ">", outFileName]),
 76+ shell=True,
 77+ stderr=open('/dev/null', "w")
 78+ )
 79+ #error = process.stderr.read()
 80+ if process.wait() != 0:
 81+ logging.error("The subscript exited with an error: %s" % error)
 82+ errors += 1
 83+ LOGGING_STREAM.write("!")
 84+ else:
 85+ successes += 1
 86+ LOGGING_STREAM.write(".")
 87+ except Exception as e:
 88+ logging.error("An error occurred while running subscript: %s" % e)
 89+ LOGGING_STREAM.write("!")
 90+ errors += 1
 91+
 92+
 93+ #if (successes + errors) % 100 == 0:
 94+ # logging.info("Processed %s users. %s successful and %s errorred" % (successes + errors, successes, errors))
 95+
 96+
 97+
 98+
 99+if __name__ == "__main__":
 100+ main()

Status & tagging log