r99600 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r99599‎ \| r99600 \| r99601 >
Date:	23:43, 11 October 2011
Author:	halfak
Status:	deferred
Tags:
Comment:	Added new huggle scripts for new huggling stuffs.
Modified paths:	/trunk/tools/wsor/newbie_warnings/R/outcomes.R (modified) (history) /trunk/tools/wsor/newbie_warnings/get_contribs.py (added) (history) /trunk/tools/wsor/newbie_warnings/get_hugglings.v2.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/newbie_warnings/R/outcomes.R
—	—	@@ -113,27 +113,27 @@
114	114	cat("============================================================\n")
115	115
116	116	print(summary(glm(
117		~~- good_outcome ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,~~
	117	+ good_outcome ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
118	118	data = group_codings
119	119	)))
120	120
121	121	print(summary(glm(
122		~~- improves ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,~~
	122	+ improves ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
123	123	data = group_codings
124	124	)))
125	125
126	126	print(summary(glm(
127		~~- contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,~~
	127	+ contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
128	128	data = group_codings
129	129	)))
130	130
131	131	print(summary(glm(
132		~~- good_contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,~~
	132	+ good_contact ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
133	133	data = group_codings
134	134	)))
135	135
136	136	print(summary(glm(
137		~~- stay ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal,~~
	137	+ stay ~ anon + ntalk_edits_before_msg + talk_edits_before_msg + teaching * personal * image,
138	138	data = group_codings
139	139	)))
140	140
—	—	@@ -203,3 +203,123 @@
204	204	))
205	205	dev.off()
206	206	}
	207	+
	208	+messaged_codings$default = !messaged_codings$personal & !messaged_codings$teaching
	209	+messaged_codings$teaching_only = messaged_codings$teaching & !messaged_codings$personal
	210	+messaged_codings$personal_only = !messaged_codings$teaching & messaged_codings$personal
	211	+messaged_codings$teaching_and_personal = messaged_codings$teaching & messaged_codings$personal
	212	+
	213	+s = scale
	214	+
	215	+for(condition in c("teaching_only", "personal_only", "teaching_and_personal")){
	216	+ cat("-----------------------------------------------------------\n")
	217	+ cat("-----------", condition, "\n")
	218	+ cat("-----------------------------------------------------------\n")
	219	+ exp_codings = messaged_codings[
	220	+ messaged_codings[[condition]] \|
	221	+ messaged_codings$default,
	222	+ ]
	223	+
	224	+ exp_codings$condition = exp_codings[[condition]]
	225	+
	226	+ print(summary(glm(
	227	+ good_outcome ~
	228	+ anon +
	229	+ s(ntalk_edits_before_msg) +
	230	+ s(talk_edits_before_msg) +
	231	+ s(before_rating) *
	232	+ condition,
	233	+ data = exp_codings[exp_codings$image,]
	234	+ )))
	235	+ print(summary(glm(
	236	+ good_outcome ~
	237	+ anon +
	238	+ s(ntalk_edits_before_msg) +
	239	+ s(talk_edits_before_msg) +
	240	+ s(before_rating) *
	241	+ condition,
	242	+ data = exp_codings[!exp_codings$image,]
	243	+ )))
	244	+
	245	+
	246	+ print(summary(glm(
	247	+ improves ~
	248	+ anon +
	249	+ s(ntalk_edits_before_msg) +
	250	+ s(talk_edits_before_msg) +
	251	+ s(before_rating) *
	252	+ condition,
	253	+ data = exp_codings[exp_codings$image,]
	254	+ )))
	255	+ print(summary(glm(
	256	+ improves ~
	257	+ anon +
	258	+ s(ntalk_edits_before_msg) +
	259	+ s(talk_edits_before_msg) +
	260	+ s(before_rating) *
	261	+ condition,
	262	+ data = exp_codings[!exp_codings$image,]
	263	+ )))
	264	+
	265	+
	266	+ print(summary(glm(
	267	+ contact ~
	268	+ anon +
	269	+ s(ntalk_edits_before_msg) +
	270	+ s(talk_edits_before_msg) +
	271	+ s(before_rating) *
	272	+ condition,
	273	+ data = exp_codings[exp_codings$image,]
	274	+ )))
	275	+ print(summary(glm(
	276	+ contact ~
	277	+ anon +
	278	+ s(ntalk_edits_before_msg) +
	279	+ s(talk_edits_before_msg) +
	280	+ s(before_rating) *
	281	+ condition,
	282	+ data = exp_codings[!exp_codings$image,]
	283	+ )))
	284	+
	285	+
	286	+ print(summary(glm(
	287	+ good_contact ~
	288	+ anon +
	289	+ s(ntalk_edits_before_msg) +
	290	+ s(talk_edits_before_msg) +
	291	+ s(before_rating) *
	292	+ condition,
	293	+ data = exp_codings[exp_codings$image,]
	294	+ )))
	295	+ print(summary(glm(
	296	+ good_contact ~
	297	+ anon +
	298	+ s(ntalk_edits_before_msg) +
	299	+ s(talk_edits_before_msg) +
	300	+ s(before_rating) *
	301	+ condition,
	302	+ data = exp_codings[!exp_codings$image,]
	303	+ )))
	304	+
	305	+
	306	+ print(summary(glm(
	307	+ stay ~
	308	+ anon +
	309	+ s(ntalk_edits_before_msg) +
	310	+ s(talk_edits_before_msg) +
	311	+ s(before_rating) *
	312	+ condition,
	313	+ data = exp_codings[exp_codings$image,]
	314	+ )))
	315	+ print(summary(glm(
	316	+ stay ~
	317	+ anon +
	318	+ s(ntalk_edits_before_msg) +
	319	+ s(talk_edits_before_msg) +
	320	+ s(before_rating) *
	321	+ condition,
	322	+ data = exp_codings[!exp_codings$image,]
	323	+ )))
	324	+}
	325	+
	326	+
Index: trunk/tools/wsor/newbie_warnings/get_hugglings.v2.py
—	—	@@ -0,0 +1,198 @@
	2	+import json, urllib2, re, argparse, os, MySQLdb, MySQLdb.cursors, sys
	3	+import logging, urllib, types, time
	4	+
	5	+def main():
	6	+ parser = argparse.ArgumentParser(
	7	+ description='Gathers huggle messages after a specified date that contain experimental comments'
	8	+ )
	9	+ parser.add_argument(
	10	+ 'since',
	11	+ type=str,
	12	+ help='a date string to search for hugglings after'
	13	+ )
	14	+ parser.add_argument(
	15	+ '-u', '--uri',
	16	+ type=str,
	17	+ help='the uri for the mediawiki API',
	18	+ default="http://en.wikipedia.org/w/api.php"
	19	+ )
	20	+ parser.add_argument(
	21	+ '-c', '--cnf',
	22	+ metavar="<path>",
	23	+ type=str,
	24	+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
	25	+ default=os.path.expanduser("~/.my.cnf")
	26	+ )
	27	+ parser.add_argument(
	28	+ '-s', '--host',
	29	+ type=str,
	30	+ help='the database host to connect to (defaults to localhost)',
	31	+ default="localhost"
	32	+ )
	33	+ parser.add_argument(
	34	+ '-d', '--db',
	35	+ type=str,
	36	+ help='the language db to run the query in (defaults to enwiki)',
	37	+ default="enwiki"
	38	+ )
	39	+ parser.add_argument(
	40	+ '-o', '--out',
	41	+ type=lambda fn:open(fn, 'a+'),
	42	+ help='Where should output be appended',
	43	+ default=sys.stdout
	44	+ )
	45	+ args = parser.parse_args()
	46	+
	47	+ LOGGING_STREAM = sys.stderr
	48	+ logging.basicConfig(
	49	+ level=logging.DEBUG,
	50	+ stream=LOGGING_STREAM,
	51	+ format='%(asctime)s %(levelname)-8s %(message)s',
	52	+ datefmt='%b-%d %H:%M:%S'
	53	+ )
	54	+
	55	+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
	56	+ db = Database(
	57	+ host=args.host,
	58	+ db=args.db,
	59	+ read_default_file=args.cnf
	60	+ )
	61	+ wp = WPAPI(args.uri)
	62	+
	63	+ headers = [
	64	+ 'id',
	65	+ 'timestamp',
	66	+ 'poster_id',
	67	+ 'poster_name',
	68	+ 'recipient',
	69	+ 'personal',
	70	+ 'directives'
	71	+ ]
	72	+ #print("\t".join(headers))
	73	+
	74	+ logging.info("Getting huggling messages.")
	75	+ for post in db.getHugglingsSince(args.since):
	76	+ try:
	77	+ diff = wp.getRevisionDiff(post['id'])
	78	+ except Exception as e:
	79	+ logging.warning("%s: error retrieving posting from API: %s" % (post['timestamp'], e))
	80	+ condition = getConditionFromDiff(diff)
	81	+ if condition == None:
	82	+ logging.debug("%(timestamp)s: non-experimental posting by %(poster_name)s to %(recipient)s" % post)
	83	+ else:
	84	+ logging.debug("%(timestamp)s: experimental posting by %(poster_name)s to %(recipient)s" % post)
	85	+ post.update(condition)
	86	+ print("\t".join(encode(post[h]) for h in headers))
	87	+
	88	+
	89	+
	90	+
	91	+
	92	+
	93	+
	94	+
	95	+
	96	+class Database:
	97	+
	98	+ def __init__(self, args, *kwargs):
	99	+ self.args = args
	100	+ self.kwargs = kwargs
	101	+ self.conn = MySQLdb.connect(args, *kwargs)
	102	+
	103	+ def getHugglingsSince(self, timestamp):
	104	+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
	105	+ cursor.execute("""
	106	+ SELECT
	107	+ rc_this_oldid AS id,
	108	+ rc_timestamp AS timestamp,
	109	+ rc_user AS poster_id,
	110	+ rc_user_text AS poster_name,
	111	+ rc_comment AS comment,
	112	+ REPLACE(rc_title, "_", " ") AS recipient
	113	+ FROM recentchanges r
	114	+ WHERE rc_namespace = 3
	115	+ AND rc_new IN (0, 1)
	116	+ AND rc_timestamp >= %(timestamp)s
	117	+ AND rc_comment LIKE %(huggle)s
	118	+ """,
	119	+ {
	120	+ 'timestamp': timestamp,
	121	+ 'huggle': "Message re." + "%" + "[[WP:HG" + "%"
	122	+ }
	123	+ )
	124	+ for post in cursor:
	125	+ yield post
	126	+
	127	+
	128	+class WPAPI:
	129	+
	130	+ def __init__(self, uri):
	131	+ self.uri = uri
	132	+
	133	+ def getRevisionDiff(self, revId, retries=10):
	134	+ attempt = 0
	135	+ while attempt < retries:
	136	+ try:
	137	+ response = urllib2.urlopen(
	138	+ self.uri,
	139	+ urllib.urlencode({
	140	+ 'action': 'query',
	141	+ 'prop': 'revisions',
	142	+ 'revids': revId,
	143	+ 'rvprop': 'ids',
	144	+ 'rvdiffto': 'prev',
	145	+ 'format': 'json'
	146	+ })
	147	+ )
	148	+ result = json.load(response)
	149	+ return result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
	150	+ except urllib2.HTTPError as e:
	151	+ time.sleep(attempt*2)
	152	+ attempt += 1
	153	+
	154	+
	155	+WARNINGS = {
	156	+ "personal1": {
	157	+ 'personal': False,
	158	+ 'directives': False
	159	+ },
	160	+ "personal1-noimage": {
	161	+ 'personal': True,
	162	+ 'directives': True
	163	+ },
	164	+ "default1": {
	165	+ 'personal': False,
	166	+ 'directives': True
	167	+ }
	168	+}
	169	+WARNING_RE = re.compile(r"<!-- Template:uw-(" + "\|".join(WARNINGS.keys()) + ") -->")
	170	+
	171	+#DIFF_ADD_RE = re.compile(r'<td class="diff-addedline">([^<]\|(<[^/]\|(</[^t]\|(</t[^d]\|</td[^>]))))+</td>')
	172	+DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>')
	173	+
	174	+def getAddedContent(diff):
	175	+ return "\n".join(match.group(1) for match in DIFF_ADD_RE.finditer(diff))
	176	+
	177	+def getCondition(message):
	178	+ match = WARNING_RE.search(message)
	179	+ if match == None:
	180	+ return None
	181	+ else:
	182	+ return WARNINGS[match.group(1)]
	183	+
	184	+def getConditionFromDiff(diff):
	185	+ content = getAddedContent(diff)
	186	+ return getCondition(content)
	187	+
	188	+
	189	+def encode(v):
	190	+ if v == None: return "\N"
	191	+
	192	+ if type(v) == types.LongType: v = int(v)
	193	+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
	194	+
	195	+ return str(v).encode("string-escape")
	196	+
	197	+
	198	+
	199	+if __name__ == "__main__": main()
Index: trunk/tools/wsor/newbie_warnings/get_contribs.py
—	—	@@ -0,0 +1,99 @@
	2	+import sys, subprocess, os, random, logging, argparse
	3	+from StringIO import StringIO
	4	+
	5	+staeiouScriptPrefix = "/home/staeiou/contribs-peachy/REL0_1BETA/contribs-"
	6	+
	7	+def isDir(d):
	8	+ d = os.path.expanduser(d)
	9	+ assert os.path.isdir(d)
	10	+ return d
	11	+
	12	+def tense(s):
	13	+ assert s in ('before', 'after')
	14	+ return s
	15	+
	16	+def main():
	17	+
	18	+ parser = argparse.ArgumentParser(
	19	+ description='Gathers a user\'s contribs surrounding a date into an html file'
	20	+ )
	21	+ parser.add_argument(
	22	+ 'tense',
	23	+ type=tense,
	24	+ help='the chronological direction to look for contribs (before or after)'
	25	+ )
	26	+ parser.add_argument(
	27	+ '-u', '--uri',
	28	+ type=str,
	29	+ help='the uri for the mediawiki API',
	30	+ default="http://en.wikipedia.org/w/api.php"
	31	+ )
	32	+ parser.add_argument(
	33	+ '-i', '--input',
	34	+ type=lambda fn:open(os.path.expanduser(fn), "r"),
	35	+ help='the input file to find users and timestamps (defaults to stdin)',
	36	+ default=sys.stdin
	37	+ )
	38	+ parser.add_argument(
	39	+ '-o', '--output_dir',
	40	+ type=isDir,
	41	+ help='Where should the output files be written (defaults to current directory)',
	42	+ default=os.getcwd()
	43	+ )
	44	+ args = parser.parse_args()
	45	+
	46	+ LOGGING_STREAM = sys.stderr
	47	+ logging.basicConfig(
	48	+ level=logging.DEBUG,
	49	+ stream=LOGGING_STREAM,
	50	+ format='%(asctime)s %(levelname)-8s %(message)s',
	51	+ datefmt='%b-%d %H:%M:%S'
	52	+ )
	53	+
	54	+ scriptName = staeiouScriptPrefix + args.tense + ".php"
	55	+
	56	+ logging.debug("Script name: %s" % scriptName)
	57	+
	58	+ successes = 0
	59	+ errors = 0
	60	+ for line in args.input.read().split("\n"):
	61	+ try:
	62	+ userText, timestamp = line.strip().split("\t")
	63	+ except Exception as e:
	64	+ logging.error("Error occured while processing line %s:'%s' in input: %s" % (successes+errors+1,line, e))
	65	+ raise e
	66	+
	67	+ outFileName = os.path.join(args.output_dir, str(round(random.random(), 7))[2:] + ".html")
	68	+ while os.path.exists(outFileName):
	69	+ logging.warning("File name mismatch, re-randomizing.")
	70	+ outFileName = os.path.join(args.output_dir, str(round(random.random(), 7))[2:] + ".html")
	71	+
	72	+ try:
	73	+ outFile = open(outFileName, "w")
	74	+ process = subprocess.Popen(
	75	+ " ".join(['php', scriptName, userText, timestamp, ">", outFileName]),
	76	+ shell=True,
	77	+ stderr=open('/dev/null', "w")
	78	+ )
	79	+ #error = process.stderr.read()
	80	+ if process.wait() != 0:
	81	+ logging.error("The subscript exited with an error: %s" % error)
	82	+ errors += 1
	83	+ LOGGING_STREAM.write("!")
	84	+ else:
	85	+ successes += 1
	86	+ LOGGING_STREAM.write(".")
	87	+ except Exception as e:
	88	+ logging.error("An error occurred while running subscript: %s" % e)
	89	+ LOGGING_STREAM.write("!")
	90	+ errors += 1
	91	+
	92	+
	93	+ #if (successes + errors) % 100 == 0:
	94	+ # logging.info("Processed %s users. %s successful and %s errorred" % (successes + errors, successes, errors))
	95	+
	96	+
	97	+
	98	+
	99	+if __name__ == "__main__":
	100	+ main()

Status & tagging log

21:16, 21 October 2011 Reedy (talk | contribs) changed the status of r99600 [removed: new added: deferred]