r89424 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r89423‎ \| r89424 \| r89425 >
Date:	16:56, 3 June 2011
Author:	halfak
Status:	deferred
Tags:
Comment:	Added code for toolserver language samples for history coding
Modified paths:	/trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py (added) (history) /trunk/tools/wsor/ts_samples/sample_talk_edits.py (added) (history) /trunk/tools/wsor/ts_samples/testing.sql (added) (history)

Diff [purge]

Index: trunk/tools/wsor/ts_samples/sample_talk_edits.py
—	—	@@ -0,0 +1,210 @@
	2	+import os, sys, logging, argparse, MySQLdb
	3	+
	4	+def clean(v):
	5	+ if v == None:
	6	+ return "\N"
	7	+ else:
	8	+ return str(v).replace("\t", "\\t").replace("\n", "\\n").replace("\\", "\\\\")
	9	+
	10	+
	11	+def main(args):
	12	+ LOGGING_STREAM = sys.stderr
	13	+ logging.basicConfig(
	14	+ level=logging.DEBUG,
	15	+ stream=LOGGING_STREAM,
	16	+ format='%(asctime)s %(levelname)-8s %(message)s',
	17	+ datefmt='%b-%d %H:%M:%S'
	18	+ )
	19	+
	20	+ logging.info("Connecting to %s_p using %s." % (args.db, args.cnf))
	21	+ conn = MySQLdb.connect(
	22	+ host="%s-p.rrdb.toolserver.org" % args.db,
	23	+ db='%s_p' % args.db,
	24	+ read_default_file=args.cnf
	25	+ )
	26	+ fetchConn = MySQLdb.connect(
	27	+ host="%s-p.rrdb.toolserver.org" % args.db,
	28	+ db='%s_p' % args.db,
	29	+ read_default_file=args.cnf
	30	+ )
	31	+
	32	+ #Printing headers
	33	+ print(
	34	+ "\t".join([
	35	+ 'user_id',
	36	+ 'username',
	37	+ 'registration',
	38	+ 'end_of_newbie',
	39	+ 'rev_id',
	40	+ 'timestamp',
	41	+ 'comment'
	42	+ ])
	43	+ )
	44	+ for year in args.year:
	45	+ logging.info("Processing %s:" % year)
	46	+ yearCount = 0
	47	+ for user in getUsersByYear(fetchConn, year):
	48	+ initialRevs = list(getFirst10Revs(conn, user['user_id']))
	49	+ if len(initialRevs) > 0:
	50	+ endOfNoob = initialRevs[-1]['rev_timestamp']
	51	+ talkRev = getRandNonSelfPostToTalkPage(
	52	+ conn,
	53	+ user['user_id'],
	54	+ user['user_name'],
	55	+ user['user_registration'],
	56	+ endOfNoob
	57	+ )
	58	+ if talkRev != None:
	59	+ print(
	60	+ "\t".join(clean(v) for v in [
	61	+ user['user_id'],
	62	+ user['user_name'],
	63	+ user['user_registration'],
	64	+ endOfNoob,
	65	+ talkRev['rev_id'],
	66	+ talkRev['rev_timestamp'],
	67	+ talkRev['rev_comment']
	68	+ ])
	69	+ )
	70	+ LOGGING_STREAM.write(".")
	71	+ yearCount += 1
	72	+ if yearCount >= args.n:
	73	+ break
	74	+ else:
	75	+ LOGGING_STREAM.write("s")
	76	+ #logging.debug("User %s has no talk page revisions by other users. Skipping..." % user['username'])
	77	+
	78	+ else:
	79	+ LOGGING_STREAM.write("-")
	80	+ #logging.debug("User %s has no revisions. Skipping..." % user['username'])
	81	+
	82	+ LOGGING_STREAM.write("\n")
	83	+
	84	+
	85	+
	86	+
	87	+def getUsersByYear(conn, year):
	88	+ year = int(year)
	89	+ cursor = conn.cursor(MySQLdb.cursors.SSCursor)
	90	+ yearBegin = "%s0000000000" % year
	91	+ yearEnd = "%s1231115959" % year
	92	+ cursor.execute("""
	93	+ SELECT * FROM user
	94	+ WHERE user_registration BETWEEN %(year_begin)s AND %(year_end)s
	95	+ ORDER BY RAND()
	96	+ """,
	97	+ {
	98	+ 'year_begin': yearBegin,
	99	+ 'year_end': yearEnd
	100	+ }
	101	+ )
	102	+ for row in cursor:
	103	+ yield dict(
	104	+ zip(
	105	+ (d[0] for d in cursor.description),
	106	+ row
	107	+ )
	108	+ )
	109	+
	110	+
	111	+
	112	+
	113	+def getFirst10Revs(conn, userId):
	114	+ user_id = int(userId)
	115	+ cursor = conn.cursor()
	116	+ cursor.execute("""
	117	+ SELECT * FROM revision
	118	+ WHERE rev_user = %(user_id)s
	119	+ ORDER BY rev_timestamp ASC
	120	+ LIMIT 10
	121	+ """,
	122	+ {
	123	+ 'user_id': userId
	124	+ }
	125	+ )
	126	+ for row in cursor:
	127	+ yield dict(
	128	+ zip(
	129	+ (d[0] for d in cursor.description),
	130	+ row
	131	+ )
	132	+ )
	133	+
	134	+def getRandNonSelfPostToTalkPage(conn, userId, username, start, end):
	135	+ pageId = getTalkPageId(conn, username)
	136	+ if pageId == None: return None
	137	+ else:
	138	+ cursor = conn.cursor()
	139	+ cursor.execute("""
	140	+ SELECT * FROM revision
	141	+ WHERE rev_page = %(page_id)s
	142	+ AND rev_user != %(user_id)s
	143	+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
	144	+ ORDER BY RAND()
	145	+ LIMIT 1
	146	+ """,
	147	+ {
	148	+ 'page_id': pageId,
	149	+ 'user_id': userId,
	150	+ 'start': start,
	151	+ 'end': end
	152	+ }
	153	+ )
	154	+ for rev in cursor:
	155	+ return dict(
	156	+ zip(
	157	+ (d[0] for d in cursor.description),
	158	+ rev
	159	+ )
	160	+ )
	161	+
	162	+ return None
	163	+
	164	+
	165	+def getTalkPageId(conn, title):
	166	+ cursor = conn.cursor()
	167	+ cursor.execute("""
	168	+ SELECT page_id FROM page
	169	+ WHERE page_title = %(title)s
	170	+ AND page_namespace = 3
	171	+ """,
	172	+ {
	173	+ 'title': title
	174	+ }
	175	+ )
	176	+ for page in cursor:
	177	+ return page[0]
	178	+
	179	+ return None
	180	+
	181	+if __name__ == "__main__":
	182	+ parser = argparse.ArgumentParser(
	183	+ description=
	184	+ 'Samples editors by the year they made their first edit.'
	185	+ )
	186	+ parser.add_argument(
	187	+ 'n',
	188	+ type=int,
	189	+ help='the number of editors to sample from each year'
	190	+ )
	191	+ parser.add_argument(
	192	+ 'year',
	193	+ type=int,
	194	+ help='year(s) to sample from',
	195	+ nargs="+"
	196	+ )
	197	+ parser.add_argument(
	198	+ '-c', '--cnf',
	199	+ metavar="<path>",
	200	+ type=str,
	201	+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
	202	+ default=os.path.expanduser("~/.my.cnf")
	203	+ )
	204	+ parser.add_argument(
	205	+ '-d', '--db',
	206	+ type=str,
	207	+ help='the language db to run the query in (defaults to enwiki)',
	208	+ default="enwiki"
	209	+ )
	210	+ args = parser.parse_args()
	211	+ main(args)
Index: trunk/tools/wsor/ts_samples/testing.sql
—	—	@@ -0,0 +1,25 @@
	2	+CREATE TABLE u_grphack.user_meta (
	3	+ user_id INT,
	4	+ username VARCHAR(255),
	5	+ registration VARCHAR(14),
	6	+ reg_year INT
	7	+);
	8	+
	9	+INSERT INTO u_grphack.user_meta
	10	+SELECT user_id, user_name, user_registration, SUBSTRING(user_registration, 1,4)
	11	+FROM user;
	12	+
	13	+CREATE INDEX user_meta_pkey ON u_grphack.user_meta (user_id) USING BTREE;
	14	+CREATE INDEX user_meta_reg_year ON u_grphack.user_meta (reg_year) USING BTREE;
	15	+
	16	+
	17	+
	18	+
	19	+explain SELECT * FROM u_grphack.user_meta
	20	+WHERE reg_year = 2004
	21	+ORDER BY RAND();
	22	+
	23	+explain SELECT * FROM user
	24	+WHERE user_registration BETWEEN "20040000000000" AND "20041231115959"
	25	+ORDER BY RAND()
	26	+LIMIT 10;
Index: trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
—	—	@@ -0,0 +1,152 @@
	2	+import os, sys, logging, argparse, MySQLdb, urllib2, urllib, json
	3	+
	4	+def clean(v):
	5	+ if v == None:
	6	+ return "\N"
	7	+ else:
	8	+ return str(v).replace("\t", "\\t").replace("\n", "\\n").replace("\\", "\\\\")
	9	+
	10	+
	11	+def main(args):
	12	+ LOGGING_STREAM = sys.stderr
	13	+ logging.basicConfig(
	14	+ level=logging.DEBUG,
	15	+ stream=LOGGING_STREAM,
	16	+ format='%(asctime)s %(levelname)-8s %(message)s',
	17	+ datefmt='%b-%d %H:%M:%S'
	18	+ )
	19	+
	20	+ logging.info("Reading from %s." % args.input)
	21	+
	22	+ #Print header
	23	+ print(
	24	+ "\t".join([
	25	+ 'rev_id',
	26	+ 'diff'
	27	+ ])
	28	+ )
	29	+
	30	+ rowBuffer = []
	31	+ for row in readTSVFile(args.input):
	32	+ LOGGING_STREAM.write("<")
	33	+ print(
	34	+ "\t".join([
	35	+ row['rev_id'],
	36	+ getSingleDiff(args.uri, row['rev_id']).replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t")
	37	+ ])
	38	+ )
	39	+ LOGGING_STREAM.write(">")
	40	+ #rowBuffer.append(row)
	41	+ #if len(rowBuffer) == 50:
	42	+ # LOGGING_STREAM.write("\n")
	43	+ # diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer))
	44	+ # for row in rowBuffer:
	45	+ # LOGGING_STREAM.write(">")
	46	+ # print(
	47	+ # "\t".join([
	48	+ # row['rev_id'],
	49	+ # diffMap.get(row['rev_id'], '').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8')
	50	+ # ])
	51	+ # )
	52	+ #
	53	+ # rowBuffer = []
	54	+ #
	55	+ # LOGGING_STREAM.write("\n")
	56	+
	57	+ LOGGING_STREAM.write("\n")
	58	+ #diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer))
	59	+ #for row in rowBuffer:
	60	+ # LOGGING_STREAM.write(">")
	61	+ # print(
	62	+ # "\t".join([
	63	+ # row['rev_id'],
	64	+ # diffMap.get(row['rev_id'], '').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8')
	65	+ # ])
	66	+ # )
	67	+ #
	68	+ #LOGGING_STREAM.write("\n")
	69	+
	70	+
	71	+
	72	+def getSingleDiff(uri, revId):
	73	+ response = urllib2.urlopen(
	74	+ uri,
	75	+ urllib.urlencode({
	76	+ 'action': 'query',
	77	+ 'prop': 'revisions',
	78	+ 'revids': revId,
	79	+ 'rvprop': 'ids\|content',
	80	+ 'rvdiffto': 'prev',
	81	+ 'format': 'json'
	82	+ }),
	83	+ ).read()
	84	+ result = json.loads(response)
	85	+ diffMap = {}
	86	+ try:
	87	+ for page in result['query']['pages'].values():
	88	+ for rev in page['revisions']:
	89	+ return rev['diff'].get("*", "").encode('utf-8')
	90	+ except Exception as e:
	91	+ logging.error(response)
	92	+ logging.error(result)
	93	+ raise e
	94	+
	95	+
	96	+def buildDiffMap(uri, revIds):
	97	+ if len(revIds) == 0:
	98	+ return {}
	99	+ else:
	100	+ response = urllib2.urlopen(
	101	+ uri,
	102	+ urllib.urlencode({
	103	+ 'action': 'query',
	104	+ 'prop': 'revisions',
	105	+ 'revids': '\|'.join(revIds),
	106	+ 'rvprop': 'ids\|content',
	107	+ 'rvdiffto': 'prev',
	108	+ 'format': 'json'
	109	+ }),
	110	+ ).read()
	111	+ result = json.loads(response)
	112	+ diffMap = {}
	113	+ try:
	114	+ for page in result['query']['pages'].values():
	115	+ for rev in page['revisions']:
	116	+ diffMap[str(rev['revid'])] = rev['diff'].get("*", "")
	117	+ except Exception as e:
	118	+ logging.error(response)
	119	+ logging.error(result)
	120	+ raise e
	121	+
	122	+ return diffMap
	123	+
	124	+
	125	+
	126	+def readTSVFile(f):
	127	+ headers = f.readline().strip().split("\t")
	128	+ for line in f:
	129	+ values = line.strip().split("\t")
	130	+ yield dict(zip(headers,values))
	131	+
	132	+
	133	+
	134	+if __name__ == "__main__":
	135	+ parser = argparse.ArgumentParser(
	136	+ description=
	137	+ 'Adds diff information to a sample of talk edits'
	138	+ )
	139	+ parser.add_argument(
	140	+ '-u', '--uri',
	141	+ type=str,
	142	+ help='the uri of the api to connect to (defaults to enwp api)',
	143	+ default="http://en.wikipedia.org/api.php"
	144	+ )
	145	+ parser.add_argument(
	146	+ '-i', '--input',
	147	+ metavar="<path>",
	148	+ type=lambda fn:open(fn, "r"),
	149	+ help='the sample file to read (defaults to standard in)',
	150	+ default=sys.stdin
	151	+ )
	152	+ args = parser.parse_args()
	153	+ main(args)

Status & tagging log

22:30, 3 June 2011 Reedy (talk | contribs) changed the status of r89424 [removed: new added: deferred]