r89424 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r89423‎ | r89424 | r89425 >
Date:16:56, 3 June 2011
Author:halfak
Status:deferred
Tags:
Comment:
Added code for toolserver language samples for history coding
Modified paths:
  • /trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py (added) (history)
  • /trunk/tools/wsor/ts_samples/sample_talk_edits.py (added) (history)
  • /trunk/tools/wsor/ts_samples/testing.sql (added) (history)

Diff [purge]

Index: trunk/tools/wsor/ts_samples/sample_talk_edits.py
@@ -0,0 +1,210 @@
 2+import os, sys, logging, argparse, MySQLdb
 3+
 4+def clean(v):
 5+ if v == None:
 6+ return "\N"
 7+ else:
 8+ return str(v).replace("\t", "\\t").replace("\n", "\\n").replace("\\", "\\\\")
 9+
 10+
 11+def main(args):
 12+ LOGGING_STREAM = sys.stderr
 13+ logging.basicConfig(
 14+ level=logging.DEBUG,
 15+ stream=LOGGING_STREAM,
 16+ format='%(asctime)s %(levelname)-8s %(message)s',
 17+ datefmt='%b-%d %H:%M:%S'
 18+ )
 19+
 20+ logging.info("Connecting to %s_p using %s." % (args.db, args.cnf))
 21+ conn = MySQLdb.connect(
 22+ host="%s-p.rrdb.toolserver.org" % args.db,
 23+ db='%s_p' % args.db,
 24+ read_default_file=args.cnf
 25+ )
 26+ fetchConn = MySQLdb.connect(
 27+ host="%s-p.rrdb.toolserver.org" % args.db,
 28+ db='%s_p' % args.db,
 29+ read_default_file=args.cnf
 30+ )
 31+
 32+ #Printing headers
 33+ print(
 34+ "\t".join([
 35+ 'user_id',
 36+ 'username',
 37+ 'registration',
 38+ 'end_of_newbie',
 39+ 'rev_id',
 40+ 'timestamp',
 41+ 'comment'
 42+ ])
 43+ )
 44+ for year in args.year:
 45+ logging.info("Processing %s:" % year)
 46+ yearCount = 0
 47+ for user in getUsersByYear(fetchConn, year):
 48+ initialRevs = list(getFirst10Revs(conn, user['user_id']))
 49+ if len(initialRevs) > 0:
 50+ endOfNoob = initialRevs[-1]['rev_timestamp']
 51+ talkRev = getRandNonSelfPostToTalkPage(
 52+ conn,
 53+ user['user_id'],
 54+ user['user_name'],
 55+ user['user_registration'],
 56+ endOfNoob
 57+ )
 58+ if talkRev != None:
 59+ print(
 60+ "\t".join(clean(v) for v in [
 61+ user['user_id'],
 62+ user['user_name'],
 63+ user['user_registration'],
 64+ endOfNoob,
 65+ talkRev['rev_id'],
 66+ talkRev['rev_timestamp'],
 67+ talkRev['rev_comment']
 68+ ])
 69+ )
 70+ LOGGING_STREAM.write(".")
 71+ yearCount += 1
 72+ if yearCount >= args.n:
 73+ break
 74+ else:
 75+ LOGGING_STREAM.write("s")
 76+ #logging.debug("User %s has no talk page revisions by other users. Skipping..." % user['username'])
 77+
 78+ else:
 79+ LOGGING_STREAM.write("-")
 80+ #logging.debug("User %s has no revisions. Skipping..." % user['username'])
 81+
 82+ LOGGING_STREAM.write("\n")
 83+
 84+
 85+
 86+
 87+def getUsersByYear(conn, year):
 88+ year = int(year)
 89+ cursor = conn.cursor(MySQLdb.cursors.SSCursor)
 90+ yearBegin = "%s0000000000" % year
 91+ yearEnd = "%s1231115959" % year
 92+ cursor.execute("""
 93+ SELECT * FROM user
 94+ WHERE user_registration BETWEEN %(year_begin)s AND %(year_end)s
 95+ ORDER BY RAND()
 96+ """,
 97+ {
 98+ 'year_begin': yearBegin,
 99+ 'year_end': yearEnd
 100+ }
 101+ )
 102+ for row in cursor:
 103+ yield dict(
 104+ zip(
 105+ (d[0] for d in cursor.description),
 106+ row
 107+ )
 108+ )
 109+
 110+
 111+
 112+
 113+def getFirst10Revs(conn, userId):
 114+ user_id = int(userId)
 115+ cursor = conn.cursor()
 116+ cursor.execute("""
 117+ SELECT * FROM revision
 118+ WHERE rev_user = %(user_id)s
 119+ ORDER BY rev_timestamp ASC
 120+ LIMIT 10
 121+ """,
 122+ {
 123+ 'user_id': userId
 124+ }
 125+ )
 126+ for row in cursor:
 127+ yield dict(
 128+ zip(
 129+ (d[0] for d in cursor.description),
 130+ row
 131+ )
 132+ )
 133+
 134+def getRandNonSelfPostToTalkPage(conn, userId, username, start, end):
 135+ pageId = getTalkPageId(conn, username)
 136+ if pageId == None: return None
 137+ else:
 138+ cursor = conn.cursor()
 139+ cursor.execute("""
 140+ SELECT * FROM revision
 141+ WHERE rev_page = %(page_id)s
 142+ AND rev_user != %(user_id)s
 143+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 144+ ORDER BY RAND()
 145+ LIMIT 1
 146+ """,
 147+ {
 148+ 'page_id': pageId,
 149+ 'user_id': userId,
 150+ 'start': start,
 151+ 'end': end
 152+ }
 153+ )
 154+ for rev in cursor:
 155+ return dict(
 156+ zip(
 157+ (d[0] for d in cursor.description),
 158+ rev
 159+ )
 160+ )
 161+
 162+ return None
 163+
 164+
 165+def getTalkPageId(conn, title):
 166+ cursor = conn.cursor()
 167+ cursor.execute("""
 168+ SELECT page_id FROM page
 169+ WHERE page_title = %(title)s
 170+ AND page_namespace = 3
 171+ """,
 172+ {
 173+ 'title': title
 174+ }
 175+ )
 176+ for page in cursor:
 177+ return page[0]
 178+
 179+ return None
 180+
 181+if __name__ == "__main__":
 182+ parser = argparse.ArgumentParser(
 183+ description=
 184+ 'Samples editors by the year they made their first edit.'
 185+ )
 186+ parser.add_argument(
 187+ 'n',
 188+ type=int,
 189+ help='the number of editors to sample from each year'
 190+ )
 191+ parser.add_argument(
 192+ 'year',
 193+ type=int,
 194+ help='year(s) to sample from',
 195+ nargs="+"
 196+ )
 197+ parser.add_argument(
 198+ '-c', '--cnf',
 199+ metavar="<path>",
 200+ type=str,
 201+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 202+ default=os.path.expanduser("~/.my.cnf")
 203+ )
 204+ parser.add_argument(
 205+ '-d', '--db',
 206+ type=str,
 207+ help='the language db to run the query in (defaults to enwiki)',
 208+ default="enwiki"
 209+ )
 210+ args = parser.parse_args()
 211+ main(args)
Index: trunk/tools/wsor/ts_samples/testing.sql
@@ -0,0 +1,25 @@
 2+CREATE TABLE u_grphack.user_meta (
 3+ user_id INT,
 4+ username VARCHAR(255),
 5+ registration VARCHAR(14),
 6+ reg_year INT
 7+);
 8+
 9+INSERT INTO u_grphack.user_meta
 10+SELECT user_id, user_name, user_registration, SUBSTRING(user_registration, 1,4)
 11+FROM user;
 12+
 13+CREATE INDEX user_meta_pkey ON u_grphack.user_meta (user_id) USING BTREE;
 14+CREATE INDEX user_meta_reg_year ON u_grphack.user_meta (reg_year) USING BTREE;
 15+
 16+
 17+
 18+
 19+explain SELECT * FROM u_grphack.user_meta
 20+WHERE reg_year = 2004
 21+ORDER BY RAND();
 22+
 23+explain SELECT * FROM user
 24+WHERE user_registration BETWEEN "20040000000000" AND "20041231115959"
 25+ORDER BY RAND()
 26+LIMIT 10;
Index: trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
@@ -0,0 +1,152 @@
 2+import os, sys, logging, argparse, MySQLdb, urllib2, urllib, json
 3+
 4+def clean(v):
 5+ if v == None:
 6+ return "\N"
 7+ else:
 8+ return str(v).replace("\t", "\\t").replace("\n", "\\n").replace("\\", "\\\\")
 9+
 10+
 11+def main(args):
 12+ LOGGING_STREAM = sys.stderr
 13+ logging.basicConfig(
 14+ level=logging.DEBUG,
 15+ stream=LOGGING_STREAM,
 16+ format='%(asctime)s %(levelname)-8s %(message)s',
 17+ datefmt='%b-%d %H:%M:%S'
 18+ )
 19+
 20+ logging.info("Reading from %s." % args.input)
 21+
 22+ #Print header
 23+ print(
 24+ "\t".join([
 25+ 'rev_id',
 26+ 'diff'
 27+ ])
 28+ )
 29+
 30+ rowBuffer = []
 31+ for row in readTSVFile(args.input):
 32+ LOGGING_STREAM.write("<")
 33+ print(
 34+ "\t".join([
 35+ row['rev_id'],
 36+ getSingleDiff(args.uri, row['rev_id']).replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t")
 37+ ])
 38+ )
 39+ LOGGING_STREAM.write(">")
 40+ #rowBuffer.append(row)
 41+ #if len(rowBuffer) == 50:
 42+ # LOGGING_STREAM.write("\n")
 43+ # diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer))
 44+ # for row in rowBuffer:
 45+ # LOGGING_STREAM.write(">")
 46+ # print(
 47+ # "\t".join([
 48+ # row['rev_id'],
 49+ # diffMap.get(row['rev_id'], '').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8')
 50+ # ])
 51+ # )
 52+ #
 53+ # rowBuffer = []
 54+ #
 55+ # LOGGING_STREAM.write("\n")
 56+
 57+ LOGGING_STREAM.write("\n")
 58+ #diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer))
 59+ #for row in rowBuffer:
 60+ # LOGGING_STREAM.write(">")
 61+ # print(
 62+ # "\t".join([
 63+ # row['rev_id'],
 64+ # diffMap.get(row['rev_id'], '').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8')
 65+ # ])
 66+ # )
 67+ #
 68+ #LOGGING_STREAM.write("\n")
 69+
 70+
 71+
 72+def getSingleDiff(uri, revId):
 73+ response = urllib2.urlopen(
 74+ uri,
 75+ urllib.urlencode({
 76+ 'action': 'query',
 77+ 'prop': 'revisions',
 78+ 'revids': revId,
 79+ 'rvprop': 'ids|content',
 80+ 'rvdiffto': 'prev',
 81+ 'format': 'json'
 82+ }),
 83+ ).read()
 84+ result = json.loads(response)
 85+ diffMap = {}
 86+ try:
 87+ for page in result['query']['pages'].values():
 88+ for rev in page['revisions']:
 89+ return rev['diff'].get("*", "").encode('utf-8')
 90+ except Exception as e:
 91+ logging.error(response)
 92+ logging.error(result)
 93+ raise e
 94+
 95+
 96+def buildDiffMap(uri, revIds):
 97+ if len(revIds) == 0:
 98+ return {}
 99+ else:
 100+ response = urllib2.urlopen(
 101+ uri,
 102+ urllib.urlencode({
 103+ 'action': 'query',
 104+ 'prop': 'revisions',
 105+ 'revids': '|'.join(revIds),
 106+ 'rvprop': 'ids|content',
 107+ 'rvdiffto': 'prev',
 108+ 'format': 'json'
 109+ }),
 110+ ).read()
 111+ result = json.loads(response)
 112+ diffMap = {}
 113+ try:
 114+ for page in result['query']['pages'].values():
 115+ for rev in page['revisions']:
 116+ diffMap[str(rev['revid'])] = rev['diff'].get("*", "")
 117+ except Exception as e:
 118+ logging.error(response)
 119+ logging.error(result)
 120+ raise e
 121+
 122+ return diffMap
 123+
 124+
 125+
 126+def readTSVFile(f):
 127+ headers = f.readline().strip().split("\t")
 128+ for line in f:
 129+ values = line.strip().split("\t")
 130+ yield dict(zip(headers,values))
 131+
 132+
 133+
 134+if __name__ == "__main__":
 135+ parser = argparse.ArgumentParser(
 136+ description=
 137+ 'Adds diff information to a sample of talk edits'
 138+ )
 139+ parser.add_argument(
 140+ '-u', '--uri',
 141+ type=str,
 142+ help='the uri of the api to connect to (defaults to enwp api)',
 143+ default="http://en.wikipedia.org/api.php"
 144+ )
 145+ parser.add_argument(
 146+ '-i', '--input',
 147+ metavar="<path>",
 148+ type=lambda fn:open(fn, "r"),
 149+ help='the sample file to read (defaults to standard in)',
 150+ default=sys.stdin
 151+ )
 152+ args = parser.parse_args()
 153+ main(args)

Status & tagging log