Index: trunk/tools/wsor/ts_samples/sample_talk_edits.py |
— | — | @@ -0,0 +1,210 @@ |
| 2 | +import os, sys, logging, argparse, MySQLdb |
| 3 | + |
| 4 | +def clean(v): |
| 5 | + if v == None: |
| 6 | + return "\N" |
| 7 | + else: |
| 8 | + return str(v).replace("\t", "\\t").replace("\n", "\\n").replace("\\", "\\\\") |
| 9 | + |
| 10 | + |
| 11 | +def main(args): |
| 12 | + LOGGING_STREAM = sys.stderr |
| 13 | + logging.basicConfig( |
| 14 | + level=logging.DEBUG, |
| 15 | + stream=LOGGING_STREAM, |
| 16 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 17 | + datefmt='%b-%d %H:%M:%S' |
| 18 | + ) |
| 19 | + |
| 20 | + logging.info("Connecting to %s_p using %s." % (args.db, args.cnf)) |
| 21 | + conn = MySQLdb.connect( |
| 22 | + host="%s-p.rrdb.toolserver.org" % args.db, |
| 23 | + db='%s_p' % args.db, |
| 24 | + read_default_file=args.cnf |
| 25 | + ) |
| 26 | + fetchConn = MySQLdb.connect( |
| 27 | + host="%s-p.rrdb.toolserver.org" % args.db, |
| 28 | + db='%s_p' % args.db, |
| 29 | + read_default_file=args.cnf |
| 30 | + ) |
| 31 | + |
| 32 | + #Printing headers |
| 33 | + print( |
| 34 | + "\t".join([ |
| 35 | + 'user_id', |
| 36 | + 'username', |
| 37 | + 'registration', |
| 38 | + 'end_of_newbie', |
| 39 | + 'rev_id', |
| 40 | + 'timestamp', |
| 41 | + 'comment' |
| 42 | + ]) |
| 43 | + ) |
| 44 | + for year in args.year: |
| 45 | + logging.info("Processing %s:" % year) |
| 46 | + yearCount = 0 |
| 47 | + for user in getUsersByYear(fetchConn, year): |
| 48 | + initialRevs = list(getFirst10Revs(conn, user['user_id'])) |
| 49 | + if len(initialRevs) > 0: |
| 50 | + endOfNoob = initialRevs[-1]['rev_timestamp'] |
| 51 | + talkRev = getRandNonSelfPostToTalkPage( |
| 52 | + conn, |
| 53 | + user['user_id'], |
| 54 | + user['user_name'], |
| 55 | + user['user_registration'], |
| 56 | + endOfNoob |
| 57 | + ) |
| 58 | + if talkRev != None: |
| 59 | + print( |
| 60 | + "\t".join(clean(v) for v in [ |
| 61 | + user['user_id'], |
| 62 | + user['user_name'], |
| 63 | + user['user_registration'], |
| 64 | + endOfNoob, |
| 65 | + talkRev['rev_id'], |
| 66 | + talkRev['rev_timestamp'], |
| 67 | + talkRev['rev_comment'] |
| 68 | + ]) |
| 69 | + ) |
| 70 | + LOGGING_STREAM.write(".") |
| 71 | + yearCount += 1 |
| 72 | + if yearCount >= args.n: |
| 73 | + break |
| 74 | + else: |
| 75 | + LOGGING_STREAM.write("s") |
| 76 | + #logging.debug("User %s has no talk page revisions by other users. Skipping..." % user['username']) |
| 77 | + |
| 78 | + else: |
| 79 | + LOGGING_STREAM.write("-") |
| 80 | + #logging.debug("User %s has no revisions. Skipping..." % user['username']) |
| 81 | + |
| 82 | + LOGGING_STREAM.write("\n") |
| 83 | + |
| 84 | + |
| 85 | + |
| 86 | + |
| 87 | +def getUsersByYear(conn, year): |
| 88 | + year = int(year) |
| 89 | + cursor = conn.cursor(MySQLdb.cursors.SSCursor) |
| 90 | + yearBegin = "%s0000000000" % year |
| 91 | + yearEnd = "%s1231115959" % year |
| 92 | + cursor.execute(""" |
| 93 | + SELECT * FROM user |
| 94 | + WHERE user_registration BETWEEN %(year_begin)s AND %(year_end)s |
| 95 | + ORDER BY RAND() |
| 96 | + """, |
| 97 | + { |
| 98 | + 'year_begin': yearBegin, |
| 99 | + 'year_end': yearEnd |
| 100 | + } |
| 101 | + ) |
| 102 | + for row in cursor: |
| 103 | + yield dict( |
| 104 | + zip( |
| 105 | + (d[0] for d in cursor.description), |
| 106 | + row |
| 107 | + ) |
| 108 | + ) |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | +def getFirst10Revs(conn, userId): |
| 114 | + user_id = int(userId) |
| 115 | + cursor = conn.cursor() |
| 116 | + cursor.execute(""" |
| 117 | + SELECT * FROM revision |
| 118 | + WHERE rev_user = %(user_id)s |
| 119 | + ORDER BY rev_timestamp ASC |
| 120 | + LIMIT 10 |
| 121 | + """, |
| 122 | + { |
| 123 | + 'user_id': userId |
| 124 | + } |
| 125 | + ) |
| 126 | + for row in cursor: |
| 127 | + yield dict( |
| 128 | + zip( |
| 129 | + (d[0] for d in cursor.description), |
| 130 | + row |
| 131 | + ) |
| 132 | + ) |
| 133 | + |
| 134 | +def getRandNonSelfPostToTalkPage(conn, userId, username, start, end): |
| 135 | + pageId = getTalkPageId(conn, username) |
| 136 | + if pageId == None: return None |
| 137 | + else: |
| 138 | + cursor = conn.cursor() |
| 139 | + cursor.execute(""" |
| 140 | + SELECT * FROM revision |
| 141 | + WHERE rev_page = %(page_id)s |
| 142 | + AND rev_user != %(user_id)s |
| 143 | + AND rev_timestamp BETWEEN %(start)s AND %(end)s |
| 144 | + ORDER BY RAND() |
| 145 | + LIMIT 1 |
| 146 | + """, |
| 147 | + { |
| 148 | + 'page_id': pageId, |
| 149 | + 'user_id': userId, |
| 150 | + 'start': start, |
| 151 | + 'end': end |
| 152 | + } |
| 153 | + ) |
| 154 | + for rev in cursor: |
| 155 | + return dict( |
| 156 | + zip( |
| 157 | + (d[0] for d in cursor.description), |
| 158 | + rev |
| 159 | + ) |
| 160 | + ) |
| 161 | + |
| 162 | + return None |
| 163 | + |
| 164 | + |
| 165 | +def getTalkPageId(conn, title): |
| 166 | + cursor = conn.cursor() |
| 167 | + cursor.execute(""" |
| 168 | + SELECT page_id FROM page |
| 169 | + WHERE page_title = %(title)s |
| 170 | + AND page_namespace = 3 |
| 171 | + """, |
| 172 | + { |
| 173 | + 'title': title |
| 174 | + } |
| 175 | + ) |
| 176 | + for page in cursor: |
| 177 | + return page[0] |
| 178 | + |
| 179 | + return None |
| 180 | + |
| 181 | +if __name__ == "__main__": |
| 182 | + parser = argparse.ArgumentParser( |
| 183 | + description= |
| 184 | + 'Samples editors by the year they made their first edit.' |
| 185 | + ) |
| 186 | + parser.add_argument( |
| 187 | + 'n', |
| 188 | + type=int, |
| 189 | + help='the number of editors to sample from each year' |
| 190 | + ) |
| 191 | + parser.add_argument( |
| 192 | + 'year', |
| 193 | + type=int, |
| 194 | + help='year(s) to sample from', |
| 195 | + nargs="+" |
| 196 | + ) |
| 197 | + parser.add_argument( |
| 198 | + '-c', '--cnf', |
| 199 | + metavar="<path>", |
| 200 | + type=str, |
| 201 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 202 | + default=os.path.expanduser("~/.my.cnf") |
| 203 | + ) |
| 204 | + parser.add_argument( |
| 205 | + '-d', '--db', |
| 206 | + type=str, |
| 207 | + help='the language db to run the query in (defaults to enwiki)', |
| 208 | + default="enwiki" |
| 209 | + ) |
| 210 | + args = parser.parse_args() |
| 211 | + main(args) |
Index: trunk/tools/wsor/ts_samples/testing.sql |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +CREATE TABLE u_grphack.user_meta ( |
| 3 | + user_id INT, |
| 4 | + username VARCHAR(255), |
| 5 | + registration VARCHAR(14), |
| 6 | + reg_year INT |
| 7 | +); |
| 8 | + |
| 9 | +INSERT INTO u_grphack.user_meta |
| 10 | +SELECT user_id, user_name, user_registration, SUBSTRING(user_registration, 1,4) |
| 11 | +FROM user; |
| 12 | + |
| 13 | +CREATE INDEX user_meta_pkey ON u_grphack.user_meta (user_id) USING BTREE; |
| 14 | +CREATE INDEX user_meta_reg_year ON u_grphack.user_meta (reg_year) USING BTREE; |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | +explain SELECT * FROM u_grphack.user_meta |
| 20 | +WHERE reg_year = 2004 |
| 21 | +ORDER BY RAND(); |
| 22 | + |
| 23 | +explain SELECT * FROM user |
| 24 | +WHERE user_registration BETWEEN "20040000000000" AND "20041231115959" |
| 25 | +ORDER BY RAND() |
| 26 | +LIMIT 10; |
Index: trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py |
— | — | @@ -0,0 +1,152 @@ |
| 2 | +import os, sys, logging, argparse, MySQLdb, urllib2, urllib, json |
| 3 | + |
| 4 | +def clean(v): |
| 5 | + if v == None: |
| 6 | + return "\N" |
| 7 | + else: |
| 8 | + return str(v).replace("\t", "\\t").replace("\n", "\\n").replace("\\", "\\\\") |
| 9 | + |
| 10 | + |
| 11 | +def main(args): |
| 12 | + LOGGING_STREAM = sys.stderr |
| 13 | + logging.basicConfig( |
| 14 | + level=logging.DEBUG, |
| 15 | + stream=LOGGING_STREAM, |
| 16 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 17 | + datefmt='%b-%d %H:%M:%S' |
| 18 | + ) |
| 19 | + |
| 20 | + logging.info("Reading from %s." % args.input) |
| 21 | + |
| 22 | + #Print header |
| 23 | + print( |
| 24 | + "\t".join([ |
| 25 | + 'rev_id', |
| 26 | + 'diff' |
| 27 | + ]) |
| 28 | + ) |
| 29 | + |
| 30 | + rowBuffer = [] |
| 31 | + for row in readTSVFile(args.input): |
| 32 | + LOGGING_STREAM.write("<") |
| 33 | + print( |
| 34 | + "\t".join([ |
| 35 | + row['rev_id'], |
| 36 | + getSingleDiff(args.uri, row['rev_id']).replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t") |
| 37 | + ]) |
| 38 | + ) |
| 39 | + LOGGING_STREAM.write(">") |
| 40 | + #rowBuffer.append(row) |
| 41 | + #if len(rowBuffer) == 50: |
| 42 | + # LOGGING_STREAM.write("\n") |
| 43 | + # diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer)) |
| 44 | + # for row in rowBuffer: |
| 45 | + # LOGGING_STREAM.write(">") |
| 46 | + # print( |
| 47 | + # "\t".join([ |
| 48 | + # row['rev_id'], |
| 49 | + # diffMap.get(row['rev_id'], '').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8') |
| 50 | + # ]) |
| 51 | + # ) |
| 52 | + # |
| 53 | + # rowBuffer = [] |
| 54 | + # |
| 55 | + # LOGGING_STREAM.write("\n") |
| 56 | + |
| 57 | + LOGGING_STREAM.write("\n") |
| 58 | + #diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer)) |
| 59 | + #for row in rowBuffer: |
| 60 | + # LOGGING_STREAM.write(">") |
| 61 | + # print( |
| 62 | + # "\t".join([ |
| 63 | + # row['rev_id'], |
| 64 | + # diffMap.get(row['rev_id'], '').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8') |
| 65 | + # ]) |
| 66 | + # ) |
| 67 | + # |
| 68 | + #LOGGING_STREAM.write("\n") |
| 69 | + |
| 70 | + |
| 71 | + |
| 72 | +def getSingleDiff(uri, revId): |
| 73 | + response = urllib2.urlopen( |
| 74 | + uri, |
| 75 | + urllib.urlencode({ |
| 76 | + 'action': 'query', |
| 77 | + 'prop': 'revisions', |
| 78 | + 'revids': revId, |
| 79 | + 'rvprop': 'ids|content', |
| 80 | + 'rvdiffto': 'prev', |
| 81 | + 'format': 'json' |
| 82 | + }), |
| 83 | + ).read() |
| 84 | + result = json.loads(response) |
| 85 | + diffMap = {} |
| 86 | + try: |
| 87 | + for page in result['query']['pages'].values(): |
| 88 | + for rev in page['revisions']: |
| 89 | + return rev['diff'].get("*", "").encode('utf-8') |
| 90 | + except Exception as e: |
| 91 | + logging.error(response) |
| 92 | + logging.error(result) |
| 93 | + raise e |
| 94 | + |
| 95 | + |
| 96 | +def buildDiffMap(uri, revIds): |
| 97 | + if len(revIds) == 0: |
| 98 | + return {} |
| 99 | + else: |
| 100 | + response = urllib2.urlopen( |
| 101 | + uri, |
| 102 | + urllib.urlencode({ |
| 103 | + 'action': 'query', |
| 104 | + 'prop': 'revisions', |
| 105 | + 'revids': '|'.join(revIds), |
| 106 | + 'rvprop': 'ids|content', |
| 107 | + 'rvdiffto': 'prev', |
| 108 | + 'format': 'json' |
| 109 | + }), |
| 110 | + ).read() |
| 111 | + result = json.loads(response) |
| 112 | + diffMap = {} |
| 113 | + try: |
| 114 | + for page in result['query']['pages'].values(): |
| 115 | + for rev in page['revisions']: |
| 116 | + diffMap[str(rev['revid'])] = rev['diff'].get("*", "") |
| 117 | + except Exception as e: |
| 118 | + logging.error(response) |
| 119 | + logging.error(result) |
| 120 | + raise e |
| 121 | + |
| 122 | + return diffMap |
| 123 | + |
| 124 | + |
| 125 | + |
| 126 | +def readTSVFile(f): |
| 127 | + headers = f.readline().strip().split("\t") |
| 128 | + for line in f: |
| 129 | + values = line.strip().split("\t") |
| 130 | + yield dict(zip(headers,values)) |
| 131 | + |
| 132 | + |
| 133 | + |
| 134 | +if __name__ == "__main__": |
| 135 | + parser = argparse.ArgumentParser( |
| 136 | + description= |
| 137 | + 'Adds diff information to a sample of talk edits' |
| 138 | + ) |
| 139 | + parser.add_argument( |
| 140 | + '-u', '--uri', |
| 141 | + type=str, |
| 142 | + help='the uri of the api to connect to (defaults to enwp api)', |
| 143 | + default="http://en.wikipedia.org/api.php" |
| 144 | + ) |
| 145 | + parser.add_argument( |
| 146 | + '-i', '--input', |
| 147 | + metavar="<path>", |
| 148 | + type=lambda fn:open(fn, "r"), |
| 149 | + help='the sample file to read (defaults to standard in)', |
| 150 | + default=sys.stdin |
| 151 | + ) |
| 152 | + args = parser.parse_args() |
| 153 | + main(args) |