r90495 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r90494‎ | r90495 | r90496 >
Date:23:20, 20 June 2011
Author:halfak
Status:deferred
Tags:
Comment:
added sampling script for staeiou
Modified paths:
  • /trunk/tools/wsor/ts_samples/sample_talk_edits_staeiou.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/ts_samples/sample_talk_edits_staeiou.py
@@ -0,0 +1,359 @@
 2+#
 3+# Sample talk page postings to newbie's talk pages in various languages.
 4+#
 5+# This script is intended to be run on the one of the toolserver machines.
 6+#
 7+# run python sample_talk_edits.py --help for command line parameters.
 8+#
 9+import os, sys, logging, argparse, MySQLdb, datetime
 10+
 11+def clean(v):
 12+ if v == None:
 13+ return "\N"
 14+ else:
 15+ return str(v).replace("\\", "\\\\").replace("\t", "\\t").replace("\n", "\\n")
 16+
 17+
 18+def main(args):
 19+ LOGGING_STREAM = sys.stderr
 20+ logging.basicConfig(
 21+ level=logging.DEBUG,
 22+ stream=LOGGING_STREAM,
 23+ format='%(asctime)s %(levelname)-8s %(message)s',
 24+ datefmt='%b-%d %H:%M:%S'
 25+ )
 26+
 27+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 28+ conn = MySQLdb.connect(
 29+ host=args.host,
 30+ db=args.db,
 31+ read_default_file=args.cnf
 32+ )
 33+ fetchConn = MySQLdb.connect(
 34+ host=args.host,
 35+ db=args.db,
 36+ read_default_file=args.cnf
 37+ )
 38+
 39+ #Printing headers
 40+ print(
 41+ "\t".join([
 42+ 'user_id',
 43+ 'username',
 44+ 'registration',
 45+ 'first_edit',
 46+ 'end of newbie',
 47+ 'last user rev_id',
 48+ 'last utalk rev_id',
 49+ 'Main edits',
 50+ 'Talk edits',
 51+ 'User edits',
 52+ 'User_talk edits',
 53+ 'Wikipedia edits',
 54+ 'Wikipedia_talk edits',
 55+ 'Image edits',
 56+ 'Image_talk edits',
 57+ 'MediaWiki edits',
 58+ 'MediaWiki_talk edits',
 59+ 'Template edits',
 60+ 'Template_talk edits',
 61+ 'Help edits',
 62+ 'Help_talk edits',
 63+ 'Category edits',
 64+ 'Category_talk edits',
 65+ 'blocks'
 66+ ])
 67+ )
 68+ for year in args.year:
 69+ for semStart, semEnd in [('000000', '069999'), ('070000', '99999')]:
 70+ logging.info("Processing %s:%s" % (year, semStart))
 71+ start = str(year) + semStart + "000000"
 72+ end = str(year) + semEnd + "999999"
 73+ count = 0
 74+ for user in getUsers(fetchConn, start, end):
 75+ #
 76+ # The following lines take a user's first_edit,
 77+ # covert it to a date, add 30 days and convert
 78+ # it back to a string. I am syntax fu.
 79+ #
 80+ endOfNoob = (
 81+ datetime.date(
 82+ int(user['first_edit'][0:4]),
 83+ int(user['first_edit'][4:6]),
 84+ int(user['first_edit'][6:8])
 85+ )+datetime.timedelta(days=30)
 86+ ).strftime("%Y%m%d") + user['first_edit'][8:]
 87+
 88+ LOGGING_STREAM.write(":")
 89+ talkRevs = list(getPostsToTalkPage(
 90+ conn,
 91+ user['user_id'],
 92+ user['user_name'],
 93+ user['first_edit'],
 94+ endOfNoob
 95+ ))
 96+ newbieRevs = {}
 97+
 98+ LOGGING_STREAM.write(":")
 99+ for rev in getUserRevs(conn, user['user_id'], user['first_edit'], endOfNoob):
 100+ newbieRevs[rev['page_namespace']] = newbieRevs.get(rev['page_namespace'], 0)+1
 101+
 102+
 103+ LOGGING_STREAM.write(":")
 104+ blocks = '\n'.join(
 105+ [
 106+ "%(action)s: %(comment)s - %(params)s" % b for b in
 107+ getBlockEvents(conn, user['user_name'], user['first_edit'], endOfNoob)
 108+ ]
 109+ )
 110+
 111+ LOGGING_STREAM.write(":")
 112+ userPageRev = getLastPostToUserPage(
 113+ conn,
 114+ user['user_id'],
 115+ user['user_name'],
 116+ user['first_edit'],
 117+ endOfNoob
 118+ )
 119+ if userPageRev == None:
 120+ userPageRevId = None
 121+ else:
 122+ userPageRevId = userPageRev['rev_id']
 123+
 124+ if len(talkRevs) != 0:
 125+ print(
 126+ "\t".join(clean(v) for v in [
 127+ user['user_id'],
 128+ user['user_name'],
 129+ user['user_registration'],
 130+ user['first_edit'],
 131+ endOfNoob,
 132+ userPageRevId,
 133+ talkRevs[-1]['rev_id'],
 134+ newbieRevs.get(0, 0),
 135+ newbieRevs.get(1, 0),
 136+ newbieRevs.get(2, 0),
 137+ newbieRevs.get(3, 0),
 138+ newbieRevs.get(4, 0),
 139+ newbieRevs.get(5, 0),
 140+ newbieRevs.get(6, 0),
 141+ newbieRevs.get(7, 0),
 142+ newbieRevs.get(8, 0),
 143+ newbieRevs.get(9, 0),
 144+ newbieRevs.get(10, 0),
 145+ newbieRevs.get(11, 0),
 146+ newbieRevs.get(12, 0),
 147+ newbieRevs.get(13, 0),
 148+ newbieRevs.get(14, 0),
 149+ newbieRevs.get(15, 0),
 150+ blocks
 151+ ])
 152+ )
 153+ LOGGING_STREAM.write(".")
 154+ count += 1
 155+ if count >= args.n:
 156+ break
 157+ else:
 158+ LOGGING_STREAM.write("s")
 159+
 160+ LOGGING_STREAM.write("\n")
 161+
 162+
 163+
 164+
 165+def getUsers(conn, start, end):
 166+ cursor = conn.cursor(MySQLdb.cursors.SSCursor)
 167+ cursor.execute("""
 168+ SELECT
 169+ u.user_id,
 170+ u.user_name,
 171+ u.user_registration,
 172+ um.first_edit,
 173+ um.last_edit
 174+ FROM user u
 175+ INNER JOIN halfak.user_meta um
 176+ ON u.user_id = um.user_id
 177+ WHERE um.first_edit BETWEEN %(start)s AND %(end)s
 178+ ORDER BY RAND()
 179+ """,
 180+ {
 181+ 'start': start,
 182+ 'end': end
 183+ }
 184+ )
 185+ for row in cursor:
 186+ yield dict(
 187+ zip(
 188+ (d[0] for d in cursor.description),
 189+ row
 190+ )
 191+ )
 192+
 193+
 194+
 195+
 196+def getUserRevs(conn, userId, start, end):
 197+ user_id = int(userId)
 198+ cursor = conn.cursor()
 199+ cursor.execute("""
 200+ SELECT
 201+ r.*,
 202+ p.page_namespace
 203+ FROM revision r
 204+ INNER JOIN page p
 205+ ON r.rev_page = p.page_id
 206+ WHERE rev_user = %(user_id)s
 207+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 208+ ORDER BY rev_timestamp ASC
 209+ """,
 210+ {
 211+ 'user_id': userId,
 212+ 'start': start,
 213+ 'end': end
 214+ }
 215+ )
 216+ for row in cursor:
 217+ yield dict(
 218+ zip(
 219+ (d[0] for d in cursor.description),
 220+ row
 221+ )
 222+ )
 223+
 224+
 225+def getBlockEvents(conn, username, start, end):
 226+ cursor = conn.cursor()
 227+ cursor.execute("""
 228+ SELECT
 229+ log_action as action,
 230+ log_comment as comment,
 231+ log_params as params
 232+ FROM logging
 233+ WHERE log_title = %(username)s
 234+ AND log_type = "block"
 235+ AND log_timestamp BETWEEN %(start)s AND %(end)s
 236+ ORDER BY log_timestamp ASC
 237+ """,
 238+ {
 239+ 'username': username,
 240+ 'start': start,
 241+ 'end': end
 242+ }
 243+ )
 244+ for row in cursor:
 245+ yield dict(
 246+ zip(
 247+ (d[0] for d in cursor.description),
 248+ row
 249+ )
 250+ )
 251+
 252+def getLastPostToUserPage(conn, userId, username, start, end):
 253+ pageId = getPageId(conn, username, 2)
 254+ if pageId != None:
 255+ cursor = conn.cursor()
 256+ cursor.execute("""
 257+ SELECT * FROM revision
 258+ WHERE rev_page = %(page_id)s
 259+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 260+ ORDER BY rev_timestamp DESC
 261+ LIMIT 1
 262+ """,
 263+ {
 264+ 'page_id': pageId,
 265+ 'user_id': userId,
 266+ 'start': start,
 267+ 'end': end
 268+ }
 269+ )
 270+ for rev in cursor:
 271+ return dict(
 272+ zip(
 273+ (d[0] for d in cursor.description),
 274+ rev
 275+ )
 276+ )
 277+
 278+ return None
 279+
 280+
 281+def getPageId(conn, title, namespace):
 282+ cursor = conn.cursor()
 283+ cursor.execute("""
 284+ SELECT page_id FROM page
 285+ WHERE page_title = %(title)s
 286+ AND page_namespace = %(namespace)s
 287+ """,
 288+ {
 289+ 'title': title,
 290+ 'namespace': namespace
 291+ }
 292+ )
 293+ for page in cursor:
 294+ return page[0]
 295+
 296+ return None
 297+
 298+def getPostsToTalkPage(conn, userId, username, start, end):
 299+ pageId = getPageId(conn, username, 3)
 300+ if pageId != None:
 301+ cursor = conn.cursor()
 302+ cursor.execute("""
 303+ SELECT * FROM revision
 304+ WHERE rev_page = %(page_id)s
 305+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 306+ ORDER BY rev_id
 307+ """,
 308+ {
 309+ 'page_id': pageId,
 310+ 'user_id': userId,
 311+ 'start': start,
 312+ 'end': end
 313+ }
 314+ )
 315+ for rev in cursor:
 316+ yield dict(
 317+ zip(
 318+ (d[0] for d in cursor.description),
 319+ rev
 320+ )
 321+ )
 322+
 323+
 324+if __name__ == "__main__":
 325+ parser = argparse.ArgumentParser(
 326+ description=
 327+ 'Samples editors by the year they made their first edit.'
 328+ )
 329+ parser.add_argument(
 330+ 'n',
 331+ type=int,
 332+ help='the number of editors to sample from each year'
 333+ )
 334+ parser.add_argument(
 335+ 'year',
 336+ type=int,
 337+ help='year(s) to sample from',
 338+ nargs="+"
 339+ )
 340+ parser.add_argument(
 341+ '-c', '--cnf',
 342+ metavar="<path>",
 343+ type=str,
 344+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 345+ default=os.path.expanduser("~/.my.cnf")
 346+ )
 347+ parser.add_argument(
 348+ '-s', '--host',
 349+ type=str,
 350+ help='the database host to connect to (defaults to localhost)',
 351+ default="localhost"
 352+ )
 353+ parser.add_argument(
 354+ '-d', '--db',
 355+ type=str,
 356+ help='the language db to run the query in (defaults to enwiki)',
 357+ default="enwiki"
 358+ )
 359+ args = parser.parse_args()
 360+ main(args)

Status & tagging log