r92920 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r92919‎ \| r92920 \| r92921 >
Date:	00:22, 23 July 2011
Author:	halfak
Status:	deferred
Tags:
Comment:	Plots, R and diff stuff
Modified paths:	/trunk/tools/wsor/diffs (added) (history) /trunk/tools/wsor/diffs/diff_match_patch.py (added) (history) /trunk/tools/wsor/diffs/example.py (added) (history) /trunk/tools/wsor/diffs/page_sample.xml (added) (history) /trunk/tools/wsor/diffs/revision_differ.py (added) (history) /trunk/tools/wsor/diffs/xml_simulator.py (added) (history) /trunk/tools/wsor/first_session (added) (history) /trunk/tools/wsor/first_session/R (added) (history) /trunk/tools/wsor/first_session/R/.RData (added) (history) /trunk/tools/wsor/first_session/R/.Rhistory (added) (history) /trunk/tools/wsor/first_session/R/Rplots.pdf (added) (history) /trunk/tools/wsor/first_session/R/edit_distributions.R (added) (history) /trunk/tools/wsor/first_session/R/first_session_characteristics.R (added) (history) /trunk/tools/wsor/first_session/R/first_session_survival.R (added) (history) /trunk/tools/wsor/first_session/R/first_sessions.R (added) (history) /trunk/tools/wsor/first_session/R/loader (added) (history) /trunk/tools/wsor/first_session/R/loader/user_sessions.R (added) (history) /trunk/tools/wsor/first_session/R/plots (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png (added) (history) /trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png (added) (history) /trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png (added) (history) /trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png (added) (history) /trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png (added) (history) /trunk/tools/wsor/first_session/R/util (added) (history) /trunk/tools/wsor/first_session/R/util/env.R (added) (history) /trunk/tools/wsor/first_session/data (added) (history) /trunk/tools/wsor/first_session/foo (added) (history) /trunk/tools/wsor/first_session/get_first_n_sessions.py (added) (history) /trunk/tools/wsor/first_session/testing.sql (added) (history) /trunk/tools/wsor/newbie_warnings/queries.sql (modified) (history) /trunk/tools/wsor/newbie_warnings/track_hugglers.py (modified) (history) /trunk/tools/wsor/newbie_warnings/track_hugglers_ng.py (added) (history) /trunk/tools/wsor/newbie_warnings/track_hugglings.py (added) (history) /trunk/tools/wsor/newbie_warnings/track_messages.py (modified) (history) /trunk/tools/wsor/vandal_conversion/R/conversions.R (modified) (history) /trunk/tools/wsor/vandal_conversion/R/util/env.R (modified) (history) /trunk/tools/wsor/vandal_conversion/get_editor_editcount.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/newbie_warnings/track_hugglers_ng.py
—	—	@@ -0,0 +1,192 @@
	2	+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
	3	+import wmf
	4	+
	5	+def encode(v):
	6	+ if v == None: return "\N"
	7	+
	8	+ if type(v) == types.LongType: v = int(v)
	9	+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
	10	+
	11	+ return str(v).encode("string-escape")
	12	+
	13	+
	14	+def emit(event, p, time):
	15	+ print(
	16	+ "\t".join(encode(v) for v in [
	17	+ event,
	18	+ p['user_id'],
	19	+ p['user_name'],
	20	+ time
	21	+ ])
	22	+ )
	23	+ sys.stdout.flush()
	24	+
	25	+
	26	+def main():
	27	+ parser = argparse.ArgumentParser(
	28	+ description=''
	29	+ )
	30	+ parser.add_argument(
	31	+ '-c', '--cnf',
	32	+ metavar="<path>",
	33	+ type=str,
	34	+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
	35	+ default=os.path.expanduser("~/.my.cnf")
	36	+ )
	37	+ parser.add_argument(
	38	+ '-s', '--host',
	39	+ type=str,
	40	+ help='the database host to connect to (defaults to localhost)',
	41	+ default="localhost"
	42	+ )
	43	+ parser.add_argument(
	44	+ '-d', '--db',
	45	+ type=str,
	46	+ help='the language db to run the query in (defaults to enwiki)',
	47	+ default="enwiki"
	48	+ )
	49	+ parser.add_argument(
	50	+ '-o', '--out',
	51	+ type=lambda fn:open(fn, 'a+'),
	52	+ help='Where should output be appended',
	53	+ default=sys.stdout
	54	+ )
	55	+ args = parser.parse_args()
	56	+
	57	+ LOGGING_STREAM = sys.stderr
	58	+ logging.basicConfig(
	59	+ level=logging.DEBUG,
	60	+ stream=LOGGING_STREAM,
	61	+ format='%(asctime)s %(levelname)-8s %(message)s',
	62	+ datefmt='%b-%d %H:%M:%S'
	63	+ )
	64	+
	65	+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
	66	+ db = Database(
	67	+ host=args.host,
	68	+ db=args.db,
	69	+ read_default_file=args.cnf
	70	+ )
	71	+
	72	+ try:
	73	+ oldPosts = {}
	74	+ lastTime = db.getTime()
	75	+ time.sleep(5)
	76	+ while True:
	77	+ logging.info("Tracking %s posts. Looking for new ones since %s." % (len(oldPosts), lastTime))
	78	+ newUsers = set(db.getHugglePostsSince(lastTime))
	79	+ currTime = db.getTime()
	80	+ currUsers = set()
	81	+ for p in db.getWaitingPosts(oldPosts.viewkeys() \| newUsers):
	82	+ if p['user_name'] not in oldPosts:
	83	+ #Found a new posting
	84	+ LOGGING_STREAM.write(">")
	85	+ p['posting'] = currTime
	86	+ oldPosts[p['user_name']] = p
	87	+ emit("received", p, currTime)
	88	+ elif p['messages'] < oldPosts[p['user_name']]['messages']:
	89	+ #Looks like someone checked the message
	90	+ LOGGING_STREAM.write("<")
	91	+ emit("read", oldPosts[p['user_name']], currTime)
	92	+ del oldPosts[p['user_name']]
	93	+ else:
	94	+ #Same shit, different minute
	95	+ pass
	96	+
	97	+ currUsers.add(p['user_name'])
	98	+
	99	+ for missing in oldPosts.viewkeys() - currUsers:
	100	+ LOGGING_STREAM.write("<")
	101	+ emit("read", oldPosts[missing], currTime)
	102	+ del oldPosts[missing]
	103	+
	104	+ lastTime = currTime
	105	+ LOGGING_STREAM.write("\n")
	106	+ time.sleep(5)
	107	+
	108	+ except KeyboardInterrupt:
	109	+ logging.info("Keyboard interrupt detected. Shutting down.")
	110	+ except Exception as e:
	111	+ logging.error(str(e))
	112	+
	113	+ print(repr(oldPosts))
	114	+ print(lastTime)
	115	+
	116	+
	117	+
	118	+def safe(val):
	119	+ return '"' + val.replace('"', '\\"') + '"'
	120	+
	121	+class Database:
	122	+
	123	+ def __init__(self, args, *kwargs):
	124	+ self.args = args
	125	+ self.kwargs = kwargs
	126	+ self.usersConn = MySQLdb.connect(args, *kwargs)
	127	+
	128	+
	129	+
	130	+ def getTime(self):
	131	+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
	132	+ cursor.execute(
	133	+ """
	134	+ SELECT rc_timestamp AS time
	135	+ FROM recentchanges
	136	+ ORDER BY rc_timestamp DESC
	137	+ LIMIT 1
	138	+ """
	139	+ )
	140	+ self.usersConn.commit()
	141	+ for row in cursor:
	142	+ return row['time']
	143	+
	144	+
	145	+ def getHugglePostsSince(self, timestamp):
	146	+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
	147	+ cursor.execute("""
	148	+ SELECT DISTINCT p.page_title AS title
	149	+ FROM revision r
	150	+ INNER JOIN page p
	151	+ ON r.rev_page = p.page_id
	152	+ WHERE p.page_namespace = 3
	153	+ AND r.rev_timestamp >= %(timestamp)s
	154	+ AND r.rev_comment LIKE %(like)s
	155	+ """,
	156	+ {
	157	+ "timestamp": timestamp,
	158	+ "like": "%" + "WP:HG" + "%",
	159	+ "clue": "%" + "Warning" + "%"
	160	+ }
	161	+ )
	162	+ return (p['title'].replace("_", " ") for p in cursor)
	163	+
	164	+ def getWaitingPosts(self, users):
	165	+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
	166	+ userString = ",".join(safe(u) for u in users)
	167	+ if len(userString) != 0:
	168	+ cursor.execute("""
	169	+ SELECT
	170	+ u.user_id,
	171	+ u.user_name,
	172	+ count(*) as messages,
	173	+ u.user_touched as last_touched,
	174	+ FROM user_newtalk nt
	175	+ LEFT JOIN user u
	176	+ ON u.user_id = nt.user_id
	177	+ WHERE u.user_name IN (""" + userString + """)
	178	+ GROUP BY u.user_id, u.user_name
	179	+ UNION
	180	+ SELECT
	181	+ NULL as user_id,
	182	+ nt.user_ip as user_name,
	183	+ count(*) as messages,
	184	+ NULL as last_touched,
	185	+ FROM user_newtalk nt
	186	+ WHERE nt.user_ip IN (""" + userString + """)
	187	+ GROUP BY nt.user_ip, NULL
	188	+ """
	189	+ )
	190	+ for post in cursor:
	191	+ yield post
	192	+
	193	+if __name__ == "__main__": main()
Index: trunk/tools/wsor/newbie_warnings/track_hugglings.py
—	—	@@ -0,0 +1,184 @@
	2	+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
	3	+import wmf
	4	+
	5	+def encode(v):
	6	+ if v == None: return "\N"
	7	+
	8	+ if type(v) == types.LongType: v = int(v)
	9	+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
	10	+
	11	+ return str(v).encode("string-escape")
	12	+
	13	+
	14	+def emit(event, p, time):
	15	+ print(
	16	+ "\t".join(encode(v) for v in [
	17	+ event,
	18	+ p['user_id'],
	19	+ p['user_name'],
	20	+ time
	21	+ ])
	22	+ )
	23	+ sys.stdout.flush()
	24	+
	25	+
	26	+def main():
	27	+ parser = argparse.ArgumentParser(
	28	+ description=''
	29	+ )
	30	+ parser.add_argument(
	31	+ '-c', '--cnf',
	32	+ metavar="<path>",
	33	+ type=str,
	34	+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
	35	+ default=os.path.expanduser("~/.my.cnf")
	36	+ )
	37	+ parser.add_argument(
	38	+ '-s', '--host',
	39	+ type=str,
	40	+ help='the database host to connect to (defaults to localhost)',
	41	+ default="localhost"
	42	+ )
	43	+ parser.add_argument(
	44	+ '-d', '--db',
	45	+ type=str,
	46	+ help='the language db to run the query in (defaults to enwiki)',
	47	+ default="enwiki"
	48	+ )
	49	+ parser.add_argument(
	50	+ '-o', '--out',
	51	+ type=lambda fn:open(fn, 'a+'),
	52	+ help='Where should output be appended',
	53	+ default=sys.stdout
	54	+ )
	55	+ args = parser.parse_args()
	56	+
	57	+ LOGGING_STREAM = sys.stderr
	58	+ logging.basicConfig(
	59	+ level=logging.DEBUG,
	60	+ stream=LOGGING_STREAM,
	61	+ format='%(asctime)s %(levelname)-8s %(message)s',
	62	+ datefmt='%b-%d %H:%M:%S'
	63	+ )
	64	+
	65	+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
	66	+ db = Database(
	67	+ host=args.host,
	68	+ db=args.db,
	69	+ read_default_file=args.cnf
	70	+ )
	71	+
	72	+ try:
	73	+ oldPosts = {}
	74	+ lastTime = db.getTime()
	75	+ time.sleep(5)
	76	+ while True:
	77	+ logging.info("Tracking %s posts. Looking for new ones since %s." % (len(oldPosts), lastTime))
	78	+ newUsers = set(db.getHugglePostsSince(lastTime))
	79	+ currTime = db.getTime()
	80	+ currUsers = set()
	81	+ for p in db.getWaitingPosts(oldPosts.viewkeys() \| newUsers):
	82	+ if p['user_name'] not in oldPosts:
	83	+ #Found a new posting
	84	+ LOGGING_STREAM.write(">")
	85	+ p['posting'] = currTime
	86	+ oldPosts[p['user_name']] = p
	87	+ emit("received", p, currTime)
	88	+ elif p['messages'] < oldPosts[p['user_name']]['messages']:
	89	+ #Looks like someone checked the message
	90	+ LOGGING_STREAM.write("<")
	91	+ emit("read", oldPosts[p['user_name']], currTime)
	92	+ del oldPosts[p['user_name']]
	93	+ else:
	94	+ #Same shit, different minute
	95	+ pass
	96	+
	97	+ currUsers.add(p['user_name'])
	98	+
	99	+ for missing in oldPosts.viewkeys() - currUsers:
	100	+ LOGGING_STREAM.write("<")
	101	+ emit("read", oldPosts[missing], currTime)
	102	+ del oldPosts[missing]
	103	+
	104	+ lastTime = currTime
	105	+ LOGGING_STREAM.write("\n")
	106	+ time.sleep(5)
	107	+
	108	+ except KeyboardInterrupt:
	109	+ logging.info("Keyboard interrupt detected. Shutting down.")
	110	+ except Exception as e:
	111	+ logging.error(str(e))
	112	+
	113	+ print(repr(oldPosts))
	114	+ print(lastTime)
	115	+
	116	+
	117	+
	118	+def safe(val):
	119	+ return '"' + val.replace('"', '\\"') + '"'
	120	+
	121	+class Database:
	122	+
	123	+ def __init__(self, args, *kwargs):
	124	+ self.args = args
	125	+ self.kwargs = kwargs
	126	+ self.usersConn = MySQLdb.connect(args, *kwargs)
	127	+
	128	+
	129	+
	130	+ def getTime(self):
	131	+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
	132	+ cursor.execute(
	133	+ """
	134	+ SELECT rc_timestamp AS time
	135	+ FROM recentchanges
	136	+ ORDER BY rc_timestamp DESC
	137	+ LIMIT 1
	138	+ """
	139	+ )
	140	+ self.usersConn.commit()
	141	+ for row in cursor:
	142	+ return row['time']
	143	+
	144	+
	145	+ def getHugglePostsSince(self, timestamp):
	146	+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
	147	+ cursor.execute("""
	148	+
	149	+ """,
	150	+ {
	151	+ "timestamp": timestamp,
	152	+ "like": "%" + "WP:HG" + "%",
	153	+ "clue": "%" + "Warning" + "%"
	154	+ }
	155	+ )
	156	+ return (p['title'].replace("_", " ") for p in cursor)
	157	+
	158	+ def getWaitingPosts(self, users):
	159	+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
	160	+ userString = ",".join(safe(u) for u in users)
	161	+ if len(userString) != 0:
	162	+ cursor.execute("""
	163	+ SELECT
	164	+ u.user_id,
	165	+ u.user_name,
	166	+ count(*) as messages
	167	+ FROM user_newtalk nt
	168	+ LEFT JOIN user u
	169	+ ON u.user_id = nt.user_id
	170	+ WHERE u.user_name IN (""" + userString + """)
	171	+ GROUP BY u.user_id, u.user_name
	172	+ UNION
	173	+ SELECT
	174	+ NULL as user_id,
	175	+ nt.user_ip as user_name,
	176	+ count(*) as messages
	177	+ FROM user_newtalk nt
	178	+ WHERE nt.user_ip IN (""" + userString + """)
	179	+ GROUP BY nt.user_ip, NULL
	180	+ """
	181	+ )
	182	+ for post in cursor:
	183	+ yield post
	184	+
	185	+if __name__ == "__main__": main()
Index: trunk/tools/wsor/newbie_warnings/track_messages.py
—	—	@@ -91,7 +91,7 @@
92	92	cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
93	93	cursor.execute(
94	94	"""
95		~~- SELECT rc_timestamp AS time~~
	95	+ SELECT SQL_NO_CACHE rc_timestamp AS time
96	96	FROM recentchanges
97	97	ORDER BY rc_timestamp DESC
98	98	LIMIT 1
Index: trunk/tools/wsor/newbie_warnings/queries.sql
—	—	@@ -181,3 +181,18 @@
182	182	FROM user_newtalk nt
183	183	WHERE nt.user_ip IN ("EpochFail")
184	184	GROUP BY nt.user_ip, NULL;
	185	+
	186	+
	187	+SELECT
	188	+ p.page_id as user_talk_id,
	189	+ p.page_title as user_talk_page,
	190	+ REPLACTE(p.page_title, "_", " ") as user_name,
	191	+ tl.tl_title as template
	192	+FROM enwiki.templatelinks tl
	193	+INNER JOIN enwiki.page p
	194	+ ON page_id = tl_from
	195	+WHERE tl_title IN ('Z49','Z50','Z51','Z52','Z53','Z54','Z55','Z56')
	196	+AND tl_namespace = 10
	197	+AND page_namespace = 3
	198	+
	199	+
Index: trunk/tools/wsor/newbie_warnings/track_hugglers.py
—	—	@@ -19,6 +19,7 @@
20	20	time
21	21	])
22	22	)
	23	+ sys.stdout.flush()
23	24
24	25
25	26	def main():
—	—	@@ -149,10 +150,7 @@
150	151	ON r.rev_page = p.page_id
151	152	WHERE p.page_namespace = 3
152	153	AND r.rev_timestamp >= %(timestamp)s
153		~~- AND (~~
154		~~- r.rev_comment LIKE %(like)s OR~~
155		~~- r.rev_comment LIKE %(clue)s~~
156		~~- )~~
	154	+ AND r.rev_comment LIKE %(like)s
157	155	""",
158	156	{
159	157	"timestamp": timestamp,
—	—	@@ -167,7 +165,7 @@
168	166	userString = ",".join(safe(u) for u in users)
169	167	if len(userString) != 0:
170	168	cursor.execute("""
171		~~- SELECT~~
	169	+ SELECT
172	170	u.user_id,
173	171	u.user_name,
174	172	count(*) as messages
Index: trunk/tools/wsor/first_session/get_first_n_sessions.py
—	—	@@ -0,0 +1,247 @@
	2	+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
	3	+import wmf
	4	+
	5	+def encode(v):
	6	+ if v == None: return "\N"
	7	+
	8	+ if type(v) == types.LongType: v = int(v)
	9	+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
	10	+
	11	+ return str(v).encode("string-escape")
	12	+
	13	+
	14	+def main():
	15	+ parser = argparse.ArgumentParser(
	16	+ description='Gathers editor data for first and last session'
	17	+ )
	18	+ parser.add_argument(
	19	+ 'n',
	20	+ type=int,
	21	+ help='the minimum number of edits that editors must have perfomed to be included'
	22	+ )
	23	+ parser.add_argument(
	24	+ 'session',
	25	+ type=int,
	26	+ help='maximum time between session edits (in seconds)'
	27	+ )
	28	+ parser.add_argument(
	29	+ '-c', '--cnf',
	30	+ metavar="<path>",
	31	+ type=str,
	32	+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
	33	+ default=os.path.expanduser("~/.my.cnf")
	34	+ )
	35	+ parser.add_argument(
	36	+ '-s', '--host',
	37	+ type=str,
	38	+ help='the database host to connect to (defaults to localhost)',
	39	+ default="localhost"
	40	+ )
	41	+ parser.add_argument(
	42	+ '-d', '--db',
	43	+ type=str,
	44	+ help='the language db to run the query in (defaults to enwiki)',
	45	+ default="enwiki"
	46	+ )
	47	+ parser.add_argument(
	48	+ '-o', '--out',
	49	+ type=lambda fn:open(fn, 'w'),
	50	+ help='an output file to write to (defaults to stdout)',
	51	+ default=sys.stdout
	52	+ )
	53	+ args = parser.parse_args()
	54	+
	55	+ LOGGING_STREAM = sys.stderr
	56	+ logging.basicConfig(
	57	+ level=logging.DEBUG,
	58	+ stream=LOGGING_STREAM,
	59	+ format='%(asctime)s %(levelname)-8s %(message)s',
	60	+ datefmt='%b-%d %H:%M:%S'
	61	+ )
	62	+
	63	+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
	64	+ db = Database(
	65	+ host=args.host,
	66	+ db=args.db,
	67	+ read_default_file=args.cnf
	68	+ )
	69	+ headers = [
	70	+ 'user_id',
	71	+ 'user_name',
	72	+ 'first_edit',
	73	+ 'last_edit',
	74	+ 'edit_count'
	75	+ ]
	76	+ for i in range(0, args.n):
	77	+ headers.append("es_%s_start" % i)
	78	+ headers.append("es_%s_end" % i)
	79	+ headers.append("es_%s_edits" % i)
	80	+ headers.append("es_%s_reverted" % i)
	81	+ headers.append("es_%s_vandalism" % i)
	82	+ headers.append("es_%s_deleted" % i)
	83	+
	84	+
	85	+ print("\t".join(headers))
	86	+
	87	+ logging.info("Loading users:")
	88	+
	89	+ users = []
	90	+ for user in db.getSampledUsers():
	91	+ users.append(user)
	92	+ LOGGING_STREAM.write(".")
	93	+ LOGGING_STREAM.write("\n")
	94	+
	95	+ logging.info("Processing users:")
	96	+ for user in users:
	97	+ i = 0
	98	+ for session in sessions(db.getEdits(user['user_id']), args.session):
	99	+ user['es_%s_start' % i] = session[0]['timestamp']
	100	+ user['es_%s_end' % i] = session[-1]['timestamp']
	101	+ user['es_%s_edits' % i] = len(session)
	102	+ user['es_%s_reverted' % i] = 0
	103	+ user['es_%s_vandalism' % i] = 0
	104	+ user['es_%s_deleted' % i] = 0
	105	+
	106	+ for edit in session:
	107	+ user['es_%s_reverted' % i] += edit['is_reverted']
	108	+ user['es_%s_vandalism' % i] += edit['is_vandalism']
	109	+ user['es_%s_deleted' % i] += edit['deleted']
	110	+
	111	+ i += 1
	112	+ if i >= args.n:
	113	+ break
	114	+
	115	+
	116	+ args.out.write("\t".join(encode(user.get(h)) for h in headers) + "\n")
	117	+ LOGGING_STREAM.write(".")
	118	+
	119	+ LOGGING_STREAM.write("\n")
	120	+
	121	+
	122	+def sessions(edits, sessionThreshold=3600):
	123	+ sessionEdits = []
	124	+ for edit in edits:
	125	+ edit['timestamp'] = wmf.wp2Timestamp(edit['rev_timestamp'])
	126	+ if len(sessionEdits) == 0:
	127	+ sessionEdits.append(edit)
	128	+ elif (edit['timestamp'] - sessionEdits[-1]['timestamp']) < sessionThreshold:
	129	+ sessionEdits.append(edit)
	130	+ else:
	131	+ yield sessionEdits
	132	+ sessionEdits = [edit]
	133	+
	134	+
	135	+ if len(sessionEdits) > 0:
	136	+ yield sessionEdits
	137	+
	138	+
	139	+
	140	+
	141	+class Database:
	142	+
	143	+ def __init__(self, args, *kwargs):
	144	+ self.args = args
	145	+ self.kwargs = kwargs
	146	+ self.usersConn = MySQLdb.connect(args, *kwargs)
	147	+ self.revsConn = MySQLdb.connect(args, *kwargs)
	148	+ self.archConn = MySQLdb.connect(args, *kwargs)
	149	+
	150	+ def getSampledUsers(self):
	151	+ cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor)
	152	+ cursor.execute(
	153	+ """
	154	+ SELECT
	155	+ u.user_id,
	156	+ u.user_name,
	157	+ um.first_edit,
	158	+ um.last_edit,
	159	+ u.user_editcount as edit_count
	160	+ FROM halfak.user_session_sample us
	161	+ INNER JOIN user u
	162	+ ON u.user_id = us.user_id
	163	+ INNER JOIN halfak.user_meta_20110715 um
	164	+ ON u.user_id = um.user_id
	165	+ """
	166	+ )
	167	+ for row in cursor:
	168	+ yield row
	169	+
	170	+
	171	+
	172	+ def getEdits(self, userId, chronologically=True):
	173	+ userId = int(userId)
	174	+ revisionCursor = self.revsConn.cursor(MySQLdb.cursors.SSDictCursor)
	175	+ archiveCursor = self.archConn.cursor(MySQLdb.cursors.SSDictCursor)
	176	+
	177	+ if chronologically: direction = "ASC"
	178	+ else: direction = "DESC"
	179	+
	180	+ revisionCursor.execute(
	181	+ """
	182	+ SELECT
	183	+ r.rev_id,
	184	+ r.rev_timestamp,
	185	+ rvtd.revision_id IS NOT NULL AS is_reverted,
	186	+ rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE AS is_vandalism,
	187	+ False AS deleted
	188	+ FROM revision r
	189	+ LEFT JOIN halfak.reverted_20110115 rvtd
	190	+ ON r.rev_id = rvtd.revision_id
	191	+ WHERE rev_user = %(user_id)s
	192	+ ORDER BY r.rev_timestamp """ + direction + """
	193	+ """,
	194	+ {
	195	+ 'user_id': userId
	196	+ }
	197	+ )
	198	+ archiveCursor.execute(
	199	+ """
	200	+ SELECT
	201	+ ar_rev_id AS rev_id,
	202	+ ar_timestamp AS rev_timestamp,
	203	+ False AS is_reverted,
	204	+ False AS is_vandalism,
	205	+ True AS deleted
	206	+ FROM archive
	207	+ WHERE ar_user = %(user_id)s
	208	+ ORDER BY ar_timestamp """ + direction + """
	209	+ """,
	210	+ {
	211	+ 'user_id': userId
	212	+ }
	213	+ )
	214	+ if chronologically:
	215	+ order = lambda t1, t2:t1 < t2
	216	+ else:
	217	+ order = lambda t1, t2:t1 > t2
	218	+
	219	+ revPointer = revisionCursor.fetchone()
	220	+ archPointer = archiveCursor.fetchone()
	221	+ while revPointer != None or archPointer != None: #still something to output
	222	+ if revPointer != None and archPointer != None: #both cursors still have something
	223	+ if order(revPointer['rev_timestamp'], archPointer['rev_timestamp']):
	224	+ yield revPointer
	225	+ revPointer = revisionCursor.fetchone()
	226	+ else:
	227	+ yield archPointer
	228	+ archPointer = archiveCursor.fetchone()
	229	+ elif revPointer != None: #only revisions left
	230	+ yield revPointer
	231	+ revPointer = revisionCursor.fetchone()
	232	+ elif archPointer != None: #only archives left
	233	+ yield archPointer
	234	+ archPointer = archiveCursor.fetchone()
	235	+
	236	+ revisionCursor.close()
	237	+ archiveCursor.close()
	238	+
	239	+
	240	+
	241	+ def getFirstEdits(self, userId, maximum=10000):
	242	+ return self.getEdits(userId, maximum, chronologically=True)
	243	+
	244	+ def getLastEdits(self, userId, maximum=10000):
	245	+ return self.getEdits(userId, maximum, chronologically=False)
	246	+
	247	+
	248	+if __name__ == "__main__": main()
Index: trunk/tools/wsor/first_session/R/.Rhistory
—	—	@@ -0,0 +1,512 @@
	2	+pch=4,
	3	+lty=4
	4	+),
	5	+"32"=list(
	6	+col="#00BBBB",
	7	+pch=5,
	8	+lty=5
	9	+),
	10	+"64"=list(
	11	+col="#BB00BB",
	12	+pch=6,
	13	+lty=6
	14	+)
	15	+)
	16	+xyplot(
	17	+early_survival ~ year,
	18	+data=limited_year_edits_props,
	19	+groups=es_0_bucket,
	20	+panel=function(x, y, subscripts, groups, ...){
	21	+f = limited_year_edits_props[subscripts,]
	22	+for(group in groups){
	23	+group = as.character(group)
	24	+subf = f[f$es_0_bucket == group,]
	25	+p = subf$early_survival
	26	+x = subf$year
	27	+n = subf$n
	28	+panel.xyplot(
	29	+x, p,
	30	+col=params[[group]]$col,
	31	+pch=params[[group]]$pch,
	32	+...
	33	+)
	34	+panel.lines(
	35	+x, p,
	36	+col=params[[group]]$col,
	37	+lwd=2,
	38	+...
	39	+)
	40	+se = sqrt(p*(1-p)/n)
	41	+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	42	+}
	43	+},
	44	+ylim=c(0, 1),
	45	+main="Early survival proportion for new editors grouped by edits in their first session",
	46	+ylab="Proportion of surviving editors",
	47	+xlab="Years",
	48	+sub="early survival = editing more than 1 month after first session",
	49	+auto.key=list(
	50	+text=paste("~", names(params), "edits"),
	51	+col=c(
	52	+"#000000",
	53	+"#FF0000",
	54	+"#00FF00",
	55	+"#0000FF",
	56	+"#BBBB00",
	57	+"#00BBBB",
	58	+"#BB00BB"
	59	+)
	60	+)
	61	+)
	62	+dev.off()
	63	+user_sessions$es_0_no_arch = 2^round(log((user_sessions$es_0_edits - user_sessions$es_0_deleted)+1, base=2))
	64	+no_arch_edits_props = with(
	65	+summaryBy(
	66	+early_survival ~ year + es_0_no_arch,
	67	+data=user_sessions[
	68	+!is.na(user_sessions$year) &
	69	+user_sessions$es_0_no_arch <= 256,
	70	+],
	71	+FUN=c(mean, length)
	72	+),
	73	+data.frame(
	74	+year = year,
	75	+es_0_no_arch = es_0_no_arch,
	76	+early_survival = early_survival.mean,
	77	+n = early_survival.length
	78	+)
	79	+)
	80	+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
	81	+limited_year_edits_props = no_arch_edits_props[
	82	+no_arch_edits_props$n >= 10 &
	83	+no_arch_edits_props$es_0_no_arch <= 16,
	84	+]
	85	+params = list(
	86	+"0"=list(
	87	+col="#AAAAAA",
	88	+pch=0,
	89	+lty=0
	90	+),
	91	+"1"=list(
	92	+col="#000000",
	93	+pch=0,
	94	+lty=0
	95	+),
	96	+"2"=list(
	97	+col="#FF0000",
	98	+pch=1,
	99	+lty=1
	100	+),
	101	+"4"=list(
	102	+col="#00FF00",
	103	+pch=2,
	104	+lty=2
	105	+),
	106	+"8"=list(
	107	+col="#0000FF",
	108	+pch=3,
	109	+lty=3
	110	+),
	111	+"16"=list(
	112	+col="#BBBB00",
	113	+pch=4,
	114	+lty=4
	115	+)
	116	+)
	117	+xyplot(
	118	+early_survival ~ year,
	119	+data=limited_year_edits_props,
	120	+groups=es_0_no_arch,
	121	+panel=function(x, y, subscripts, groups, ...){
	122	+f = limited_year_edits_props[subscripts,]
	123	+for(group in groups){
	124	+group = as.character(group)
	125	+subf = f[f$es_0_no_arch == group,]
	126	+p = subf$early_survival
	127	+x = subf$year
	128	+n = subf$n
	129	+panel.xyplot(
	130	+x, p,
	131	+col=params[[group]]$col,
	132	+pch=params[[group]]$pch,
	133	+...
	134	+)
	135	+panel.lines(
	136	+x, p,
	137	+col=params[[group]]$col,
	138	+lwd=2,
	139	+...
	140	+)
	141	+se = sqrt(p*(1-p)/n)
	142	+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	143	+}
	144	+},
	145	+ylim=c(0, 1),
	146	+main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
	147	+ylab="Proportion of surviving editors",
	148	+xlab="Years",
	149	+sub="early survival = editing more than 1 month after first session",
	150	+auto.key=list(
	151	+text=paste("~", names(params), "edits"),
	152	+col=c(
	153	+"#AAAAAA",
	154	+"#000000",
	155	+"#FF0000",
	156	+"#00FF00",
	157	+"#0000FF",
	158	+"#BBBB00",
	159	+"#00BBBB",
	160	+"#BB00BB"
	161	+)
	162	+)
	163	+)
	164	+dev.off()
	165	+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
	166	+limited_year_edits_props = no_arch_edits_props[
	167	+no_arch_edits_props$n >= 10 &
	168	+no_arch_edits_props$es_0_no_arch <= 16,
	169	+]
	170	+params = list(
	171	+"0"=list(
	172	+col="#AAAAAA",
	173	+pch=0,
	174	+lty=0
	175	+),
	176	+"1"=list(
	177	+col="#000000",
	178	+pch=0,
	179	+lty=0
	180	+),
	181	+"2"=list(
	182	+col="#FF0000",
	183	+pch=1,
	184	+lty=1
	185	+),
	186	+"4"=list(
	187	+col="#00FF00",
	188	+pch=2,
	189	+lty=2
	190	+),
	191	+"8"=list(
	192	+col="#0000FF",
	193	+pch=3,
	194	+lty=3
	195	+),
	196	+"16"=list(
	197	+col="#BBBB00",
	198	+pch=4,
	199	+lty=4
	200	+)
	201	+)
	202	+xyplot(
	203	+early_survival ~ year,
	204	+data=limited_year_edits_props,
	205	+groups=es_0_no_arch,
	206	+panel=function(x, y, subscripts, groups, ...){
	207	+f = limited_year_edits_props[subscripts,]
	208	+for(group in groups){
	209	+group = as.character(group)
	210	+subf = f[f$es_0_no_arch == group,]
	211	+p = subf$early_survival
	212	+x = subf$year
	213	+n = subf$n
	214	+panel.xyplot(
	215	+x, p,
	216	+col=params[[group]]$col,
	217	+pch=params[[group]]$pch,
	218	+...
	219	+)
	220	+panel.lines(
	221	+x, p,
	222	+col=params[[group]]$col,
	223	+lwd=2,
	224	+...
	225	+)
	226	+se = sqrt(p*(1-p)/n)
	227	+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	228	+}
	229	+},
	230	+ylim=c(0, 1),
	231	+main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
	232	+ylab="Proportion of surviving editors",
	233	+xlab="Years",
	234	+sub="early survival = editing more than 1 month after first session",
	235	+auto.key=list(
	236	+text=paste("~", names(params), "edits"),
	237	+col=c(
	238	+"#AAAAAA",
	239	+"#000000",
	240	+"#FF0000",
	241	+"#00FF00",
	242	+"#0000FF",
	243	+"#BBBB00",
	244	+"#00BBBB",
	245	+"#BB00BB"
	246	+),
	247	+points=F
	248	+)
	249	+)
	250	+dev.off()
	251	+png("plots/early_survival.by_year.es_lines.png", height=768, width=1024)
	252	+limited_year_edits_props = year_edits_props[
	253	+year_edits_props$n >= 10 &
	254	+year_edits_props$es_0_bucket <= 16,
	255	+]
	256	+params = list(
	257	+"1"=list(
	258	+col="#000000",
	259	+pch=0,
	260	+lty=0
	261	+),
	262	+"2"=list(
	263	+col="#FF0000",
	264	+pch=1,
	265	+lty=1
	266	+),
	267	+"4"=list(
	268	+col="#00FF00",
	269	+pch=2,
	270	+lty=2
	271	+),
	272	+"8"=list(
	273	+col="#0000FF",
	274	+pch=3,
	275	+lty=3
	276	+),
	277	+"16"=list(
	278	+col="#BBBB00",
	279	+pch=4,
	280	+lty=4
	281	+),
	282	+"32"=list(
	283	+col="#00BBBB",
	284	+pch=5,
	285	+lty=5
	286	+),
	287	+"64"=list(
	288	+col="#BB00BB",
	289	+pch=6,
	290	+lty=6
	291	+)
	292	+)
	293	+xyplot(
	294	+early_survival ~ year,
	295	+data=limited_year_edits_props,
	296	+groups=es_0_bucket,
	297	+panel=function(x, y, subscripts, groups, ...){
	298	+f = limited_year_edits_props[subscripts,]
	299	+for(group in groups){
	300	+group = as.character(group)
	301	+subf = f[f$es_0_bucket == group,]
	302	+p = subf$early_survival
	303	+x = subf$year
	304	+n = subf$n
	305	+panel.xyplot(
	306	+x, p,
	307	+col=params[[group]]$col,
	308	+pch=params[[group]]$pch,
	309	+...
	310	+)
	311	+panel.lines(
	312	+x, p,
	313	+col=params[[group]]$col,
	314	+lwd=2,
	315	+...
	316	+)
	317	+se = sqrt(p*(1-p)/n)
	318	+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	319	+}
	320	+},
	321	+ylim=c(0, 1),
	322	+main="Early survival proportion for new editors grouped by edits in their first session",
	323	+ylab="Proportion of surviving editors",
	324	+xlab="Years",
	325	+sub="early survival = editing more than 1 month after first session",
	326	+auto.key=list(
	327	+text=paste("~", names(params), "edits"),
	328	+col=c(
	329	+"#000000",
	330	+"#FF0000",
	331	+"#00FF00",
	332	+"#0000FF",
	333	+"#BBBB00",
	334	+"#00BBBB",
	335	+"#BB00BB"
	336	+),
	337	+points=F
	338	+)
	339	+)
	340	+dev.off()
	341	+user_sessions$es_0_no_arch = 2^round(log(user_sessions$es_0_edits - user_sessions$es_0_deleted, base=2))
	342	+no_arch_edits_props = with(
	343	+summaryBy(
	344	+early_survival ~ year + es_0_no_arch,
	345	+data=user_sessions[
	346	+!is.na(user_sessions$year) &
	347	+user_sessions$es_0_no_arch <= 256,
	348	+],
	349	+FUN=c(mean, length)
	350	+),
	351	+data.frame(
	352	+year = year,
	353	+es_0_no_arch = es_0_no_arch,
	354	+early_survival = early_survival.mean,
	355	+n = early_survival.length
	356	+)
	357	+)
	358	+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
	359	+limited_year_edits_props = no_arch_edits_props[
	360	+no_arch_edits_props$n >= 10 &
	361	+no_arch_edits_props$es_0_no_arch <= 16,
	362	+]
	363	+params = list(
	364	+"0"=list(
	365	+col="#AAAAAA",
	366	+pch=0,
	367	+lty=0
	368	+),
	369	+"1"=list(
	370	+col="#000000",
	371	+pch=0,
	372	+lty=0
	373	+),
	374	+"2"=list(
	375	+col="#FF0000",
	376	+pch=1,
	377	+lty=1
	378	+),
	379	+"4"=list(
	380	+col="#00FF00",
	381	+pch=2,
	382	+lty=2
	383	+),
	384	+"8"=list(
	385	+col="#0000FF",
	386	+pch=3,
	387	+lty=3
	388	+),
	389	+"16"=list(
	390	+col="#BBBB00",
	391	+pch=4,
	392	+lty=4
	393	+)
	394	+)
	395	+xyplot(
	396	+early_survival ~ year,
	397	+data=limited_year_edits_props,
	398	+groups=es_0_no_arch,
	399	+panel=function(x, y, subscripts, groups, ...){
	400	+f = limited_year_edits_props[subscripts,]
	401	+for(group in groups){
	402	+group = as.character(group)
	403	+subf = f[f$es_0_no_arch == group,]
	404	+p = subf$early_survival
	405	+x = subf$year
	406	+n = subf$n
	407	+panel.xyplot(
	408	+x, p,
	409	+col=params[[group]]$col,
	410	+pch=params[[group]]$pch,
	411	+...
	412	+)
	413	+panel.lines(
	414	+x, p,
	415	+col=params[[group]]$col,
	416	+lwd=2,
	417	+...
	418	+)
	419	+se = sqrt(p*(1-p)/n)
	420	+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	421	+}
	422	+},
	423	+ylim=c(0, 1),
	424	+main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
	425	+ylab="Proportion of surviving editors",
	426	+xlab="Years",
	427	+sub="early survival = editing more than 1 month after first session",
	428	+auto.key=list(
	429	+text=paste("~", names(params), "edits"),
	430	+col=c(
	431	+"#AAAAAA",
	432	+"#000000",
	433	+"#FF0000",
	434	+"#00FF00",
	435	+"#0000FF",
	436	+"#BBBB00",
	437	+"#00BBBB",
	438	+"#BB00BB"
	439	+),
	440	+points=F
	441	+)
	442	+)
	443	+dev.off()
	444	+es_0_bucket = 10^floor(log(user_sessions$es_0_edits, base=10))
	445	+table(es_0_bucket)
	446	+three_es_buckets = with(
	447	+summaryBy(
	448	+es_0_edits +
	449	+es_1_edits +
	450	+es_2_edits ~
	451	+year + es_0_bucket,
	452	+data=user_sessions,
	453	+FUN=c(mean, sd, length)
	454	+),
	455	+data.frame(
	456	+year = year
	457	+es_0_bucket = es_0_bucket,
	458	+es_0_mean = es_0_edits.mean,
	459	+es_0_sd = es_0_edits.sd,
	460	+es_0_n = es_0_edits.length,
	461	+es_1_mean = es_1_edits.mean,
	462	+es_1_sd = es_1_edits.sd,
	463	+es_1_n = es_1_edits.length,
	464	+es_2_mean = es_2_edits.mean,
	465	+es_2_sd = es_2_edits.sd,
	466	+es_2_n = es_2_edits.length
	467	+)
	468	+)three_es_buckets = with(
	469	+summaryBy(
	470	+es_0_edits +
	471	+es_1_edits +
	472	+es_2_edits ~
	473	+year + es_0_bucket,
	474	+data=user_sessions,
	475	+FUN=c(mean, sd, length)
	476	+),
	477	+data.frame(
	478	+year = year,
	479	+bucket = es_0_bucket,
	480	+es_0_mean = es_0_edits.mean,
	481	+es_0_sd = es_0_edits.sd,
	482	+es_0_n = es_0_edits.length,
	483	+es_1_mean = es_1_edits.mean,
	484	+es_1_sd = es_1_edits.sd,
	485	+es_1_n = es_1_edits.length,
	486	+es_2_mean = es_2_edits.mean,
	487	+es_2_sd = es_2_edits.sd,
	488	+es_2_n = es_2_edits.length
	489	+)
	490	+three_es_buckets = with(
	491	+summaryBy(
	492	+es_0_edits +
	493	+es_1_edits +
	494	+es_2_edits ~
	495	+year + es_0_bucket,
	496	+data=user_sessions,
	497	+FUN=c(mean, sd, length)
	498	+),
	499	+data.frame(
	500	+year = year,
	501	+bucket = es_0_bucket,
	502	+es_0_mean = es_0_edits.mean,
	503	+es_0_sd = es_0_edits.sd,
	504	+es_0_n = es_0_edits.length,
	505	+es_1_mean = es_1_edits.mean,
	506	+es_1_sd = es_1_edits.sd,
	507	+es_1_n = es_1_edits.length,
	508	+es_2_mean = es_2_edits.mean,
	509	+es_2_sd = es_2_edits.sd,
	510	+es_2_n = es_2_edits.length
	511	+)
	512	+)
	513	+three_es_buckets
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png
___________________________________________________________________
Added: svn:mime-type
1	514	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png
___________________________________________________________________
Added: svn:mime-type
2	515	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png
___________________________________________________________________
Added: svn:mime-type
3	516	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png
___________________________________________________________________
Added: svn:mime-type
4	517	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png
___________________________________________________________________
Added: svn:mime-type
5	518	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png
___________________________________________________________________
Added: svn:mime-type
6	519	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png
___________________________________________________________________
Added: svn:mime-type
7	520	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png
___________________________________________________________________
Added: svn:mime-type
8	521	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png
___________________________________________________________________
Added: svn:mime-type
9	522	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png
___________________________________________________________________
Added: svn:mime-type
10	523	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png
___________________________________________________________________
Added: svn:mime-type
11	524	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png
___________________________________________________________________
Added: svn:mime-type
12	525	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png
___________________________________________________________________
Added: svn:mime-type
13	526	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/first_session_characteristics.R
—	—	@@ -0,0 +1,132 @@
	2	+source("loader/user_sessions.R")
	3	+
	4	+library(lattice)
	5	+library(doBy)
	6	+
	7	+user_sessions = load_user_sessions()
	8	+user_sessions$year = strftime(user_sessions$first_edit, format="%Y")
	9	+user_sessions$early_survival = user_sessions$last_edit - user_sessions$es_0_end >= 30
	10	+
	11	+user_sessions$es_0_bucket = 10^floor(log(user_sessions$es_0_edits, base=10))
	12	+user_sessions$es_1_edits = naReplace(user_sessions$es_1_edits, 0)
	13	+user_sessions$es_2_edits = naReplace(user_sessions$es_2_edits, 0)
	14	+
	15	+
	16	+
	17	+
	18	+
	19	+three_es_buckets = with(
	20	+ summaryBy(
	21	+ es_0_edits +
	22	+ es_1_edits +
	23	+ es_2_edits ~
	24	+ year + es_0_bucket,
	25	+ data=user_sessions,
	26	+ FUN=c(mean, sd, length)
	27	+ ),
	28	+ rbind(
	29	+ data.frame(
	30	+ year = year,
	31	+ bucket = es_0_bucket,
	32	+ es = 0,
	33	+ mean = es_0_edits.mean,
	34	+ sd = es_0_edits.sd,
	35	+ n = es_0_edits.length
	36	+ ),
	37	+ data.frame(
	38	+ year = year,
	39	+ bucket = es_0_bucket,
	40	+ es = 1,
	41	+ mean = es_1_edits.mean,
	42	+ sd = es_1_edits.sd,
	43	+ n = es_1_edits.length
	44	+ ),
	45	+ data.frame(
	46	+ year = year,
	47	+ bucket = es_0_bucket,
	48	+ es = 2,
	49	+ mean = es_2_edits.mean,
	50	+ sd = es_2_edits.sd,
	51	+ n = es_2_edits.length
	52	+ )
	53	+ )
	54	+)
	55	+
	56	+
	57	+png("plots/edit_sessions.by_year_and_es_0_bucket.png", height=768, width=1024)
	58	+limited_three_es_buckets = three_es_buckets[
	59	+ three_es_buckets$n >= 10 &
	60	+ three_es_buckets$bucket <= 16,
	61	+]
	62	+params = list(
	63	+ "1"=list(
	64	+ col="#000000",
	65	+ pch=0,
	66	+ lty=0
	67	+ ),
	68	+ "2"=list(
	69	+ col="#FF0000",
	70	+ pch=1,
	71	+ lty=1
	72	+ ),
	73	+ "4"=list(
	74	+ col="#00FF00",
	75	+ pch=2,
	76	+ lty=2
	77	+ ),
	78	+ "8"=list(
	79	+ col="#0000FF",
	80	+ pch=3,
	81	+ lty=3
	82	+ ),
	83	+ "16"=list(
	84	+ col="#BBBB00",
	85	+ pch=4,
	86	+ lty=4
	87	+ )
	88	+)
	89	+xyplot(
	90	+ mean ~ es \| as.factor(year),
	91	+ data=limited_three_es_buckets,
	92	+ groups=bucket,
	93	+ panel=function(x, y, subscripts, groups, ...){
	94	+ f = limited_three_es_buckets[subscripts,]
	95	+ for(group in groups){
	96	+ group = as.character(group)
	97	+ subf = f[f$bucket == group,]
	98	+ y = subf$mean
	99	+ x = subf$es
	100	+ n = subf$n
	101	+ sd = subf$sd
	102	+ se = sd/sqrt(n)
	103	+ panel.xyplot(
	104	+ x, y,
	105	+ col=params[[group]]$col,
	106	+ pch=params[[group]]$pch,
	107	+ ...
	108	+ )
	109	+ panel.lines(
	110	+ x, y,
	111	+ col=params[[group]]$col,
	112	+ lwd=2,
	113	+ ...
	114	+ )
	115	+ panel.arrows(x, y+se, x, y-se, ends="both", col="#777777", angle=90, length=.01)
	116	+ }
	117	+ },
	118	+ main="Session activity by editor first session group",
	119	+ ylab="Average session edits",
	120	+ xlab="Edit session",
	121	+ auto.key=list(
	122	+ text=paste("~", names(params), "edits"),
	123	+ col=c(
	124	+ "#000000",
	125	+ "#FF0000",
	126	+ "#00FF00",
	127	+ "#0000FF",
	128	+ "#BBBB00"
	129	+ ),
	130	+ points=F
	131	+ )
	132	+)
	133	+dev.off()
Index: trunk/tools/wsor/first_session/R/first_session_survival.R
—	—	@@ -0,0 +1,457 @@
	2	+source("loader/user_sessions.R")
	3	+
	4	+library(lattice)
	5	+library(doBy)
	6	+
	7	+user_sessions = load_user_sessions()
	8	+user_sessions$year = strftime(user_sessions$first_edit, format="%Y")
	9	+user_sessions$early_survival = user_sessions$last_edit - user_sessions$es_0_end >= 30
	10	+
	11	+year_props = with(
	12	+ summaryBy(
	13	+ early_survival ~ year,
	14	+ data=user_sessions[!is.na(user_sessions$year),],
	15	+ FUN=c(mean, length)
	16	+ ),
	17	+ data.frame(
	18	+ year = year,
	19	+ early_survival = early_survival.mean,
	20	+ n = early_survival.length
	21	+ )
	22	+)
	23	+
	24	+png("plots/early_survival.by_year.png", height=768, width=1024)
	25	+xyplot(
	26	+ early_survival ~ year,
	27	+ data=year_props,
	28	+ panel=function(x, y, subscripts, ...){
	29	+ f = year_props[subscripts,]
	30	+ panel.xyplot(x, y, ...)
	31	+ panel.lines(x, y, ...)
	32	+ x = f$year
	33	+ p = f$early_survival
	34	+ n = f$n
	35	+ se = sqrt(p*(1-p)/n)
	36	+ panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1)
	37	+ },
	38	+ ylim=c(0, 1),
	39	+ main="Early survival proportion for new editors",
	40	+ ylab="Proportion of surviving editors",
	41	+ xlab="Year",
	42	+ sub="early survival = editing more than 1 month after first session"
	43	+)
	44	+dev.off()
	45	+
	46	+year_props.no_vandal = with(
	47	+ summaryBy(
	48	+ early_survival ~ year,
	49	+ data=user_sessions[
	50	+ !is.na(user_sessions$year) &
	51	+ user_sessions$es_0_edits >= 2 &
	52	+ user_sessions$es_0_vandalism / user_sessions$es_0_edits <= .25,
	53	+ ],
	54	+ FUN=c(mean, length)
	55	+ ),
	56	+ data.frame(
	57	+ year = year,
	58	+ early_survival = early_survival.mean,
	59	+ n = early_survival.length
	60	+ )
	61	+)
	62	+
	63	+png("plots/early_survival.by_year.no_vandals.png", height=768, width=1024)
	64	+xyplot(
	65	+ early_survival ~ year,
	66	+ data=year_props.no_vandal,
	67	+ panel=function(x, y, subscripts, ...){
	68	+ f = year_props.no_vandal[subscripts,]
	69	+ panel.xyplot(x, y, ...)
	70	+ panel.lines(x, y, ...)
	71	+ x = f$year
	72	+ p = f$early_survival
	73	+ n = f$n
	74	+ se = sqrt(p*(1-p)/n)
	75	+ panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1)
	76	+ },
	77	+ ylim=c(0, 1),
	78	+ main="Early survival proportion for new editors (no vandals)",
	79	+ ylab="Proportion of surviving editors",
	80	+ xlab="Year",
	81	+ sub="early survival = editing more than 1 month after first session"
	82	+)
	83	+dev.off()
	84	+
	85	+user_sessions$es_0_bucket = 2^round(log(user_sessions$es_0_edits, base=2))
	86	+
	87	+year_edits_props = with(
	88	+ summaryBy(
	89	+ early_survival ~ year + es_0_bucket,
	90	+ data=user_sessions[
	91	+ !is.na(user_sessions$year) &
	92	+ user_sessions$es_0_bucket <= 256,
	93	+ ],
	94	+ FUN=c(mean, length)
	95	+ ),
	96	+ data.frame(
	97	+ year = year,
	98	+ es_0_bucket = es_0_bucket,
	99	+ early_survival = early_survival.mean,
	100	+ n = early_survival.length
	101	+ )
	102	+)
	103	+
	104	+png("plots/early_survival.by_year_and_first_session.png", height=768, width=1024)
	105	+xyplot(
	106	+ early_survival ~ es_0_bucket \| as.factor(year),
	107	+ data=year_edits_props,
	108	+ panel=function(x, y, subscripts, ...){
	109	+ f = year_edits_props[subscripts,]
	110	+ panel.xyplot(x, y, ...)
	111	+ x = log(f$es_0_bucket, base=2)
	112	+ p = f$early_survival
	113	+ n = f$n
	114	+ se = sqrt(p*(1-p)/n)
	115	+ panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1)
	116	+ panel.lines(-5:10, .2, col="#BBBBBB")
	117	+ panel.lines(-5:10, .4, col="#BBBBBB")
	118	+ panel.lines(-5:10, .6, col="#BBBBBB")
	119	+ panel.lines(-5:10, .8, col="#BBBBBB")
	120	+ },
	121	+ ylim=c(0, 1),
	122	+ main="Early survival proportion for new editors by first session edits",
	123	+ ylab="Proportion of surviving editors",
	124	+ xlab="First session edits",
	125	+ sub="early survival = editing more than 1 month after first session",
	126	+ scales=list(x=list(log=2, at=2^(0:8))),
	127	+ xlim=c(.5, 300)
	128	+)
	129	+dev.off()
	130	+
	131	+png("plots/early_survival.by_year.es_lines.png", height=768, width=1024)
	132	+limited_year_edits_props = year_edits_props[
	133	+ year_edits_props$n >= 10 &
	134	+ year_edits_props$es_0_bucket <= 16,
	135	+]
	136	+params = list(
	137	+ "1"=list(
	138	+ col="#000000",
	139	+ pch=0,
	140	+ lty=0
	141	+ ),
	142	+ "2"=list(
	143	+ col="#FF0000",
	144	+ pch=1,
	145	+ lty=1
	146	+ ),
	147	+ "4"=list(
	148	+ col="#00FF00",
	149	+ pch=2,
	150	+ lty=2
	151	+ ),
	152	+ "8"=list(
	153	+ col="#0000FF",
	154	+ pch=3,
	155	+ lty=3
	156	+ ),
	157	+ "16"=list(
	158	+ col="#BBBB00",
	159	+ pch=4,
	160	+ lty=4
	161	+ ),
	162	+ "32"=list(
	163	+ col="#00BBBB",
	164	+ pch=5,
	165	+ lty=5
	166	+ ),
	167	+ "64"=list(
	168	+ col="#BB00BB",
	169	+ pch=6,
	170	+ lty=6
	171	+ )
	172	+)
	173	+xyplot(
	174	+ early_survival ~ year,
	175	+ data=limited_year_edits_props,
	176	+ groups=es_0_bucket,
	177	+ panel=function(x, y, subscripts, groups, ...){
	178	+ f = limited_year_edits_props[subscripts,]
	179	+ for(group in groups){
	180	+ group = as.character(group)
	181	+ subf = f[f$es_0_bucket == group,]
	182	+ p = subf$early_survival
	183	+ x = subf$year
	184	+ n = subf$n
	185	+ panel.xyplot(
	186	+ x, p,
	187	+ col=params[[group]]$col,
	188	+ pch=params[[group]]$pch,
	189	+ ...
	190	+ )
	191	+ panel.lines(
	192	+ x, p,
	193	+ col=params[[group]]$col,
	194	+ lwd=2,
	195	+ ...
	196	+ )
	197	+ se = sqrt(p*(1-p)/n)
	198	+ panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	199	+ }
	200	+ },
	201	+ ylim=c(0, 1),
	202	+ main="Early survival proportion for new editors grouped by edits in their first session",
	203	+ ylab="Proportion of surviving editors",
	204	+ xlab="Years",
	205	+ sub="early survival = editing more than 1 month after first session",
	206	+ auto.key=list(
	207	+ text=paste("~", names(params), "edits"),
	208	+ col=c(
	209	+ "#000000",
	210	+ "#FF0000",
	211	+ "#00FF00",
	212	+ "#0000FF",
	213	+ "#BBBB00",
	214	+ "#00BBBB",
	215	+ "#BB00BB"
	216	+ ),
	217	+ points=F
	218	+ )
	219	+)
	220	+dev.off()
	221	+
	222	+
	223	+user_sessions$es_0_no_arch = 2^round(log(user_sessions$es_0_edits - user_sessions$es_0_deleted, base=2))
	224	+
	225	+no_arch_edits_props = with(
	226	+ summaryBy(
	227	+ early_survival ~ year + es_0_no_arch,
	228	+ data=user_sessions[
	229	+ !is.na(user_sessions$year) &
	230	+ user_sessions$es_0_no_arch <= 256,
	231	+ ],
	232	+ FUN=c(mean, length)
	233	+ ),
	234	+ data.frame(
	235	+ year = year,
	236	+ es_0_no_arch = es_0_no_arch,
	237	+ early_survival = early_survival.mean,
	238	+ n = early_survival.length
	239	+ )
	240	+)
	241	+
	242	+
	243	+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
	244	+limited_year_edits_props = no_arch_edits_props[
	245	+ no_arch_edits_props$n >= 10 &
	246	+ no_arch_edits_props$es_0_no_arch <= 16,
	247	+]
	248	+params = list(
	249	+ "0"=list(
	250	+ col="#AAAAAA",
	251	+ pch=0,
	252	+ lty=0
	253	+ ),
	254	+ "1"=list(
	255	+ col="#000000",
	256	+ pch=0,
	257	+ lty=0
	258	+ ),
	259	+ "2"=list(
	260	+ col="#FF0000",
	261	+ pch=1,
	262	+ lty=1
	263	+ ),
	264	+ "4"=list(
	265	+ col="#00FF00",
	266	+ pch=2,
	267	+ lty=2
	268	+ ),
	269	+ "8"=list(
	270	+ col="#0000FF",
	271	+ pch=3,
	272	+ lty=3
	273	+ ),
	274	+ "16"=list(
	275	+ col="#BBBB00",
	276	+ pch=4,
	277	+ lty=4
	278	+ )
	279	+)
	280	+xyplot(
	281	+ early_survival ~ year,
	282	+ data=limited_year_edits_props,
	283	+ groups=es_0_no_arch,
	284	+ panel=function(x, y, subscripts, groups, ...){
	285	+ f = limited_year_edits_props[subscripts,]
	286	+ for(group in groups){
	287	+ group = as.character(group)
	288	+ subf = f[f$es_0_no_arch == group,]
	289	+ p = subf$early_survival
	290	+ x = subf$year
	291	+ n = subf$n
	292	+ panel.xyplot(
	293	+ x, p,
	294	+ col=params[[group]]$col,
	295	+ pch=params[[group]]$pch,
	296	+ ...
	297	+ )
	298	+ panel.lines(
	299	+ x, p,
	300	+ col=params[[group]]$col,
	301	+ lwd=2,
	302	+ ...
	303	+ )
	304	+ se = sqrt(p*(1-p)/n)
	305	+ panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	306	+ }
	307	+ },
	308	+ ylim=c(0, 1),
	309	+ main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
	310	+ ylab="Proportion of surviving editors",
	311	+ xlab="Years",
	312	+ sub="early survival = editing more than 1 month after first session",
	313	+ auto.key=list(
	314	+ text=paste("~", names(params), "edits"),
	315	+ col=c(
	316	+ "#AAAAAA",
	317	+ "#000000",
	318	+ "#FF0000",
	319	+ "#00FF00",
	320	+ "#0000FF",
	321	+ "#BBBB00",
	322	+ "#00BBBB",
	323	+ "#BB00BB"
	324	+ ),
	325	+ points=F
	326	+ )
	327	+)
	328	+dev.off()
	329	+
	330	+
	331	+user_sessions$years_since_2001 = as.numeric((user_sessions$first_edit - as.POSIXct("2001-01-01"))/365)
	332	+user_sessions$initial_rejection = with(
	333	+ user_sessions,
	334	+ (
	335	+ naReplace(es_0_deleted, 0) + naReplace(es_0_reverted, 0) +
	336	+ naReplace(es_1_deleted, 0) + naReplace(es_1_reverted, 0) +
	337	+ naReplace(es_2_deleted, 0) + naReplace(es_2_reverted, 0)
	338	+ )/(
	339	+ naReplace(es_0_edits, 0) +
	340	+ naReplace(es_1_edits, 0) +
	341	+ naReplace(es_2_edits, 0)
	342	+ )
	343	+)
	344	+sc = scale
	345	+summary(glm(
	346	+ early_survival ~
	347	+ sc(es_0_edits) *
	348	+ sc(years_since_2001) *
	349	+ sc(initial_rejection),
	350	+ data=user_sessions[
	351	+ user_sessions$es_0_edits > 3,
	352	+ ],
	353	+ family=binomial(link="logit")
	354	+))
	355	+
	356	+
	357	+user_sessions$initial_rejection_group = round(user_sessions$initial_rejection/2, 1)*2
	358	+
	359	+survival_by_year_and_rejection = with(
	360	+ summaryBy(
	361	+ early_survival ~ year + initial_rejection_group,
	362	+ data=user_sessions[
	363	+ user_sessions$es_0_edits > 3 &
	364	+ user_sessions$es_0_vandalism == 0,
	365	+ ],
	366	+ FUN=c(mean, length)
	367	+ ),
	368	+ data.frame(
	369	+ year = year,
	370	+ rejection_group = initial_rejection_group,
	371	+ early_survival = early_survival.mean,
	372	+ n = early_survival.length
	373	+ )
	374	+)
	375	+
	376	+png("plots/early_survival.by_year_and_rejection.no_vandals.png", height=768, width=1024)
	377	+limited_frame = survival_by_year_and_rejection[
	378	+ survival_by_year_and_rejection$n >= 10,
	379	+]
	380	+params = list(
	381	+ "0"=list(
	382	+ col="#AAAAAA",
	383	+ pch=0,
	384	+ lty=0
	385	+ ),
	386	+ "0.2"=list(
	387	+ col="#FF0000",
	388	+ pch=1,
	389	+ lty=1
	390	+ ),
	391	+ "0.4"=list(
	392	+ col="#0000FF",
	393	+ pch=3,
	394	+ lty=3
	395	+ ),
	396	+ "0.6"=list(
	397	+ col="#00BBBB",
	398	+ pch=5,
	399	+ lty=4
	400	+ ),
	401	+ "0.8"=list(
	402	+ col="#BB0000",
	403	+ pch=7,
	404	+ lty=4
	405	+ ),
	406	+ "1"=list(
	407	+ col="#00BB00",
	408	+ pch=9,
	409	+ lty=4
	410	+ )
	411	+)
	412	+xyplot(
	413	+ early_survival ~ year,
	414	+ data=limited_frame,
	415	+ groups=rejection_group,
	416	+ panel=function(x, y, subscripts, groups, ...){
	417	+ f = limited_frame[subscripts,]
	418	+ for(group in groups){
	419	+ group = as.character(group)
	420	+ subf = f[f$rejection_group == group,]
	421	+ p = subf$early_survival
	422	+ x = subf$year
	423	+ n = subf$n
	424	+ panel.xyplot(
	425	+ x, p,
	426	+ col=params[[group]]$col,
	427	+ pch=params[[group]]$pch,
	428	+ ...
	429	+ )
	430	+ panel.lines(
	431	+ x, p,
	432	+ col=params[[group]]$col,
	433	+ lwd=2,
	434	+ ...
	435	+ )
	436	+ se = sqrt(p*(1-p)/n)
	437	+ panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
	438	+ }
	439	+ },
	440	+ ylim=c(0, 1),
	441	+ main="Early survival proportion for new editors grouped by early rejection proportion",
	442	+ ylab="Proportion of surviving editors",
	443	+ xlab="Years",
	444	+ sub="early survival = editing more than 1 month after first session\nrejection = proportion of revisions reverted or deleted in first edit sessions.",
	445	+ auto.key=list(
	446	+ text=paste("~", names(params), " rejection"),
	447	+ col=c(
	448	+ "#AAAAAA",
	449	+ "#FF0000",
	450	+ "#0000FF",
	451	+ "#00BBBB",
	452	+ "#BB0000",
	453	+ "#00BB00"
	454	+ ),
	455	+ points=F
	456	+ )
	457	+)
	458	+dev.off()
Index: trunk/tools/wsor/first_session/R/loader/user_sessions.R
—	—	@@ -0,0 +1,38 @@
	2	+source("util/env.R")
	3	+
	4	+load_user_sessions = function(verbose=T, reload=F){
	5	+ filename = paste(DATA_DIR, "user_sessions.3.tsv", sep="/")
	6	+ if(!exists("USER_SESSIONS")){
	7	+ USER_SESSIONS <<- NULL
	8	+ }
	9	+ if(is.null(USER_SESSIONS) \| reload){
	10	+ USER_SESSIONS <<- NULL
	11	+ }
	12	+ if(is.null(USER_SESSIONS)){
	13	+ if(verbose){cat("Loading ", filename, "...")}
	14	+ USER_SESSIONS <<- read.table(
	15	+ filename,
	16	+ header=T, sep="\t",
	17	+ quote="", comment.char="",
	18	+ na.strings="\\N"
	19	+ )
	20	+ USER_SESSIONS$first_edit = strptime(
	21	+ as.character(USER_SESSIONS$first_edit),
	22	+ "%Y%m%d%H%M%S"
	23	+ )
	24	+ USER_SESSIONS$last_edit = strptime(
	25	+ as.character(USER_SESSIONS$last_edit),
	26	+ "%Y%m%d%H%M%S"
	27	+ )
	28	+ USER_SESSIONS$es_0_start = as.POSIXct(USER_SESSIONS$es_0_start, origin="1970-01-01")
	29	+ USER_SESSIONS$es_1_start = as.POSIXct(USER_SESSIONS$es_1_start, origin="1970-01-01")
	30	+ USER_SESSIONS$es_2_start = as.POSIXct(USER_SESSIONS$es_2_start, origin="1970-01-01")
	31	+ USER_SESSIONS$es_0_end = as.POSIXct(USER_SESSIONS$es_0_end, origin="1970-01-01")
	32	+ USER_SESSIONS$es_1_end = as.POSIXct(USER_SESSIONS$es_1_end, origin="1970-01-01")
	33	+ USER_SESSIONS$es_2_end = as.POSIXct(USER_SESSIONS$es_2_end, origin="1970-01-01")
	34	+ if(verbose){cat("DONE!\n")}
	35	+ }
	36	+ USER_SESSIONS
	37	+}
	38	+
	39	+
Index: trunk/tools/wsor/first_session/R/edit_distributions.R
—	—	@@ -0,0 +1,64 @@
	2	+source("loader/user_sessions.R")
	3	+
	4	+library(lattice)
	5	+library(doBy)
	6	+
	7	+user_sessions = load_user_sessions()
	8	+user_sessions$year = floor(user_sessions$first_edit/10000000000)
	9	+
	10	+
	11	+year_edits = data.frame()
	12	+for(year in unique(user_sessions$year)){
	13	+ tab = data.frame(
	14	+ table(
	15	+ 10^round(
	16	+ log(
	17	+ user_sessions[user_sessions$year == year,]$edit_count,
	18	+ base=10
	19	+ )
	20	+ )
	21	+ )
	22	+ )
	23	+
	24	+ year_edits = rbind(
	25	+ year_edits,
	26	+ data.frame(
	27	+ year = year,
	28	+ edits = as.numeric(as.character(tab$Var1)),
	29	+ freq = tab$Freq,
	30	+ prop = tab$Freq/sum(tab$Freq)
	31	+ )
	32	+ )
	33	+}
	34	+
	35	+png("plots/edit_count_distribution.png", height=768, width=1024)
	36	+xyplot(
	37	+ freq ~ edits \| as.factor(year),
	38	+ data = year_edits[year_edits$edits > 0,],
	39	+ type="o",
	40	+ scales=list(
	41	+ x=list(log=10, at=10^(0:6), labels=10^(0:6))#,
	42	+ #y=list(log=10)
	43	+ ),
	44	+ main="Editor edit count distributions by editor first edit year",
	45	+ xlab="Number of edits (log10 bucketed)",
	46	+ ylab="Number of editors",
	47	+ sub="based on a random sample of <= 10,000 editors from each year"
	48	+)
	49	+dev.off()
	50	+
	51	+png("plots/edit_count_distribution.prop.png", height=768, width=1024)
	52	+xyplot(
	53	+ prop ~ edits \| as.factor(year),
	54	+ data = year_edits[year_edits$edits > 0,],
	55	+ type="o",
	56	+ scales=list(
	57	+ x=list(log=10, at=10^(0:6), labels=10^(0:6))#,
	58	+ #y=list(log=10)
	59	+ ),
	60	+ main="Editor edit count distributions by editor first edit year",
	61	+ xlab="Number of edits (log10 bucketed)",
	62	+ ylab="Proportion of editors",
	63	+ sub="based on a random sample of <= 10,000 editors from each year"
	64	+)
	65	+dev.off()
Index: trunk/tools/wsor/first_session/R/.RData
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/.RData
___________________________________________________________________
Added: svn:mime-type
1	66	+ application/octet-stream
Index: trunk/tools/wsor/first_session/R/Rplots.pdf
—	—	@@ -0,0 +1,267 @@
	2	+%PDF-1.4
	3	+%��ρ�\r
	4	+1 0 obj
	5	+<<
	6	+/CreationDate (D:20110719222129)
	7	+/ModDate (D:20110719222129)
	8	+/Title (R Graphics Output)
	9	+/Producer (R 2.13.1)
	10	+/Creator (R)
	11	+>>
	12	+endobj
	13	+2 0 obj
	14	+<<
	15	+/Type /Catalog
	16	+/Pages 3 0 R
	17	+>>
	18	+endobj
	19	+3 0 obj
	20	+<<
	21	+/Type /Pages
	22	+/Kids [
	23	+]
	24	+/Count 0
	25	+/MediaBox [0 0 504 504]
	26	+>>
	27	+endobj
	28	+4 0 obj
	29	+<<
	30	+/ProcSet [/PDF /Text]
	31	+/Font <<>>
	32	+/ExtGState << >>
	33	+/ColorSpace << /sRGB 5 0 R >>
	34	+>>
	35	+endobj
	36	+5 0 obj
	37	+[/ICCBased 6 0 R]
	38	+endobj
	39	+6 0 obj
	40	+<< /N 3 /Alternate /DeviceRGB /Length 9433 /Filter /ASCIIHexDecode >>
	41	+stream
	42	+00 00 0c 48 4c 69 6e 6f 02 10 00 00 6d 6e 74 72
	43	+52 47 42 20 58 59 5a 20 07 ce 00 02 00 09 00 06
	44	+00 31 00 00 61 63 73 70 4d 53 46 54 00 00 00 00
	45	+49 45 43 20 73 52 47 42 00 00 00 00 00 00 00 00
	46	+00 00 00 00 00 00 f6 d6 00 01 00 00 00 00 d3 2d
	47	+48 50 20 20 00 00 00 00 00 00 00 00 00 00 00 00
	48	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	49	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	50	+00 00 00 11 63 70 72 74 00 00 01 50 00 00 00 33
	51	+64 65 73 63 00 00 01 84 00 00 00 6c 77 74 70 74
	52	+00 00 01 f0 00 00 00 14 62 6b 70 74 00 00 02 04
	53	+00 00 00 14 72 58 59 5a 00 00 02 18 00 00 00 14
	54	+67 58 59 5a 00 00 02 2c 00 00 00 14 62 58 59 5a
	55	+00 00 02 40 00 00 00 14 64 6d 6e 64 00 00 02 54
	56	+00 00 00 70 64 6d 64 64 00 00 02 c4 00 00 00 88
	57	+76 75 65 64 00 00 03 4c 00 00 00 86 76 69 65 77
	58	+00 00 03 d4 00 00 00 24 6c 75 6d 69 00 00 03 f8
	59	+00 00 00 14 6d 65 61 73 00 00 04 0c 00 00 00 24
	60	+74 65 63 68 00 00 04 30 00 00 00 0c 72 54 52 43
	61	+00 00 04 3c 00 00 08 0c 67 54 52 43 00 00 04 3c
	62	+00 00 08 0c 62 54 52 43 00 00 04 3c 00 00 08 0c
	63	+74 65 78 74 00 00 00 00 43 6f 70 79 72 69 67 68
	64	+74 20 28 63 29 20 31 39 39 38 20 48 65 77 6c 65
	65	+74 74 2d 50 61 63 6b 61 72 64 20 43 6f 6d 70 61
	66	+6e 79 00 00 64 65 73 63 00 00 00 00 00 00 00 12
	67	+73 52 47 42 20 49 45 43 36 31 39 36 36 2d 32 2e
	68	+31 00 00 00 00 00 00 00 00 00 00 00 12 73 52 47
	69	+42 20 49 45 43 36 31 39 36 36 2d 32 2e 31 00 00
	70	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	71	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	72	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	73	+58 59 5a 20 00 00 00 00 00 00 f3 51 00 01 00 00
	74	+00 01 16 cc 58 59 5a 20 00 00 00 00 00 00 00 00
	75	+00 00 00 00 00 00 00 00 58 59 5a 20 00 00 00 00
	76	+00 00 6f a2 00 00 38 f5 00 00 03 90 58 59 5a 20
	77	+00 00 00 00 00 00 62 99 00 00 b7 85 00 00 18 da
	78	+58 59 5a 20 00 00 00 00 00 00 24 a0 00 00 0f 84
	79	+00 00 b6 cf 64 65 73 63 00 00 00 00 00 00 00 16
	80	+49 45 43 20 68 74 74 70 3a 2f 2f 77 77 77 2e 69
	81	+65 63 2e 63 68 00 00 00 00 00 00 00 00 00 00 00
	82	+16 49 45 43 20 68 74 74 70 3a 2f 2f 77 77 77 2e
	83	+69 65 63 2e 63 68 00 00 00 00 00 00 00 00 00 00
	84	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	85	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	86	+00 00 00 00 64 65 73 63 00 00 00 00 00 00 00 2e
	87	+49 45 43 20 36 31 39 36 36 2d 32 2e 31 20 44 65
	88	+66 61 75 6c 74 20 52 47 42 20 63 6f 6c 6f 75 72
	89	+20 73 70 61 63 65 20 2d 20 73 52 47 42 00 00 00
	90	+00 00 00 00 00 00 00 00 2e 49 45 43 20 36 31 39
	91	+36 36 2d 32 2e 31 20 44 65 66 61 75 6c 74 20 52
	92	+47 42 20 63 6f 6c 6f 75 72 20 73 70 61 63 65 20
	93	+2d 20 73 52 47 42 00 00 00 00 00 00 00 00 00 00
	94	+00 00 00 00 00 00 00 00 00 00 00 00 64 65 73 63
	95	+00 00 00 00 00 00 00 2c 52 65 66 65 72 65 6e 63
	96	+65 20 56 69 65 77 69 6e 67 20 43 6f 6e 64 69 74
	97	+69 6f 6e 20 69 6e 20 49 45 43 36 31 39 36 36 2d
	98	+32 2e 31 00 00 00 00 00 00 00 00 00 00 00 2c 52
	99	+65 66 65 72 65 6e 63 65 20 56 69 65 77 69 6e 67
	100	+20 43 6f 6e 64 69 74 69 6f 6e 20 69 6e 20 49 45
	101	+43 36 31 39 36 36 2d 32 2e 31 00 00 00 00 00 00
	102	+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
	103	+00 00 00 00 76 69 65 77 00 00 00 00 00 13 a4 fe
	104	+00 14 5f 2e 00 10 cf 14 00 03 ed cc 00 04 13 0b
	105	+00 03 5c 9e 00 00 00 01 58 59 5a 20 00 00 00 00
	106	+00 4c 09 56 00 50 00 00 00 57 1f e7 6d 65 61 73
	107	+00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 00
	108	+00 00 00 00 00 00 00 00 00 00 02 8f 00 00 00 02
	109	+73 69 67 20 00 00 00 00 43 52 54 20 63 75 72 76
	110	+00 00 00 00 00 00 04 00 00 00 00 05 00 0a 00 0f
	111	+00 14 00 19 00 1e 00 23 00 28 00 2d 00 32 00 37
	112	+00 3b 00 40 00 45 00 4a 00 4f 00 54 00 59 00 5e
	113	+00 63 00 68 00 6d 00 72 00 77 00 7c 00 81 00 86
	114	+00 8b 00 90 00 95 00 9a 00 9f 00 a4 00 a9 00 ae
	115	+00 b2 00 b7 00 bc 00 c1 00 c6 00 cb 00 d0 00 d5
	116	+00 db 00 e0 00 e5 00 eb 00 f0 00 f6 00 fb 01 01
	117	+01 07 01 0d 01 13 01 19 01 1f 01 25 01 2b 01 32
	118	+01 38 01 3e 01 45 01 4c 01 52 01 59 01 60 01 67
	119	+01 6e 01 75 01 7c 01 83 01 8b 01 92 01 9a 01 a1
	120	+01 a9 01 b1 01 b9 01 c1 01 c9 01 d1 01 d9 01 e1
	121	+01 e9 01 f2 01 fa 02 03 02 0c 02 14 02 1d 02 26
	122	+02 2f 02 38 02 41 02 4b 02 54 02 5d 02 67 02 71
	123	+02 7a 02 84 02 8e 02 98 02 a2 02 ac 02 b6 02 c1
	124	+02 cb 02 d5 02 e0 02 eb 02 f5 03 00 03 0b 03 16
	125	+03 21 03 2d 03 38 03 43 03 4f 03 5a 03 66 03 72
	126	+03 7e 03 8a 03 96 03 a2 03 ae 03 ba 03 c7 03 d3
	127	+03 e0 03 ec 03 f9 04 06 04 13 04 20 04 2d 04 3b
	128	+04 48 04 55 04 63 04 71 04 7e 04 8c 04 9a 04 a8
	129	+04 b6 04 c4 04 d3 04 e1 04 f0 04 fe 05 0d 05 1c
	130	+05 2b 05 3a 05 49 05 58 05 67 05 77 05 86 05 96
	131	+05 a6 05 b5 05 c5 05 d5 05 e5 05 f6 06 06 06 16
	132	+06 27 06 37 06 48 06 59 06 6a 06 7b 06 8c 06 9d
	133	+06 af 06 c0 06 d1 06 e3 06 f5 07 07 07 19 07 2b
	134	+07 3d 07 4f 07 61 07 74 07 86 07 99 07 ac 07 bf
	135	+07 d2 07 e5 07 f8 08 0b 08 1f 08 32 08 46 08 5a
	136	+08 6e 08 82 08 96 08 aa 08 be 08 d2 08 e7 08 fb
	137	+09 10 09 25 09 3a 09 4f 09 64 09 79 09 8f 09 a4
	138	+09 ba 09 cf 09 e5 09 fb 0a 11 0a 27 0a 3d 0a 54
	139	+0a 6a 0a 81 0a 98 0a ae 0a c5 0a dc 0a f3 0b 0b
	140	+0b 22 0b 39 0b 51 0b 69 0b 80 0b 98 0b b0 0b c8
	141	+0b e1 0b f9 0c 12 0c 2a 0c 43 0c 5c 0c 75 0c 8e
	142	+0c a7 0c c0 0c d9 0c f3 0d 0d 0d 26 0d 40 0d 5a
	143	+0d 74 0d 8e 0d a9 0d c3 0d de 0d f8 0e 13 0e 2e
	144	+0e 49 0e 64 0e 7f 0e 9b 0e b6 0e d2 0e ee 0f 09
	145	+0f 25 0f 41 0f 5e 0f 7a 0f 96 0f b3 0f cf 0f ec
	146	+10 09 10 26 10 43 10 61 10 7e 10 9b 10 b9 10 d7
	147	+10 f5 11 13 11 31 11 4f 11 6d 11 8c 11 aa 11 c9
	148	+11 e8 12 07 12 26 12 45 12 64 12 84 12 a3 12 c3
	149	+12 e3 13 03 13 23 13 43 13 63 13 83 13 a4 13 c5
	150	+13 e5 14 06 14 27 14 49 14 6a 14 8b 14 ad 14 ce
	151	+14 f0 15 12 15 34 15 56 15 78 15 9b 15 bd 15 e0
	152	+16 03 16 26 16 49 16 6c 16 8f 16 b2 16 d6 16 fa
	153	+17 1d 17 41 17 65 17 89 17 ae 17 d2 17 f7 18 1b
	154	+18 40 18 65 18 8a 18 af 18 d5 18 fa 19 20 19 45
	155	+19 6b 19 91 19 b7 19 dd 1a 04 1a 2a 1a 51 1a 77
	156	+1a 9e 1a c5 1a ec 1b 14 1b 3b 1b 63 1b 8a 1b b2
	157	+1b da 1c 02 1c 2a 1c 52 1c 7b 1c a3 1c cc 1c f5
	158	+1d 1e 1d 47 1d 70 1d 99 1d c3 1d ec 1e 16 1e 40
	159	+1e 6a 1e 94 1e be 1e e9 1f 13 1f 3e 1f 69 1f 94
	160	+1f bf 1f ea 20 15 20 41 20 6c 20 98 20 c4 20 f0
	161	+21 1c 21 48 21 75 21 a1 21 ce 21 fb 22 27 22 55
	162	+22 82 22 af 22 dd 23 0a 23 38 23 66 23 94 23 c2
	163	+23 f0 24 1f 24 4d 24 7c 24 ab 24 da 25 09 25 38
	164	+25 68 25 97 25 c7 25 f7 26 27 26 57 26 87 26 b7
	165	+26 e8 27 18 27 49 27 7a 27 ab 27 dc 28 0d 28 3f
	166	+28 71 28 a2 28 d4 29 06 29 38 29 6b 29 9d 29 d0
	167	+2a 02 2a 35 2a 68 2a 9b 2a cf 2b 02 2b 36 2b 69
	168	+2b 9d 2b d1 2c 05 2c 39 2c 6e 2c a2 2c d7 2d 0c
	169	+2d 41 2d 76 2d ab 2d e1 2e 16 2e 4c 2e 82 2e b7
	170	+2e ee 2f 24 2f 5a 2f 91 2f c7 2f fe 30 35 30 6c
	171	+30 a4 30 db 31 12 31 4a 31 82 31 ba 31 f2 32 2a
	172	+32 63 32 9b 32 d4 33 0d 33 46 33 7f 33 b8 33 f1
	173	+34 2b 34 65 34 9e 34 d8 35 13 35 4d 35 87 35 c2
	174	+35 fd 36 37 36 72 36 ae 36 e9 37 24 37 60 37 9c
	175	+37 d7 38 14 38 50 38 8c 38 c8 39 05 39 42 39 7f
	176	+39 bc 39 f9 3a 36 3a 74 3a b2 3a ef 3b 2d 3b 6b
	177	+3b aa 3b e8 3c 27 3c 65 3c a4 3c e3 3d 22 3d 61
	178	+3d a1 3d e0 3e 20 3e 60 3e a0 3e e0 3f 21 3f 61
	179	+3f a2 3f e2 40 23 40 64 40 a6 40 e7 41 29 41 6a
	180	+41 ac 41 ee 42 30 42 72 42 b5 42 f7 43 3a 43 7d
	181	+43 c0 44 03 44 47 44 8a 44 ce 45 12 45 55 45 9a
	182	+45 de 46 22 46 67 46 ab 46 f0 47 35 47 7b 47 c0
	183	+48 05 48 4b 48 91 48 d7 49 1d 49 63 49 a9 49 f0
	184	+4a 37 4a 7d 4a c4 4b 0c 4b 53 4b 9a 4b e2 4c 2a
	185	+4c 72 4c ba 4d 02 4d 4a 4d 93 4d dc 4e 25 4e 6e
	186	+4e b7 4f 00 4f 49 4f 93 4f dd 50 27 50 71 50 bb
	187	+51 06 51 50 51 9b 51 e6 52 31 52 7c 52 c7 53 13
	188	+53 5f 53 aa 53 f6 54 42 54 8f 54 db 55 28 55 75
	189	+55 c2 56 0f 56 5c 56 a9 56 f7 57 44 57 92 57 e0
	190	+58 2f 58 7d 58 cb 59 1a 59 69 59 b8 5a 07 5a 56
	191	+5a a6 5a f5 5b 45 5b 95 5b e5 5c 35 5c 86 5c d6
	192	+5d 27 5d 78 5d c9 5e 1a 5e 6c 5e bd 5f 0f 5f 61
	193	+5f b3 60 05 60 57 60 aa 60 fc 61 4f 61 a2 61 f5
	194	+62 49 62 9c 62 f0 63 43 63 97 63 eb 64 40 64 94
	195	+64 e9 65 3d 65 92 65 e7 66 3d 66 92 66 e8 67 3d
	196	+67 93 67 e9 68 3f 68 96 68 ec 69 43 69 9a 69 f1
	197	+6a 48 6a 9f 6a f7 6b 4f 6b a7 6b ff 6c 57 6c af
	198	+6d 08 6d 60 6d b9 6e 12 6e 6b 6e c4 6f 1e 6f 78
	199	+6f d1 70 2b 70 86 70 e0 71 3a 71 95 71 f0 72 4b
	200	+72 a6 73 01 73 5d 73 b8 74 14 74 70 74 cc 75 28
	201	+75 85 75 e1 76 3e 76 9b 76 f8 77 56 77 b3 78 11
	202	+78 6e 78 cc 79 2a 79 89 79 e7 7a 46 7a a5 7b 04
	203	+7b 63 7b c2 7c 21 7c 81 7c e1 7d 41 7d a1 7e 01
	204	+7e 62 7e c2 7f 23 7f 84 7f e5 80 47 80 a8 81 0a
	205	+81 6b 81 cd 82 30 82 92 82 f4 83 57 83 ba 84 1d
	206	+84 80 84 e3 85 47 85 ab 86 0e 86 72 86 d7 87 3b
	207	+87 9f 88 04 88 69 88 ce 89 33 89 99 89 fe 8a 64
	208	+8a ca 8b 30 8b 96 8b fc 8c 63 8c ca 8d 31 8d 98
	209	+8d ff 8e 66 8e ce 8f 36 8f 9e 90 06 90 6e 90 d6
	210	+91 3f 91 a8 92 11 92 7a 92 e3 93 4d 93 b6 94 20
	211	+94 8a 94 f4 95 5f 95 c9 96 34 96 9f 97 0a 97 75
	212	+97 e0 98 4c 98 b8 99 24 99 90 99 fc 9a 68 9a d5
	213	+9b 42 9b af 9c 1c 9c 89 9c f7 9d 64 9d d2 9e 40
	214	+9e ae 9f 1d 9f 8b 9f fa a0 69 a0 d8 a1 47 a1 b6
	215	+a2 26 a2 96 a3 06 a3 76 a3 e6 a4 56 a4 c7 a5 38
	216	+a5 a9 a6 1a a6 8b a6 fd a7 6e a7 e0 a8 52 a8 c4
	217	+a9 37 a9 a9 aa 1c aa 8f ab 02 ab 75 ab e9 ac 5c
	218	+ac d0 ad 44 ad b8 ae 2d ae a1 af 16 af 8b b0 00
	219	+b0 75 b0 ea b1 60 b1 d6 b2 4b b2 c2 b3 38 b3 ae
	220	+b4 25 b4 9c b5 13 b5 8a b6 01 b6 79 b6 f0 b7 68
	221	+b7 e0 b8 59 b8 d1 b9 4a b9 c2 ba 3b ba b5 bb 2e
	222	+bb a7 bc 21 bc 9b bd 15 bd 8f be 0a be 84 be ff
	223	+bf 7a bf f5 c0 70 c0 ec c1 67 c1 e3 c2 5f c2 db
	224	+c3 58 c3 d4 c4 51 c4 ce c5 4b c5 c8 c6 46 c6 c3
	225	+c7 41 c7 bf c8 3d c8 bc c9 3a c9 b9 ca 38 ca b7
	226	+cb 36 cb b6 cc 35 cc b5 cd 35 cd b5 ce 36 ce b6
	227	+cf 37 cf b8 d0 39 d0 ba d1 3c d1 be d2 3f d2 c1
	228	+d3 44 d3 c6 d4 49 d4 cb d5 4e d5 d1 d6 55 d6 d8
	229	+d7 5c d7 e0 d8 64 d8 e8 d9 6c d9 f1 da 76 da fb
	230	+db 80 dc 05 dc 8a dd 10 dd 96 de 1c de a2 df 29
	231	+df af e0 36 e0 bd e1 44 e1 cc e2 53 e2 db e3 63
	232	+e3 eb e4 73 e4 fc e5 84 e6 0d e6 96 e7 1f e7 a9
	233	+e8 32 e8 bc e9 46 e9 d0 ea 5b ea e5 eb 70 eb fb
	234	+ec 86 ed 11 ed 9c ee 28 ee b4 ef 40 ef cc f0 58
	235	+f0 e5 f1 72 f1 ff f2 8c f3 19 f3 a7 f4 34 f4 c2
	236	+f5 50 f5 de f6 6d f6 fb f7 8a f8 19 f8 a8 f9 38
	237	+f9 c7 fa 57 fa e7 fb 77 fc 07 fc 98 fd 29 fd ba
	238	+fe 4b fe dc ff 6d ff ff >
	239	+endstream
	240	+endobj
	241	+7 0 obj
	242	+<<
	243	+/Type /Encoding
	244	+/BaseEncoding /WinAnsiEncoding
	245	+/Differences [ 45/minus 96/quoteleft
	246	+144/dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent
	247	+/dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space]
	248	+>>
	249	+endobj
	250	+xref
	251	+0 8
	252	+0000000000 65535 f
	253	+0000000021 00000 n
	254	+0000000164 00000 n
	255	+0000000213 00000 n
	256	+0000000290 00000 n
	257	+0000000391 00000 n
	258	+0000000424 00000 n
	259	+0000009960 00000 n
	260	+trailer
	261	+<<
	262	+/Size 8
	263	+/Info 1 0 R
	264	+/Root 2 0 R
	265	+>>
	266	+startxref
	267	+10217
	268	+%%EOF
Index: trunk/tools/wsor/first_session/R/first_sessions.R
—	—	@@ -0,0 +1,8 @@
	2	+source("loader/user_sessions.R")
	3	+
	4	+library(lattice)
	5	+library(doBy)
	6	+
	7	+
	8	+
	9	+
Index: trunk/tools/wsor/first_session/R/util/env.R
—	—	@@ -0,0 +1,15 @@
	2	+DATA_DIR = "../data"
	3	+
	4	+
	5	+naReplace = function(x, replacement){
	6	+ sapply(
	7	+ x,
	8	+ function(v){
	9	+ if(is.na(v)){
	10	+ replacement
	11	+ }else{
	12	+ v
	13	+ }
	14	+ }
	15	+ )
	16	+}
Index: trunk/tools/wsor/first_session/foo
—	—	@@ -0,0 +1,24 @@
	2	+user_id user_name first_edit last_edit edit_count es_0_start es_0_end es_0_edits es_0_reverted es_0_vandalism es_0_deleted es_1_start es_1_end es_1_edits es_1_reverted es_1_vandalism es_1_deleted es_2_start es_2_end es_2_edits es_2_reverted es_2_vandalism es_2_deleted
	3	+1 Damian Yerrick 20010929004320 20110715131605 13196 1001724200 1001724224 2 0 0 0 1001735270 1001735270 1 0 0 0 1001778000 1001781732 4 1 0 1
	4	+2 AxelBoldt 20010726145009 20110715175901 34804 996159009 996159009 1 0 0 0 996164049 996164049 1 0 0 0 996175238 996177464 3 0 0 0
	5	+3 Tobias Hoevekamp 20010326202105 20040329205621 1903 985638065 985638065 1 1 0 0 985683223 985683379 2 0 0 0 985944995 985944995 1 0 0 0
	6	+4 Magnus Manske 20010728082538 20110714220907 20038 996308738 996308738 1 0 0 0 996429215 996429215 2 2 0 0 996439880 996439880 1 0 0 0
	7	+5 Hoevekam 20030709192137 20041227165610 3 1057778497 1057778497 1 0 0 0 1095446631 1095446631 1 0 0 0 1104166570 1104166570 1 0 0 0
	8	+6 Paul Drye 20010919131128 20080605202716 1135 1000905088 1000906103 5 3 0 0 1001350404 1001350920 2 0 0 0 1001359775 1001359775 1 0 0 0
	9	+7 Joao 20010826124114 20040606005523 266 998829674 998832110 6 0 0 0 998850967 998852514 3 0 0 0 998856582 998856582 1 0 0 0
	10	+8 TwoOneTwo 20010909202356 20110205015022 2135 1000067036 1000067036 1 0 0 0 1000157045 1000157045 1 0 0 0 1000326898 1000326898 1 0 0 0
	11	+9 Chenyu 20011118233022 20020124230110 166 1006126222 1006130510 5 0 0 0 1006138034 1006140164 7 3 0 2 1006147796 1006147828 3 0 0 1
	12	+10 Tbc 20010803144007 20020105091549 125 996849607 996849607 1 0 0 0 996875902 996875902 1 0 0 0 996967861 996967861 1 1 0 0
	13	+11 Kpjas 20010506173149 20110714103545 6302 989170309 989170309 1 0 0 0 990957006 990957006 1 0 0 0 991557466 991557466 1 0 0 0
	14	+12 Matthew Woodcraft 20011202215229 20110510203406 725 1007329949 1007336697 7 0 0 1 1007344959 1007344959 1 0 0 0 1007852954 1007852954 1 0 0 0
	15	+13 SteveSmith 20020124050249 20030520014515 82 1011848569 1011848569 1 0 0 0 1011879408 1011880799 5 0 0 0 1011892200 1011900596 10 1 0 0
	16	+14 RjLesch 20010727142501 20020708073228 872 996243901 996245695 4 1 0 0 996522629 996522629 1 0 0 0 996698056 996698374 2 0 0 0
	17	+15 Trelvis 20011213170430 20040906184834 673 1008263070 1008263442 2 1 0 0 1008279572 1008283623 6 3 0 0 1008287606 1008287606 1 0 0 0
	18	+16 General Wesc 20010805053252 20110403162228 1505 996989572 996989815 2 0 0 0 996994099 996998763 4 0 0 0 997040223 997044154 11 0 0 1
	19	+17 Peter Winnberg 20011110110118 20061124144257 464 1005390078 1005390078 1 0 0 0 1005586430 1005586639 2 1 0 0 1005596572 1005596572 1 0 0 0
	20	+18 MichaelTinkler 20010731150633 20020903033518 2468 996591993 996594597 2 0 0 0 996610359 996610359 1 0 0 0 996767279 996767279 1 0 0 0
	21	+19 Ignaciovicario 20020126191706 20020225154311 2 1012072626 1012072626 1 0 0 0 1014651791 1014651791 1 0 0 0 \N \N \N \N \N \N
	22	+20 Pingos 20020113023235 20040311053834 17 1010889155 1010889303 4 2 0 1 1011038119 1011038119 1 0 0 0 1011044975 1011045028 3 0 0 3
	23	+21 Firepink 20020118155248 20020225155115 63 1011369168 1011371693 3 0 0 0 1011487743 1011487946 3 0 0 0 1011545356 1011546803 2 0 0 0
	24	+22 Luis Oliveira 20020124235009 20050130223105 27 1011916209 1011918167 4 0 0 0 1011965218 1011965218 1 0 0 0 1011969929 1011971575 3 0 0 0
	25	+23 Goran 20020225154311 20030221002037 11 1014651791 1014652275 4 2 0 0 1039118155 1039118752 4 0 0 0 1039135447 1039135447 1 0 0 0
Index: trunk/tools/wsor/first_session/data
—	—	@@ -0,0 +1 @@
	2	+link /home/halfak/data/first_session
\ No newline at end of file
Property changes on: trunk/tools/wsor/first_session/data
___________________________________________________________________
Added: svn:special
1	3	+ *
Index: trunk/tools/wsor/first_session/testing.sql
—	—	@@ -0,0 +1,197 @@
	2	+
	3	+CREATE TABLE halfak.user_session_sample
	4	+SELECT
	5	+ user_id,
	6	+ YEAR(first_edit) AS year,
	7	+ MONTH(first_edit) >= 7 AS semester
	8	+FROM halfak.user_meta_20110715
	9	+WHERE first_edit BETWEEN "20010000000000" AND "20019999999999"
	10	+ORDER BY RAND()
	11	+LIMIT 10000;
	12	+
	13	+INSERT INTO halfak.user_session_sample
	14	+SELECT
	15	+ user_id,
	16	+ YEAR(first_edit) AS year,
	17	+ MONTH(first_edit) >= 7 AS semester
	18	+FROM halfak.user_meta_20110715
	19	+WHERE first_edit BETWEEN "20020000000000" AND "20029999999999"
	20	+ORDER BY RAND()
	21	+LIMIT 10000;
	22	+
	23	+INSERT INTO halfak.user_session_sample
	24	+SELECT
	25	+ user_id,
	26	+ YEAR(first_edit) AS year,
	27	+ MONTH(first_edit) >= 7 AS semester
	28	+FROM halfak.user_meta_20110715
	29	+WHERE first_edit BETWEEN "20030000000000" AND "20039999999999"
	30	+ORDER BY RAND()
	31	+LIMIT 10000;
	32	+
	33	+INSERT INTO halfak.user_session_sample
	34	+SELECT
	35	+ user_id,
	36	+ YEAR(first_edit) AS year,
	37	+ MONTH(first_edit) >= 7 AS semester
	38	+FROM halfak.user_meta_20110715
	39	+WHERE first_edit BETWEEN "20040000000000" AND "20049999999999"
	40	+ORDER BY RAND()
	41	+LIMIT 10000;
	42	+
	43	+INSERT INTO halfak.user_session_sample
	44	+SELECT
	45	+ user_id,
	46	+ YEAR(first_edit) AS year,
	47	+ MONTH(first_edit) >= 7 AS semester
	48	+FROM halfak.user_meta_20110715
	49	+WHERE first_edit BETWEEN "20050000000000" AND "20059999999999"
	50	+ORDER BY RAND()
	51	+LIMIT 10000;
	52	+
	53	+INSERT INTO halfak.user_session_sample
	54	+SELECT
	55	+ user_id,
	56	+ YEAR(first_edit) AS year,
	57	+ MONTH(first_edit) >= 7 AS semester
	58	+FROM halfak.user_meta_20110715
	59	+WHERE first_edit BETWEEN "20060000000000" AND "20069999999999"
	60	+ORDER BY RAND()
	61	+LIMIT 10000;
	62	+
	63	+INSERT INTO halfak.user_session_sample
	64	+SELECT
	65	+ user_id,
	66	+ YEAR(first_edit) AS year,
	67	+ MONTH(first_edit) >= 7 AS semester
	68	+FROM halfak.user_meta_20110715
	69	+WHERE first_edit BETWEEN "20070000000000" AND "20079999999999"
	70	+ORDER BY RAND()
	71	+LIMIT 10000;
	72	+
	73	+INSERT INTO halfak.user_session_sample
	74	+SELECT
	75	+ user_id,
	76	+ YEAR(first_edit) AS year,
	77	+ MONTH(first_edit) >= 7 AS semester
	78	+FROM halfak.user_meta_20110715
	79	+WHERE first_edit BETWEEN "20080000000000" AND "20089999999999"
	80	+ORDER BY RAND()
	81	+LIMIT 10000;
	82	+
	83	+INSERT INTO halfak.user_session_sample
	84	+SELECT
	85	+ user_id,
	86	+ YEAR(first_edit) AS year,
	87	+ MONTH(first_edit) >= 7 AS semester
	88	+FROM halfak.user_meta_20110715
	89	+WHERE first_edit BETWEEN "20090000000000" AND "20099999999999"
	90	+ORDER BY RAND()
	91	+LIMIT 10000;
	92	+
	93	+INSERT INTO halfak.user_session_sample
	94	+SELECT
	95	+ user_id,
	96	+ YEAR(first_edit) AS year,
	97	+ MONTH(first_edit) >= 7 AS semester
	98	+FROM halfak.user_meta_20110715
	99	+WHERE first_edit BETWEEN "20100000000000" AND "20109999999999"
	100	+ORDER BY RAND()
	101	+LIMIT 10000;
	102	+
	103	+
	104	+
	105	+USE enwiki;
	106	+CREATE TABLE zexley.user_meta_firsts
	107	+SELECT
	108	+ user_id,
	109	+ first_edit,
	110	+ last_edit,
	111	+ sum(rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL .25 YEAR)) as 1q,
	112	+ sum(rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .25 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .5 YEAR)) as 2q,
	113	+ sum(rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .5 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 3q,
	114	+ sum(rev_timestamp > DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 4q
	115	+FROM (
	116	+SELECT
	117	+ u.user_id AS user_id,
	118	+ u.first_edit AS first_edit,
	119	+ u.last_edit AS last_edit,
	120	+ r.rev_timestamp
	121	+FROM halfak.user_meta_20110715 u
	122	+LEFT JOIN revision r
	123	+ ON u.user_id = r.rev_user AND
	124	+ r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR)
	125	+UNION
	126	+SELECT
	127	+ u.user_id AS user_id,
	128	+ u.first_edit AS first_edit,
	129	+ u.last_edit AS last_edit,
	130	+ ar_timestamp AS rev_timestamp
	131	+FROM halfak.user_meta_20110715 u
	132	+LEFT JOIN archive a
	133	+ ON u.user_id = ar_user AND
	134	+ ar_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR)
	135	+) AS r
	136	+GROUP BY user_id
	137	+
	138	+
	139	+CREATE TABLE halfak.rev_len_changed
	140	+SELECT
	141	+ r.rev_id,
	142	+ r.rev_timestamp,
	143	+ YEAR(r.rev_timestamp) AS rev_year,
	144	+ MONTH(r.rev_timestamp) AS rev_month,
	145	+ r.rev_len,
	146	+ r.rev_user AS user_id,
	147	+ r.rev_user_text AS user_text,
	148	+ `change` AS len_change
	149	+ p.page_id AS page_id,
	150	+ p.page_namespace AS namespace
	151	+FROM revision r
	152	+INNER JOIN user u
	153	+ ON r.rev_user = u.user_id
	154	+INNER JOIN halfak.user_meta_20110715 um
	155	+ ON um.user_id = r.rev_user
	156	+INNER JOIN halfak.rev_len_change rlc
	157	+ ON r.rev_id = rlc.rev_id
	158	+INNER JOIN page p
	159	+ ON p.page_id = r.rev_page;
	160	+
	161	+ALTER TABLE halfak.rev_len_changed
	162	+ADD COLUMN rev_year INT UNSIGNED
	163	+AFTER rev_timestamp;
	164	+
	165	+ALTER TABLE halfak.rev_len_changed
	166	+ADD COLUMN rev_month INT UNSIGNED
	167	+AFTER rev_timestamp;
	168	+
	169	+UPDATE halfak.rev_len_changed
	170	+SET
	171	+ rev_year = YEAR(rev_timestamp),
	172	+ rev_month = MONTH(rev_timestamp);
	173	+
	174	+
	175	+CREATE UNIQUE INDEX rev_idx ON halfak.rev_len_changed_final (rev_id);
	176	+CREATE INDEX user_year_month_namespace ON halfak.rev_len_changed_final (user_id, rev_year, rev_month, namespace);
	177	+
	178	+
	179	+
	180	+
	181	+
	182	+SELECT
	183	+ user_id,
	184	+ rev_year,
	185	+ rev_month,
	186	+ namespace,
	187	+ first_edit,
	188	+ COUNT(*) as edits,
	189	+ SUM(IF(len_change > 0,len_change,0)) as len_added,
	190	+ SUM(IF(len_change < 0,len_change*-1,0)) as len_removed
	191	+FROM halfak.rev_len_changed
	192	+GROUP BY
	193	+ user_id,
	194	+ rev_year,
	195	+ rev_month,
	196	+ namespace,
	197	+ first_edit
	198	+WHERE user_id = 2356767;
Index: trunk/tools/wsor/vandal_conversion/R/conversions.R
—	—	@@ -1,4 +1,120 @@
2	2	source("loader/load_editor_first_and_last.R")
	3	+source("loader/load_editor_edit_count.R")
3	4
4	5	editor_first_and_last = load_editor_first_and_last()
	6	+efl = unique(editor_first_and_last)
	7	+efl = efl[efl$last10_edits == 10,]
5	8
	9	+editor_edit_count = load_editor_edit_count()
	10	+efl = merge(
	11	+ efl,
	12	+ editor_edit_count,
	13	+ by=c("user_id", "user_name")
	14	+)
	15	+
	16	+library(lattice)
	17	+
	18	+#plot(table(efl$fes_edits))
	19	+#xyplot(table(efl$fes_edits)~as.numeric(names(table(efl$fes_edits))), scales=list(x=list(log=2), y=list(log=2)))
	20	+
	21	+png("plots/fes_discarded.hist.png", height=768, width=1024)
	22	+efl$fes_discarded = efl$fes_reverted + efl$fes_deleted
	23	+efl$fes_discarded_prop = efl$fes_discarded / efl$fes_edits
	24	+plot(
	25	+ table(round(efl[efl$fes_edits >= 4,]$fes_discarded_prop, 1)),
	26	+ main="Histogram of the proportion of first session edits that were discarded",
	27	+ sub="for editors with at least 20 edits and 4 in first session. Discarded edits have been reverted or deleted",
	28	+ ylab="Frequency",
	29	+ xlab="Proportion of discarded edits"
	30	+)
	31	+dev.off()
	32	+
	33	+png("plots/fes_vandalism.hist.png", height=768, width=1024)
	34	+efl$fes_vandalism_prop = efl$fes_vandalism / (efl$fes_edits - efl$fes_deleted)
	35	+plot(
	36	+ table(round(efl[(efl$fes_edits - efl$fes_deleted) >= 1,]$fes_vandalism_prop, 1)),
	37	+ main="Histogram of the proportion of kept 1st session edits that were vandalism",
	38	+ sub="for editors with at least 20 edits and 1 kept edits in first session.",
	39	+ ylab="Frequency",
	40	+ xlab="Proportion of vandalism edits"
	41	+)
	42	+dev.off()
	43	+
	44	+png("plots/fes_reverted.hist.png", height=768, width=1024)
	45	+efl$fes_reverted_prop = efl$fes_reverted / (efl$fes_edits - efl$fes_deleted)
	46	+plot(
	47	+ table(round(efl[(efl$fes_edits - efl$fes_deleted) >= 1,]$fes_reverted_prop, 1)),
	48	+ main="Histogram of the proportion of kept 1st session edits that were reverted",
	49	+ sub="for editors with at least 20 edits and 1 kept edits in first session.",
	50	+ ylab="Frequency",
	51	+ xlab="Proportion of reverted edits"
	52	+)
	53	+dev.off()
	54	+
	55	+
	56	+png("plots/last10_discarded.hist.png", height=768, width=1024)
	57	+efl$last10_discarded = efl$last10_reverted + efl$last10_deleted
	58	+efl$last10_discarded_prop = efl$last10_discarded / efl$last10_edits
	59	+plot(
	60	+ table(round(efl$last10_discarded_prop, 1)),
	61	+ main="Histogram of the proportion of the last 10 edits that were discarded",
	62	+ sub="for editors with at least 20 edits. Discarded edits have been reverted or deleted",
	63	+ ylab="Frequency",
	64	+ xlab="Proportion of discarded edits"
	65	+)
	66	+dev.off()
	67	+
	68	+
	69	+png("plots/future_edits.hist.png", height=768, width=1024)
	70	+efl$future_edits = efl$edit_count - efl$fes_edits
	71	+plot(
	72	+ table(10^round(log(efl$future_edits, base=10), 1)),
	73	+ main="Histogram of edits after first session for edits who made at least 20 edits",
	74	+ xlab="Edits after first session (log10 bucketed, scaled)",
	75	+ ylab="Frequency",
	76	+ type="o",
	77	+ log="x"
	78	+)
	79	+dev.off()
	80	+
	81	+
	82	+top_100 = efl[order(efl$edit_count, decreasing=T),][1:100,]
	83	+png("plots/fes_discarded.hist.top_100.png", height=768, width=1024)
	84	+top_100$fes_discarded = top_100$fes_reverted + top_100$fes_deleted
	85	+top_100$fes_discarded_prop = top_100$fes_discarded / top_100$fes_edits
	86	+plot(
	87	+ table(round(top_100$fes_discarded_prop, 1)),
	88	+ main="Histogram of the proportion of the last 10 edits that were discarded",
	89	+ sub="for the top 100 editors by edit count. Discarded edits have been reverted or deleted",
	90	+ ylab="Frequency",
	91	+ xlab="Proportion of discarded edits"
	92	+)
	93	+dev.off()
	94	+
	95	+png("plots/fes_reverted.hist.top_100.png", height=768, width=1024)
	96	+top_100$fes_reverted_prop = top_100$fes_reverted / (top_100$fes_edits - top_100$fes_deleted)
	97	+plot(
	98	+ table(round(top_100[top_100$fes_edits - top_100$fes_deleted >= 1,]$fes_reverted_prop, 1)),
	99	+ main="Histogram of the proportion of the last 10 edits that were reverted",
	100	+ sub="for the top 100 editors by edit count.",
	101	+ ylab="Frequency",
	102	+ xlab="Proportion of reverted edits"
	103	+)
	104	+dev.off()
	105	+
	106	+png("plots/fes_vandal.hist.top_100.png", height=768, width=1024)
	107	+top_100$fes_vandalism_prop = top_100$fes_vandalism / (top_100$fes_edits - top_100$fes_deleted)
	108	+plot(
	109	+ table(round(top_100[top_100$fes_edits - top_100$fes_deleted >= 1,]$fes_vandalism_prop, 1)),
	110	+ main="Histogram of the proportion of the last 10 edits that were reverted for vandalism",
	111	+ sub="for the top 100 editors by edit count.",
	112	+ ylab="Frequency",
	113	+ xlab="Proportion of edits reverted for vandalism"
	114	+)
	115	+dev.off()
	116	+
	117	+
	118	+summary(top_100$fes_vandalism > 0)
	119	+summary(top_100$fes_reverted > 0)
	120	+summary(top_100$fes_discarded > 0)
	121	+
Index: trunk/tools/wsor/vandal_conversion/R/util/env.R
—	—	@@ -1 +1 @@
2		~~-DATA_DIR = "/home/aaron/data/vandal_conversion"~~
	2	+DATA_DIR = "../data"
Index: trunk/tools/wsor/vandal_conversion/get_editor_editcount.py
—	—	@@ -0,0 +1,109 @@
	2	+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
	3	+import wmf
	4	+
	5	+def encode(v):
	6	+ if v == None: return "\N"
	7	+
	8	+ if type(v) == types.LongType: v = int(v)
	9	+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
	10	+
	11	+ return str(v).encode("string-escape")
	12	+
	13	+
	14	+def main():
	15	+ parser = argparse.ArgumentParser(
	16	+ description='Gathers editor data for first and last session'
	17	+ )
	18	+ parser.add_argument(
	19	+ 'min_edits',
	20	+ type=int,
	21	+ help='the minimum number of edits that editors must have perfomed to be included'
	22	+ )
	23	+ parser.add_argument(
	24	+ '-c', '--cnf',
	25	+ metavar="<path>",
	26	+ type=str,
	27	+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
	28	+ default=os.path.expanduser("~/.my.cnf")
	29	+ )
	30	+ parser.add_argument(
	31	+ '-s', '--host',
	32	+ type=str,
	33	+ help='the database host to connect to (defaults to localhost)',
	34	+ default="localhost"
	35	+ )
	36	+ parser.add_argument(
	37	+ '-d', '--db',
	38	+ type=str,
	39	+ help='the language db to run the query in (defaults to enwiki)',
	40	+ default="enwiki"
	41	+ )
	42	+ args = parser.parse_args()
	43	+
	44	+ LOGGING_STREAM = sys.stderr
	45	+ logging.basicConfig(
	46	+ level=logging.DEBUG,
	47	+ stream=LOGGING_STREAM,
	48	+ format='%(asctime)s %(levelname)-8s %(message)s',
	49	+ datefmt='%b-%d %H:%M:%S'
	50	+ )
	51	+
	52	+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
	53	+ db = Database(
	54	+ host=args.host,
	55	+ db=args.db,
	56	+ read_default_file=args.cnf
	57	+ )
	58	+ headers = [
	59	+ 'user_id',
	60	+ 'user_name',
	61	+ 'edit_count'
	62	+ ]
	63	+ print("\t".join(headers))
	64	+
	65	+ logging.info("Processing users:")
	66	+
	67	+ for user in db.getUsers(minimumEdits=args.min_edits):
	68	+ print("\t".join(encode(user[h]) for h in headers))
	69	+ LOGGING_STREAM.write(".")
	70	+
	71	+ LOGGING_STREAM.write("\n")
	72	+
	73	+
	74	+class Database:
	75	+
	76	+ def __init__(self, args, *kwargs):
	77	+ self.args = args
	78	+ self.kwargs = kwargs
	79	+ self.usersConn = MySQLdb.connect(args, *kwargs)
	80	+ self.revsConn = MySQLdb.connect(args, *kwargs)
	81	+ self.archConn = MySQLdb.connect(args, *kwargs)
	82	+
	83	+ def getUsers(self, minimumEdits=0):
	84	+ minimumEdits = int(minimumEdits)
	85	+ cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor)
	86	+ cursor.execute(
	87	+ """
	88	+ SELECT
	89	+ u.user_id,
	90	+ u.user_name,
	91	+ u.user_editcount as edit_count
	92	+ FROM user u
	93	+ WHERE u.user_editcount >= %(minimum_edits)s
	94	+ """,
	95	+ {
	96	+ 'minimum_edits': minimumEdits
	97	+ }
	98	+ )
	99	+ for row in cursor:
	100	+ yield row
	101	+
	102	+
	103	+ def getFirstEdits(self, userId, maximum=10000):
	104	+ return self.getEdits(userId, maximum, chronologically=True)
	105	+
	106	+ def getLastEdits(self, userId, maximum=10000):
	107	+ return self.getEdits(userId, maximum, chronologically=False)
	108	+
	109	+
	110	+if __name__ == "__main__": main()
Index: trunk/tools/wsor/diffs/example.py
—	—	@@ -0,0 +1,101 @@
	2	+from StringIO import StringIO
	3	+from diff_match_patch import diff_match_patch
	4	+import re
	5	+
	6	+revs = [
	7	+ {'rev_id': 1, 'content':'Foo derp 263254'},
	8	+ {'rev_id': 2, 'content':'Foo derp 26354'}
	9	+]
	10	+
	11	+def tokenize(content):
	12	+ return re.findall(
	13	+ r"[\w]+" + #Word
	14	+ r"\|\[\[" + #Opening internal link
	15	+ r"\|\]\]" + #Closing internal link
	16	+ r"\|\{\{" + #Opening template
	17	+ r"\|\}\}" + #Closing template
	18	+ r"\|\{\{\{" + #Opening template var
	19	+ r"\|\}\}\}" + #Closing template var
	20	+ r"\|\n+" + #Line breaks
	21	+ r"\| +" + #Spaces
	22	+ r"\|&\w+;" + #HTML escape sequence
	23	+ r"\|'''" + #Bold
	24	+ r"\|''" + #Italics
	25	+ r"\|=+" + #Header
	26	+ r"\|\{\\|" + #Opening table
	27	+ r"\|\\|\}" + #Closing table
	28	+ r"\|\\|\-" + #Table row
	29	+ r"\|.", #Misc character
	30	+ content
	31	+ )
	32	+
	33	+def hashTokens(tokens, hash2Token=[], token2Hash={}):
	34	+ hashBuffer = StringIO()
	35	+ for t in tokens:
	36	+ if t in token2Hash:
	37	+ hashBuffer.write(unichr(token2Hash[t]+1))
	38	+ else:
	39	+ hashId = len(hash2Token)
	40	+ hash2Token.append(t)
	41	+ token2Hash[t] = hashId
	42	+ hashBuffer.write(unichr(hashId+1))
	43	+
	44	+ return (hashBuffer.getvalue(), hash2Token, token2Hash)
	45	+
	46	+def unhash(hashes, hash2Token, sep=''):
	47	+ return sep.join(hash2Token[ord(h)-1] for h in hashes)
	48	+
	49	+def simpleDiff(content1, content2, tokenize=tokenize, sep='', report=[-1,0,1]):
	50	+ hashes1, h2t, t2h = hashTokens(tokenize(content1))
	51	+ hashes2, h2t, t2h = hashTokens(tokenize(content2), h2t, t2h)
	52	+
	53	+ report = set(report)
	54	+
	55	+ dmp = diff_match_patch()
	56	+
	57	+ diffs = dmp.diff_main(hashes1, hashes2, checklines=False)
	58	+
	59	+ position = 0
	60	+ for (ar,hashes) in diffs:
	61	+ content = unhash(hashes,h2t,sep=sep)
	62	+ if ar in report:
	63	+ yield position, ar, content
	64	+
	65	+ if ar != -1: position += len(content)
	66	+
	67	+
	68	+def main():
	69	+
	70	+ lastRev = {'content':''}
	71	+ content = ''
	72	+ for rev in revs:
	73	+ buff = StringIO()
	74	+ oldPos = 0
	75	+ lastPos = 0
	76	+ for pos, ar, c in simpleDiff(lastRev['content'], rev['content'], report=[-1,1]):
	77	+ equal = content[oldPos:oldPos+pos-lastPos]
	78	+ buff.write(equal)
	79	+ lastPos += len(equal)
	80	+ oldPos += len(equal)
	81	+
	82	+ if ar == 1:
	83	+ buff.write(c)
	84	+ lastPos += len(c)
	85	+ elif ar == -1:
	86	+ oldPos += len(c)
	87	+
	88	+
	89	+ print("%s, %s, %r" % (pos, ar, c))
	90	+
	91	+ buff.write(content[oldPos:])
	92	+
	93	+
	94	+ content = buff.getvalue()
	95	+ print("Rev: id=%s\n\t%r\n\t%r" % (rev['rev_id'], rev['content'], content))
	96	+ lastRev = rev
	97	+
	98	+
	99	+
	100	+
	101	+if __name__ == "__main__": main()
	102	+
Index: trunk/tools/wsor/diffs/revision_differ.py
—	—	@@ -0,0 +1,150 @@
	2	+#!/usr/local/bin/pypy
	3	+
	4	+import logging,traceback
	5	+import sys, re
	6	+from StringIO import StringIO
	7	+
	8	+from diff_match_patch import diff_match_patch
	9	+
	10	+from xml_simulator import RecordingFileWrapper
	11	+from wmf.dump.iterator import Iterator
	12	+import wmf
	13	+
	14	+def tokenize(content):
	15	+ return re.findall(
	16	+ r"[\w]+" + #Word
	17	+ r"\|\[\[" + #Opening internal link
	18	+ r"\|\]\]" + #Closing internal link
	19	+ r"\|\{\{" + #Opening template
	20	+ r"\|\}\}" + #Closing template
	21	+ r"\|\{\{\{" + #Opening template var
	22	+ r"\|\}\}\}" + #Closing template var
	23	+ r"\|\n+" + #Line breaks
	24	+ r"\| +" + #Spaces
	25	+ r"\|&\w+;" + #HTML escape sequence
	26	+ r"\|'''" + #Bold
	27	+ r"\|''" + #Italics
	28	+ r"\|=+" + #Header
	29	+ r"\|\{\\|" + #Opening table
	30	+ r"\|\\|\}" + #Closing table
	31	+ r"\|\\|\-" + #Table row
	32	+ r"\|.", #Misc character
	33	+ content
	34	+ )
	35	+
	36	+def hashTokens(tokens, hash2Token=[], token2Hash={}):
	37	+ hashBuffer = StringIO()
	38	+ for t in tokens:
	39	+ if t in token2Hash:
	40	+ hashBuffer.write(unichr(token2Hash[t]+1))
	41	+ else:
	42	+ hashId = len(hash2Token)
	43	+ hash2Token.append(t)
	44	+ token2Hash[t] = hashId
	45	+ hashBuffer.write(unichr(hashId+1))
	46	+
	47	+ return (hashBuffer.getvalue(), hash2Token, token2Hash)
	48	+
	49	+def unhash(hashes, hash2Token, sep=''):
	50	+ return sep.join(hash2Token[ord(h)-1] for h in hashes)
	51	+
	52	+def simpleDiff(content1, content2, tokenize=tokenize, sep='', report=[-1,0,1]):
	53	+ hashes1, h2t, t2h = hashTokens(tokenize(content1))
	54	+ hashes2, h2t, t2h = hashTokens(tokenize(content2), h2t, t2h)
	55	+
	56	+ report = set(report)
	57	+
	58	+ dmp = diff_match_patch()
	59	+
	60	+ diffs = dmp.diff_main(hashes1, hashes2, checklines=False)
	61	+
	62	+ position = 0
	63	+ for (ar,hashes) in diffs:
	64	+ content = unhash(hashes,h2t,sep=sep)
	65	+ if ar in report:
	66	+ yield position, ar, content
	67	+
	68	+ if ar != -1: position += len(content)
	69	+
	70	+
	71	+metaXML = """
	72	+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">
	73	+<siteinfo>
	74	+<sitename>Wikipedia</sitename>
	75	+<base>http://en.wikipedia.org/wiki/Main_Page</base>
	76	+<generator>MediaWiki 1.17wmf1</generator>
	77	+<case>first-letter</case>
	78	+<namespaces>
	79	+<namespace key="-2" case="first-letter">Media</namespace>
	80	+<namespace key="-1" case="first-letter">Special</namespace>
	81	+<namespace key="0" case="first-letter" />
	82	+<namespace key="1" case="first-letter">Talk</namespace>
	83	+<namespace key="2" case="first-letter">User</namespace>
	84	+<namespace key="3" case="first-letter">User talk</namespace>
	85	+<namespace key="4" case="first-letter">Wikipedia</namespace>
	86	+<namespace key="5" case="first-letter">Wikipedia talk</namespace>
	87	+<namespace key="6" case="first-letter">File</namespace>
	88	+<namespace key="7" case="first-letter">File talk</namespace>
	89	+<namespace key="8" case="first-letter">MediaWiki</namespace>
	90	+<namespace key="9" case="first-letter">MediaWiki talk</namespace>
	91	+<namespace key="10" case="first-letter">Template</namespace>
	92	+<namespace key="11" case="first-letter">Template talk</namespace>
	93	+<namespace key="12" case="first-letter">Help</namespace>
	94	+<namespace key="13" case="first-letter">Help talk</namespace>
	95	+<namespace key="14" case="first-letter">Category</namespace>
	96	+<namespace key="15" case="first-letter">Category talk</namespace>
	97	+<namespace key="100" case="first-letter">Portal</namespace>
	98	+<namespace key="101" case="first-letter">Portal talk</namespace>
	99	+<namespace key="108" case="first-letter">Book</namespace>
	100	+<namespace key="109" case="first-letter">Book talk</namespace>
	101	+</namespaces>
	102	+</siteinfo>
	103	+"""
	104	+xmlSim = RecordingFileWrapper(sys.stdin, pre=metaXML, post='</mediawiki>')
	105	+
	106	+try:
	107	+ dump = Iterator(xmlSim)
	108	+except Exception as e:
	109	+ sys.stderr.write(str(e) + xmlSim.getHistory())
	110	+ sys.exit(1)
	111	+
	112	+
	113	+for page in dump.readPages():
	114	+ sys.stderr.write('Processing: %s - %s\n' % (page.getId(), page.getTitle().encode('UTF-8')))
	115	+ try:
	116	+ lastRev = None
	117	+ for revision in page.readRevisions():
	118	+ if lastRev == None:
	119	+ lastRev = revision
	120	+ else:
	121	+ namespace, title = wmf.normalizeTitle(page.getTitle(), namespaces=dump.namespaces)
	122	+ nsId = dump.namespaces[namespace]
	123	+ row = [
	124	+ repr(revision.getId()),
	125	+ repr(page.getId()),
	126	+ repr(nsId),
	127	+ repr(title),
	128	+ repr(revision.getTimestamp()),
	129	+ repr(revision.getComment()),
	130	+ repr(revision.getMinor()),
	131	+ repr(revision.getContributor().getId()),
	132	+ repr(revision.getContributor().getUsername())
	133	+ ]
	134	+
	135	+ for d in simpleDiff(lastRev.getText(), revision.getText(), report=[-1,1]):
	136	+ row.append(":".join(repr(v) for v in d))
	137	+
	138	+ print("\t".join(row))
	139	+
	140	+ except Exception as e:
	141	+ sys.stderr.write('%s' % e)
	142	+ #fh.write('%s' % e)
	143	+ #logging.error(
	144	+ # "Failed to process page %s:%s - %s" % (
	145	+ # page.getId(),
	146	+ # page.getTitle(),
	147	+ # e
	148	+ # ))
	149	+ #logging.error(traceback.print_exc())
	150	+#fh.close()
	151	+#sys.exit(0)
Property changes on: trunk/tools/wsor/diffs/revision_differ.py
___________________________________________________________________
Added: svn:executable
1	152	+ *
Index: trunk/tools/wsor/diffs/xml_simulator.py
—	—	@@ -0,0 +1,80 @@
	2	+import sys
	3	+from StringIO import StringIO
	4	+from collections import deque
	5	+
	6	+class FileWrapper:
	7	+
	8	+ def __init__(self, fp, pre='', post=''):
	9	+ self.fp = fp
	10	+ self.pre = StringIO(pre)
	11	+ self.post = StringIO(post)
	12	+ self.closed = False
	13	+ self.mode = "r"
	14	+
	15	+ def read(self, bytes=sys.maxint):
	16	+ bytes = int(bytes)
	17	+ if self.closed: raise ValueError("I/O operation on closed file")
	18	+
	19	+ preBytes = self.pre.read(bytes)
	20	+ if len(preBytes) < bytes:
	21	+ fpBytes = self.fp.read(bytes-len(preBytes))
	22	+ else:
	23	+ fpBytes = ''
	24	+
	25	+ if len(preBytes) + len(fpBytes) < bytes:
	26	+ postBytes = self.post.read(bytes-(len(preBytes) + len(fpBytes)))
	27	+ else:
	28	+ postBytes = ''
	29	+
	30	+ return preBytes + fpBytes + postBytes
	31	+
	32	+ def readline(self):
	33	+ if self.closed: raise ValueError("I/O operation on closed file")
	34	+
	35	+ output = self.pre.readline()
	36	+ if len(output) == 0 or output[-1] != "\n":
	37	+ output += self.fp.readline()
	38	+ if len(output) == 0 or output[-1] != "\n":
	39	+ output += self.post.readline()
	40	+
	41	+ return output
	42	+
	43	+ def readlines(self): raise NotImplementedError()
	44	+
	45	+ def __iter__(self):
	46	+
	47	+ line = self.readline()
	48	+ while line != '':
	49	+ yield line
	50	+ line = self.readline()
	51	+
	52	+
	53	+ def seek(self): raise NotImplementedError()
	54	+ def write(self): raise NotImplementedError()
	55	+ def writelines(self): raise NotImplementedError()
	56	+ def tell(self):
	57	+ return self.pre.tell() + self.fp.tell() + self.post.tell()
	58	+
	59	+
	60	+ def close(self):
	61	+ self.closed = True
	62	+ self.fp.close()
	63	+
	64	+class RecordingFileWrapper(FileWrapper):
	65	+
	66	+ def __init__(self, fp, pre='', post='', record=10000):
	67	+ self.history = deque(maxlen=record)
	68	+ FileWrapper.__init__(self, fp, pre=pre, post=post)
	69	+
	70	+ def read(self, bytes=sys.maxint):
	71	+ outBytes = FileWrapper.read(self, bytes)
	72	+ self.history.extend(outBytes)
	73	+ return outBytes
	74	+
	75	+ def readline(self):
	76	+ outBytes = FileWrapper.readline(self)
	77	+ self.history.extend(outBytes)
	78	+ return outBytes
	79	+
	80	+ def getHistory(self):
	81	+ return ''.join(self.history)
Property changes on: trunk/tools/wsor/diffs/xml_simulator.py
___________________________________________________________________
Added: svn:executable
1	82	+ *
Index: trunk/tools/wsor/diffs/diff_match_patch.py
—	—	@@ -0,0 +1,1949 @@
	2	+#!/usr/bin/env python
	3	+
	4	+"""Diff Match and Patch
	5	+
	6	+Copyright 2006 Google Inc.
	7	+http://code.google.com/p/google-diff-match-patch/
	8	+
	9	+Licensed under the Apache License, Version 2.0 (the "License");
	10	+you may not use this file except in compliance with the License.
	11	+You may obtain a copy of the License at
	12	+
	13	+ http://www.apache.org/licenses/LICENSE-2.0
	14	+
	15	+Unless required by applicable law or agreed to in writing, software
	16	+distributed under the License is distributed on an "AS IS" BASIS,
	17	+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	18	+See the License for the specific language governing permissions and
	19	+limitations under the License.
	20	+"""
	21	+
	22	+"""Functions for diff, match and patch.
	23	+
	24	+Computes the difference between two texts to create a patch.
	25	+Applies the patch onto another text, allowing for errors.
	26	+"""
	27	+
	28	+__author__ = 'fraser@google.com (Neil Fraser)'
	29	+
	30	+import math
	31	+import time
	32	+import urllib
	33	+import re
	34	+import sys
	35	+
	36	+class diff_match_patch:
	37	+ """Class containing the diff, match and patch methods.
	38	+
	39	+ Also contains the behaviour settings.
	40	+ """
	41	+
	42	+ def __init__(self):
	43	+ """Inits a diff_match_patch object with default settings.
	44	+ Redefine these in your program to override the defaults.
	45	+ """
	46	+
	47	+ # Number of seconds to map a diff before giving up (0 for infinity).
	48	+ self.Diff_Timeout = 1.0
	49	+ # Cost of an empty edit operation in terms of edit characters.
	50	+ self.Diff_EditCost = 4
	51	+ # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
	52	+ self.Match_Threshold = 0.5
	53	+ # How far to search for a match (0 = exact location, 1000+ = broad match).
	54	+ # A match this many characters away from the expected location will add
	55	+ # 1.0 to the score (0.0 is a perfect match).
	56	+ self.Match_Distance = 1000
	57	+ # When deleting a large block of text (over ~64 characters), how close does
	58	+ # the contents have to match the expected contents. (0.0 = perfection,
	59	+ # 1.0 = very loose). Note that Match_Threshold controls how closely the
	60	+ # end points of a delete need to match.
	61	+ self.Patch_DeleteThreshold = 0.5
	62	+ # Chunk size for context length.
	63	+ self.Patch_Margin = 4
	64	+
	65	+ # The number of bits in an int.
	66	+ # Python has no maximum, thus to disable patch splitting set to 0.
	67	+ # However to avoid long patches in certain pathological cases, use 32.
	68	+ # Multiple short patches (using native ints) are much faster than long ones.
	69	+ self.Match_MaxBits = 32
	70	+
	71	+ # DIFF FUNCTIONS
	72	+
	73	+ # The data structure representing a diff is an array of tuples:
	74	+ # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
	75	+ # which means: delete "Hello", add "Goodbye" and keep " world."
	76	+ DIFF_DELETE = -1
	77	+ DIFF_INSERT = 1
	78	+ DIFF_EQUAL = 0
	79	+
	80	+ def diff_main(self, text1, text2, checklines=True, deadline=None):
	81	+ """Find the differences between two texts. Simplifies the problem by
	82	+ stripping any common prefix or suffix off the texts before diffing.
	83	+
	84	+ Args:
	85	+ text1: Old string to be diffed.
	86	+ text2: New string to be diffed.
	87	+ checklines: Optional speedup flag. If present and false, then don't run
	88	+ a line-level diff first to identify the changed areas.
	89	+ Defaults to true, which does a faster, slightly less optimal diff.
	90	+ deadline: Optional time when the diff should be complete by. Used
	91	+ internally for recursive calls. Users should set DiffTimeout instead.
	92	+
	93	+ Returns:
	94	+ Array of changes.
	95	+ """
	96	+ # Set a deadline by which time the diff must be complete.
	97	+ if deadline == None:
	98	+ # Unlike in most languages, Python counts time in seconds.
	99	+ if self.Diff_Timeout <= 0:
	100	+ deadline = sys.maxint
	101	+ else:
	102	+ deadline = time.time() + self.Diff_Timeout
	103	+
	104	+ # Check for null inputs.
	105	+ if text1 == None or text2 == None:
	106	+ raise ValueError("Null inputs. (diff_main)")
	107	+
	108	+ # Check for equality (speedup).
	109	+ if text1 == text2:
	110	+ if text1:
	111	+ return [(self.DIFF_EQUAL, text1)]
	112	+ return []
	113	+
	114	+ # Trim off common prefix (speedup).
	115	+ commonlength = self.diff_commonPrefix(text1, text2)
	116	+ commonprefix = text1[:commonlength]
	117	+ text1 = text1[commonlength:]
	118	+ text2 = text2[commonlength:]
	119	+
	120	+ # Trim off common suffix (speedup).
	121	+ commonlength = self.diff_commonSuffix(text1, text2)
	122	+ if commonlength == 0:
	123	+ commonsuffix = ''
	124	+ else:
	125	+ commonsuffix = text1[-commonlength:]
	126	+ text1 = text1[:-commonlength]
	127	+ text2 = text2[:-commonlength]
	128	+
	129	+ # Compute the diff on the middle block.
	130	+ diffs = self.diff_compute(text1, text2, checklines, deadline)
	131	+
	132	+ # Restore the prefix and suffix.
	133	+ if commonprefix:
	134	+ diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
	135	+ if commonsuffix:
	136	+ diffs.append((self.DIFF_EQUAL, commonsuffix))
	137	+ self.diff_cleanupMerge(diffs)
	138	+ return diffs
	139	+
	140	+ def diff_compute(self, text1, text2, checklines, deadline):
	141	+ """Find the differences between two texts. Assumes that the texts do not
	142	+ have any common prefix or suffix.
	143	+
	144	+ Args:
	145	+ text1: Old string to be diffed.
	146	+ text2: New string to be diffed.
	147	+ checklines: Speedup flag. If false, then don't run a line-level diff
	148	+ first to identify the changed areas.
	149	+ If true, then run a faster, slightly less optimal diff.
	150	+ deadline: Time when the diff should be complete by.
	151	+
	152	+ Returns:
	153	+ Array of changes.
	154	+ """
	155	+ if not text1:
	156	+ # Just add some text (speedup).
	157	+ return [(self.DIFF_INSERT, text2)]
	158	+
	159	+ if not text2:
	160	+ # Just delete some text (speedup).
	161	+ return [(self.DIFF_DELETE, text1)]
	162	+
	163	+ if len(text1) > len(text2):
	164	+ (longtext, shorttext) = (text1, text2)
	165	+ else:
	166	+ (shorttext, longtext) = (text1, text2)
	167	+ i = longtext.find(shorttext)
	168	+ if i != -1:
	169	+ # Shorter text is inside the longer text (speedup).
	170	+ diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
	171	+ (self.DIFF_INSERT, longtext[i + len(shorttext):])]
	172	+ # Swap insertions for deletions if diff is reversed.
	173	+ if len(text1) > len(text2):
	174	+ diffs[0] = (self.DIFF_DELETE, diffs[0][1])
	175	+ diffs[2] = (self.DIFF_DELETE, diffs[2][1])
	176	+ return diffs
	177	+
	178	+ if len(shorttext) == 1:
	179	+ # Single character string.
	180	+ # After the previous speedup, the character can't be an equality.
	181	+ return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
	182	+ longtext = shorttext = None # Garbage collect.
	183	+
	184	+ # Check to see if the problem can be split in two.
	185	+ hm = self.diff_halfMatch(text1, text2)
	186	+ if hm:
	187	+ # A half-match was found, sort out the return data.
	188	+ (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
	189	+ # Send both pairs off for separate processing.
	190	+ diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
	191	+ diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
	192	+ # Merge the results.
	193	+ return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
	194	+
	195	+ if checklines and len(text1) > 100 and len(text2) > 100:
	196	+ return self.diff_lineMode(text1, text2, deadline)
	197	+
	198	+ return self.diff_bisect(text1, text2, deadline)
	199	+
	200	+ def diff_lineMode(self, text1, text2, deadline):
	201	+ """Do a quick line-level diff on both strings, then rediff the parts for
	202	+ greater accuracy.
	203	+ This speedup can produce non-minimal diffs.
	204	+
	205	+ Args:
	206	+ text1: Old string to be diffed.
	207	+ text2: New string to be diffed.
	208	+ deadline: Time when the diff should be complete by.
	209	+
	210	+ Returns:
	211	+ Array of changes.
	212	+ """
	213	+
	214	+ # Scan the text on a line-by-line basis first.
	215	+ (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
	216	+
	217	+ diffs = self.diff_main(text1, text2, False, deadline)
	218	+
	219	+ # Convert the diff back to original text.
	220	+ self.diff_charsToLines(diffs, linearray)
	221	+ # Eliminate freak matches (e.g. blank lines)
	222	+ self.diff_cleanupSemantic(diffs)
	223	+
	224	+ # Rediff any replacement blocks, this time character-by-character.
	225	+ # Add a dummy entry at the end.
	226	+ diffs.append((self.DIFF_EQUAL, ''))
	227	+ pointer = 0
	228	+ count_delete = 0
	229	+ count_insert = 0
	230	+ text_delete = ''
	231	+ text_insert = ''
	232	+ while pointer < len(diffs):
	233	+ if diffs[pointer][0] == self.DIFF_INSERT:
	234	+ count_insert += 1
	235	+ text_insert += diffs[pointer][1]
	236	+ elif diffs[pointer][0] == self.DIFF_DELETE:
	237	+ count_delete += 1
	238	+ text_delete += diffs[pointer][1]
	239	+ elif diffs[pointer][0] == self.DIFF_EQUAL:
	240	+ # Upon reaching an equality, check for prior redundancies.
	241	+ if count_delete >= 1 and count_insert >= 1:
	242	+ # Delete the offending records and add the merged ones.
	243	+ a = self.diff_main(text_delete, text_insert, False, deadline)
	244	+ diffs[pointer - count_delete - count_insert : pointer] = a
	245	+ pointer = pointer - count_delete - count_insert + len(a)
	246	+ count_insert = 0
	247	+ count_delete = 0
	248	+ text_delete = ''
	249	+ text_insert = ''
	250	+
	251	+ pointer += 1
	252	+
	253	+ diffs.pop() # Remove the dummy entry at the end.
	254	+
	255	+ return diffs
	256	+
	257	+ def diff_bisect(self, text1, text2, deadline):
	258	+ """Find the 'middle snake' of a diff, split the problem in two
	259	+ and return the recursively constructed diff.
	260	+ See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
	261	+
	262	+ Args:
	263	+ text1: Old string to be diffed.
	264	+ text2: New string to be diffed.
	265	+ deadline: Time at which to bail if not yet complete.
	266	+
	267	+ Returns:
	268	+ Array of diff tuples.
	269	+ """
	270	+
	271	+ # Cache the text lengths to prevent multiple calls.
	272	+ text1_length = len(text1)
	273	+ text2_length = len(text2)
	274	+ max_d = (text1_length + text2_length + 1) / 2
	275	+ v_offset = max_d
	276	+ v_length = 2 * max_d
	277	+ v1 = [-1] * v_length
	278	+ v1[v_offset + 1] = 0
	279	+ v2 = v1[:]
	280	+ delta = text1_length - text2_length
	281	+ # If the total number of characters is odd, then the front path will
	282	+ # collide with the reverse path.
	283	+ front = (delta % 2 != 0)
	284	+ # Offsets for start and end of k loop.
	285	+ # Prevents mapping of space beyond the grid.
	286	+ k1start = 0
	287	+ k1end = 0
	288	+ k2start = 0
	289	+ k2end = 0
	290	+ for d in xrange(max_d):
	291	+ # Bail out if deadline is reached.
	292	+ if time.time() > deadline:
	293	+ break
	294	+
	295	+ # Walk the front path one step.
	296	+ for k1 in xrange(-d + k1start, d + 1 - k1end, 2):
	297	+ k1_offset = v_offset + k1
	298	+ if (k1 == -d or k1 != d and
	299	+ v1[k1_offset - 1] < v1[k1_offset + 1]):
	300	+ x1 = v1[k1_offset + 1]
	301	+ else:
	302	+ x1 = v1[k1_offset - 1] + 1
	303	+ y1 = x1 - k1
	304	+ while (x1 < text1_length and y1 < text2_length and
	305	+ text1[x1] == text2[y1]):
	306	+ x1 += 1
	307	+ y1 += 1
	308	+ v1[k1_offset] = x1
	309	+ if x1 > text1_length:
	310	+ # Ran off the right of the graph.
	311	+ k1end += 2
	312	+ elif y1 > text2_length:
	313	+ # Ran off the bottom of the graph.
	314	+ k1start += 2
	315	+ elif front:
	316	+ k2_offset = v_offset + delta - k1
	317	+ if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
	318	+ # Mirror x2 onto top-left coordinate system.
	319	+ x2 = text1_length - v2[k2_offset]
	320	+ if x1 >= x2:
	321	+ # Overlap detected.
	322	+ return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
	323	+
	324	+ # Walk the reverse path one step.
	325	+ for k2 in xrange(-d + k2start, d + 1 - k2end, 2):
	326	+ k2_offset = v_offset + k2
	327	+ if (k2 == -d or k2 != d and
	328	+ v2[k2_offset - 1] < v2[k2_offset + 1]):
	329	+ x2 = v2[k2_offset + 1]
	330	+ else:
	331	+ x2 = v2[k2_offset - 1] + 1
	332	+ y2 = x2 - k2
	333	+ while (x2 < text1_length and y2 < text2_length and
	334	+ text1[-x2 - 1] == text2[-y2 - 1]):
	335	+ x2 += 1
	336	+ y2 += 1
	337	+ v2[k2_offset] = x2
	338	+ if x2 > text1_length:
	339	+ # Ran off the left of the graph.
	340	+ k2end += 2
	341	+ elif y2 > text2_length:
	342	+ # Ran off the top of the graph.
	343	+ k2start += 2
	344	+ elif not front:
	345	+ k1_offset = v_offset + delta - k2
	346	+ if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
	347	+ x1 = v1[k1_offset]
	348	+ y1 = v_offset + x1 - k1_offset
	349	+ # Mirror x2 onto top-left coordinate system.
	350	+ x2 = text1_length - x2
	351	+ if x1 >= x2:
	352	+ # Overlap detected.
	353	+ return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
	354	+
	355	+ # Diff took too long and hit the deadline or
	356	+ # number of diffs equals number of characters, no commonality at all.
	357	+ return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
	358	+
	359	+ def diff_bisectSplit(self, text1, text2, x, y, deadline):
	360	+ """Given the location of the 'middle snake', split the diff in two parts
	361	+ and recurse.
	362	+
	363	+ Args:
	364	+ text1: Old string to be diffed.
	365	+ text2: New string to be diffed.
	366	+ x: Index of split point in text1.
	367	+ y: Index of split point in text2.
	368	+ deadline: Time at which to bail if not yet complete.
	369	+
	370	+ Returns:
	371	+ Array of diff tuples.
	372	+ """
	373	+ text1a = text1[:x]
	374	+ text2a = text2[:y]
	375	+ text1b = text1[x:]
	376	+ text2b = text2[y:]
	377	+
	378	+ # Compute both diffs serially.
	379	+ diffs = self.diff_main(text1a, text2a, False, deadline)
	380	+ diffsb = self.diff_main(text1b, text2b, False, deadline)
	381	+
	382	+ return diffs + diffsb
	383	+
	384	+ def diff_linesToChars(self, text1, text2):
	385	+ """Split two texts into an array of strings. Reduce the texts to a string
	386	+ of hashes where each Unicode character represents one line.
	387	+
	388	+ Args:
	389	+ text1: First string.
	390	+ text2: Second string.
	391	+
	392	+ Returns:
	393	+ Three element tuple, containing the encoded text1, the encoded text2 and
	394	+ the array of unique strings. The zeroth element of the array of unique
	395	+ strings is intentionally blank.
	396	+ """
	397	+ lineArray = [] # e.g. lineArray[4] == "Hello\n"
	398	+ lineHash = {} # e.g. lineHash["Hello\n"] == 4
	399	+
	400	+ # "\x00" is a valid character, but various debuggers don't like it.
	401	+ # So we'll insert a junk entry to avoid generating a null character.
	402	+ lineArray.append('')
	403	+
	404	+ def diff_linesToCharsMunge(text):
	405	+ """Split a text into an array of strings. Reduce the texts to a string
	406	+ of hashes where each Unicode character represents one line.
	407	+ Modifies linearray and linehash through being a closure.
	408	+
	409	+ Args:
	410	+ text: String to encode.
	411	+
	412	+ Returns:
	413	+ Encoded string.
	414	+ """
	415	+ chars = []
	416	+ # Walk the text, pulling out a substring for each line.
	417	+ # text.split('\n') would would temporarily double our memory footprint.
	418	+ # Modifying text would create many large strings to garbage collect.
	419	+ lineStart = 0
	420	+ lineEnd = -1
	421	+ while lineEnd < len(text) - 1:
	422	+ lineEnd = text.find('\n', lineStart)
	423	+ if lineEnd == -1:
	424	+ lineEnd = len(text) - 1
	425	+ line = text[lineStart:lineEnd + 1]
	426	+ lineStart = lineEnd + 1
	427	+
	428	+ if line in lineHash:
	429	+ chars.append(unichr(lineHash[line]))
	430	+ else:
	431	+ lineArray.append(line)
	432	+ lineHash[line] = len(lineArray) - 1
	433	+ chars.append(unichr(len(lineArray) - 1))
	434	+ return "".join(chars)
	435	+
	436	+ chars1 = diff_linesToCharsMunge(text1)
	437	+ chars2 = diff_linesToCharsMunge(text2)
	438	+ return (chars1, chars2, lineArray)
	439	+
	440	+ def diff_linesToWords(self, text1, text2):
	441	+ """
	442	+ INSERT BY FABIAN
	443	+ Split two texts into an array of strings. Reduce the texts to a string
	444	+ of hashes where each Unicode character represents one word.
	445	+
	446	+ Args:
	447	+ text1: First string.
	448	+ text2: Second string.
	449	+
	450	+ Returns:
	451	+ Three element tuple, containing the encoded text1, the encoded text2 and
	452	+ the array of unique strings. The zeroth element of the array of unique
	453	+ strings is intentionally blank.
	454	+ """
	455	+ lineArray = [] # e.g. lineArray[4] == "Hello\n"
	456	+ lineHash = {} # e.g. lineHash["Hello\n"] == 4
	457	+
	458	+ # "\x00" is a valid character, but various debuggers don't like it.
	459	+ # So we'll insert a junk entry to avoid generating a null character.
	460	+ lineArray.append('')
	461	+
	462	+ def diff_linesToCharsMunge(text):
	463	+ """Split a text into an array of strings. Reduce the texts to a string
	464	+ of hashes where each Unicode character represents one line.
	465	+ Modifies linearray and linehash through being a closure.
	466	+
	467	+ Args:
	468	+ text: String to encode.
	469	+
	470	+ Returns:
	471	+ Encoded string.
	472	+ """
	473	+ chars = []
	474	+ # Walk the text, pulling out a substring for each line.
	475	+ # text.split('\n') would would temporarily double our memory footprint.
	476	+ # Modifying text would create many large strings to garbage collect.
	477	+ lineStart = 0
	478	+ lineEnd = -1
	479	+ while lineEnd < len(text) - 1:
	480	+ lineEnd = text.find(' ', lineStart)
	481	+ if lineEnd == -1:
	482	+ lineEnd = len(text) - 1
	483	+ line = text[lineStart:lineEnd + 1]
	484	+ lineStart = lineEnd + 1
	485	+
	486	+ if line in lineHash:
	487	+ chars.append(unichr(lineHash[line]))
	488	+ else:
	489	+ lineArray.append(line)
	490	+ lineHash[line] = len(lineArray) - 1
	491	+ chars.append(unichr(len(lineArray) - 1))
	492	+ return "".join(chars)
	493	+
	494	+ chars1 = diff_linesToCharsMunge(text1)
	495	+ chars2 = diff_linesToCharsMunge(text2)
	496	+ return (chars1, chars2, lineArray)
	497	+
	498	+
	499	+
	500	+ def diff_charsToLines(self, diffs, lineArray):
	501	+ """Rehydrate the text in a diff from a string of line hashes to real lines
	502	+ of text.
	503	+
	504	+ Args:
	505	+ diffs: Array of diff tuples.
	506	+ lineArray: Array of unique strings.
	507	+ """
	508	+ for x in xrange(len(diffs)):
	509	+ text = []
	510	+ for char in diffs[x][1]:
	511	+ text.append(lineArray[ord(char)])
	512	+ diffs[x] = (diffs[x][0], "".join(text))
	513	+
	514	+ def diff_commonPrefix(self, text1, text2):
	515	+ """Determine the common prefix of two strings.
	516	+
	517	+ Args:
	518	+ text1: First string.
	519	+ text2: Second string.
	520	+
	521	+ Returns:
	522	+ The number of characters common to the start of each string.
	523	+ """
	524	+ # Quick check for common null cases.
	525	+ if not text1 or not text2 or text1[0] != text2[0]:
	526	+ return 0
	527	+ # Binary search.
	528	+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
	529	+ pointermin = 0
	530	+ pointermax = min(len(text1), len(text2))
	531	+ pointermid = pointermax
	532	+ pointerstart = 0
	533	+ while pointermin < pointermid:
	534	+ if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
	535	+ pointermin = pointermid
	536	+ pointerstart = pointermin
	537	+ else:
	538	+ pointermax = pointermid
	539	+ pointermid = int((pointermax - pointermin) / 2 + pointermin)
	540	+ return pointermid
	541	+
	542	+ def diff_commonSuffix(self, text1, text2):
	543	+ """Determine the common suffix of two strings.
	544	+
	545	+ Args:
	546	+ text1: First string.
	547	+ text2: Second string.
	548	+
	549	+ Returns:
	550	+ The number of characters common to the end of each string.
	551	+ """
	552	+ # Quick check for common null cases.
	553	+ if not text1 or not text2 or text1[-1] != text2[-1]:
	554	+ return 0
	555	+ # Binary search.
	556	+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
	557	+ pointermin = 0
	558	+ pointermax = min(len(text1), len(text2))
	559	+ pointermid = pointermax
	560	+ pointerend = 0
	561	+ while pointermin < pointermid:
	562	+ if (text1[-pointermid:len(text1) - pointerend] ==
	563	+ text2[-pointermid:len(text2) - pointerend]):
	564	+ pointermin = pointermid
	565	+ pointerend = pointermin
	566	+ else:
	567	+ pointermax = pointermid
	568	+ pointermid = int((pointermax - pointermin) / 2 + pointermin)
	569	+ return pointermid
	570	+
	571	+ def diff_commonOverlap(self, text1, text2):
	572	+ """Determine if the suffix of one string is the prefix of another.
	573	+
	574	+ Args:
	575	+ text1 First string.
	576	+ text2 Second string.
	577	+
	578	+ Returns:
	579	+ The number of characters common to the end of the first
	580	+ string and the start of the second string.
	581	+ """
	582	+ # Cache the text lengths to prevent multiple calls.
	583	+ text1_length = len(text1)
	584	+ text2_length = len(text2)
	585	+ # Eliminate the null case.
	586	+ if text1_length == 0 or text2_length == 0:
	587	+ return 0
	588	+ # Truncate the longer string.
	589	+ if text1_length > text2_length:
	590	+ text1 = text1[-text2_length:]
	591	+ elif text1_length < text2_length:
	592	+ text2 = text2[:text1_length]
	593	+ text_length = min(text1_length, text2_length)
	594	+ # Quick check for the worst case.
	595	+ if text1 == text2:
	596	+ return text_length
	597	+
	598	+ # Start by looking for a single character match
	599	+ # and increase length until no match is found.
	600	+ # Performance analysis: http://neil.fraser.name/news/2010/11/04/
	601	+ best = 0
	602	+ length = 1
	603	+ while True:
	604	+ pattern = text1[-length:]
	605	+ found = text2.find(pattern)
	606	+ if found == -1:
	607	+ return best
	608	+ length += found
	609	+ if found == 0 or text1[-length:] == text2[:length]:
	610	+ best = length
	611	+ length += 1
	612	+
	613	+ def diff_halfMatch(self, text1, text2):
	614	+ """Do the two texts share a substring which is at least half the length of
	615	+ the longer text?
	616	+ This speedup can produce non-minimal diffs.
	617	+
	618	+ Args:
	619	+ text1: First string.
	620	+ text2: Second string.
	621	+
	622	+ Returns:
	623	+ Five element Array, containing the prefix of text1, the suffix of text1,
	624	+ the prefix of text2, the suffix of text2 and the common middle. Or None
	625	+ if there was no match.
	626	+ """
	627	+ if self.Diff_Timeout <= 0:
	628	+ # Don't risk returning a non-optimal diff if we have unlimited time.
	629	+ return None
	630	+ if len(text1) > len(text2):
	631	+ (longtext, shorttext) = (text1, text2)
	632	+ else:
	633	+ (shorttext, longtext) = (text1, text2)
	634	+ if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
	635	+ return None # Pointless.
	636	+
	637	+ def diff_halfMatchI(longtext, shorttext, i):
	638	+ """Does a substring of shorttext exist within longtext such that the
	639	+ substring is at least half the length of longtext?
	640	+ Closure, but does not reference any external variables.
	641	+
	642	+ Args:
	643	+ longtext: Longer string.
	644	+ shorttext: Shorter string.
	645	+ i: Start index of quarter length substring within longtext.
	646	+
	647	+ Returns:
	648	+ Five element Array, containing the prefix of longtext, the suffix of
	649	+ longtext, the prefix of shorttext, the suffix of shorttext and the
	650	+ common middle. Or None if there was no match.
	651	+ """
	652	+ seed = longtext[i:i + len(longtext) / 4]
	653	+ best_common = ''
	654	+ j = shorttext.find(seed)
	655	+ while j != -1:
	656	+ prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
	657	+ suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
	658	+ if len(best_common) < suffixLength + prefixLength:
	659	+ best_common = (shorttext[j - suffixLength:j] +
	660	+ shorttext[j:j + prefixLength])
	661	+ best_longtext_a = longtext[:i - suffixLength]
	662	+ best_longtext_b = longtext[i + prefixLength:]
	663	+ best_shorttext_a = shorttext[:j - suffixLength]
	664	+ best_shorttext_b = shorttext[j + prefixLength:]
	665	+ j = shorttext.find(seed, j + 1)
	666	+
	667	+ if len(best_common) * 2 >= len(longtext):
	668	+ return (best_longtext_a, best_longtext_b,
	669	+ best_shorttext_a, best_shorttext_b, best_common)
	670	+ else:
	671	+ return None
	672	+
	673	+ # First check if the second quarter is the seed for a half-match.
	674	+ hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
	675	+ # Check again based on the third quarter.
	676	+ hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
	677	+ if not hm1 and not hm2:
	678	+ return None
	679	+ elif not hm2:
	680	+ hm = hm1
	681	+ elif not hm1:
	682	+ hm = hm2
	683	+ else:
	684	+ # Both matched. Select the longest.
	685	+ if len(hm1[4]) > len(hm2[4]):
	686	+ hm = hm1
	687	+ else:
	688	+ hm = hm2
	689	+
	690	+ # A half-match was found, sort out the return data.
	691	+ if len(text1) > len(text2):
	692	+ (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
	693	+ else:
	694	+ (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
	695	+ return (text1_a, text1_b, text2_a, text2_b, mid_common)
	696	+
	697	+ def diff_cleanupSemantic(self, diffs):
	698	+ """Reduce the number of edits by eliminating semantically trivial
	699	+ equalities.
	700	+
	701	+ Args:
	702	+ diffs: Array of diff tuples.
	703	+ """
	704	+ changes = False
	705	+ equalities = [] # Stack of indices where equalities are found.
	706	+ lastequality = None # Always equal to equalities[-1][1]
	707	+ pointer = 0 # Index of current position.
	708	+ # Number of chars that changed prior to the equality.
	709	+ length_insertions1, length_deletions1 = 0, 0
	710	+ # Number of chars that changed after the equality.
	711	+ length_insertions2, length_deletions2 = 0, 0
	712	+ while pointer < len(diffs):
	713	+ if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
	714	+ equalities.append(pointer)
	715	+ length_insertions1, length_insertions2 = length_insertions2, 0
	716	+ length_deletions1, length_deletions2 = length_deletions2, 0
	717	+ lastequality = diffs[pointer][1]
	718	+ else: # An insertion or deletion.
	719	+ if diffs[pointer][0] == self.DIFF_INSERT:
	720	+ length_insertions2 += len(diffs[pointer][1])
	721	+ else:
	722	+ length_deletions2 += len(diffs[pointer][1])
	723	+ # Eliminate an equality that is smaller or equal to the edits on both
	724	+ # sides of it.
	725	+ if (lastequality != None and (len(lastequality) <=
	726	+ max(length_insertions1, length_deletions1)) and
	727	+ (len(lastequality) <= max(length_insertions2, length_deletions2))):
	728	+ # Duplicate record.
	729	+ diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
	730	+ # Change second copy to insert.
	731	+ diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
	732	+ diffs[equalities[-1] + 1][1])
	733	+ # Throw away the equality we just deleted.
	734	+ equalities.pop()
	735	+ # Throw away the previous equality (it needs to be reevaluated).
	736	+ if len(equalities):
	737	+ equalities.pop()
	738	+ if len(equalities):
	739	+ pointer = equalities[-1]
	740	+ else:
	741	+ pointer = -1
	742	+ # Reset the counters.
	743	+ length_insertions1, length_deletions1 = 0, 0
	744	+ length_insertions2, length_deletions2 = 0, 0
	745	+ lastequality = None
	746	+ changes = True
	747	+ pointer += 1
	748	+
	749	+ # Normalize the diff.
	750	+ if changes:
	751	+ self.diff_cleanupMerge(diffs)
	752	+ self.diff_cleanupSemanticLossless(diffs)
	753	+
	754	+ # Find any overlaps between deletions and insertions.
	755	+ # e.g: <del>abcxxx</del><ins>xxxdef</ins>
	756	+ # -> <del>abc</del>xxx<ins>def</ins>
	757	+ # Only extract an overlap if it is as big as the edit ahead or behind it.
	758	+ pointer = 1
	759	+ while pointer < len(diffs):
	760	+ if (diffs[pointer - 1][0] == self.DIFF_DELETE and
	761	+ diffs[pointer][0] == self.DIFF_INSERT):
	762	+ deletion = diffs[pointer - 1][1]
	763	+ insertion = diffs[pointer][1]
	764	+ overlap_length = self.diff_commonOverlap(deletion, insertion)
	765	+ if (overlap_length >= len(deletion) / 2.0 or
	766	+ overlap_length >= len(insertion) / 2.0):
	767	+ # Overlap found. Insert an equality and trim the surrounding edits.
	768	+ diffs.insert(pointer, (self.DIFF_EQUAL, insertion[:overlap_length]))
	769	+ diffs[pointer - 1] = (self.DIFF_DELETE,
	770	+ deletion[:len(deletion) - overlap_length])
	771	+ diffs[pointer + 1] = (self.DIFF_INSERT, insertion[overlap_length:])
	772	+ pointer += 1
	773	+ pointer += 1
	774	+ pointer += 1
	775	+
	776	+ def diff_cleanupSemanticLossless(self, diffs):
	777	+ """Look for single edits surrounded on both sides by equalities
	778	+ which can be shifted sideways to align the edit to a word boundary.
	779	+ e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
	780	+
	781	+ Args:
	782	+ diffs: Array of diff tuples.
	783	+ """
	784	+
	785	+ def diff_cleanupSemanticScore(one, two):
	786	+ """Given two strings, compute a score representing whether the
	787	+ internal boundary falls on logical boundaries.
	788	+ Scores range from 5 (best) to 0 (worst).
	789	+ Closure, but does not reference any external variables.
	790	+
	791	+ Args:
	792	+ one: First string.
	793	+ two: Second string.
	794	+
	795	+ Returns:
	796	+ The score.
	797	+ """
	798	+ if not one or not two:
	799	+ # Edges are the best.
	800	+ return 5
	801	+
	802	+ # Each port of this function behaves slightly differently due to
	803	+ # subtle differences in each language's definition of things like
	804	+ # 'whitespace'. Since this function's purpose is largely cosmetic,
	805	+ # the choice has been made to use each language's native features
	806	+ # rather than force total conformity.
	807	+ score = 0
	808	+ # One point for non-alphanumeric.
	809	+ if not one[-1].isalnum() or not two[0].isalnum():
	810	+ score += 1
	811	+ # Two points for whitespace.
	812	+ if one[-1].isspace() or two[0].isspace():
	813	+ score += 1
	814	+ # Three points for line breaks.
	815	+ if (one[-1] == "\r" or one[-1] == "\n" or
	816	+ two[0] == "\r" or two[0] == "\n"):
	817	+ score += 1
	818	+ # Four points for blank lines.
	819	+ if (re.search("\\n\\r?\\n$", one) or
	820	+ re.match("^\\r?\\n\\r?\\n", two)):
	821	+ score += 1
	822	+ return score
	823	+
	824	+ pointer = 1
	825	+ # Intentionally ignore the first and last element (don't need checking).
	826	+ while pointer < len(diffs) - 1:
	827	+ if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
	828	+ diffs[pointer + 1][0] == self.DIFF_EQUAL):
	829	+ # This is a single edit surrounded by equalities.
	830	+ equality1 = diffs[pointer - 1][1]
	831	+ edit = diffs[pointer][1]
	832	+ equality2 = diffs[pointer + 1][1]
	833	+
	834	+ # First, shift the edit as far left as possible.
	835	+ commonOffset = self.diff_commonSuffix(equality1, edit)
	836	+ if commonOffset:
	837	+ commonString = edit[-commonOffset:]
	838	+ equality1 = equality1[:-commonOffset]
	839	+ edit = commonString + edit[:-commonOffset]
	840	+ equality2 = commonString + equality2
	841	+
	842	+ # Second, step character by character right, looking for the best fit.
	843	+ bestEquality1 = equality1
	844	+ bestEdit = edit
	845	+ bestEquality2 = equality2
	846	+ bestScore = (diff_cleanupSemanticScore(equality1, edit) +
	847	+ diff_cleanupSemanticScore(edit, equality2))
	848	+ while edit and equality2 and edit[0] == equality2[0]:
	849	+ equality1 += edit[0]
	850	+ edit = edit[1:] + equality2[0]
	851	+ equality2 = equality2[1:]
	852	+ score = (diff_cleanupSemanticScore(equality1, edit) +
	853	+ diff_cleanupSemanticScore(edit, equality2))
	854	+ # The >= encourages trailing rather than leading whitespace on edits.
	855	+ if score >= bestScore:
	856	+ bestScore = score
	857	+ bestEquality1 = equality1
	858	+ bestEdit = edit
	859	+ bestEquality2 = equality2
	860	+
	861	+ if diffs[pointer - 1][1] != bestEquality1:
	862	+ # We have an improvement, save it back to the diff.
	863	+ if bestEquality1:
	864	+ diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
	865	+ else:
	866	+ del diffs[pointer - 1]
	867	+ pointer -= 1
	868	+ diffs[pointer] = (diffs[pointer][0], bestEdit)
	869	+ if bestEquality2:
	870	+ diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
	871	+ else:
	872	+ del diffs[pointer + 1]
	873	+ pointer -= 1
	874	+ pointer += 1
	875	+
	876	+ def diff_cleanupEfficiency(self, diffs):
	877	+ """Reduce the number of edits by eliminating operationally trivial
	878	+ equalities.
	879	+
	880	+ Args:
	881	+ diffs: Array of diff tuples.
	882	+ """
	883	+ changes = False
	884	+ equalities = [] # Stack of indices where equalities are found.
	885	+ lastequality = '' # Always equal to equalities[-1][1]
	886	+ pointer = 0 # Index of current position.
	887	+ pre_ins = False # Is there an insertion operation before the last equality.
	888	+ pre_del = False # Is there a deletion operation before the last equality.
	889	+ post_ins = False # Is there an insertion operation after the last equality.
	890	+ post_del = False # Is there a deletion operation after the last equality.
	891	+ while pointer < len(diffs):
	892	+ if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
	893	+ if (len(diffs[pointer][1]) < self.Diff_EditCost and
	894	+ (post_ins or post_del)):
	895	+ # Candidate found.
	896	+ equalities.append(pointer)
	897	+ pre_ins = post_ins
	898	+ pre_del = post_del
	899	+ lastequality = diffs[pointer][1]
	900	+ else:
	901	+ # Not a candidate, and can never become one.
	902	+ equalities = []
	903	+ lastequality = ''
	904	+
	905	+ post_ins = post_del = False
	906	+ else: # An insertion or deletion.
	907	+ if diffs[pointer][0] == self.DIFF_DELETE:
	908	+ post_del = True
	909	+ else:
	910	+ post_ins = True
	911	+
	912	+ # Five types to be split:
	913	+ # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
	914	+ # <ins>A</ins>X<ins>C</ins><del>D</del>
	915	+ # <ins>A</ins><del>B</del>X<ins>C</ins>
	916	+ # <ins>A</del>X<ins>C</ins><del>D</del>
	917	+ # <ins>A</ins><del>B</del>X<del>C</del>
	918	+
	919	+ if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
	920	+ ((len(lastequality) < self.Diff_EditCost / 2) and
	921	+ (pre_ins + pre_del + post_ins + post_del) == 3)):
	922	+ # Duplicate record.
	923	+ diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
	924	+ # Change second copy to insert.
	925	+ diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
	926	+ diffs[equalities[-1] + 1][1])
	927	+ equalities.pop() # Throw away the equality we just deleted.
	928	+ lastequality = ''
	929	+ if pre_ins and pre_del:
	930	+ # No changes made which could affect previous entry, keep going.
	931	+ post_ins = post_del = True
	932	+ equalities = []
	933	+ else:
	934	+ if len(equalities):
	935	+ equalities.pop() # Throw away the previous equality.
	936	+ if len(equalities):
	937	+ pointer = equalities[-1]
	938	+ else:
	939	+ pointer = -1
	940	+ post_ins = post_del = False
	941	+ changes = True
	942	+ pointer += 1
	943	+
	944	+ if changes:
	945	+ self.diff_cleanupMerge(diffs)
	946	+
	947	+ def diff_cleanupMerge(self, diffs):
	948	+ """Reorder and merge like edit sections. Merge equalities.
	949	+ Any edit section can move as long as it doesn't cross an equality.
	950	+
	951	+ Args:
	952	+ diffs: Array of diff tuples.
	953	+ """
	954	+ diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
	955	+ pointer = 0
	956	+ count_delete = 0
	957	+ count_insert = 0
	958	+ text_delete = ''
	959	+ text_insert = ''
	960	+ while pointer < len(diffs):
	961	+ if diffs[pointer][0] == self.DIFF_INSERT:
	962	+ count_insert += 1
	963	+ text_insert += diffs[pointer][1]
	964	+ pointer += 1
	965	+ elif diffs[pointer][0] == self.DIFF_DELETE:
	966	+ count_delete += 1
	967	+ text_delete += diffs[pointer][1]
	968	+ pointer += 1
	969	+ elif diffs[pointer][0] == self.DIFF_EQUAL:
	970	+ # Upon reaching an equality, check for prior redundancies.
	971	+ if count_delete + count_insert > 1:
	972	+ if count_delete != 0 and count_insert != 0:
	973	+ # Factor out any common prefixies.
	974	+ commonlength = self.diff_commonPrefix(text_insert, text_delete)
	975	+ if commonlength != 0:
	976	+ x = pointer - count_delete - count_insert - 1
	977	+ if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
	978	+ diffs[x] = (diffs[x][0], diffs[x][1] +
	979	+ text_insert[:commonlength])
	980	+ else:
	981	+ diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
	982	+ pointer += 1
	983	+ text_insert = text_insert[commonlength:]
	984	+ text_delete = text_delete[commonlength:]
	985	+ # Factor out any common suffixies.
	986	+ commonlength = self.diff_commonSuffix(text_insert, text_delete)
	987	+ if commonlength != 0:
	988	+ diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
	989	+ diffs[pointer][1])
	990	+ text_insert = text_insert[:-commonlength]
	991	+ text_delete = text_delete[:-commonlength]
	992	+ # Delete the offending records and add the merged ones.
	993	+ if count_delete == 0:
	994	+ diffs[pointer - count_insert : pointer] = [
	995	+ (self.DIFF_INSERT, text_insert)]
	996	+ elif count_insert == 0:
	997	+ diffs[pointer - count_delete : pointer] = [
	998	+ (self.DIFF_DELETE, text_delete)]
	999	+ else:
	1000	+ diffs[pointer - count_delete - count_insert : pointer] = [
	1001	+ (self.DIFF_DELETE, text_delete),
	1002	+ (self.DIFF_INSERT, text_insert)]
	1003	+ pointer = pointer - count_delete - count_insert + 1
	1004	+ if count_delete != 0:
	1005	+ pointer += 1
	1006	+ if count_insert != 0:
	1007	+ pointer += 1
	1008	+ elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
	1009	+ # Merge this equality with the previous one.
	1010	+ diffs[pointer - 1] = (diffs[pointer - 1][0],
	1011	+ diffs[pointer - 1][1] + diffs[pointer][1])
	1012	+ del diffs[pointer]
	1013	+ else:
	1014	+ pointer += 1
	1015	+
	1016	+ count_insert = 0
	1017	+ count_delete = 0
	1018	+ text_delete = ''
	1019	+ text_insert = ''
	1020	+
	1021	+ if diffs[-1][1] == '':
	1022	+ diffs.pop() # Remove the dummy entry at the end.
	1023	+
	1024	+ # Second pass: look for single edits surrounded on both sides by equalities
	1025	+ # which can be shifted sideways to eliminate an equality.
	1026	+ # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
	1027	+ changes = False
	1028	+ pointer = 1
	1029	+ # Intentionally ignore the first and last element (don't need checking).
	1030	+ while pointer < len(diffs) - 1:
	1031	+ if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
	1032	+ diffs[pointer + 1][0] == self.DIFF_EQUAL):
	1033	+ # This is a single edit surrounded by equalities.
	1034	+ if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
	1035	+ # Shift the edit over the previous equality.
	1036	+ diffs[pointer] = (diffs[pointer][0],
	1037	+ diffs[pointer - 1][1] +
	1038	+ diffs[pointer][1][:-len(diffs[pointer - 1][1])])
	1039	+ diffs[pointer + 1] = (diffs[pointer + 1][0],
	1040	+ diffs[pointer - 1][1] + diffs[pointer + 1][1])
	1041	+ del diffs[pointer - 1]
	1042	+ changes = True
	1043	+ elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
	1044	+ # Shift the edit over the next equality.
	1045	+ diffs[pointer - 1] = (diffs[pointer - 1][0],
	1046	+ diffs[pointer - 1][1] + diffs[pointer + 1][1])
	1047	+ diffs[pointer] = (diffs[pointer][0],
	1048	+ diffs[pointer][1][len(diffs[pointer + 1][1]):] +
	1049	+ diffs[pointer + 1][1])
	1050	+ del diffs[pointer + 1]
	1051	+ changes = True
	1052	+ pointer += 1
	1053	+
	1054	+ # If shifts were made, the diff needs reordering and another shift sweep.
	1055	+ if changes:
	1056	+ self.diff_cleanupMerge(diffs)
	1057	+
	1058	+ def diff_xIndex(self, diffs, loc):
	1059	+ """loc is a location in text1, compute and return the equivalent location
	1060	+ in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
	1061	+
	1062	+ Args:
	1063	+ diffs: Array of diff tuples.
	1064	+ loc: Location within text1.
	1065	+
	1066	+ Returns:
	1067	+ Location within text2.
	1068	+ """
	1069	+ chars1 = 0
	1070	+ chars2 = 0
	1071	+ last_chars1 = 0
	1072	+ last_chars2 = 0
	1073	+ for x in xrange(len(diffs)):
	1074	+ (op, text) = diffs[x]
	1075	+ if op != self.DIFF_INSERT: # Equality or deletion.
	1076	+ chars1 += len(text)
	1077	+ if op != self.DIFF_DELETE: # Equality or insertion.
	1078	+ chars2 += len(text)
	1079	+ if chars1 > loc: # Overshot the location.
	1080	+ break
	1081	+ last_chars1 = chars1
	1082	+ last_chars2 = chars2
	1083	+
	1084	+ if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
	1085	+ # The location was deleted.
	1086	+ return last_chars2
	1087	+ # Add the remaining len(character).
	1088	+ return last_chars2 + (loc - last_chars1)
	1089	+
	1090	+ def diff_prettyHtml(self, diffs):
	1091	+ """Convert a diff array into a pretty HTML report.
	1092	+
	1093	+ Args:
	1094	+ diffs: Array of diff tuples.
	1095	+
	1096	+ Returns:
	1097	+ HTML representation.
	1098	+ """
	1099	+ html = []
	1100	+ i = 0
	1101	+ for (op, data) in diffs:
	1102	+ text = (data.replace("&", "&").replace("<", "<")
	1103	+ .replace(">", ">").replace("\n", "¶<br>"))
	1104	+ if op == self.DIFF_INSERT:
	1105	+ html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)
	1106	+ elif op == self.DIFF_DELETE:
	1107	+ html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)
	1108	+ elif op == self.DIFF_EQUAL:
	1109	+ html.append("<span>%s</span>" % text)
	1110	+ if op != self.DIFF_DELETE:
	1111	+ i += len(data)
	1112	+ return "".join(html)
	1113	+
	1114	+ def diff_text1(self, diffs):
	1115	+ """Compute and return the source text (all equalities and deletions).
	1116	+
	1117	+ Args:
	1118	+ diffs: Array of diff tuples.
	1119	+
	1120	+ Returns:
	1121	+ Source text.
	1122	+ """
	1123	+ text = []
	1124	+ for (op, data) in diffs:
	1125	+ if op != self.DIFF_INSERT:
	1126	+ text.append(data)
	1127	+ return "".join(text)
	1128	+
	1129	+ def diff_text2(self, diffs):
	1130	+ """Compute and return the destination text (all equalities and insertions).
	1131	+
	1132	+ Args:
	1133	+ diffs: Array of diff tuples.
	1134	+
	1135	+ Returns:
	1136	+ Destination text.
	1137	+ """
	1138	+ text = []
	1139	+ for (op, data) in diffs:
	1140	+ if op != self.DIFF_DELETE:
	1141	+ text.append(data)
	1142	+ return "".join(text)
	1143	+
	1144	+ def diff_levenshtein(self, diffs):
	1145	+ """Compute the Levenshtein distance; the number of inserted, deleted or
	1146	+ substituted characters.
	1147	+
	1148	+ Args:
	1149	+ diffs: Array of diff tuples.
	1150	+
	1151	+ Returns:
	1152	+ Number of changes.
	1153	+ """
	1154	+ levenshtein = 0
	1155	+ insertions = 0
	1156	+ deletions = 0
	1157	+ for (op, data) in diffs:
	1158	+ if op == self.DIFF_INSERT:
	1159	+ insertions += len(data)
	1160	+ elif op == self.DIFF_DELETE:
	1161	+ deletions += len(data)
	1162	+ elif op == self.DIFF_EQUAL:
	1163	+ # A deletion and an insertion is one substitution.
	1164	+ levenshtein += max(insertions, deletions)
	1165	+ insertions = 0
	1166	+ deletions = 0
	1167	+ levenshtein += max(insertions, deletions)
	1168	+ return levenshtein
	1169	+
	1170	+ def diff_toDelta(self, diffs):
	1171	+ """Crush the diff into an encoded string which describes the operations
	1172	+ required to transform text1 into text2.
	1173	+ E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
	1174	+ Operations are tab-separated. Inserted text is escaped using %xx notation.
	1175	+
	1176	+ Args:
	1177	+ diffs: Array of diff tuples.
	1178	+
	1179	+ Returns:
	1180	+ Delta text.
	1181	+ """
	1182	+ text = []
	1183	+ for (op, data) in diffs:
	1184	+ if op == self.DIFF_INSERT:
	1185	+ # High ascii will raise UnicodeDecodeError. Use Unicode instead.
	1186	+ data = data.encode("utf-8")
	1187	+ text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
	1188	+ elif op == self.DIFF_DELETE:
	1189	+ text.append("-%d" % len(data))
	1190	+ elif op == self.DIFF_EQUAL:
	1191	+ text.append("=%d" % len(data))
	1192	+ return "\t".join(text)
	1193	+
	1194	+ def diff_fromDelta(self, text1, delta):
	1195	+ """Given the original text1, and an encoded string which describes the
	1196	+ operations required to transform text1 into text2, compute the full diff.
	1197	+
	1198	+ Args:
	1199	+ text1: Source string for the diff.
	1200	+ delta: Delta text.
	1201	+
	1202	+ Returns:
	1203	+ Array of diff tuples.
	1204	+
	1205	+ Raises:
	1206	+ ValueError: If invalid input.
	1207	+ """
	1208	+ if type(delta) == unicode:
	1209	+ # Deltas should be composed of a subset of ascii chars, Unicode not
	1210	+ # required. If this encode raises UnicodeEncodeError, delta is invalid.
	1211	+ delta = delta.encode("ascii")
	1212	+ diffs = []
	1213	+ pointer = 0 # Cursor in text1
	1214	+ tokens = delta.split("\t")
	1215	+ for token in tokens:
	1216	+ if token == "":
	1217	+ # Blank tokens are ok (from a trailing \t).
	1218	+ continue
	1219	+ # Each token begins with a one character parameter which specifies the
	1220	+ # operation of this token (delete, insert, equality).
	1221	+ param = token[1:]
	1222	+ if token[0] == "+":
	1223	+ param = urllib.unquote(param).decode("utf-8")
	1224	+ diffs.append((self.DIFF_INSERT, param))
	1225	+ elif token[0] == "-" or token[0] == "=":
	1226	+ try:
	1227	+ n = int(param)
	1228	+ except ValueError:
	1229	+ raise ValueError("Invalid number in diff_fromDelta: " + param)
	1230	+ if n < 0:
	1231	+ raise ValueError("Negative number in diff_fromDelta: " + param)
	1232	+ text = text1[pointer : pointer + n]
	1233	+ pointer += n
	1234	+ if token[0] == "=":
	1235	+ diffs.append((self.DIFF_EQUAL, text))
	1236	+ else:
	1237	+ diffs.append((self.DIFF_DELETE, text))
	1238	+ else:
	1239	+ # Anything else is an error.
	1240	+ raise ValueError("Invalid diff operation in diff_fromDelta: " +
	1241	+ token[0])
	1242	+ if pointer != len(text1):
	1243	+ raise ValueError(
	1244	+ "Delta length (%d) does not equal source text length (%d)." %
	1245	+ (pointer, len(text1)))
	1246	+ return diffs
	1247	+
	1248	+ # MATCH FUNCTIONS
	1249	+
	1250	+ def match_main(self, text, pattern, loc):
	1251	+ """Locate the best instance of 'pattern' in 'text' near 'loc'.
	1252	+
	1253	+ Args:
	1254	+ text: The text to search.
	1255	+ pattern: The pattern to search for.
	1256	+ loc: The location to search around.
	1257	+
	1258	+ Returns:
	1259	+ Best match index or -1.
	1260	+ """
	1261	+ # Check for null inputs.
	1262	+ if text == None or pattern == None:
	1263	+ raise ValueError("Null inputs. (match_main)")
	1264	+
	1265	+ loc = max(0, min(loc, len(text)))
	1266	+ if text == pattern:
	1267	+ # Shortcut (potentially not guaranteed by the algorithm)
	1268	+ return 0
	1269	+ elif not text:
	1270	+ # Nothing to match.
	1271	+ return -1
	1272	+ elif text[loc:loc + len(pattern)] == pattern:
	1273	+ # Perfect match at the perfect spot! (Includes case of null pattern)
	1274	+ return loc
	1275	+ else:
	1276	+ # Do a fuzzy compare.
	1277	+ match = self.match_bitap(text, pattern, loc)
	1278	+ return match
	1279	+
	1280	+ def match_bitap(self, text, pattern, loc):
	1281	+ """Locate the best instance of 'pattern' in 'text' near 'loc' using the
	1282	+ Bitap algorithm.
	1283	+
	1284	+ Args:
	1285	+ text: The text to search.
	1286	+ pattern: The pattern to search for.
	1287	+ loc: The location to search around.
	1288	+
	1289	+ Returns:
	1290	+ Best match index or -1.
	1291	+ """
	1292	+ # Python doesn't have a maxint limit, so ignore this check.
	1293	+ #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
	1294	+ # raise ValueError("Pattern too long for this application.")
	1295	+
	1296	+ # Initialise the alphabet.
	1297	+ s = self.match_alphabet(pattern)
	1298	+
	1299	+ def match_bitapScore(e, x):
	1300	+ """Compute and return the score for a match with e errors and x location.
	1301	+ Accesses loc and pattern through being a closure.
	1302	+
	1303	+ Args:
	1304	+ e: Number of errors in match.
	1305	+ x: Location of match.
	1306	+
	1307	+ Returns:
	1308	+ Overall score for match (0.0 = good, 1.0 = bad).
	1309	+ """
	1310	+ accuracy = float(e) / len(pattern)
	1311	+ proximity = abs(loc - x)
	1312	+ if not self.Match_Distance:
	1313	+ # Dodge divide by zero error.
	1314	+ return proximity and 1.0 or accuracy
	1315	+ return accuracy + (proximity / float(self.Match_Distance))
	1316	+
	1317	+ # Highest score beyond which we give up.
	1318	+ score_threshold = self.Match_Threshold
	1319	+ # Is there a nearby exact match? (speedup)
	1320	+ best_loc = text.find(pattern, loc)
	1321	+ if best_loc != -1:
	1322	+ score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
	1323	+ # What about in the other direction? (speedup)
	1324	+ best_loc = text.rfind(pattern, loc + len(pattern))
	1325	+ if best_loc != -1:
	1326	+ score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
	1327	+
	1328	+ # Initialise the bit arrays.
	1329	+ matchmask = 1 << (len(pattern) - 1)
	1330	+ best_loc = -1
	1331	+
	1332	+ bin_max = len(pattern) + len(text)
	1333	+ # Empty initialization added to appease pychecker.
	1334	+ last_rd = None
	1335	+ for d in xrange(len(pattern)):
	1336	+ # Scan for the best match each iteration allows for one more error.
	1337	+ # Run a binary search to determine how far from 'loc' we can stray at
	1338	+ # this error level.
	1339	+ bin_min = 0
	1340	+ bin_mid = bin_max
	1341	+ while bin_min < bin_mid:
	1342	+ if match_bitapScore(d, loc + bin_mid) <= score_threshold:
	1343	+ bin_min = bin_mid
	1344	+ else:
	1345	+ bin_max = bin_mid
	1346	+ bin_mid = (bin_max - bin_min) / 2 + bin_min
	1347	+
	1348	+ # Use the result from this iteration as the maximum for the next.
	1349	+ bin_max = bin_mid
	1350	+ start = max(1, loc - bin_mid + 1)
	1351	+ finish = min(loc + bin_mid, len(text)) + len(pattern)
	1352	+
	1353	+ rd = range(finish + 1)
	1354	+ rd.append((1 << d) - 1)
	1355	+ for j in xrange(finish, start - 1, -1):
	1356	+ if len(text) <= j - 1:
	1357	+ # Out of range.
	1358	+ charMatch = 0
	1359	+ else:
	1360	+ charMatch = s.get(text[j - 1], 0)
	1361	+ if d == 0: # First pass: exact match.
	1362	+ rd[j] = ((rd[j + 1] << 1) \| 1) & charMatch
	1363	+ else: # Subsequent passes: fuzzy match.
	1364	+ rd[j] = ((rd[j + 1] << 1) \| 1) & charMatch \| (
	1365	+ ((last_rd[j + 1] \| last_rd[j]) << 1) \| 1) \| last_rd[j + 1]
	1366	+ if rd[j] & matchmask:
	1367	+ score = match_bitapScore(d, j - 1)
	1368	+ # This match will almost certainly be better than any existing match.
	1369	+ # But check anyway.
	1370	+ if score <= score_threshold:
	1371	+ # Told you so.
	1372	+ score_threshold = score
	1373	+ best_loc = j - 1
	1374	+ if best_loc > loc:
	1375	+ # When passing loc, don't exceed our current distance from loc.
	1376	+ start = max(1, 2 * loc - best_loc)
	1377	+ else:
	1378	+ # Already passed loc, downhill from here on in.
	1379	+ break
	1380	+ # No hope for a (better) match at greater error levels.
	1381	+ if match_bitapScore(d + 1, loc) > score_threshold:
	1382	+ break
	1383	+ last_rd = rd
	1384	+ return best_loc
	1385	+
	1386	+ def match_alphabet(self, pattern):
	1387	+ """Initialise the alphabet for the Bitap algorithm.
	1388	+
	1389	+ Args:
	1390	+ pattern: The text to encode.
	1391	+
	1392	+ Returns:
	1393	+ Hash of character locations.
	1394	+ """
	1395	+ s = {}
	1396	+ for char in pattern:
	1397	+ s[char] = 0
	1398	+ for i in xrange(len(pattern)):
	1399	+ s[pattern[i]] \|= 1 << (len(pattern) - i - 1)
	1400	+ return s
	1401	+
	1402	+ # PATCH FUNCTIONS
	1403	+
	1404	+ def patch_addContext(self, patch, text):
	1405	+ """Increase the context until it is unique,
	1406	+ but don't let the pattern expand beyond Match_MaxBits.
	1407	+
	1408	+ Args:
	1409	+ patch: The patch to grow.
	1410	+ text: Source text.
	1411	+ """
	1412	+ if len(text) == 0:
	1413	+ return
	1414	+ pattern = text[patch.start2 : patch.start2 + patch.length1]
	1415	+ padding = 0
	1416	+
	1417	+ # Look for the first and last matches of pattern in text. If two different
	1418	+ # matches are found, increase the pattern length.
	1419	+ while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
	1420	+ 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
	1421	+ self.Patch_Margin)):
	1422	+ padding += self.Patch_Margin
	1423	+ pattern = text[max(0, patch.start2 - padding) :
	1424	+ patch.start2 + patch.length1 + padding]
	1425	+ # Add one chunk for good luck.
	1426	+ padding += self.Patch_Margin
	1427	+
	1428	+ # Add the prefix.
	1429	+ prefix = text[max(0, patch.start2 - padding) : patch.start2]
	1430	+ if prefix:
	1431	+ patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
	1432	+ # Add the suffix.
	1433	+ suffix = text[patch.start2 + patch.length1 :
	1434	+ patch.start2 + patch.length1 + padding]
	1435	+ if suffix:
	1436	+ patch.diffs.append((self.DIFF_EQUAL, suffix))
	1437	+
	1438	+ # Roll back the start points.
	1439	+ patch.start1 -= len(prefix)
	1440	+ patch.start2 -= len(prefix)
	1441	+ # Extend lengths.
	1442	+ patch.length1 += len(prefix) + len(suffix)
	1443	+ patch.length2 += len(prefix) + len(suffix)
	1444	+
	1445	+ def patch_make(self, a, b=None, c=None):
	1446	+ """Compute a list of patches to turn text1 into text2.
	1447	+ Use diffs if provided, otherwise compute it ourselves.
	1448	+ There are four ways to call this function, depending on what data is
	1449	+ available to the caller:
	1450	+ Method 1:
	1451	+ a = text1, b = text2
	1452	+ Method 2:
	1453	+ a = diffs
	1454	+ Method 3 (optimal):
	1455	+ a = text1, b = diffs
	1456	+ Method 4 (deprecated, use method 3):
	1457	+ a = text1, b = text2, c = diffs
	1458	+
	1459	+ Args:
	1460	+ a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
	1461	+ text2 (method 2).
	1462	+ b: text2 (methods 1,4) or Array of diff tuples for text1 to
	1463	+ text2 (method 3) or undefined (method 2).
	1464	+ c: Array of diff tuples for text1 to text2 (method 4) or
	1465	+ undefined (methods 1,2,3).
	1466	+
	1467	+ Returns:
	1468	+ Array of patch objects.
	1469	+ """
	1470	+ text1 = None
	1471	+ diffs = None
	1472	+ # Note that texts may arrive as 'str' or 'unicode'.
	1473	+ if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
	1474	+ # Method 1: text1, text2
	1475	+ # Compute diffs from text1 and text2.
	1476	+ text1 = a
	1477	+ diffs = self.diff_main(text1, b, True)
	1478	+ if len(diffs) > 2:
	1479	+ self.diff_cleanupSemantic(diffs)
	1480	+ self.diff_cleanupEfficiency(diffs)
	1481	+ elif isinstance(a, list) and b is None and c is None:
	1482	+ # Method 2: diffs
	1483	+ # Compute text1 from diffs.
	1484	+ diffs = a
	1485	+ text1 = self.diff_text1(diffs)
	1486	+ elif isinstance(a, basestring) and isinstance(b, list) and c is None:
	1487	+ # Method 3: text1, diffs
	1488	+ text1 = a
	1489	+ diffs = b
	1490	+ elif (isinstance(a, basestring) and isinstance(b, basestring) and
	1491	+ isinstance(c, list)):
	1492	+ # Method 4: text1, text2, diffs
	1493	+ # text2 is not used.
	1494	+ text1 = a
	1495	+ diffs = c
	1496	+ else:
	1497	+ raise ValueError("Unknown call format to patch_make.")
	1498	+
	1499	+ if not diffs:
	1500	+ return [] # Get rid of the None case.
	1501	+ patches = []
	1502	+ patch = patch_obj()
	1503	+ char_count1 = 0 # Number of characters into the text1 string.
	1504	+ char_count2 = 0 # Number of characters into the text2 string.
	1505	+ prepatch_text = text1 # Recreate the patches to determine context info.
	1506	+ postpatch_text = text1
	1507	+ for x in xrange(len(diffs)):
	1508	+ (diff_type, diff_text) = diffs[x]
	1509	+ if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
	1510	+ # A new patch starts here.
	1511	+ patch.start1 = char_count1
	1512	+ patch.start2 = char_count2
	1513	+ if diff_type == self.DIFF_INSERT:
	1514	+ # Insertion
	1515	+ patch.diffs.append(diffs[x])
	1516	+ patch.length2 += len(diff_text)
	1517	+ postpatch_text = (postpatch_text[:char_count2] + diff_text +
	1518	+ postpatch_text[char_count2:])
	1519	+ elif diff_type == self.DIFF_DELETE:
	1520	+ # Deletion.
	1521	+ patch.length1 += len(diff_text)
	1522	+ patch.diffs.append(diffs[x])
	1523	+ postpatch_text = (postpatch_text[:char_count2] +
	1524	+ postpatch_text[char_count2 + len(diff_text):])
	1525	+ elif (diff_type == self.DIFF_EQUAL and
	1526	+ len(diff_text) <= 2 * self.Patch_Margin and
	1527	+ len(patch.diffs) != 0 and len(diffs) != x + 1):
	1528	+ # Small equality inside a patch.
	1529	+ patch.diffs.append(diffs[x])
	1530	+ patch.length1 += len(diff_text)
	1531	+ patch.length2 += len(diff_text)
	1532	+
	1533	+ if (diff_type == self.DIFF_EQUAL and
	1534	+ len(diff_text) >= 2 * self.Patch_Margin):
	1535	+ # Time for a new patch.
	1536	+ if len(patch.diffs) != 0:
	1537	+ self.patch_addContext(patch, prepatch_text)
	1538	+ patches.append(patch)
	1539	+ patch = patch_obj()
	1540	+ # Unlike Unidiff, our patch lists have a rolling context.
	1541	+ # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
	1542	+ # Update prepatch text & pos to reflect the application of the
	1543	+ # just completed patch.
	1544	+ prepatch_text = postpatch_text
	1545	+ char_count1 = char_count2
	1546	+
	1547	+ # Update the current character count.
	1548	+ if diff_type != self.DIFF_INSERT:
	1549	+ char_count1 += len(diff_text)
	1550	+ if diff_type != self.DIFF_DELETE:
	1551	+ char_count2 += len(diff_text)
	1552	+
	1553	+ # Pick up the leftover patch if not empty.
	1554	+ if len(patch.diffs) != 0:
	1555	+ self.patch_addContext(patch, prepatch_text)
	1556	+ patches.append(patch)
	1557	+ return patches
	1558	+
	1559	+ def patch_deepCopy(self, patches):
	1560	+ """Given an array of patches, return another array that is identical.
	1561	+
	1562	+ Args:
	1563	+ patches: Array of patch objects.
	1564	+
	1565	+ Returns:
	1566	+ Array of patch objects.
	1567	+ """
	1568	+ patchesCopy = []
	1569	+ for patch in patches:
	1570	+ patchCopy = patch_obj()
	1571	+ # No need to deep copy the tuples since they are immutable.
	1572	+ patchCopy.diffs = patch.diffs[:]
	1573	+ patchCopy.start1 = patch.start1
	1574	+ patchCopy.start2 = patch.start2
	1575	+ patchCopy.length1 = patch.length1
	1576	+ patchCopy.length2 = patch.length2
	1577	+ patchesCopy.append(patchCopy)
	1578	+ return patchesCopy
	1579	+
	1580	+ def patch_apply(self, patches, text):
	1581	+ """Merge a set of patches onto the text. Return a patched text, as well
	1582	+ as a list of true/false values indicating which patches were applied.
	1583	+
	1584	+ Args:
	1585	+ patches: Array of patch objects.
	1586	+ text: Old text.
	1587	+
	1588	+ Returns:
	1589	+ Two element Array, containing the new text and an array of boolean values.
	1590	+ """
	1591	+ if not patches:
	1592	+ return (text, [])
	1593	+
	1594	+ # Deep copy the patches so that no changes are made to originals.
	1595	+ patches = self.patch_deepCopy(patches)
	1596	+
	1597	+ nullPadding = self.patch_addPadding(patches)
	1598	+ text = nullPadding + text + nullPadding
	1599	+ self.patch_splitMax(patches)
	1600	+
	1601	+ # delta keeps track of the offset between the expected and actual location
	1602	+ # of the previous patch. If there are patches expected at positions 10 and
	1603	+ # 20, but the first patch was found at 12, delta is 2 and the second patch
	1604	+ # has an effective expected position of 22.
	1605	+ delta = 0
	1606	+ results = []
	1607	+ for patch in patches:
	1608	+ expected_loc = patch.start2 + delta
	1609	+ text1 = self.diff_text1(patch.diffs)
	1610	+ end_loc = -1
	1611	+ if len(text1) > self.Match_MaxBits:
	1612	+ # patch_splitMax will only provide an oversized pattern in the case of
	1613	+ # a monster delete.
	1614	+ start_loc = self.match_main(text, text1[:self.Match_MaxBits],
	1615	+ expected_loc)
	1616	+ if start_loc != -1:
	1617	+ end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
	1618	+ expected_loc + len(text1) - self.Match_MaxBits)
	1619	+ if end_loc == -1 or start_loc >= end_loc:
	1620	+ # Can't find valid trailing context. Drop this patch.
	1621	+ start_loc = -1
	1622	+ else:
	1623	+ start_loc = self.match_main(text, text1, expected_loc)
	1624	+ if start_loc == -1:
	1625	+ # No match found. :(
	1626	+ results.append(False)
	1627	+ # Subtract the delta for this failed patch from subsequent patches.
	1628	+ delta -= patch.length2 - patch.length1
	1629	+ else:
	1630	+ # Found a match. :)
	1631	+ results.append(True)
	1632	+ delta = start_loc - expected_loc
	1633	+ if end_loc == -1:
	1634	+ text2 = text[start_loc : start_loc + len(text1)]
	1635	+ else:
	1636	+ text2 = text[start_loc : end_loc + self.Match_MaxBits]
	1637	+ if text1 == text2:
	1638	+ # Perfect match, just shove the replacement text in.
	1639	+ text = (text[:start_loc] + self.diff_text2(patch.diffs) +
	1640	+ text[start_loc + len(text1):])
	1641	+ else:
	1642	+ # Imperfect match.
	1643	+ # Run a diff to get a framework of equivalent indices.
	1644	+ diffs = self.diff_main(text1, text2, False)
	1645	+ if (len(text1) > self.Match_MaxBits and
	1646	+ self.diff_levenshtein(diffs) / float(len(text1)) >
	1647	+ self.Patch_DeleteThreshold):
	1648	+ # The end points match, but the content is unacceptably bad.
	1649	+ results[-1] = False
	1650	+ else:
	1651	+ self.diff_cleanupSemanticLossless(diffs)
	1652	+ index1 = 0
	1653	+ for (op, data) in patch.diffs:
	1654	+ if op != self.DIFF_EQUAL:
	1655	+ index2 = self.diff_xIndex(diffs, index1)
	1656	+ if op == self.DIFF_INSERT: # Insertion
	1657	+ text = text[:start_loc + index2] + data + text[start_loc +
	1658	+ index2:]
	1659	+ elif op == self.DIFF_DELETE: # Deletion
	1660	+ text = text[:start_loc + index2] + text[start_loc +
	1661	+ self.diff_xIndex(diffs, index1 + len(data)):]
	1662	+ if op != self.DIFF_DELETE:
	1663	+ index1 += len(data)
	1664	+ # Strip the padding off.
	1665	+ text = text[len(nullPadding):-len(nullPadding)]
	1666	+ return (text, results)
	1667	+
	1668	+ def patch_addPadding(self, patches):
	1669	+ """Add some padding on text start and end so that edges can match
	1670	+ something. Intended to be called only from within patch_apply.
	1671	+
	1672	+ Args:
	1673	+ patches: Array of patch objects.
	1674	+
	1675	+ Returns:
	1676	+ The padding string added to each side.
	1677	+ """
	1678	+ paddingLength = self.Patch_Margin
	1679	+ nullPadding = ""
	1680	+ for x in xrange(1, paddingLength + 1):
	1681	+ nullPadding += chr(x)
	1682	+
	1683	+ # Bump all the patches forward.
	1684	+ for patch in patches:
	1685	+ patch.start1 += paddingLength
	1686	+ patch.start2 += paddingLength
	1687	+
	1688	+ # Add some padding on start of first diff.
	1689	+ patch = patches[0]
	1690	+ diffs = patch.diffs
	1691	+ if not diffs or diffs[0][0] != self.DIFF_EQUAL:
	1692	+ # Add nullPadding equality.
	1693	+ diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
	1694	+ patch.start1 -= paddingLength # Should be 0.
	1695	+ patch.start2 -= paddingLength # Should be 0.
	1696	+ patch.length1 += paddingLength
	1697	+ patch.length2 += paddingLength
	1698	+ elif paddingLength > len(diffs[0][1]):
	1699	+ # Grow first equality.
	1700	+ extraLength = paddingLength - len(diffs[0][1])
	1701	+ newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
	1702	+ diffs[0] = (diffs[0][0], newText)
	1703	+ patch.start1 -= extraLength
	1704	+ patch.start2 -= extraLength
	1705	+ patch.length1 += extraLength
	1706	+ patch.length2 += extraLength
	1707	+
	1708	+ # Add some padding on end of last diff.
	1709	+ patch = patches[-1]
	1710	+ diffs = patch.diffs
	1711	+ if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
	1712	+ # Add nullPadding equality.
	1713	+ diffs.append((self.DIFF_EQUAL, nullPadding))
	1714	+ patch.length1 += paddingLength
	1715	+ patch.length2 += paddingLength
	1716	+ elif paddingLength > len(diffs[-1][1]):
	1717	+ # Grow last equality.
	1718	+ extraLength = paddingLength - len(diffs[-1][1])
	1719	+ newText = diffs[-1][1] + nullPadding[:extraLength]
	1720	+ diffs[-1] = (diffs[-1][0], newText)
	1721	+ patch.length1 += extraLength
	1722	+ patch.length2 += extraLength
	1723	+
	1724	+ return nullPadding
	1725	+
	1726	+ def patch_splitMax(self, patches):
	1727	+ """Look through the patches and break up any which are longer than the
	1728	+ maximum limit of the match algorithm.
	1729	+ Intended to be called only from within patch_apply.
	1730	+
	1731	+ Args:
	1732	+ patches: Array of patch objects.
	1733	+ """
	1734	+ patch_size = self.Match_MaxBits
	1735	+ if patch_size == 0:
	1736	+ # Python has the option of not splitting strings due to its ability
	1737	+ # to handle integers of arbitrary precision.
	1738	+ return
	1739	+ for x in xrange(len(patches)):
	1740	+ if patches[x].length1 > patch_size:
	1741	+ bigpatch = patches[x]
	1742	+ # Remove the big old patch.
	1743	+ del patches[x]
	1744	+ x -= 1
	1745	+ start1 = bigpatch.start1
	1746	+ start2 = bigpatch.start2
	1747	+ precontext = ''
	1748	+ while len(bigpatch.diffs) != 0:
	1749	+ # Create one of several smaller patches.
	1750	+ patch = patch_obj()
	1751	+ empty = True
	1752	+ patch.start1 = start1 - len(precontext)
	1753	+ patch.start2 = start2 - len(precontext)
	1754	+ if precontext:
	1755	+ patch.length1 = patch.length2 = len(precontext)
	1756	+ patch.diffs.append((self.DIFF_EQUAL, precontext))
	1757	+
	1758	+ while (len(bigpatch.diffs) != 0 and
	1759	+ patch.length1 < patch_size - self.Patch_Margin):
	1760	+ (diff_type, diff_text) = bigpatch.diffs[0]
	1761	+ if diff_type == self.DIFF_INSERT:
	1762	+ # Insertions are harmless.
	1763	+ patch.length2 += len(diff_text)
	1764	+ start2 += len(diff_text)
	1765	+ patch.diffs.append(bigpatch.diffs.pop(0))
	1766	+ empty = False
	1767	+ elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
	1768	+ patch.diffs[0][0] == self.DIFF_EQUAL and
	1769	+ len(diff_text) > 2 * patch_size):
	1770	+ # This is a large deletion. Let it pass in one chunk.
	1771	+ patch.length1 += len(diff_text)
	1772	+ start1 += len(diff_text)
	1773	+ empty = False
	1774	+ patch.diffs.append((diff_type, diff_text))
	1775	+ del bigpatch.diffs[0]
	1776	+ else:
	1777	+ # Deletion or equality. Only take as much as we can stomach.
	1778	+ diff_text = diff_text[:patch_size - patch.length1 -
	1779	+ self.Patch_Margin]
	1780	+ patch.length1 += len(diff_text)
	1781	+ start1 += len(diff_text)
	1782	+ if diff_type == self.DIFF_EQUAL:
	1783	+ patch.length2 += len(diff_text)
	1784	+ start2 += len(diff_text)
	1785	+ else:
	1786	+ empty = False
	1787	+
	1788	+ patch.diffs.append((diff_type, diff_text))
	1789	+ if diff_text == bigpatch.diffs[0][1]:
	1790	+ del bigpatch.diffs[0]
	1791	+ else:
	1792	+ bigpatch.diffs[0] = (bigpatch.diffs[0][0],
	1793	+ bigpatch.diffs[0][1][len(diff_text):])
	1794	+
	1795	+ # Compute the head context for the next patch.
	1796	+ precontext = self.diff_text2(patch.diffs)
	1797	+ precontext = precontext[-self.Patch_Margin:]
	1798	+ # Append the end context for this patch.
	1799	+ postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
	1800	+ if postcontext:
	1801	+ patch.length1 += len(postcontext)
	1802	+ patch.length2 += len(postcontext)
	1803	+ if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
	1804	+ patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
	1805	+ postcontext)
	1806	+ else:
	1807	+ patch.diffs.append((self.DIFF_EQUAL, postcontext))
	1808	+
	1809	+ if not empty:
	1810	+ x += 1
	1811	+ patches.insert(x, patch)
	1812	+
	1813	+ def patch_toText(self, patches):
	1814	+ """Take a list of patches and return a textual representation.
	1815	+
	1816	+ Args:
	1817	+ patches: Array of patch objects.
	1818	+
	1819	+ Returns:
	1820	+ Text representation of patches.
	1821	+ """
	1822	+ text = []
	1823	+ for patch in patches:
	1824	+ text.append(str(patch))
	1825	+ return "".join(text)
	1826	+
	1827	+ def patch_fromText(self, textline):
	1828	+ """Parse a textual representation of patches and return a list of patch
	1829	+ objects.
	1830	+
	1831	+ Args:
	1832	+ textline: Text representation of patches.
	1833	+
	1834	+ Returns:
	1835	+ Array of patch objects.
	1836	+
	1837	+ Raises:
	1838	+ ValueError: If invalid input.
	1839	+ """
	1840	+ if type(textline) == unicode:
	1841	+ # Patches should be composed of a subset of ascii chars, Unicode not
	1842	+ # required. If this encode raises UnicodeEncodeError, patch is invalid.
	1843	+ textline = textline.encode("ascii")
	1844	+ patches = []
	1845	+ if not textline:
	1846	+ return patches
	1847	+ text = textline.split('\n')
	1848	+ while len(text) != 0:
	1849	+ m = re.match("^@@ -(\d+),?(\d) \+(\d+),?(\d) @@$", text[0])
	1850	+ if not m:
	1851	+ raise ValueError("Invalid patch string: " + text[0])
	1852	+ patch = patch_obj()
	1853	+ patches.append(patch)
	1854	+ patch.start1 = int(m.group(1))
	1855	+ if m.group(2) == '':
	1856	+ patch.start1 -= 1
	1857	+ patch.length1 = 1
	1858	+ elif m.group(2) == '0':
	1859	+ patch.length1 = 0
	1860	+ else:
	1861	+ patch.start1 -= 1
	1862	+ patch.length1 = int(m.group(2))
	1863	+
	1864	+ patch.start2 = int(m.group(3))
	1865	+ if m.group(4) == '':
	1866	+ patch.start2 -= 1
	1867	+ patch.length2 = 1
	1868	+ elif m.group(4) == '0':
	1869	+ patch.length2 = 0
	1870	+ else:
	1871	+ patch.start2 -= 1
	1872	+ patch.length2 = int(m.group(4))
	1873	+
	1874	+ del text[0]
	1875	+
	1876	+ while len(text) != 0:
	1877	+ if text[0]:
	1878	+ sign = text[0][0]
	1879	+ else:
	1880	+ sign = ''
	1881	+ line = urllib.unquote(text[0][1:])
	1882	+ line = line.decode("utf-8")
	1883	+ if sign == '+':
	1884	+ # Insertion.
	1885	+ patch.diffs.append((self.DIFF_INSERT, line))
	1886	+ elif sign == '-':
	1887	+ # Deletion.
	1888	+ patch.diffs.append((self.DIFF_DELETE, line))
	1889	+ elif sign == ' ':
	1890	+ # Minor equality.
	1891	+ patch.diffs.append((self.DIFF_EQUAL, line))
	1892	+ elif sign == '@':
	1893	+ # Start of next patch.
	1894	+ break
	1895	+ elif sign == '':
	1896	+ # Blank line? Whatever.
	1897	+ pass
	1898	+ else:
	1899	+ # WTF?
	1900	+ raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
	1901	+ del text[0]
	1902	+ return patches
	1903	+
	1904	+
	1905	+class patch_obj:
	1906	+ """Class representing one patch operation.
	1907	+ """
	1908	+
	1909	+ def __init__(self):
	1910	+ """Initializes with an empty list of diffs.
	1911	+ """
	1912	+ self.diffs = []
	1913	+ self.start1 = None
	1914	+ self.start2 = None
	1915	+ self.length1 = 0
	1916	+ self.length2 = 0
	1917	+
	1918	+ def __str__(self):
	1919	+ """Emmulate GNU diff's format.
	1920	+ Header: @@ -382,8 +481,9 @@
	1921	+ Indicies are printed as 1-based, not 0-based.
	1922	+
	1923	+ Returns:
	1924	+ The GNU diff string.
	1925	+ """
	1926	+ if self.length1 == 0:
	1927	+ coords1 = str(self.start1) + ",0"
	1928	+ elif self.length1 == 1:
	1929	+ coords1 = str(self.start1 + 1)
	1930	+ else:
	1931	+ coords1 = str(self.start1 + 1) + "," + str(self.length1)
	1932	+ if self.length2 == 0:
	1933	+ coords2 = str(self.start2) + ",0"
	1934	+ elif self.length2 == 1:
	1935	+ coords2 = str(self.start2 + 1)
	1936	+ else:
	1937	+ coords2 = str(self.start2 + 1) + "," + str(self.length2)
	1938	+ text = ["@@ -", coords1, " +", coords2, " @@\n"]
	1939	+ # Escape the body of the patch with %xx notation.
	1940	+ for (op, data) in self.diffs:
	1941	+ if op == diff_match_patch.DIFF_INSERT:
	1942	+ text.append("+")
	1943	+ elif op == diff_match_patch.DIFF_DELETE:
	1944	+ text.append("-")
	1945	+ elif op == diff_match_patch.DIFF_EQUAL:
	1946	+ text.append(" ")
	1947	+ # High ascii will raise UnicodeDecodeError. Use Unicode instead.
	1948	+ data = data.encode("utf-8")
	1949	+ text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
	1950	+ return "".join(text)
Property changes on: trunk/tools/wsor/diffs/diff_match_patch.py
___________________________________________________________________
Added: svn:executable
1	1951	+ *
Index: trunk/tools/wsor/diffs/page_sample.xml
—	—	@@ -0,0 +1,37 @@
	2	+ <page>
	3	+ <title>Bassist</title>
	4	+ <id>60001</id>
	5	+ <revision>
	6	+ <id>108204</id>
	7	+ <timestamp>2002-06-30T02:03:23Z</timestamp>
	8	+ <contributor>
	9	+ <ip>195.149.37.198</ip>
	10	+ </contributor>
	11	+ <minor />
	12	+ <comment>stub</comment>
	13	+ <text xml:space="preserve">A <b>bassist</b> is somebody who plays a [[bass guitar]] or [[double bass]].</text>
	14	+ </revision>
	15	+ <revision>
	16	+ <id>208937</id>
	17	+ <timestamp>2002-06-30T16:00:41Z</timestamp>
	18	+ <contributor>
	19	+ <username>JeLuF</username>
	20	+ <id>733</id>
	21	+ </contributor>
	22	+ <comment>added list</comment>
	23	+ <text xml:space="preserve">A <b>bassist</b> is somebody who plays a [[bass guitar]] or [[double bass]].
	24	+
	25	+Famous bassists include:
	26	+* [[Ron Carter]]
	27	+* [[Les Claypool]] from [[Primus]]
	28	+* [[John Entwistle]] from [[The Who]]
	29	+* [[Kelly Grouchet]] from [[Electric Light Orchestra]]
	30	+* [[Glenn Hughes]] from [[Deep Purple]]
	31	+* [[Lemmy Kilmister]] from [[Motorhead]]
	32	+* Sir [[Paul McCartney]] from [[The Beatles]]
	33	+* [[Charles Mingus]]
	34	+* [[Jason Newsted]] from [[Metallica]]
	35	+* [[Sting]] from [[The Police]]
	36	+* [[Leon Wilkeson]] from [[Lynyrd Skynyrd]]</text>
	37	+ </revision>
	38	+ </page>

Status & tagging log

00:25, 23 July 2011 😂 (talk | contribs) changed the status of r92920 [removed: new added: deferred]