r92920 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92919‎ | r92920 | r92921 >
Date:00:22, 23 July 2011
Author:halfak
Status:deferred
Tags:
Comment:
Plots, R and diff stuff
Modified paths:
  • /trunk/tools/wsor/diffs (added) (history)
  • /trunk/tools/wsor/diffs/diff_match_patch.py (added) (history)
  • /trunk/tools/wsor/diffs/example.py (added) (history)
  • /trunk/tools/wsor/diffs/page_sample.xml (added) (history)
  • /trunk/tools/wsor/diffs/revision_differ.py (added) (history)
  • /trunk/tools/wsor/diffs/xml_simulator.py (added) (history)
  • /trunk/tools/wsor/first_session (added) (history)
  • /trunk/tools/wsor/first_session/R (added) (history)
  • /trunk/tools/wsor/first_session/R/.RData (added) (history)
  • /trunk/tools/wsor/first_session/R/.Rhistory (added) (history)
  • /trunk/tools/wsor/first_session/R/Rplots.pdf (added) (history)
  • /trunk/tools/wsor/first_session/R/edit_distributions.R (added) (history)
  • /trunk/tools/wsor/first_session/R/first_session_characteristics.R (added) (history)
  • /trunk/tools/wsor/first_session/R/first_session_survival.R (added) (history)
  • /trunk/tools/wsor/first_session/R/first_sessions.R (added) (history)
  • /trunk/tools/wsor/first_session/R/loader (added) (history)
  • /trunk/tools/wsor/first_session/R/loader/user_sessions.R (added) (history)
  • /trunk/tools/wsor/first_session/R/plots (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png (added) (history)
  • /trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png (added) (history)
  • /trunk/tools/wsor/first_session/R/util (added) (history)
  • /trunk/tools/wsor/first_session/R/util/env.R (added) (history)
  • /trunk/tools/wsor/first_session/data (added) (history)
  • /trunk/tools/wsor/first_session/foo (added) (history)
  • /trunk/tools/wsor/first_session/get_first_n_sessions.py (added) (history)
  • /trunk/tools/wsor/first_session/testing.sql (added) (history)
  • /trunk/tools/wsor/newbie_warnings/queries.sql (modified) (history)
  • /trunk/tools/wsor/newbie_warnings/track_hugglers.py (modified) (history)
  • /trunk/tools/wsor/newbie_warnings/track_hugglers_ng.py (added) (history)
  • /trunk/tools/wsor/newbie_warnings/track_hugglings.py (added) (history)
  • /trunk/tools/wsor/newbie_warnings/track_messages.py (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/R/conversions.R (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/R/util/env.R (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/get_editor_editcount.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/newbie_warnings/track_hugglers_ng.py
@@ -0,0 +1,192 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def emit(event, p, time):
 15+ print(
 16+ "\t".join(encode(v) for v in [
 17+ event,
 18+ p['user_id'],
 19+ p['user_name'],
 20+ time
 21+ ])
 22+ )
 23+ sys.stdout.flush()
 24+
 25+
 26+def main():
 27+ parser = argparse.ArgumentParser(
 28+ description=''
 29+ )
 30+ parser.add_argument(
 31+ '-c', '--cnf',
 32+ metavar="<path>",
 33+ type=str,
 34+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 35+ default=os.path.expanduser("~/.my.cnf")
 36+ )
 37+ parser.add_argument(
 38+ '-s', '--host',
 39+ type=str,
 40+ help='the database host to connect to (defaults to localhost)',
 41+ default="localhost"
 42+ )
 43+ parser.add_argument(
 44+ '-d', '--db',
 45+ type=str,
 46+ help='the language db to run the query in (defaults to enwiki)',
 47+ default="enwiki"
 48+ )
 49+ parser.add_argument(
 50+ '-o', '--out',
 51+ type=lambda fn:open(fn, 'a+'),
 52+ help='Where should output be appended',
 53+ default=sys.stdout
 54+ )
 55+ args = parser.parse_args()
 56+
 57+ LOGGING_STREAM = sys.stderr
 58+ logging.basicConfig(
 59+ level=logging.DEBUG,
 60+ stream=LOGGING_STREAM,
 61+ format='%(asctime)s %(levelname)-8s %(message)s',
 62+ datefmt='%b-%d %H:%M:%S'
 63+ )
 64+
 65+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 66+ db = Database(
 67+ host=args.host,
 68+ db=args.db,
 69+ read_default_file=args.cnf
 70+ )
 71+
 72+ try:
 73+ oldPosts = {}
 74+ lastTime = db.getTime()
 75+ time.sleep(5)
 76+ while True:
 77+ logging.info("Tracking %s posts. Looking for new ones since %s." % (len(oldPosts), lastTime))
 78+ newUsers = set(db.getHugglePostsSince(lastTime))
 79+ currTime = db.getTime()
 80+ currUsers = set()
 81+ for p in db.getWaitingPosts(oldPosts.viewkeys() | newUsers):
 82+ if p['user_name'] not in oldPosts:
 83+ #Found a new posting
 84+ LOGGING_STREAM.write(">")
 85+ p['posting'] = currTime
 86+ oldPosts[p['user_name']] = p
 87+ emit("received", p, currTime)
 88+ elif p['messages'] < oldPosts[p['user_name']]['messages']:
 89+ #Looks like someone checked the message
 90+ LOGGING_STREAM.write("<")
 91+ emit("read", oldPosts[p['user_name']], currTime)
 92+ del oldPosts[p['user_name']]
 93+ else:
 94+ #Same shit, different minute
 95+ pass
 96+
 97+ currUsers.add(p['user_name'])
 98+
 99+ for missing in oldPosts.viewkeys() - currUsers:
 100+ LOGGING_STREAM.write("<")
 101+ emit("read", oldPosts[missing], currTime)
 102+ del oldPosts[missing]
 103+
 104+ lastTime = currTime
 105+ LOGGING_STREAM.write("\n")
 106+ time.sleep(5)
 107+
 108+ except KeyboardInterrupt:
 109+ logging.info("Keyboard interrupt detected. Shutting down.")
 110+ except Exception as e:
 111+ logging.error(str(e))
 112+
 113+ print(repr(oldPosts))
 114+ print(lastTime)
 115+
 116+
 117+
 118+def safe(val):
 119+ return '"' + val.replace('"', '\\"') + '"'
 120+
 121+class Database:
 122+
 123+ def __init__(self, *args, **kwargs):
 124+ self.args = args
 125+ self.kwargs = kwargs
 126+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 127+
 128+
 129+
 130+ def getTime(self):
 131+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 132+ cursor.execute(
 133+ """
 134+ SELECT rc_timestamp AS time
 135+ FROM recentchanges
 136+ ORDER BY rc_timestamp DESC
 137+ LIMIT 1
 138+ """
 139+ )
 140+ self.usersConn.commit()
 141+ for row in cursor:
 142+ return row['time']
 143+
 144+
 145+ def getHugglePostsSince(self, timestamp):
 146+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 147+ cursor.execute("""
 148+ SELECT DISTINCT p.page_title AS title
 149+ FROM revision r
 150+ INNER JOIN page p
 151+ ON r.rev_page = p.page_id
 152+ WHERE p.page_namespace = 3
 153+ AND r.rev_timestamp >= %(timestamp)s
 154+ AND r.rev_comment LIKE %(like)s
 155+ """,
 156+ {
 157+ "timestamp": timestamp,
 158+ "like": "%" + "WP:HG" + "%",
 159+ "clue": "%" + "Warning" + "%"
 160+ }
 161+ )
 162+ return (p['title'].replace("_", " ") for p in cursor)
 163+
 164+ def getWaitingPosts(self, users):
 165+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 166+ userString = ",".join(safe(u) for u in users)
 167+ if len(userString) != 0:
 168+ cursor.execute("""
 169+ SELECT
 170+ u.user_id,
 171+ u.user_name,
 172+ count(*) as messages,
 173+ u.user_touched as last_touched,
 174+ FROM user_newtalk nt
 175+ LEFT JOIN user u
 176+ ON u.user_id = nt.user_id
 177+ WHERE u.user_name IN (""" + userString + """)
 178+ GROUP BY u.user_id, u.user_name
 179+ UNION
 180+ SELECT
 181+ NULL as user_id,
 182+ nt.user_ip as user_name,
 183+ count(*) as messages,
 184+ NULL as last_touched,
 185+ FROM user_newtalk nt
 186+ WHERE nt.user_ip IN (""" + userString + """)
 187+ GROUP BY nt.user_ip, NULL
 188+ """
 189+ )
 190+ for post in cursor:
 191+ yield post
 192+
 193+if __name__ == "__main__": main()
Index: trunk/tools/wsor/newbie_warnings/track_hugglings.py
@@ -0,0 +1,184 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def emit(event, p, time):
 15+ print(
 16+ "\t".join(encode(v) for v in [
 17+ event,
 18+ p['user_id'],
 19+ p['user_name'],
 20+ time
 21+ ])
 22+ )
 23+ sys.stdout.flush()
 24+
 25+
 26+def main():
 27+ parser = argparse.ArgumentParser(
 28+ description=''
 29+ )
 30+ parser.add_argument(
 31+ '-c', '--cnf',
 32+ metavar="<path>",
 33+ type=str,
 34+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 35+ default=os.path.expanduser("~/.my.cnf")
 36+ )
 37+ parser.add_argument(
 38+ '-s', '--host',
 39+ type=str,
 40+ help='the database host to connect to (defaults to localhost)',
 41+ default="localhost"
 42+ )
 43+ parser.add_argument(
 44+ '-d', '--db',
 45+ type=str,
 46+ help='the language db to run the query in (defaults to enwiki)',
 47+ default="enwiki"
 48+ )
 49+ parser.add_argument(
 50+ '-o', '--out',
 51+ type=lambda fn:open(fn, 'a+'),
 52+ help='Where should output be appended',
 53+ default=sys.stdout
 54+ )
 55+ args = parser.parse_args()
 56+
 57+ LOGGING_STREAM = sys.stderr
 58+ logging.basicConfig(
 59+ level=logging.DEBUG,
 60+ stream=LOGGING_STREAM,
 61+ format='%(asctime)s %(levelname)-8s %(message)s',
 62+ datefmt='%b-%d %H:%M:%S'
 63+ )
 64+
 65+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 66+ db = Database(
 67+ host=args.host,
 68+ db=args.db,
 69+ read_default_file=args.cnf
 70+ )
 71+
 72+ try:
 73+ oldPosts = {}
 74+ lastTime = db.getTime()
 75+ time.sleep(5)
 76+ while True:
 77+ logging.info("Tracking %s posts. Looking for new ones since %s." % (len(oldPosts), lastTime))
 78+ newUsers = set(db.getHugglePostsSince(lastTime))
 79+ currTime = db.getTime()
 80+ currUsers = set()
 81+ for p in db.getWaitingPosts(oldPosts.viewkeys() | newUsers):
 82+ if p['user_name'] not in oldPosts:
 83+ #Found a new posting
 84+ LOGGING_STREAM.write(">")
 85+ p['posting'] = currTime
 86+ oldPosts[p['user_name']] = p
 87+ emit("received", p, currTime)
 88+ elif p['messages'] < oldPosts[p['user_name']]['messages']:
 89+ #Looks like someone checked the message
 90+ LOGGING_STREAM.write("<")
 91+ emit("read", oldPosts[p['user_name']], currTime)
 92+ del oldPosts[p['user_name']]
 93+ else:
 94+ #Same shit, different minute
 95+ pass
 96+
 97+ currUsers.add(p['user_name'])
 98+
 99+ for missing in oldPosts.viewkeys() - currUsers:
 100+ LOGGING_STREAM.write("<")
 101+ emit("read", oldPosts[missing], currTime)
 102+ del oldPosts[missing]
 103+
 104+ lastTime = currTime
 105+ LOGGING_STREAM.write("\n")
 106+ time.sleep(5)
 107+
 108+ except KeyboardInterrupt:
 109+ logging.info("Keyboard interrupt detected. Shutting down.")
 110+ except Exception as e:
 111+ logging.error(str(e))
 112+
 113+ print(repr(oldPosts))
 114+ print(lastTime)
 115+
 116+
 117+
 118+def safe(val):
 119+ return '"' + val.replace('"', '\\"') + '"'
 120+
 121+class Database:
 122+
 123+ def __init__(self, *args, **kwargs):
 124+ self.args = args
 125+ self.kwargs = kwargs
 126+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 127+
 128+
 129+
 130+ def getTime(self):
 131+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 132+ cursor.execute(
 133+ """
 134+ SELECT rc_timestamp AS time
 135+ FROM recentchanges
 136+ ORDER BY rc_timestamp DESC
 137+ LIMIT 1
 138+ """
 139+ )
 140+ self.usersConn.commit()
 141+ for row in cursor:
 142+ return row['time']
 143+
 144+
 145+ def getHugglePostsSince(self, timestamp):
 146+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 147+ cursor.execute("""
 148+
 149+ """,
 150+ {
 151+ "timestamp": timestamp,
 152+ "like": "%" + "WP:HG" + "%",
 153+ "clue": "%" + "Warning" + "%"
 154+ }
 155+ )
 156+ return (p['title'].replace("_", " ") for p in cursor)
 157+
 158+ def getWaitingPosts(self, users):
 159+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 160+ userString = ",".join(safe(u) for u in users)
 161+ if len(userString) != 0:
 162+ cursor.execute("""
 163+ SELECT
 164+ u.user_id,
 165+ u.user_name,
 166+ count(*) as messages
 167+ FROM user_newtalk nt
 168+ LEFT JOIN user u
 169+ ON u.user_id = nt.user_id
 170+ WHERE u.user_name IN (""" + userString + """)
 171+ GROUP BY u.user_id, u.user_name
 172+ UNION
 173+ SELECT
 174+ NULL as user_id,
 175+ nt.user_ip as user_name,
 176+ count(*) as messages
 177+ FROM user_newtalk nt
 178+ WHERE nt.user_ip IN (""" + userString + """)
 179+ GROUP BY nt.user_ip, NULL
 180+ """
 181+ )
 182+ for post in cursor:
 183+ yield post
 184+
 185+if __name__ == "__main__": main()
Index: trunk/tools/wsor/newbie_warnings/track_messages.py
@@ -91,7 +91,7 @@
9292 cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
9393 cursor.execute(
9494 """
95 - SELECT rc_timestamp AS time
 95+ SELECT SQL_NO_CACHE rc_timestamp AS time
9696 FROM recentchanges
9797 ORDER BY rc_timestamp DESC
9898 LIMIT 1
Index: trunk/tools/wsor/newbie_warnings/queries.sql
@@ -181,3 +181,18 @@
182182 FROM user_newtalk nt
183183 WHERE nt.user_ip IN ("EpochFail")
184184 GROUP BY nt.user_ip, NULL;
 185+
 186+
 187+SELECT
 188+ p.page_id as user_talk_id,
 189+ p.page_title as user_talk_page,
 190+ REPLACTE(p.page_title, "_", " ") as user_name,
 191+ tl.tl_title as template
 192+FROM enwiki.templatelinks tl
 193+INNER JOIN enwiki.page p
 194+ ON page_id = tl_from
 195+WHERE tl_title IN ('Z49','Z50','Z51','Z52','Z53','Z54','Z55','Z56')
 196+AND tl_namespace = 10
 197+AND page_namespace = 3
 198+
 199+
Index: trunk/tools/wsor/newbie_warnings/track_hugglers.py
@@ -19,6 +19,7 @@
2020 time
2121 ])
2222 )
 23+ sys.stdout.flush()
2324
2425
2526 def main():
@@ -149,10 +150,7 @@
150151 ON r.rev_page = p.page_id
151152 WHERE p.page_namespace = 3
152153 AND r.rev_timestamp >= %(timestamp)s
153 - AND (
154 - r.rev_comment LIKE %(like)s OR
155 - r.rev_comment LIKE %(clue)s
156 - )
 154+ AND r.rev_comment LIKE %(like)s
157155 """,
158156 {
159157 "timestamp": timestamp,
@@ -167,7 +165,7 @@
168166 userString = ",".join(safe(u) for u in users)
169167 if len(userString) != 0:
170168 cursor.execute("""
171 - SELECT
 169+ SELECT
172170 u.user_id,
173171 u.user_name,
174172 count(*) as messages
Index: trunk/tools/wsor/first_session/get_first_n_sessions.py
@@ -0,0 +1,247 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def main():
 15+ parser = argparse.ArgumentParser(
 16+ description='Gathers editor data for first and last session'
 17+ )
 18+ parser.add_argument(
 19+ 'n',
 20+ type=int,
 21+ help='the minimum number of edits that editors must have perfomed to be included'
 22+ )
 23+ parser.add_argument(
 24+ 'session',
 25+ type=int,
 26+ help='maximum time between session edits (in seconds)'
 27+ )
 28+ parser.add_argument(
 29+ '-c', '--cnf',
 30+ metavar="<path>",
 31+ type=str,
 32+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 33+ default=os.path.expanduser("~/.my.cnf")
 34+ )
 35+ parser.add_argument(
 36+ '-s', '--host',
 37+ type=str,
 38+ help='the database host to connect to (defaults to localhost)',
 39+ default="localhost"
 40+ )
 41+ parser.add_argument(
 42+ '-d', '--db',
 43+ type=str,
 44+ help='the language db to run the query in (defaults to enwiki)',
 45+ default="enwiki"
 46+ )
 47+ parser.add_argument(
 48+ '-o', '--out',
 49+ type=lambda fn:open(fn, 'w'),
 50+ help='an output file to write to (defaults to stdout)',
 51+ default=sys.stdout
 52+ )
 53+ args = parser.parse_args()
 54+
 55+ LOGGING_STREAM = sys.stderr
 56+ logging.basicConfig(
 57+ level=logging.DEBUG,
 58+ stream=LOGGING_STREAM,
 59+ format='%(asctime)s %(levelname)-8s %(message)s',
 60+ datefmt='%b-%d %H:%M:%S'
 61+ )
 62+
 63+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 64+ db = Database(
 65+ host=args.host,
 66+ db=args.db,
 67+ read_default_file=args.cnf
 68+ )
 69+ headers = [
 70+ 'user_id',
 71+ 'user_name',
 72+ 'first_edit',
 73+ 'last_edit',
 74+ 'edit_count'
 75+ ]
 76+ for i in range(0, args.n):
 77+ headers.append("es_%s_start" % i)
 78+ headers.append("es_%s_end" % i)
 79+ headers.append("es_%s_edits" % i)
 80+ headers.append("es_%s_reverted" % i)
 81+ headers.append("es_%s_vandalism" % i)
 82+ headers.append("es_%s_deleted" % i)
 83+
 84+
 85+ print("\t".join(headers))
 86+
 87+ logging.info("Loading users:")
 88+
 89+ users = []
 90+ for user in db.getSampledUsers():
 91+ users.append(user)
 92+ LOGGING_STREAM.write(".")
 93+ LOGGING_STREAM.write("\n")
 94+
 95+ logging.info("Processing users:")
 96+ for user in users:
 97+ i = 0
 98+ for session in sessions(db.getEdits(user['user_id']), args.session):
 99+ user['es_%s_start' % i] = session[0]['timestamp']
 100+ user['es_%s_end' % i] = session[-1]['timestamp']
 101+ user['es_%s_edits' % i] = len(session)
 102+ user['es_%s_reverted' % i] = 0
 103+ user['es_%s_vandalism' % i] = 0
 104+ user['es_%s_deleted' % i] = 0
 105+
 106+ for edit in session:
 107+ user['es_%s_reverted' % i] += edit['is_reverted']
 108+ user['es_%s_vandalism' % i] += edit['is_vandalism']
 109+ user['es_%s_deleted' % i] += edit['deleted']
 110+
 111+ i += 1
 112+ if i >= args.n:
 113+ break
 114+
 115+
 116+ args.out.write("\t".join(encode(user.get(h)) for h in headers) + "\n")
 117+ LOGGING_STREAM.write(".")
 118+
 119+ LOGGING_STREAM.write("\n")
 120+
 121+
 122+def sessions(edits, sessionThreshold=3600):
 123+ sessionEdits = []
 124+ for edit in edits:
 125+ edit['timestamp'] = wmf.wp2Timestamp(edit['rev_timestamp'])
 126+ if len(sessionEdits) == 0:
 127+ sessionEdits.append(edit)
 128+ elif (edit['timestamp'] - sessionEdits[-1]['timestamp']) < sessionThreshold:
 129+ sessionEdits.append(edit)
 130+ else:
 131+ yield sessionEdits
 132+ sessionEdits = [edit]
 133+
 134+
 135+ if len(sessionEdits) > 0:
 136+ yield sessionEdits
 137+
 138+
 139+
 140+
 141+class Database:
 142+
 143+ def __init__(self, *args, **kwargs):
 144+ self.args = args
 145+ self.kwargs = kwargs
 146+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 147+ self.revsConn = MySQLdb.connect(*args, **kwargs)
 148+ self.archConn = MySQLdb.connect(*args, **kwargs)
 149+
 150+ def getSampledUsers(self):
 151+ cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor)
 152+ cursor.execute(
 153+ """
 154+ SELECT
 155+ u.user_id,
 156+ u.user_name,
 157+ um.first_edit,
 158+ um.last_edit,
 159+ u.user_editcount as edit_count
 160+ FROM halfak.user_session_sample us
 161+ INNER JOIN user u
 162+ ON u.user_id = us.user_id
 163+ INNER JOIN halfak.user_meta_20110715 um
 164+ ON u.user_id = um.user_id
 165+ """
 166+ )
 167+ for row in cursor:
 168+ yield row
 169+
 170+
 171+
 172+ def getEdits(self, userId, chronologically=True):
 173+ userId = int(userId)
 174+ revisionCursor = self.revsConn.cursor(MySQLdb.cursors.SSDictCursor)
 175+ archiveCursor = self.archConn.cursor(MySQLdb.cursors.SSDictCursor)
 176+
 177+ if chronologically: direction = "ASC"
 178+ else: direction = "DESC"
 179+
 180+ revisionCursor.execute(
 181+ """
 182+ SELECT
 183+ r.rev_id,
 184+ r.rev_timestamp,
 185+ rvtd.revision_id IS NOT NULL AS is_reverted,
 186+ rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE AS is_vandalism,
 187+ False AS deleted
 188+ FROM revision r
 189+ LEFT JOIN halfak.reverted_20110115 rvtd
 190+ ON r.rev_id = rvtd.revision_id
 191+ WHERE rev_user = %(user_id)s
 192+ ORDER BY r.rev_timestamp """ + direction + """
 193+ """,
 194+ {
 195+ 'user_id': userId
 196+ }
 197+ )
 198+ archiveCursor.execute(
 199+ """
 200+ SELECT
 201+ ar_rev_id AS rev_id,
 202+ ar_timestamp AS rev_timestamp,
 203+ False AS is_reverted,
 204+ False AS is_vandalism,
 205+ True AS deleted
 206+ FROM archive
 207+ WHERE ar_user = %(user_id)s
 208+ ORDER BY ar_timestamp """ + direction + """
 209+ """,
 210+ {
 211+ 'user_id': userId
 212+ }
 213+ )
 214+ if chronologically:
 215+ order = lambda t1, t2:t1 < t2
 216+ else:
 217+ order = lambda t1, t2:t1 > t2
 218+
 219+ revPointer = revisionCursor.fetchone()
 220+ archPointer = archiveCursor.fetchone()
 221+ while revPointer != None or archPointer != None: #still something to output
 222+ if revPointer != None and archPointer != None: #both cursors still have something
 223+ if order(revPointer['rev_timestamp'], archPointer['rev_timestamp']):
 224+ yield revPointer
 225+ revPointer = revisionCursor.fetchone()
 226+ else:
 227+ yield archPointer
 228+ archPointer = archiveCursor.fetchone()
 229+ elif revPointer != None: #only revisions left
 230+ yield revPointer
 231+ revPointer = revisionCursor.fetchone()
 232+ elif archPointer != None: #only archives left
 233+ yield archPointer
 234+ archPointer = archiveCursor.fetchone()
 235+
 236+ revisionCursor.close()
 237+ archiveCursor.close()
 238+
 239+
 240+
 241+ def getFirstEdits(self, userId, maximum=10000):
 242+ return self.getEdits(userId, maximum, chronologically=True)
 243+
 244+ def getLastEdits(self, userId, maximum=10000):
 245+ return self.getEdits(userId, maximum, chronologically=False)
 246+
 247+
 248+if __name__ == "__main__": main()
Index: trunk/tools/wsor/first_session/R/.Rhistory
@@ -0,0 +1,512 @@
 2+pch=4,
 3+lty=4
 4+),
 5+"32"=list(
 6+col="#00BBBB",
 7+pch=5,
 8+lty=5
 9+),
 10+"64"=list(
 11+col="#BB00BB",
 12+pch=6,
 13+lty=6
 14+)
 15+)
 16+xyplot(
 17+early_survival ~ year,
 18+data=limited_year_edits_props,
 19+groups=es_0_bucket,
 20+panel=function(x, y, subscripts, groups, ...){
 21+f = limited_year_edits_props[subscripts,]
 22+for(group in groups){
 23+group = as.character(group)
 24+subf = f[f$es_0_bucket == group,]
 25+p = subf$early_survival
 26+x = subf$year
 27+n = subf$n
 28+panel.xyplot(
 29+x, p,
 30+col=params[[group]]$col,
 31+pch=params[[group]]$pch,
 32+...
 33+)
 34+panel.lines(
 35+x, p,
 36+col=params[[group]]$col,
 37+lwd=2,
 38+...
 39+)
 40+se = sqrt(p*(1-p)/n)
 41+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 42+}
 43+},
 44+ylim=c(0, 1),
 45+main="Early survival proportion for new editors grouped by edits in their first session",
 46+ylab="Proportion of surviving editors",
 47+xlab="Years",
 48+sub="early survival = editing more than 1 month after first session",
 49+auto.key=list(
 50+text=paste("~", names(params), "edits"),
 51+col=c(
 52+"#000000",
 53+"#FF0000",
 54+"#00FF00",
 55+"#0000FF",
 56+"#BBBB00",
 57+"#00BBBB",
 58+"#BB00BB"
 59+)
 60+)
 61+)
 62+dev.off()
 63+user_sessions$es_0_no_arch = 2^round(log((user_sessions$es_0_edits - user_sessions$es_0_deleted)+1, base=2))
 64+no_arch_edits_props = with(
 65+summaryBy(
 66+early_survival ~ year + es_0_no_arch,
 67+data=user_sessions[
 68+!is.na(user_sessions$year) &
 69+user_sessions$es_0_no_arch <= 256,
 70+],
 71+FUN=c(mean, length)
 72+),
 73+data.frame(
 74+year = year,
 75+es_0_no_arch = es_0_no_arch,
 76+early_survival = early_survival.mean,
 77+n = early_survival.length
 78+)
 79+)
 80+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
 81+limited_year_edits_props = no_arch_edits_props[
 82+no_arch_edits_props$n >= 10 &
 83+no_arch_edits_props$es_0_no_arch <= 16,
 84+]
 85+params = list(
 86+"0"=list(
 87+col="#AAAAAA",
 88+pch=0,
 89+lty=0
 90+),
 91+"1"=list(
 92+col="#000000",
 93+pch=0,
 94+lty=0
 95+),
 96+"2"=list(
 97+col="#FF0000",
 98+pch=1,
 99+lty=1
 100+),
 101+"4"=list(
 102+col="#00FF00",
 103+pch=2,
 104+lty=2
 105+),
 106+"8"=list(
 107+col="#0000FF",
 108+pch=3,
 109+lty=3
 110+),
 111+"16"=list(
 112+col="#BBBB00",
 113+pch=4,
 114+lty=4
 115+)
 116+)
 117+xyplot(
 118+early_survival ~ year,
 119+data=limited_year_edits_props,
 120+groups=es_0_no_arch,
 121+panel=function(x, y, subscripts, groups, ...){
 122+f = limited_year_edits_props[subscripts,]
 123+for(group in groups){
 124+group = as.character(group)
 125+subf = f[f$es_0_no_arch == group,]
 126+p = subf$early_survival
 127+x = subf$year
 128+n = subf$n
 129+panel.xyplot(
 130+x, p,
 131+col=params[[group]]$col,
 132+pch=params[[group]]$pch,
 133+...
 134+)
 135+panel.lines(
 136+x, p,
 137+col=params[[group]]$col,
 138+lwd=2,
 139+...
 140+)
 141+se = sqrt(p*(1-p)/n)
 142+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 143+}
 144+},
 145+ylim=c(0, 1),
 146+main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
 147+ylab="Proportion of surviving editors",
 148+xlab="Years",
 149+sub="early survival = editing more than 1 month after first session",
 150+auto.key=list(
 151+text=paste("~", names(params), "edits"),
 152+col=c(
 153+"#AAAAAA",
 154+"#000000",
 155+"#FF0000",
 156+"#00FF00",
 157+"#0000FF",
 158+"#BBBB00",
 159+"#00BBBB",
 160+"#BB00BB"
 161+)
 162+)
 163+)
 164+dev.off()
 165+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
 166+limited_year_edits_props = no_arch_edits_props[
 167+no_arch_edits_props$n >= 10 &
 168+no_arch_edits_props$es_0_no_arch <= 16,
 169+]
 170+params = list(
 171+"0"=list(
 172+col="#AAAAAA",
 173+pch=0,
 174+lty=0
 175+),
 176+"1"=list(
 177+col="#000000",
 178+pch=0,
 179+lty=0
 180+),
 181+"2"=list(
 182+col="#FF0000",
 183+pch=1,
 184+lty=1
 185+),
 186+"4"=list(
 187+col="#00FF00",
 188+pch=2,
 189+lty=2
 190+),
 191+"8"=list(
 192+col="#0000FF",
 193+pch=3,
 194+lty=3
 195+),
 196+"16"=list(
 197+col="#BBBB00",
 198+pch=4,
 199+lty=4
 200+)
 201+)
 202+xyplot(
 203+early_survival ~ year,
 204+data=limited_year_edits_props,
 205+groups=es_0_no_arch,
 206+panel=function(x, y, subscripts, groups, ...){
 207+f = limited_year_edits_props[subscripts,]
 208+for(group in groups){
 209+group = as.character(group)
 210+subf = f[f$es_0_no_arch == group,]
 211+p = subf$early_survival
 212+x = subf$year
 213+n = subf$n
 214+panel.xyplot(
 215+x, p,
 216+col=params[[group]]$col,
 217+pch=params[[group]]$pch,
 218+...
 219+)
 220+panel.lines(
 221+x, p,
 222+col=params[[group]]$col,
 223+lwd=2,
 224+...
 225+)
 226+se = sqrt(p*(1-p)/n)
 227+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 228+}
 229+},
 230+ylim=c(0, 1),
 231+main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
 232+ylab="Proportion of surviving editors",
 233+xlab="Years",
 234+sub="early survival = editing more than 1 month after first session",
 235+auto.key=list(
 236+text=paste("~", names(params), "edits"),
 237+col=c(
 238+"#AAAAAA",
 239+"#000000",
 240+"#FF0000",
 241+"#00FF00",
 242+"#0000FF",
 243+"#BBBB00",
 244+"#00BBBB",
 245+"#BB00BB"
 246+),
 247+points=F
 248+)
 249+)
 250+dev.off()
 251+png("plots/early_survival.by_year.es_lines.png", height=768, width=1024)
 252+limited_year_edits_props = year_edits_props[
 253+year_edits_props$n >= 10 &
 254+year_edits_props$es_0_bucket <= 16,
 255+]
 256+params = list(
 257+"1"=list(
 258+col="#000000",
 259+pch=0,
 260+lty=0
 261+),
 262+"2"=list(
 263+col="#FF0000",
 264+pch=1,
 265+lty=1
 266+),
 267+"4"=list(
 268+col="#00FF00",
 269+pch=2,
 270+lty=2
 271+),
 272+"8"=list(
 273+col="#0000FF",
 274+pch=3,
 275+lty=3
 276+),
 277+"16"=list(
 278+col="#BBBB00",
 279+pch=4,
 280+lty=4
 281+),
 282+"32"=list(
 283+col="#00BBBB",
 284+pch=5,
 285+lty=5
 286+),
 287+"64"=list(
 288+col="#BB00BB",
 289+pch=6,
 290+lty=6
 291+)
 292+)
 293+xyplot(
 294+early_survival ~ year,
 295+data=limited_year_edits_props,
 296+groups=es_0_bucket,
 297+panel=function(x, y, subscripts, groups, ...){
 298+f = limited_year_edits_props[subscripts,]
 299+for(group in groups){
 300+group = as.character(group)
 301+subf = f[f$es_0_bucket == group,]
 302+p = subf$early_survival
 303+x = subf$year
 304+n = subf$n
 305+panel.xyplot(
 306+x, p,
 307+col=params[[group]]$col,
 308+pch=params[[group]]$pch,
 309+...
 310+)
 311+panel.lines(
 312+x, p,
 313+col=params[[group]]$col,
 314+lwd=2,
 315+...
 316+)
 317+se = sqrt(p*(1-p)/n)
 318+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 319+}
 320+},
 321+ylim=c(0, 1),
 322+main="Early survival proportion for new editors grouped by edits in their first session",
 323+ylab="Proportion of surviving editors",
 324+xlab="Years",
 325+sub="early survival = editing more than 1 month after first session",
 326+auto.key=list(
 327+text=paste("~", names(params), "edits"),
 328+col=c(
 329+"#000000",
 330+"#FF0000",
 331+"#00FF00",
 332+"#0000FF",
 333+"#BBBB00",
 334+"#00BBBB",
 335+"#BB00BB"
 336+),
 337+points=F
 338+)
 339+)
 340+dev.off()
 341+user_sessions$es_0_no_arch = 2^round(log(user_sessions$es_0_edits - user_sessions$es_0_deleted, base=2))
 342+no_arch_edits_props = with(
 343+summaryBy(
 344+early_survival ~ year + es_0_no_arch,
 345+data=user_sessions[
 346+!is.na(user_sessions$year) &
 347+user_sessions$es_0_no_arch <= 256,
 348+],
 349+FUN=c(mean, length)
 350+),
 351+data.frame(
 352+year = year,
 353+es_0_no_arch = es_0_no_arch,
 354+early_survival = early_survival.mean,
 355+n = early_survival.length
 356+)
 357+)
 358+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
 359+limited_year_edits_props = no_arch_edits_props[
 360+no_arch_edits_props$n >= 10 &
 361+no_arch_edits_props$es_0_no_arch <= 16,
 362+]
 363+params = list(
 364+"0"=list(
 365+col="#AAAAAA",
 366+pch=0,
 367+lty=0
 368+),
 369+"1"=list(
 370+col="#000000",
 371+pch=0,
 372+lty=0
 373+),
 374+"2"=list(
 375+col="#FF0000",
 376+pch=1,
 377+lty=1
 378+),
 379+"4"=list(
 380+col="#00FF00",
 381+pch=2,
 382+lty=2
 383+),
 384+"8"=list(
 385+col="#0000FF",
 386+pch=3,
 387+lty=3
 388+),
 389+"16"=list(
 390+col="#BBBB00",
 391+pch=4,
 392+lty=4
 393+)
 394+)
 395+xyplot(
 396+early_survival ~ year,
 397+data=limited_year_edits_props,
 398+groups=es_0_no_arch,
 399+panel=function(x, y, subscripts, groups, ...){
 400+f = limited_year_edits_props[subscripts,]
 401+for(group in groups){
 402+group = as.character(group)
 403+subf = f[f$es_0_no_arch == group,]
 404+p = subf$early_survival
 405+x = subf$year
 406+n = subf$n
 407+panel.xyplot(
 408+x, p,
 409+col=params[[group]]$col,
 410+pch=params[[group]]$pch,
 411+...
 412+)
 413+panel.lines(
 414+x, p,
 415+col=params[[group]]$col,
 416+lwd=2,
 417+...
 418+)
 419+se = sqrt(p*(1-p)/n)
 420+panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 421+}
 422+},
 423+ylim=c(0, 1),
 424+main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
 425+ylab="Proportion of surviving editors",
 426+xlab="Years",
 427+sub="early survival = editing more than 1 month after first session",
 428+auto.key=list(
 429+text=paste("~", names(params), "edits"),
 430+col=c(
 431+"#AAAAAA",
 432+"#000000",
 433+"#FF0000",
 434+"#00FF00",
 435+"#0000FF",
 436+"#BBBB00",
 437+"#00BBBB",
 438+"#BB00BB"
 439+),
 440+points=F
 441+)
 442+)
 443+dev.off()
 444+es_0_bucket = 10^floor(log(user_sessions$es_0_edits, base=10))
 445+table(es_0_bucket)
 446+three_es_buckets = with(
 447+summaryBy(
 448+es_0_edits +
 449+es_1_edits +
 450+es_2_edits ~
 451+year + es_0_bucket,
 452+data=user_sessions,
 453+FUN=c(mean, sd, length)
 454+),
 455+data.frame(
 456+year = year
 457+es_0_bucket = es_0_bucket,
 458+es_0_mean = es_0_edits.mean,
 459+es_0_sd = es_0_edits.sd,
 460+es_0_n = es_0_edits.length,
 461+es_1_mean = es_1_edits.mean,
 462+es_1_sd = es_1_edits.sd,
 463+es_1_n = es_1_edits.length,
 464+es_2_mean = es_2_edits.mean,
 465+es_2_sd = es_2_edits.sd,
 466+es_2_n = es_2_edits.length
 467+)
 468+)three_es_buckets = with(
 469+summaryBy(
 470+es_0_edits +
 471+es_1_edits +
 472+es_2_edits ~
 473+year + es_0_bucket,
 474+data=user_sessions,
 475+FUN=c(mean, sd, length)
 476+),
 477+data.frame(
 478+year = year,
 479+bucket = es_0_bucket,
 480+es_0_mean = es_0_edits.mean,
 481+es_0_sd = es_0_edits.sd,
 482+es_0_n = es_0_edits.length,
 483+es_1_mean = es_1_edits.mean,
 484+es_1_sd = es_1_edits.sd,
 485+es_1_n = es_1_edits.length,
 486+es_2_mean = es_2_edits.mean,
 487+es_2_sd = es_2_edits.sd,
 488+es_2_n = es_2_edits.length
 489+)
 490+three_es_buckets = with(
 491+summaryBy(
 492+es_0_edits +
 493+es_1_edits +
 494+es_2_edits ~
 495+year + es_0_bucket,
 496+data=user_sessions,
 497+FUN=c(mean, sd, length)
 498+),
 499+data.frame(
 500+year = year,
 501+bucket = es_0_bucket,
 502+es_0_mean = es_0_edits.mean,
 503+es_0_sd = es_0_edits.sd,
 504+es_0_n = es_0_edits.length,
 505+es_1_mean = es_1_edits.mean,
 506+es_1_sd = es_1_edits.sd,
 507+es_1_n = es_1_edits.length,
 508+es_2_mean = es_2_edits.mean,
 509+es_2_sd = es_2_edits.sd,
 510+es_2_n = es_2_edits.length
 511+)
 512+)
 513+three_es_buckets
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png
___________________________________________________________________
Added: svn:mime-type
1514 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png
___________________________________________________________________
Added: svn:mime-type
2515 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png
___________________________________________________________________
Added: svn:mime-type
3516 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png
___________________________________________________________________
Added: svn:mime-type
4517 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png
___________________________________________________________________
Added: svn:mime-type
5518 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png
___________________________________________________________________
Added: svn:mime-type
6519 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png
___________________________________________________________________
Added: svn:mime-type
7520 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png
___________________________________________________________________
Added: svn:mime-type
8521 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png
___________________________________________________________________
Added: svn:mime-type
9522 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png
___________________________________________________________________
Added: svn:mime-type
10523 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png
___________________________________________________________________
Added: svn:mime-type
11524 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png
___________________________________________________________________
Added: svn:mime-type
12525 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png
___________________________________________________________________
Added: svn:mime-type
13526 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/first_session_characteristics.R
@@ -0,0 +1,132 @@
 2+source("loader/user_sessions.R")
 3+
 4+library(lattice)
 5+library(doBy)
 6+
 7+user_sessions = load_user_sessions()
 8+user_sessions$year = strftime(user_sessions$first_edit, format="%Y")
 9+user_sessions$early_survival = user_sessions$last_edit - user_sessions$es_0_end >= 30
 10+
 11+user_sessions$es_0_bucket = 10^floor(log(user_sessions$es_0_edits, base=10))
 12+user_sessions$es_1_edits = naReplace(user_sessions$es_1_edits, 0)
 13+user_sessions$es_2_edits = naReplace(user_sessions$es_2_edits, 0)
 14+
 15+
 16+
 17+
 18+
 19+three_es_buckets = with(
 20+ summaryBy(
 21+ es_0_edits +
 22+ es_1_edits +
 23+ es_2_edits ~
 24+ year + es_0_bucket,
 25+ data=user_sessions,
 26+ FUN=c(mean, sd, length)
 27+ ),
 28+ rbind(
 29+ data.frame(
 30+ year = year,
 31+ bucket = es_0_bucket,
 32+ es = 0,
 33+ mean = es_0_edits.mean,
 34+ sd = es_0_edits.sd,
 35+ n = es_0_edits.length
 36+ ),
 37+ data.frame(
 38+ year = year,
 39+ bucket = es_0_bucket,
 40+ es = 1,
 41+ mean = es_1_edits.mean,
 42+ sd = es_1_edits.sd,
 43+ n = es_1_edits.length
 44+ ),
 45+ data.frame(
 46+ year = year,
 47+ bucket = es_0_bucket,
 48+ es = 2,
 49+ mean = es_2_edits.mean,
 50+ sd = es_2_edits.sd,
 51+ n = es_2_edits.length
 52+ )
 53+ )
 54+)
 55+
 56+
 57+png("plots/edit_sessions.by_year_and_es_0_bucket.png", height=768, width=1024)
 58+limited_three_es_buckets = three_es_buckets[
 59+ three_es_buckets$n >= 10 &
 60+ three_es_buckets$bucket <= 16,
 61+]
 62+params = list(
 63+ "1"=list(
 64+ col="#000000",
 65+ pch=0,
 66+ lty=0
 67+ ),
 68+ "2"=list(
 69+ col="#FF0000",
 70+ pch=1,
 71+ lty=1
 72+ ),
 73+ "4"=list(
 74+ col="#00FF00",
 75+ pch=2,
 76+ lty=2
 77+ ),
 78+ "8"=list(
 79+ col="#0000FF",
 80+ pch=3,
 81+ lty=3
 82+ ),
 83+ "16"=list(
 84+ col="#BBBB00",
 85+ pch=4,
 86+ lty=4
 87+ )
 88+)
 89+xyplot(
 90+ mean ~ es | as.factor(year),
 91+ data=limited_three_es_buckets,
 92+ groups=bucket,
 93+ panel=function(x, y, subscripts, groups, ...){
 94+ f = limited_three_es_buckets[subscripts,]
 95+ for(group in groups){
 96+ group = as.character(group)
 97+ subf = f[f$bucket == group,]
 98+ y = subf$mean
 99+ x = subf$es
 100+ n = subf$n
 101+ sd = subf$sd
 102+ se = sd/sqrt(n)
 103+ panel.xyplot(
 104+ x, y,
 105+ col=params[[group]]$col,
 106+ pch=params[[group]]$pch,
 107+ ...
 108+ )
 109+ panel.lines(
 110+ x, y,
 111+ col=params[[group]]$col,
 112+ lwd=2,
 113+ ...
 114+ )
 115+ panel.arrows(x, y+se, x, y-se, ends="both", col="#777777", angle=90, length=.01)
 116+ }
 117+ },
 118+ main="Session activity by editor first session group",
 119+ ylab="Average session edits",
 120+ xlab="Edit session",
 121+ auto.key=list(
 122+ text=paste("~", names(params), "edits"),
 123+ col=c(
 124+ "#000000",
 125+ "#FF0000",
 126+ "#00FF00",
 127+ "#0000FF",
 128+ "#BBBB00"
 129+ ),
 130+ points=F
 131+ )
 132+)
 133+dev.off()
Index: trunk/tools/wsor/first_session/R/first_session_survival.R
@@ -0,0 +1,457 @@
 2+source("loader/user_sessions.R")
 3+
 4+library(lattice)
 5+library(doBy)
 6+
 7+user_sessions = load_user_sessions()
 8+user_sessions$year = strftime(user_sessions$first_edit, format="%Y")
 9+user_sessions$early_survival = user_sessions$last_edit - user_sessions$es_0_end >= 30
 10+
 11+year_props = with(
 12+ summaryBy(
 13+ early_survival ~ year,
 14+ data=user_sessions[!is.na(user_sessions$year),],
 15+ FUN=c(mean, length)
 16+ ),
 17+ data.frame(
 18+ year = year,
 19+ early_survival = early_survival.mean,
 20+ n = early_survival.length
 21+ )
 22+)
 23+
 24+png("plots/early_survival.by_year.png", height=768, width=1024)
 25+xyplot(
 26+ early_survival ~ year,
 27+ data=year_props,
 28+ panel=function(x, y, subscripts, ...){
 29+ f = year_props[subscripts,]
 30+ panel.xyplot(x, y, ...)
 31+ panel.lines(x, y, ...)
 32+ x = f$year
 33+ p = f$early_survival
 34+ n = f$n
 35+ se = sqrt(p*(1-p)/n)
 36+ panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1)
 37+ },
 38+ ylim=c(0, 1),
 39+ main="Early survival proportion for new editors",
 40+ ylab="Proportion of surviving editors",
 41+ xlab="Year",
 42+ sub="early survival = editing more than 1 month after first session"
 43+)
 44+dev.off()
 45+
 46+year_props.no_vandal = with(
 47+ summaryBy(
 48+ early_survival ~ year,
 49+ data=user_sessions[
 50+ !is.na(user_sessions$year) &
 51+ user_sessions$es_0_edits >= 2 &
 52+ user_sessions$es_0_vandalism / user_sessions$es_0_edits <= .25,
 53+ ],
 54+ FUN=c(mean, length)
 55+ ),
 56+ data.frame(
 57+ year = year,
 58+ early_survival = early_survival.mean,
 59+ n = early_survival.length
 60+ )
 61+)
 62+
 63+png("plots/early_survival.by_year.no_vandals.png", height=768, width=1024)
 64+xyplot(
 65+ early_survival ~ year,
 66+ data=year_props.no_vandal,
 67+ panel=function(x, y, subscripts, ...){
 68+ f = year_props.no_vandal[subscripts,]
 69+ panel.xyplot(x, y, ...)
 70+ panel.lines(x, y, ...)
 71+ x = f$year
 72+ p = f$early_survival
 73+ n = f$n
 74+ se = sqrt(p*(1-p)/n)
 75+ panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1)
 76+ },
 77+ ylim=c(0, 1),
 78+ main="Early survival proportion for new editors (no vandals)",
 79+ ylab="Proportion of surviving editors",
 80+ xlab="Year",
 81+ sub="early survival = editing more than 1 month after first session"
 82+)
 83+dev.off()
 84+
 85+user_sessions$es_0_bucket = 2^round(log(user_sessions$es_0_edits, base=2))
 86+
 87+year_edits_props = with(
 88+ summaryBy(
 89+ early_survival ~ year + es_0_bucket,
 90+ data=user_sessions[
 91+ !is.na(user_sessions$year) &
 92+ user_sessions$es_0_bucket <= 256,
 93+ ],
 94+ FUN=c(mean, length)
 95+ ),
 96+ data.frame(
 97+ year = year,
 98+ es_0_bucket = es_0_bucket,
 99+ early_survival = early_survival.mean,
 100+ n = early_survival.length
 101+ )
 102+)
 103+
 104+png("plots/early_survival.by_year_and_first_session.png", height=768, width=1024)
 105+xyplot(
 106+ early_survival ~ es_0_bucket | as.factor(year),
 107+ data=year_edits_props,
 108+ panel=function(x, y, subscripts, ...){
 109+ f = year_edits_props[subscripts,]
 110+ panel.xyplot(x, y, ...)
 111+ x = log(f$es_0_bucket, base=2)
 112+ p = f$early_survival
 113+ n = f$n
 114+ se = sqrt(p*(1-p)/n)
 115+ panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1)
 116+ panel.lines(-5:10, .2, col="#BBBBBB")
 117+ panel.lines(-5:10, .4, col="#BBBBBB")
 118+ panel.lines(-5:10, .6, col="#BBBBBB")
 119+ panel.lines(-5:10, .8, col="#BBBBBB")
 120+ },
 121+ ylim=c(0, 1),
 122+ main="Early survival proportion for new editors by first session edits",
 123+ ylab="Proportion of surviving editors",
 124+ xlab="First session edits",
 125+ sub="early survival = editing more than 1 month after first session",
 126+ scales=list(x=list(log=2, at=2^(0:8))),
 127+ xlim=c(.5, 300)
 128+)
 129+dev.off()
 130+
 131+png("plots/early_survival.by_year.es_lines.png", height=768, width=1024)
 132+limited_year_edits_props = year_edits_props[
 133+ year_edits_props$n >= 10 &
 134+ year_edits_props$es_0_bucket <= 16,
 135+]
 136+params = list(
 137+ "1"=list(
 138+ col="#000000",
 139+ pch=0,
 140+ lty=0
 141+ ),
 142+ "2"=list(
 143+ col="#FF0000",
 144+ pch=1,
 145+ lty=1
 146+ ),
 147+ "4"=list(
 148+ col="#00FF00",
 149+ pch=2,
 150+ lty=2
 151+ ),
 152+ "8"=list(
 153+ col="#0000FF",
 154+ pch=3,
 155+ lty=3
 156+ ),
 157+ "16"=list(
 158+ col="#BBBB00",
 159+ pch=4,
 160+ lty=4
 161+ ),
 162+ "32"=list(
 163+ col="#00BBBB",
 164+ pch=5,
 165+ lty=5
 166+ ),
 167+ "64"=list(
 168+ col="#BB00BB",
 169+ pch=6,
 170+ lty=6
 171+ )
 172+)
 173+xyplot(
 174+ early_survival ~ year,
 175+ data=limited_year_edits_props,
 176+ groups=es_0_bucket,
 177+ panel=function(x, y, subscripts, groups, ...){
 178+ f = limited_year_edits_props[subscripts,]
 179+ for(group in groups){
 180+ group = as.character(group)
 181+ subf = f[f$es_0_bucket == group,]
 182+ p = subf$early_survival
 183+ x = subf$year
 184+ n = subf$n
 185+ panel.xyplot(
 186+ x, p,
 187+ col=params[[group]]$col,
 188+ pch=params[[group]]$pch,
 189+ ...
 190+ )
 191+ panel.lines(
 192+ x, p,
 193+ col=params[[group]]$col,
 194+ lwd=2,
 195+ ...
 196+ )
 197+ se = sqrt(p*(1-p)/n)
 198+ panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 199+ }
 200+ },
 201+ ylim=c(0, 1),
 202+ main="Early survival proportion for new editors grouped by edits in their first session",
 203+ ylab="Proportion of surviving editors",
 204+ xlab="Years",
 205+ sub="early survival = editing more than 1 month after first session",
 206+ auto.key=list(
 207+ text=paste("~", names(params), "edits"),
 208+ col=c(
 209+ "#000000",
 210+ "#FF0000",
 211+ "#00FF00",
 212+ "#0000FF",
 213+ "#BBBB00",
 214+ "#00BBBB",
 215+ "#BB00BB"
 216+ ),
 217+ points=F
 218+ )
 219+)
 220+dev.off()
 221+
 222+
 223+user_sessions$es_0_no_arch = 2^round(log(user_sessions$es_0_edits - user_sessions$es_0_deleted, base=2))
 224+
 225+no_arch_edits_props = with(
 226+ summaryBy(
 227+ early_survival ~ year + es_0_no_arch,
 228+ data=user_sessions[
 229+ !is.na(user_sessions$year) &
 230+ user_sessions$es_0_no_arch <= 256,
 231+ ],
 232+ FUN=c(mean, length)
 233+ ),
 234+ data.frame(
 235+ year = year,
 236+ es_0_no_arch = es_0_no_arch,
 237+ early_survival = early_survival.mean,
 238+ n = early_survival.length
 239+ )
 240+)
 241+
 242+
 243+png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024)
 244+limited_year_edits_props = no_arch_edits_props[
 245+ no_arch_edits_props$n >= 10 &
 246+ no_arch_edits_props$es_0_no_arch <= 16,
 247+]
 248+params = list(
 249+ "0"=list(
 250+ col="#AAAAAA",
 251+ pch=0,
 252+ lty=0
 253+ ),
 254+ "1"=list(
 255+ col="#000000",
 256+ pch=0,
 257+ lty=0
 258+ ),
 259+ "2"=list(
 260+ col="#FF0000",
 261+ pch=1,
 262+ lty=1
 263+ ),
 264+ "4"=list(
 265+ col="#00FF00",
 266+ pch=2,
 267+ lty=2
 268+ ),
 269+ "8"=list(
 270+ col="#0000FF",
 271+ pch=3,
 272+ lty=3
 273+ ),
 274+ "16"=list(
 275+ col="#BBBB00",
 276+ pch=4,
 277+ lty=4
 278+ )
 279+)
 280+xyplot(
 281+ early_survival ~ year,
 282+ data=limited_year_edits_props,
 283+ groups=es_0_no_arch,
 284+ panel=function(x, y, subscripts, groups, ...){
 285+ f = limited_year_edits_props[subscripts,]
 286+ for(group in groups){
 287+ group = as.character(group)
 288+ subf = f[f$es_0_no_arch == group,]
 289+ p = subf$early_survival
 290+ x = subf$year
 291+ n = subf$n
 292+ panel.xyplot(
 293+ x, p,
 294+ col=params[[group]]$col,
 295+ pch=params[[group]]$pch,
 296+ ...
 297+ )
 298+ panel.lines(
 299+ x, p,
 300+ col=params[[group]]$col,
 301+ lwd=2,
 302+ ...
 303+ )
 304+ se = sqrt(p*(1-p)/n)
 305+ panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 306+ }
 307+ },
 308+ ylim=c(0, 1),
 309+ main="Early survival proportion for new editors grouped by edits (not deleted) in their first session",
 310+ ylab="Proportion of surviving editors",
 311+ xlab="Years",
 312+ sub="early survival = editing more than 1 month after first session",
 313+ auto.key=list(
 314+ text=paste("~", names(params), "edits"),
 315+ col=c(
 316+ "#AAAAAA",
 317+ "#000000",
 318+ "#FF0000",
 319+ "#00FF00",
 320+ "#0000FF",
 321+ "#BBBB00",
 322+ "#00BBBB",
 323+ "#BB00BB"
 324+ ),
 325+ points=F
 326+ )
 327+)
 328+dev.off()
 329+
 330+
 331+user_sessions$years_since_2001 = as.numeric((user_sessions$first_edit - as.POSIXct("2001-01-01"))/365)
 332+user_sessions$initial_rejection = with(
 333+ user_sessions,
 334+ (
 335+ naReplace(es_0_deleted, 0) + naReplace(es_0_reverted, 0) +
 336+ naReplace(es_1_deleted, 0) + naReplace(es_1_reverted, 0) +
 337+ naReplace(es_2_deleted, 0) + naReplace(es_2_reverted, 0)
 338+ )/(
 339+ naReplace(es_0_edits, 0) +
 340+ naReplace(es_1_edits, 0) +
 341+ naReplace(es_2_edits, 0)
 342+ )
 343+)
 344+sc = scale
 345+summary(glm(
 346+ early_survival ~
 347+ sc(es_0_edits) *
 348+ sc(years_since_2001) *
 349+ sc(initial_rejection),
 350+ data=user_sessions[
 351+ user_sessions$es_0_edits > 3,
 352+ ],
 353+ family=binomial(link="logit")
 354+))
 355+
 356+
 357+user_sessions$initial_rejection_group = round(user_sessions$initial_rejection/2, 1)*2
 358+
 359+survival_by_year_and_rejection = with(
 360+ summaryBy(
 361+ early_survival ~ year + initial_rejection_group,
 362+ data=user_sessions[
 363+ user_sessions$es_0_edits > 3 &
 364+ user_sessions$es_0_vandalism == 0,
 365+ ],
 366+ FUN=c(mean, length)
 367+ ),
 368+ data.frame(
 369+ year = year,
 370+ rejection_group = initial_rejection_group,
 371+ early_survival = early_survival.mean,
 372+ n = early_survival.length
 373+ )
 374+)
 375+
 376+png("plots/early_survival.by_year_and_rejection.no_vandals.png", height=768, width=1024)
 377+limited_frame = survival_by_year_and_rejection[
 378+ survival_by_year_and_rejection$n >= 10,
 379+]
 380+params = list(
 381+ "0"=list(
 382+ col="#AAAAAA",
 383+ pch=0,
 384+ lty=0
 385+ ),
 386+ "0.2"=list(
 387+ col="#FF0000",
 388+ pch=1,
 389+ lty=1
 390+ ),
 391+ "0.4"=list(
 392+ col="#0000FF",
 393+ pch=3,
 394+ lty=3
 395+ ),
 396+ "0.6"=list(
 397+ col="#00BBBB",
 398+ pch=5,
 399+ lty=4
 400+ ),
 401+ "0.8"=list(
 402+ col="#BB0000",
 403+ pch=7,
 404+ lty=4
 405+ ),
 406+ "1"=list(
 407+ col="#00BB00",
 408+ pch=9,
 409+ lty=4
 410+ )
 411+)
 412+xyplot(
 413+ early_survival ~ year,
 414+ data=limited_frame,
 415+ groups=rejection_group,
 416+ panel=function(x, y, subscripts, groups, ...){
 417+ f = limited_frame[subscripts,]
 418+ for(group in groups){
 419+ group = as.character(group)
 420+ subf = f[f$rejection_group == group,]
 421+ p = subf$early_survival
 422+ x = subf$year
 423+ n = subf$n
 424+ panel.xyplot(
 425+ x, p,
 426+ col=params[[group]]$col,
 427+ pch=params[[group]]$pch,
 428+ ...
 429+ )
 430+ panel.lines(
 431+ x, p,
 432+ col=params[[group]]$col,
 433+ lwd=2,
 434+ ...
 435+ )
 436+ se = sqrt(p*(1-p)/n)
 437+ panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05)
 438+ }
 439+ },
 440+ ylim=c(0, 1),
 441+ main="Early survival proportion for new editors grouped by early rejection proportion",
 442+ ylab="Proportion of surviving editors",
 443+ xlab="Years",
 444+ sub="early survival = editing more than 1 month after first session\nrejection = proportion of revisions reverted or deleted in first edit sessions.",
 445+ auto.key=list(
 446+ text=paste("~", names(params), " rejection"),
 447+ col=c(
 448+ "#AAAAAA",
 449+ "#FF0000",
 450+ "#0000FF",
 451+ "#00BBBB",
 452+ "#BB0000",
 453+ "#00BB00"
 454+ ),
 455+ points=F
 456+ )
 457+)
 458+dev.off()
Index: trunk/tools/wsor/first_session/R/loader/user_sessions.R
@@ -0,0 +1,38 @@
 2+source("util/env.R")
 3+
 4+load_user_sessions = function(verbose=T, reload=F){
 5+ filename = paste(DATA_DIR, "user_sessions.3.tsv", sep="/")
 6+ if(!exists("USER_SESSIONS")){
 7+ USER_SESSIONS <<- NULL
 8+ }
 9+ if(is.null(USER_SESSIONS) | reload){
 10+ USER_SESSIONS <<- NULL
 11+ }
 12+ if(is.null(USER_SESSIONS)){
 13+ if(verbose){cat("Loading ", filename, "...")}
 14+ USER_SESSIONS <<- read.table(
 15+ filename,
 16+ header=T, sep="\t",
 17+ quote="", comment.char="",
 18+ na.strings="\\N"
 19+ )
 20+ USER_SESSIONS$first_edit = strptime(
 21+ as.character(USER_SESSIONS$first_edit),
 22+ "%Y%m%d%H%M%S"
 23+ )
 24+ USER_SESSIONS$last_edit = strptime(
 25+ as.character(USER_SESSIONS$last_edit),
 26+ "%Y%m%d%H%M%S"
 27+ )
 28+ USER_SESSIONS$es_0_start = as.POSIXct(USER_SESSIONS$es_0_start, origin="1970-01-01")
 29+ USER_SESSIONS$es_1_start = as.POSIXct(USER_SESSIONS$es_1_start, origin="1970-01-01")
 30+ USER_SESSIONS$es_2_start = as.POSIXct(USER_SESSIONS$es_2_start, origin="1970-01-01")
 31+ USER_SESSIONS$es_0_end = as.POSIXct(USER_SESSIONS$es_0_end, origin="1970-01-01")
 32+ USER_SESSIONS$es_1_end = as.POSIXct(USER_SESSIONS$es_1_end, origin="1970-01-01")
 33+ USER_SESSIONS$es_2_end = as.POSIXct(USER_SESSIONS$es_2_end, origin="1970-01-01")
 34+ if(verbose){cat("DONE!\n")}
 35+ }
 36+ USER_SESSIONS
 37+}
 38+
 39+
Index: trunk/tools/wsor/first_session/R/edit_distributions.R
@@ -0,0 +1,64 @@
 2+source("loader/user_sessions.R")
 3+
 4+library(lattice)
 5+library(doBy)
 6+
 7+user_sessions = load_user_sessions()
 8+user_sessions$year = floor(user_sessions$first_edit/10000000000)
 9+
 10+
 11+year_edits = data.frame()
 12+for(year in unique(user_sessions$year)){
 13+ tab = data.frame(
 14+ table(
 15+ 10^round(
 16+ log(
 17+ user_sessions[user_sessions$year == year,]$edit_count,
 18+ base=10
 19+ )
 20+ )
 21+ )
 22+ )
 23+
 24+ year_edits = rbind(
 25+ year_edits,
 26+ data.frame(
 27+ year = year,
 28+ edits = as.numeric(as.character(tab$Var1)),
 29+ freq = tab$Freq,
 30+ prop = tab$Freq/sum(tab$Freq)
 31+ )
 32+ )
 33+}
 34+
 35+png("plots/edit_count_distribution.png", height=768, width=1024)
 36+xyplot(
 37+ freq ~ edits | as.factor(year),
 38+ data = year_edits[year_edits$edits > 0,],
 39+ type="o",
 40+ scales=list(
 41+ x=list(log=10, at=10^(0:6), labels=10^(0:6))#,
 42+ #y=list(log=10)
 43+ ),
 44+ main="Editor edit count distributions by editor first edit year",
 45+ xlab="Number of edits (log10 bucketed)",
 46+ ylab="Number of editors",
 47+ sub="based on a random sample of <= 10,000 editors from each year"
 48+)
 49+dev.off()
 50+
 51+png("plots/edit_count_distribution.prop.png", height=768, width=1024)
 52+xyplot(
 53+ prop ~ edits | as.factor(year),
 54+ data = year_edits[year_edits$edits > 0,],
 55+ type="o",
 56+ scales=list(
 57+ x=list(log=10, at=10^(0:6), labels=10^(0:6))#,
 58+ #y=list(log=10)
 59+ ),
 60+ main="Editor edit count distributions by editor first edit year",
 61+ xlab="Number of edits (log10 bucketed)",
 62+ ylab="Proportion of editors",
 63+ sub="based on a random sample of <= 10,000 editors from each year"
 64+)
 65+dev.off()
Index: trunk/tools/wsor/first_session/R/.RData
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/tools/wsor/first_session/R/.RData
___________________________________________________________________
Added: svn:mime-type
166 + application/octet-stream
Index: trunk/tools/wsor/first_session/R/Rplots.pdf
@@ -0,0 +1,267 @@
 2+%PDF-1.4
 3+%���ρ�\r
 4+1 0 obj
 5+<<
 6+/CreationDate (D:20110719222129)
 7+/ModDate (D:20110719222129)
 8+/Title (R Graphics Output)
 9+/Producer (R 2.13.1)
 10+/Creator (R)
 11+>>
 12+endobj
 13+2 0 obj
 14+<<
 15+/Type /Catalog
 16+/Pages 3 0 R
 17+>>
 18+endobj
 19+3 0 obj
 20+<<
 21+/Type /Pages
 22+/Kids [
 23+]
 24+/Count 0
 25+/MediaBox [0 0 504 504]
 26+>>
 27+endobj
 28+4 0 obj
 29+<<
 30+/ProcSet [/PDF /Text]
 31+/Font <<>>
 32+/ExtGState << >>
 33+/ColorSpace << /sRGB 5 0 R >>
 34+>>
 35+endobj
 36+5 0 obj
 37+[/ICCBased 6 0 R]
 38+endobj
 39+6 0 obj
 40+<< /N 3 /Alternate /DeviceRGB /Length 9433 /Filter /ASCIIHexDecode >>
 41+stream
 42+00 00 0c 48 4c 69 6e 6f 02 10 00 00 6d 6e 74 72
 43+52 47 42 20 58 59 5a 20 07 ce 00 02 00 09 00 06
 44+00 31 00 00 61 63 73 70 4d 53 46 54 00 00 00 00
 45+49 45 43 20 73 52 47 42 00 00 00 00 00 00 00 00
 46+00 00 00 00 00 00 f6 d6 00 01 00 00 00 00 d3 2d
 47+48 50 20 20 00 00 00 00 00 00 00 00 00 00 00 00
 48+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 49+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 50+00 00 00 11 63 70 72 74 00 00 01 50 00 00 00 33
 51+64 65 73 63 00 00 01 84 00 00 00 6c 77 74 70 74
 52+00 00 01 f0 00 00 00 14 62 6b 70 74 00 00 02 04
 53+00 00 00 14 72 58 59 5a 00 00 02 18 00 00 00 14
 54+67 58 59 5a 00 00 02 2c 00 00 00 14 62 58 59 5a
 55+00 00 02 40 00 00 00 14 64 6d 6e 64 00 00 02 54
 56+00 00 00 70 64 6d 64 64 00 00 02 c4 00 00 00 88
 57+76 75 65 64 00 00 03 4c 00 00 00 86 76 69 65 77
 58+00 00 03 d4 00 00 00 24 6c 75 6d 69 00 00 03 f8
 59+00 00 00 14 6d 65 61 73 00 00 04 0c 00 00 00 24
 60+74 65 63 68 00 00 04 30 00 00 00 0c 72 54 52 43
 61+00 00 04 3c 00 00 08 0c 67 54 52 43 00 00 04 3c
 62+00 00 08 0c 62 54 52 43 00 00 04 3c 00 00 08 0c
 63+74 65 78 74 00 00 00 00 43 6f 70 79 72 69 67 68
 64+74 20 28 63 29 20 31 39 39 38 20 48 65 77 6c 65
 65+74 74 2d 50 61 63 6b 61 72 64 20 43 6f 6d 70 61
 66+6e 79 00 00 64 65 73 63 00 00 00 00 00 00 00 12
 67+73 52 47 42 20 49 45 43 36 31 39 36 36 2d 32 2e
 68+31 00 00 00 00 00 00 00 00 00 00 00 12 73 52 47
 69+42 20 49 45 43 36 31 39 36 36 2d 32 2e 31 00 00
 70+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 71+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 72+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 73+58 59 5a 20 00 00 00 00 00 00 f3 51 00 01 00 00
 74+00 01 16 cc 58 59 5a 20 00 00 00 00 00 00 00 00
 75+00 00 00 00 00 00 00 00 58 59 5a 20 00 00 00 00
 76+00 00 6f a2 00 00 38 f5 00 00 03 90 58 59 5a 20
 77+00 00 00 00 00 00 62 99 00 00 b7 85 00 00 18 da
 78+58 59 5a 20 00 00 00 00 00 00 24 a0 00 00 0f 84
 79+00 00 b6 cf 64 65 73 63 00 00 00 00 00 00 00 16
 80+49 45 43 20 68 74 74 70 3a 2f 2f 77 77 77 2e 69
 81+65 63 2e 63 68 00 00 00 00 00 00 00 00 00 00 00
 82+16 49 45 43 20 68 74 74 70 3a 2f 2f 77 77 77 2e
 83+69 65 63 2e 63 68 00 00 00 00 00 00 00 00 00 00
 84+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 85+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 86+00 00 00 00 64 65 73 63 00 00 00 00 00 00 00 2e
 87+49 45 43 20 36 31 39 36 36 2d 32 2e 31 20 44 65
 88+66 61 75 6c 74 20 52 47 42 20 63 6f 6c 6f 75 72
 89+20 73 70 61 63 65 20 2d 20 73 52 47 42 00 00 00
 90+00 00 00 00 00 00 00 00 2e 49 45 43 20 36 31 39
 91+36 36 2d 32 2e 31 20 44 65 66 61 75 6c 74 20 52
 92+47 42 20 63 6f 6c 6f 75 72 20 73 70 61 63 65 20
 93+2d 20 73 52 47 42 00 00 00 00 00 00 00 00 00 00
 94+00 00 00 00 00 00 00 00 00 00 00 00 64 65 73 63
 95+00 00 00 00 00 00 00 2c 52 65 66 65 72 65 6e 63
 96+65 20 56 69 65 77 69 6e 67 20 43 6f 6e 64 69 74
 97+69 6f 6e 20 69 6e 20 49 45 43 36 31 39 36 36 2d
 98+32 2e 31 00 00 00 00 00 00 00 00 00 00 00 2c 52
 99+65 66 65 72 65 6e 63 65 20 56 69 65 77 69 6e 67
 100+20 43 6f 6e 64 69 74 69 6f 6e 20 69 6e 20 49 45
 101+43 36 31 39 36 36 2d 32 2e 31 00 00 00 00 00 00
 102+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 103+00 00 00 00 76 69 65 77 00 00 00 00 00 13 a4 fe
 104+00 14 5f 2e 00 10 cf 14 00 03 ed cc 00 04 13 0b
 105+00 03 5c 9e 00 00 00 01 58 59 5a 20 00 00 00 00
 106+00 4c 09 56 00 50 00 00 00 57 1f e7 6d 65 61 73
 107+00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 00
 108+00 00 00 00 00 00 00 00 00 00 02 8f 00 00 00 02
 109+73 69 67 20 00 00 00 00 43 52 54 20 63 75 72 76
 110+00 00 00 00 00 00 04 00 00 00 00 05 00 0a 00 0f
 111+00 14 00 19 00 1e 00 23 00 28 00 2d 00 32 00 37
 112+00 3b 00 40 00 45 00 4a 00 4f 00 54 00 59 00 5e
 113+00 63 00 68 00 6d 00 72 00 77 00 7c 00 81 00 86
 114+00 8b 00 90 00 95 00 9a 00 9f 00 a4 00 a9 00 ae
 115+00 b2 00 b7 00 bc 00 c1 00 c6 00 cb 00 d0 00 d5
 116+00 db 00 e0 00 e5 00 eb 00 f0 00 f6 00 fb 01 01
 117+01 07 01 0d 01 13 01 19 01 1f 01 25 01 2b 01 32
 118+01 38 01 3e 01 45 01 4c 01 52 01 59 01 60 01 67
 119+01 6e 01 75 01 7c 01 83 01 8b 01 92 01 9a 01 a1
 120+01 a9 01 b1 01 b9 01 c1 01 c9 01 d1 01 d9 01 e1
 121+01 e9 01 f2 01 fa 02 03 02 0c 02 14 02 1d 02 26
 122+02 2f 02 38 02 41 02 4b 02 54 02 5d 02 67 02 71
 123+02 7a 02 84 02 8e 02 98 02 a2 02 ac 02 b6 02 c1
 124+02 cb 02 d5 02 e0 02 eb 02 f5 03 00 03 0b 03 16
 125+03 21 03 2d 03 38 03 43 03 4f 03 5a 03 66 03 72
 126+03 7e 03 8a 03 96 03 a2 03 ae 03 ba 03 c7 03 d3
 127+03 e0 03 ec 03 f9 04 06 04 13 04 20 04 2d 04 3b
 128+04 48 04 55 04 63 04 71 04 7e 04 8c 04 9a 04 a8
 129+04 b6 04 c4 04 d3 04 e1 04 f0 04 fe 05 0d 05 1c
 130+05 2b 05 3a 05 49 05 58 05 67 05 77 05 86 05 96
 131+05 a6 05 b5 05 c5 05 d5 05 e5 05 f6 06 06 06 16
 132+06 27 06 37 06 48 06 59 06 6a 06 7b 06 8c 06 9d
 133+06 af 06 c0 06 d1 06 e3 06 f5 07 07 07 19 07 2b
 134+07 3d 07 4f 07 61 07 74 07 86 07 99 07 ac 07 bf
 135+07 d2 07 e5 07 f8 08 0b 08 1f 08 32 08 46 08 5a
 136+08 6e 08 82 08 96 08 aa 08 be 08 d2 08 e7 08 fb
 137+09 10 09 25 09 3a 09 4f 09 64 09 79 09 8f 09 a4
 138+09 ba 09 cf 09 e5 09 fb 0a 11 0a 27 0a 3d 0a 54
 139+0a 6a 0a 81 0a 98 0a ae 0a c5 0a dc 0a f3 0b 0b
 140+0b 22 0b 39 0b 51 0b 69 0b 80 0b 98 0b b0 0b c8
 141+0b e1 0b f9 0c 12 0c 2a 0c 43 0c 5c 0c 75 0c 8e
 142+0c a7 0c c0 0c d9 0c f3 0d 0d 0d 26 0d 40 0d 5a
 143+0d 74 0d 8e 0d a9 0d c3 0d de 0d f8 0e 13 0e 2e
 144+0e 49 0e 64 0e 7f 0e 9b 0e b6 0e d2 0e ee 0f 09
 145+0f 25 0f 41 0f 5e 0f 7a 0f 96 0f b3 0f cf 0f ec
 146+10 09 10 26 10 43 10 61 10 7e 10 9b 10 b9 10 d7
 147+10 f5 11 13 11 31 11 4f 11 6d 11 8c 11 aa 11 c9
 148+11 e8 12 07 12 26 12 45 12 64 12 84 12 a3 12 c3
 149+12 e3 13 03 13 23 13 43 13 63 13 83 13 a4 13 c5
 150+13 e5 14 06 14 27 14 49 14 6a 14 8b 14 ad 14 ce
 151+14 f0 15 12 15 34 15 56 15 78 15 9b 15 bd 15 e0
 152+16 03 16 26 16 49 16 6c 16 8f 16 b2 16 d6 16 fa
 153+17 1d 17 41 17 65 17 89 17 ae 17 d2 17 f7 18 1b
 154+18 40 18 65 18 8a 18 af 18 d5 18 fa 19 20 19 45
 155+19 6b 19 91 19 b7 19 dd 1a 04 1a 2a 1a 51 1a 77
 156+1a 9e 1a c5 1a ec 1b 14 1b 3b 1b 63 1b 8a 1b b2
 157+1b da 1c 02 1c 2a 1c 52 1c 7b 1c a3 1c cc 1c f5
 158+1d 1e 1d 47 1d 70 1d 99 1d c3 1d ec 1e 16 1e 40
 159+1e 6a 1e 94 1e be 1e e9 1f 13 1f 3e 1f 69 1f 94
 160+1f bf 1f ea 20 15 20 41 20 6c 20 98 20 c4 20 f0
 161+21 1c 21 48 21 75 21 a1 21 ce 21 fb 22 27 22 55
 162+22 82 22 af 22 dd 23 0a 23 38 23 66 23 94 23 c2
 163+23 f0 24 1f 24 4d 24 7c 24 ab 24 da 25 09 25 38
 164+25 68 25 97 25 c7 25 f7 26 27 26 57 26 87 26 b7
 165+26 e8 27 18 27 49 27 7a 27 ab 27 dc 28 0d 28 3f
 166+28 71 28 a2 28 d4 29 06 29 38 29 6b 29 9d 29 d0
 167+2a 02 2a 35 2a 68 2a 9b 2a cf 2b 02 2b 36 2b 69
 168+2b 9d 2b d1 2c 05 2c 39 2c 6e 2c a2 2c d7 2d 0c
 169+2d 41 2d 76 2d ab 2d e1 2e 16 2e 4c 2e 82 2e b7
 170+2e ee 2f 24 2f 5a 2f 91 2f c7 2f fe 30 35 30 6c
 171+30 a4 30 db 31 12 31 4a 31 82 31 ba 31 f2 32 2a
 172+32 63 32 9b 32 d4 33 0d 33 46 33 7f 33 b8 33 f1
 173+34 2b 34 65 34 9e 34 d8 35 13 35 4d 35 87 35 c2
 174+35 fd 36 37 36 72 36 ae 36 e9 37 24 37 60 37 9c
 175+37 d7 38 14 38 50 38 8c 38 c8 39 05 39 42 39 7f
 176+39 bc 39 f9 3a 36 3a 74 3a b2 3a ef 3b 2d 3b 6b
 177+3b aa 3b e8 3c 27 3c 65 3c a4 3c e3 3d 22 3d 61
 178+3d a1 3d e0 3e 20 3e 60 3e a0 3e e0 3f 21 3f 61
 179+3f a2 3f e2 40 23 40 64 40 a6 40 e7 41 29 41 6a
 180+41 ac 41 ee 42 30 42 72 42 b5 42 f7 43 3a 43 7d
 181+43 c0 44 03 44 47 44 8a 44 ce 45 12 45 55 45 9a
 182+45 de 46 22 46 67 46 ab 46 f0 47 35 47 7b 47 c0
 183+48 05 48 4b 48 91 48 d7 49 1d 49 63 49 a9 49 f0
 184+4a 37 4a 7d 4a c4 4b 0c 4b 53 4b 9a 4b e2 4c 2a
 185+4c 72 4c ba 4d 02 4d 4a 4d 93 4d dc 4e 25 4e 6e
 186+4e b7 4f 00 4f 49 4f 93 4f dd 50 27 50 71 50 bb
 187+51 06 51 50 51 9b 51 e6 52 31 52 7c 52 c7 53 13
 188+53 5f 53 aa 53 f6 54 42 54 8f 54 db 55 28 55 75
 189+55 c2 56 0f 56 5c 56 a9 56 f7 57 44 57 92 57 e0
 190+58 2f 58 7d 58 cb 59 1a 59 69 59 b8 5a 07 5a 56
 191+5a a6 5a f5 5b 45 5b 95 5b e5 5c 35 5c 86 5c d6
 192+5d 27 5d 78 5d c9 5e 1a 5e 6c 5e bd 5f 0f 5f 61
 193+5f b3 60 05 60 57 60 aa 60 fc 61 4f 61 a2 61 f5
 194+62 49 62 9c 62 f0 63 43 63 97 63 eb 64 40 64 94
 195+64 e9 65 3d 65 92 65 e7 66 3d 66 92 66 e8 67 3d
 196+67 93 67 e9 68 3f 68 96 68 ec 69 43 69 9a 69 f1
 197+6a 48 6a 9f 6a f7 6b 4f 6b a7 6b ff 6c 57 6c af
 198+6d 08 6d 60 6d b9 6e 12 6e 6b 6e c4 6f 1e 6f 78
 199+6f d1 70 2b 70 86 70 e0 71 3a 71 95 71 f0 72 4b
 200+72 a6 73 01 73 5d 73 b8 74 14 74 70 74 cc 75 28
 201+75 85 75 e1 76 3e 76 9b 76 f8 77 56 77 b3 78 11
 202+78 6e 78 cc 79 2a 79 89 79 e7 7a 46 7a a5 7b 04
 203+7b 63 7b c2 7c 21 7c 81 7c e1 7d 41 7d a1 7e 01
 204+7e 62 7e c2 7f 23 7f 84 7f e5 80 47 80 a8 81 0a
 205+81 6b 81 cd 82 30 82 92 82 f4 83 57 83 ba 84 1d
 206+84 80 84 e3 85 47 85 ab 86 0e 86 72 86 d7 87 3b
 207+87 9f 88 04 88 69 88 ce 89 33 89 99 89 fe 8a 64
 208+8a ca 8b 30 8b 96 8b fc 8c 63 8c ca 8d 31 8d 98
 209+8d ff 8e 66 8e ce 8f 36 8f 9e 90 06 90 6e 90 d6
 210+91 3f 91 a8 92 11 92 7a 92 e3 93 4d 93 b6 94 20
 211+94 8a 94 f4 95 5f 95 c9 96 34 96 9f 97 0a 97 75
 212+97 e0 98 4c 98 b8 99 24 99 90 99 fc 9a 68 9a d5
 213+9b 42 9b af 9c 1c 9c 89 9c f7 9d 64 9d d2 9e 40
 214+9e ae 9f 1d 9f 8b 9f fa a0 69 a0 d8 a1 47 a1 b6
 215+a2 26 a2 96 a3 06 a3 76 a3 e6 a4 56 a4 c7 a5 38
 216+a5 a9 a6 1a a6 8b a6 fd a7 6e a7 e0 a8 52 a8 c4
 217+a9 37 a9 a9 aa 1c aa 8f ab 02 ab 75 ab e9 ac 5c
 218+ac d0 ad 44 ad b8 ae 2d ae a1 af 16 af 8b b0 00
 219+b0 75 b0 ea b1 60 b1 d6 b2 4b b2 c2 b3 38 b3 ae
 220+b4 25 b4 9c b5 13 b5 8a b6 01 b6 79 b6 f0 b7 68
 221+b7 e0 b8 59 b8 d1 b9 4a b9 c2 ba 3b ba b5 bb 2e
 222+bb a7 bc 21 bc 9b bd 15 bd 8f be 0a be 84 be ff
 223+bf 7a bf f5 c0 70 c0 ec c1 67 c1 e3 c2 5f c2 db
 224+c3 58 c3 d4 c4 51 c4 ce c5 4b c5 c8 c6 46 c6 c3
 225+c7 41 c7 bf c8 3d c8 bc c9 3a c9 b9 ca 38 ca b7
 226+cb 36 cb b6 cc 35 cc b5 cd 35 cd b5 ce 36 ce b6
 227+cf 37 cf b8 d0 39 d0 ba d1 3c d1 be d2 3f d2 c1
 228+d3 44 d3 c6 d4 49 d4 cb d5 4e d5 d1 d6 55 d6 d8
 229+d7 5c d7 e0 d8 64 d8 e8 d9 6c d9 f1 da 76 da fb
 230+db 80 dc 05 dc 8a dd 10 dd 96 de 1c de a2 df 29
 231+df af e0 36 e0 bd e1 44 e1 cc e2 53 e2 db e3 63
 232+e3 eb e4 73 e4 fc e5 84 e6 0d e6 96 e7 1f e7 a9
 233+e8 32 e8 bc e9 46 e9 d0 ea 5b ea e5 eb 70 eb fb
 234+ec 86 ed 11 ed 9c ee 28 ee b4 ef 40 ef cc f0 58
 235+f0 e5 f1 72 f1 ff f2 8c f3 19 f3 a7 f4 34 f4 c2
 236+f5 50 f5 de f6 6d f6 fb f7 8a f8 19 f8 a8 f9 38
 237+f9 c7 fa 57 fa e7 fb 77 fc 07 fc 98 fd 29 fd ba
 238+fe 4b fe dc ff 6d ff ff >
 239+endstream
 240+endobj
 241+7 0 obj
 242+<<
 243+/Type /Encoding
 244+/BaseEncoding /WinAnsiEncoding
 245+/Differences [ 45/minus 96/quoteleft
 246+144/dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent
 247+/dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space]
 248+>>
 249+endobj
 250+xref
 251+0 8
 252+0000000000 65535 f
 253+0000000021 00000 n
 254+0000000164 00000 n
 255+0000000213 00000 n
 256+0000000290 00000 n
 257+0000000391 00000 n
 258+0000000424 00000 n
 259+0000009960 00000 n
 260+trailer
 261+<<
 262+/Size 8
 263+/Info 1 0 R
 264+/Root 2 0 R
 265+>>
 266+startxref
 267+10217
 268+%%EOF
Index: trunk/tools/wsor/first_session/R/first_sessions.R
@@ -0,0 +1,8 @@
 2+source("loader/user_sessions.R")
 3+
 4+library(lattice)
 5+library(doBy)
 6+
 7+
 8+
 9+
Index: trunk/tools/wsor/first_session/R/util/env.R
@@ -0,0 +1,15 @@
 2+DATA_DIR = "../data"
 3+
 4+
 5+naReplace = function(x, replacement){
 6+ sapply(
 7+ x,
 8+ function(v){
 9+ if(is.na(v)){
 10+ replacement
 11+ }else{
 12+ v
 13+ }
 14+ }
 15+ )
 16+}
Index: trunk/tools/wsor/first_session/foo
@@ -0,0 +1,24 @@
 2+user_id user_name first_edit last_edit edit_count es_0_start es_0_end es_0_edits es_0_reverted es_0_vandalism es_0_deleted es_1_start es_1_end es_1_edits es_1_reverted es_1_vandalism es_1_deleted es_2_start es_2_end es_2_edits es_2_reverted es_2_vandalism es_2_deleted
 3+1 Damian Yerrick 20010929004320 20110715131605 13196 1001724200 1001724224 2 0 0 0 1001735270 1001735270 1 0 0 0 1001778000 1001781732 4 1 0 1
 4+2 AxelBoldt 20010726145009 20110715175901 34804 996159009 996159009 1 0 0 0 996164049 996164049 1 0 0 0 996175238 996177464 3 0 0 0
 5+3 Tobias Hoevekamp 20010326202105 20040329205621 1903 985638065 985638065 1 1 0 0 985683223 985683379 2 0 0 0 985944995 985944995 1 0 0 0
 6+4 Magnus Manske 20010728082538 20110714220907 20038 996308738 996308738 1 0 0 0 996429215 996429215 2 2 0 0 996439880 996439880 1 0 0 0
 7+5 Hoevekam 20030709192137 20041227165610 3 1057778497 1057778497 1 0 0 0 1095446631 1095446631 1 0 0 0 1104166570 1104166570 1 0 0 0
 8+6 Paul Drye 20010919131128 20080605202716 1135 1000905088 1000906103 5 3 0 0 1001350404 1001350920 2 0 0 0 1001359775 1001359775 1 0 0 0
 9+7 Joao 20010826124114 20040606005523 266 998829674 998832110 6 0 0 0 998850967 998852514 3 0 0 0 998856582 998856582 1 0 0 0
 10+8 TwoOneTwo 20010909202356 20110205015022 2135 1000067036 1000067036 1 0 0 0 1000157045 1000157045 1 0 0 0 1000326898 1000326898 1 0 0 0
 11+9 Chenyu 20011118233022 20020124230110 166 1006126222 1006130510 5 0 0 0 1006138034 1006140164 7 3 0 2 1006147796 1006147828 3 0 0 1
 12+10 Tbc 20010803144007 20020105091549 125 996849607 996849607 1 0 0 0 996875902 996875902 1 0 0 0 996967861 996967861 1 1 0 0
 13+11 Kpjas 20010506173149 20110714103545 6302 989170309 989170309 1 0 0 0 990957006 990957006 1 0 0 0 991557466 991557466 1 0 0 0
 14+12 Matthew Woodcraft 20011202215229 20110510203406 725 1007329949 1007336697 7 0 0 1 1007344959 1007344959 1 0 0 0 1007852954 1007852954 1 0 0 0
 15+13 SteveSmith 20020124050249 20030520014515 82 1011848569 1011848569 1 0 0 0 1011879408 1011880799 5 0 0 0 1011892200 1011900596 10 1 0 0
 16+14 RjLesch 20010727142501 20020708073228 872 996243901 996245695 4 1 0 0 996522629 996522629 1 0 0 0 996698056 996698374 2 0 0 0
 17+15 Trelvis 20011213170430 20040906184834 673 1008263070 1008263442 2 1 0 0 1008279572 1008283623 6 3 0 0 1008287606 1008287606 1 0 0 0
 18+16 General Wesc 20010805053252 20110403162228 1505 996989572 996989815 2 0 0 0 996994099 996998763 4 0 0 0 997040223 997044154 11 0 0 1
 19+17 Peter Winnberg 20011110110118 20061124144257 464 1005390078 1005390078 1 0 0 0 1005586430 1005586639 2 1 0 0 1005596572 1005596572 1 0 0 0
 20+18 MichaelTinkler 20010731150633 20020903033518 2468 996591993 996594597 2 0 0 0 996610359 996610359 1 0 0 0 996767279 996767279 1 0 0 0
 21+19 Ignaciovicario 20020126191706 20020225154311 2 1012072626 1012072626 1 0 0 0 1014651791 1014651791 1 0 0 0 \N \N \N \N \N \N
 22+20 Pingos 20020113023235 20040311053834 17 1010889155 1010889303 4 2 0 1 1011038119 1011038119 1 0 0 0 1011044975 1011045028 3 0 0 3
 23+21 Firepink 20020118155248 20020225155115 63 1011369168 1011371693 3 0 0 0 1011487743 1011487946 3 0 0 0 1011545356 1011546803 2 0 0 0
 24+22 Luis Oliveira 20020124235009 20050130223105 27 1011916209 1011918167 4 0 0 0 1011965218 1011965218 1 0 0 0 1011969929 1011971575 3 0 0 0
 25+23 Goran 20020225154311 20030221002037 11 1014651791 1014652275 4 2 0 0 1039118155 1039118752 4 0 0 0 1039135447 1039135447 1 0 0 0
Index: trunk/tools/wsor/first_session/data
@@ -0,0 +1 @@
 2+link /home/halfak/data/first_session
\ No newline at end of file
Property changes on: trunk/tools/wsor/first_session/data
___________________________________________________________________
Added: svn:special
13 + *
Index: trunk/tools/wsor/first_session/testing.sql
@@ -0,0 +1,197 @@
 2+
 3+CREATE TABLE halfak.user_session_sample
 4+SELECT
 5+ user_id,
 6+ YEAR(first_edit) AS year,
 7+ MONTH(first_edit) >= 7 AS semester
 8+FROM halfak.user_meta_20110715
 9+WHERE first_edit BETWEEN "20010000000000" AND "20019999999999"
 10+ORDER BY RAND()
 11+LIMIT 10000;
 12+
 13+INSERT INTO halfak.user_session_sample
 14+SELECT
 15+ user_id,
 16+ YEAR(first_edit) AS year,
 17+ MONTH(first_edit) >= 7 AS semester
 18+FROM halfak.user_meta_20110715
 19+WHERE first_edit BETWEEN "20020000000000" AND "20029999999999"
 20+ORDER BY RAND()
 21+LIMIT 10000;
 22+
 23+INSERT INTO halfak.user_session_sample
 24+SELECT
 25+ user_id,
 26+ YEAR(first_edit) AS year,
 27+ MONTH(first_edit) >= 7 AS semester
 28+FROM halfak.user_meta_20110715
 29+WHERE first_edit BETWEEN "20030000000000" AND "20039999999999"
 30+ORDER BY RAND()
 31+LIMIT 10000;
 32+
 33+INSERT INTO halfak.user_session_sample
 34+SELECT
 35+ user_id,
 36+ YEAR(first_edit) AS year,
 37+ MONTH(first_edit) >= 7 AS semester
 38+FROM halfak.user_meta_20110715
 39+WHERE first_edit BETWEEN "20040000000000" AND "20049999999999"
 40+ORDER BY RAND()
 41+LIMIT 10000;
 42+
 43+INSERT INTO halfak.user_session_sample
 44+SELECT
 45+ user_id,
 46+ YEAR(first_edit) AS year,
 47+ MONTH(first_edit) >= 7 AS semester
 48+FROM halfak.user_meta_20110715
 49+WHERE first_edit BETWEEN "20050000000000" AND "20059999999999"
 50+ORDER BY RAND()
 51+LIMIT 10000;
 52+
 53+INSERT INTO halfak.user_session_sample
 54+SELECT
 55+ user_id,
 56+ YEAR(first_edit) AS year,
 57+ MONTH(first_edit) >= 7 AS semester
 58+FROM halfak.user_meta_20110715
 59+WHERE first_edit BETWEEN "20060000000000" AND "20069999999999"
 60+ORDER BY RAND()
 61+LIMIT 10000;
 62+
 63+INSERT INTO halfak.user_session_sample
 64+SELECT
 65+ user_id,
 66+ YEAR(first_edit) AS year,
 67+ MONTH(first_edit) >= 7 AS semester
 68+FROM halfak.user_meta_20110715
 69+WHERE first_edit BETWEEN "20070000000000" AND "20079999999999"
 70+ORDER BY RAND()
 71+LIMIT 10000;
 72+
 73+INSERT INTO halfak.user_session_sample
 74+SELECT
 75+ user_id,
 76+ YEAR(first_edit) AS year,
 77+ MONTH(first_edit) >= 7 AS semester
 78+FROM halfak.user_meta_20110715
 79+WHERE first_edit BETWEEN "20080000000000" AND "20089999999999"
 80+ORDER BY RAND()
 81+LIMIT 10000;
 82+
 83+INSERT INTO halfak.user_session_sample
 84+SELECT
 85+ user_id,
 86+ YEAR(first_edit) AS year,
 87+ MONTH(first_edit) >= 7 AS semester
 88+FROM halfak.user_meta_20110715
 89+WHERE first_edit BETWEEN "20090000000000" AND "20099999999999"
 90+ORDER BY RAND()
 91+LIMIT 10000;
 92+
 93+INSERT INTO halfak.user_session_sample
 94+SELECT
 95+ user_id,
 96+ YEAR(first_edit) AS year,
 97+ MONTH(first_edit) >= 7 AS semester
 98+FROM halfak.user_meta_20110715
 99+WHERE first_edit BETWEEN "20100000000000" AND "20109999999999"
 100+ORDER BY RAND()
 101+LIMIT 10000;
 102+
 103+
 104+
 105+USE enwiki;
 106+CREATE TABLE zexley.user_meta_firsts
 107+SELECT
 108+ user_id,
 109+ first_edit,
 110+ last_edit,
 111+ sum(rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL .25 YEAR)) as 1q,
 112+ sum(rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .25 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .5 YEAR)) as 2q,
 113+ sum(rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .5 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 3q,
 114+ sum(rev_timestamp > DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 4q
 115+FROM (
 116+SELECT
 117+ u.user_id AS user_id,
 118+ u.first_edit AS first_edit,
 119+ u.last_edit AS last_edit,
 120+ r.rev_timestamp
 121+FROM halfak.user_meta_20110715 u
 122+LEFT JOIN revision r
 123+ ON u.user_id = r.rev_user AND
 124+ r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR)
 125+UNION
 126+SELECT
 127+ u.user_id AS user_id,
 128+ u.first_edit AS first_edit,
 129+ u.last_edit AS last_edit,
 130+ ar_timestamp AS rev_timestamp
 131+FROM halfak.user_meta_20110715 u
 132+LEFT JOIN archive a
 133+ ON u.user_id = ar_user AND
 134+ ar_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR)
 135+) AS r
 136+GROUP BY user_id
 137+
 138+
 139+CREATE TABLE halfak.rev_len_changed
 140+SELECT
 141+ r.rev_id,
 142+ r.rev_timestamp,
 143+ YEAR(r.rev_timestamp) AS rev_year,
 144+ MONTH(r.rev_timestamp) AS rev_month,
 145+ r.rev_len,
 146+ r.rev_user AS user_id,
 147+ r.rev_user_text AS user_text,
 148+ `change` AS len_change
 149+ p.page_id AS page_id,
 150+ p.page_namespace AS namespace
 151+FROM revision r
 152+INNER JOIN user u
 153+ ON r.rev_user = u.user_id
 154+INNER JOIN halfak.user_meta_20110715 um
 155+ ON um.user_id = r.rev_user
 156+INNER JOIN halfak.rev_len_change rlc
 157+ ON r.rev_id = rlc.rev_id
 158+INNER JOIN page p
 159+ ON p.page_id = r.rev_page;
 160+
 161+ALTER TABLE halfak.rev_len_changed
 162+ADD COLUMN rev_year INT UNSIGNED
 163+AFTER rev_timestamp;
 164+
 165+ALTER TABLE halfak.rev_len_changed
 166+ADD COLUMN rev_month INT UNSIGNED
 167+AFTER rev_timestamp;
 168+
 169+UPDATE halfak.rev_len_changed
 170+SET
 171+ rev_year = YEAR(rev_timestamp),
 172+ rev_month = MONTH(rev_timestamp);
 173+
 174+
 175+CREATE UNIQUE INDEX rev_idx ON halfak.rev_len_changed_final (rev_id);
 176+CREATE INDEX user_year_month_namespace ON halfak.rev_len_changed_final (user_id, rev_year, rev_month, namespace);
 177+
 178+
 179+
 180+
 181+
 182+SELECT
 183+ user_id,
 184+ rev_year,
 185+ rev_month,
 186+ namespace,
 187+ first_edit,
 188+ COUNT(*) as edits,
 189+ SUM(IF(len_change > 0,len_change,0)) as len_added,
 190+ SUM(IF(len_change < 0,len_change*-1,0)) as len_removed
 191+FROM halfak.rev_len_changed
 192+GROUP BY
 193+ user_id,
 194+ rev_year,
 195+ rev_month,
 196+ namespace,
 197+ first_edit
 198+WHERE user_id = 2356767;
Index: trunk/tools/wsor/vandal_conversion/R/conversions.R
@@ -1,4 +1,120 @@
22 source("loader/load_editor_first_and_last.R")
 3+source("loader/load_editor_edit_count.R")
34
45 editor_first_and_last = load_editor_first_and_last()
 6+efl = unique(editor_first_and_last)
 7+efl = efl[efl$last10_edits == 10,]
58
 9+editor_edit_count = load_editor_edit_count()
 10+efl = merge(
 11+ efl,
 12+ editor_edit_count,
 13+ by=c("user_id", "user_name")
 14+)
 15+
 16+library(lattice)
 17+
 18+#plot(table(efl$fes_edits))
 19+#xyplot(table(efl$fes_edits)~as.numeric(names(table(efl$fes_edits))), scales=list(x=list(log=2), y=list(log=2)))
 20+
 21+png("plots/fes_discarded.hist.png", height=768, width=1024)
 22+efl$fes_discarded = efl$fes_reverted + efl$fes_deleted
 23+efl$fes_discarded_prop = efl$fes_discarded / efl$fes_edits
 24+plot(
 25+ table(round(efl[efl$fes_edits >= 4,]$fes_discarded_prop, 1)),
 26+ main="Histogram of the proportion of first session edits that were discarded",
 27+ sub="for editors with at least 20 edits and 4 in first session. Discarded edits have been reverted or deleted",
 28+ ylab="Frequency",
 29+ xlab="Proportion of discarded edits"
 30+)
 31+dev.off()
 32+
 33+png("plots/fes_vandalism.hist.png", height=768, width=1024)
 34+efl$fes_vandalism_prop = efl$fes_vandalism / (efl$fes_edits - efl$fes_deleted)
 35+plot(
 36+ table(round(efl[(efl$fes_edits - efl$fes_deleted) >= 1,]$fes_vandalism_prop, 1)),
 37+ main="Histogram of the proportion of kept 1st session edits that were vandalism",
 38+ sub="for editors with at least 20 edits and 1 kept edits in first session.",
 39+ ylab="Frequency",
 40+ xlab="Proportion of vandalism edits"
 41+)
 42+dev.off()
 43+
 44+png("plots/fes_reverted.hist.png", height=768, width=1024)
 45+efl$fes_reverted_prop = efl$fes_reverted / (efl$fes_edits - efl$fes_deleted)
 46+plot(
 47+ table(round(efl[(efl$fes_edits - efl$fes_deleted) >= 1,]$fes_reverted_prop, 1)),
 48+ main="Histogram of the proportion of kept 1st session edits that were reverted",
 49+ sub="for editors with at least 20 edits and 1 kept edits in first session.",
 50+ ylab="Frequency",
 51+ xlab="Proportion of reverted edits"
 52+)
 53+dev.off()
 54+
 55+
 56+png("plots/last10_discarded.hist.png", height=768, width=1024)
 57+efl$last10_discarded = efl$last10_reverted + efl$last10_deleted
 58+efl$last10_discarded_prop = efl$last10_discarded / efl$last10_edits
 59+plot(
 60+ table(round(efl$last10_discarded_prop, 1)),
 61+ main="Histogram of the proportion of the last 10 edits that were discarded",
 62+ sub="for editors with at least 20 edits. Discarded edits have been reverted or deleted",
 63+ ylab="Frequency",
 64+ xlab="Proportion of discarded edits"
 65+)
 66+dev.off()
 67+
 68+
 69+png("plots/future_edits.hist.png", height=768, width=1024)
 70+efl$future_edits = efl$edit_count - efl$fes_edits
 71+plot(
 72+ table(10^round(log(efl$future_edits, base=10), 1)),
 73+ main="Histogram of edits after first session for edits who made at least 20 edits",
 74+ xlab="Edits after first session (log10 bucketed, scaled)",
 75+ ylab="Frequency",
 76+ type="o",
 77+ log="x"
 78+)
 79+dev.off()
 80+
 81+
 82+top_100 = efl[order(efl$edit_count, decreasing=T),][1:100,]
 83+png("plots/fes_discarded.hist.top_100.png", height=768, width=1024)
 84+top_100$fes_discarded = top_100$fes_reverted + top_100$fes_deleted
 85+top_100$fes_discarded_prop = top_100$fes_discarded / top_100$fes_edits
 86+plot(
 87+ table(round(top_100$fes_discarded_prop, 1)),
 88+ main="Histogram of the proportion of the last 10 edits that were discarded",
 89+ sub="for the top 100 editors by edit count. Discarded edits have been reverted or deleted",
 90+ ylab="Frequency",
 91+ xlab="Proportion of discarded edits"
 92+)
 93+dev.off()
 94+
 95+png("plots/fes_reverted.hist.top_100.png", height=768, width=1024)
 96+top_100$fes_reverted_prop = top_100$fes_reverted / (top_100$fes_edits - top_100$fes_deleted)
 97+plot(
 98+ table(round(top_100[top_100$fes_edits - top_100$fes_deleted >= 1,]$fes_reverted_prop, 1)),
 99+ main="Histogram of the proportion of the last 10 edits that were reverted",
 100+ sub="for the top 100 editors by edit count.",
 101+ ylab="Frequency",
 102+ xlab="Proportion of reverted edits"
 103+)
 104+dev.off()
 105+
 106+png("plots/fes_vandal.hist.top_100.png", height=768, width=1024)
 107+top_100$fes_vandalism_prop = top_100$fes_vandalism / (top_100$fes_edits - top_100$fes_deleted)
 108+plot(
 109+ table(round(top_100[top_100$fes_edits - top_100$fes_deleted >= 1,]$fes_vandalism_prop, 1)),
 110+ main="Histogram of the proportion of the last 10 edits that were reverted for vandalism",
 111+ sub="for the top 100 editors by edit count.",
 112+ ylab="Frequency",
 113+ xlab="Proportion of edits reverted for vandalism"
 114+)
 115+dev.off()
 116+
 117+
 118+summary(top_100$fes_vandalism > 0)
 119+summary(top_100$fes_reverted > 0)
 120+summary(top_100$fes_discarded > 0)
 121+
Index: trunk/tools/wsor/vandal_conversion/R/util/env.R
@@ -1 +1 @@
2 -DATA_DIR = "/home/aaron/data/vandal_conversion"
 2+DATA_DIR = "../data"
Index: trunk/tools/wsor/vandal_conversion/get_editor_editcount.py
@@ -0,0 +1,109 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def main():
 15+ parser = argparse.ArgumentParser(
 16+ description='Gathers editor data for first and last session'
 17+ )
 18+ parser.add_argument(
 19+ 'min_edits',
 20+ type=int,
 21+ help='the minimum number of edits that editors must have perfomed to be included'
 22+ )
 23+ parser.add_argument(
 24+ '-c', '--cnf',
 25+ metavar="<path>",
 26+ type=str,
 27+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 28+ default=os.path.expanduser("~/.my.cnf")
 29+ )
 30+ parser.add_argument(
 31+ '-s', '--host',
 32+ type=str,
 33+ help='the database host to connect to (defaults to localhost)',
 34+ default="localhost"
 35+ )
 36+ parser.add_argument(
 37+ '-d', '--db',
 38+ type=str,
 39+ help='the language db to run the query in (defaults to enwiki)',
 40+ default="enwiki"
 41+ )
 42+ args = parser.parse_args()
 43+
 44+ LOGGING_STREAM = sys.stderr
 45+ logging.basicConfig(
 46+ level=logging.DEBUG,
 47+ stream=LOGGING_STREAM,
 48+ format='%(asctime)s %(levelname)-8s %(message)s',
 49+ datefmt='%b-%d %H:%M:%S'
 50+ )
 51+
 52+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 53+ db = Database(
 54+ host=args.host,
 55+ db=args.db,
 56+ read_default_file=args.cnf
 57+ )
 58+ headers = [
 59+ 'user_id',
 60+ 'user_name',
 61+ 'edit_count'
 62+ ]
 63+ print("\t".join(headers))
 64+
 65+ logging.info("Processing users:")
 66+
 67+ for user in db.getUsers(minimumEdits=args.min_edits):
 68+ print("\t".join(encode(user[h]) for h in headers))
 69+ LOGGING_STREAM.write(".")
 70+
 71+ LOGGING_STREAM.write("\n")
 72+
 73+
 74+class Database:
 75+
 76+ def __init__(self, *args, **kwargs):
 77+ self.args = args
 78+ self.kwargs = kwargs
 79+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 80+ self.revsConn = MySQLdb.connect(*args, **kwargs)
 81+ self.archConn = MySQLdb.connect(*args, **kwargs)
 82+
 83+ def getUsers(self, minimumEdits=0):
 84+ minimumEdits = int(minimumEdits)
 85+ cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor)
 86+ cursor.execute(
 87+ """
 88+ SELECT
 89+ u.user_id,
 90+ u.user_name,
 91+ u.user_editcount as edit_count
 92+ FROM user u
 93+ WHERE u.user_editcount >= %(minimum_edits)s
 94+ """,
 95+ {
 96+ 'minimum_edits': minimumEdits
 97+ }
 98+ )
 99+ for row in cursor:
 100+ yield row
 101+
 102+
 103+ def getFirstEdits(self, userId, maximum=10000):
 104+ return self.getEdits(userId, maximum, chronologically=True)
 105+
 106+ def getLastEdits(self, userId, maximum=10000):
 107+ return self.getEdits(userId, maximum, chronologically=False)
 108+
 109+
 110+if __name__ == "__main__": main()
Index: trunk/tools/wsor/diffs/example.py
@@ -0,0 +1,101 @@
 2+from StringIO import StringIO
 3+from diff_match_patch import diff_match_patch
 4+import re
 5+
 6+revs = [
 7+ {'rev_id': 1, 'content':'Foo derp 263254'},
 8+ {'rev_id': 2, 'content':'Foo derp 26354'}
 9+]
 10+
 11+def tokenize(content):
 12+ return re.findall(
 13+ r"[\w]+" + #Word
 14+ r"|\[\[" + #Opening internal link
 15+ r"|\]\]" + #Closing internal link
 16+ r"|\{\{" + #Opening template
 17+ r"|\}\}" + #Closing template
 18+ r"|\{\{\{" + #Opening template var
 19+ r"|\}\}\}" + #Closing template var
 20+ r"|\n+" + #Line breaks
 21+ r"| +" + #Spaces
 22+ r"|&\w+;" + #HTML escape sequence
 23+ r"|'''" + #Bold
 24+ r"|''" + #Italics
 25+ r"|=+" + #Header
 26+ r"|\{\|" + #Opening table
 27+ r"|\|\}" + #Closing table
 28+ r"|\|\-" + #Table row
 29+ r"|.", #Misc character
 30+ content
 31+ )
 32+
 33+def hashTokens(tokens, hash2Token=[], token2Hash={}):
 34+ hashBuffer = StringIO()
 35+ for t in tokens:
 36+ if t in token2Hash:
 37+ hashBuffer.write(unichr(token2Hash[t]+1))
 38+ else:
 39+ hashId = len(hash2Token)
 40+ hash2Token.append(t)
 41+ token2Hash[t] = hashId
 42+ hashBuffer.write(unichr(hashId+1))
 43+
 44+ return (hashBuffer.getvalue(), hash2Token, token2Hash)
 45+
 46+def unhash(hashes, hash2Token, sep=''):
 47+ return sep.join(hash2Token[ord(h)-1] for h in hashes)
 48+
 49+def simpleDiff(content1, content2, tokenize=tokenize, sep='', report=[-1,0,1]):
 50+ hashes1, h2t, t2h = hashTokens(tokenize(content1))
 51+ hashes2, h2t, t2h = hashTokens(tokenize(content2), h2t, t2h)
 52+
 53+ report = set(report)
 54+
 55+ dmp = diff_match_patch()
 56+
 57+ diffs = dmp.diff_main(hashes1, hashes2, checklines=False)
 58+
 59+ position = 0
 60+ for (ar,hashes) in diffs:
 61+ content = unhash(hashes,h2t,sep=sep)
 62+ if ar in report:
 63+ yield position, ar, content
 64+
 65+ if ar != -1: position += len(content)
 66+
 67+
 68+def main():
 69+
 70+ lastRev = {'content':''}
 71+ content = ''
 72+ for rev in revs:
 73+ buff = StringIO()
 74+ oldPos = 0
 75+ lastPos = 0
 76+ for pos, ar, c in simpleDiff(lastRev['content'], rev['content'], report=[-1,1]):
 77+ equal = content[oldPos:oldPos+pos-lastPos]
 78+ buff.write(equal)
 79+ lastPos += len(equal)
 80+ oldPos += len(equal)
 81+
 82+ if ar == 1:
 83+ buff.write(c)
 84+ lastPos += len(c)
 85+ elif ar == -1:
 86+ oldPos += len(c)
 87+
 88+
 89+ print("%s, %s, %r" % (pos, ar, c))
 90+
 91+ buff.write(content[oldPos:])
 92+
 93+
 94+ content = buff.getvalue()
 95+ print("Rev: id=%s\n\t%r\n\t%r" % (rev['rev_id'], rev['content'], content))
 96+ lastRev = rev
 97+
 98+
 99+
 100+
 101+if __name__ == "__main__": main()
 102+
Index: trunk/tools/wsor/diffs/revision_differ.py
@@ -0,0 +1,150 @@
 2+#!/usr/local/bin/pypy
 3+
 4+import logging,traceback
 5+import sys, re
 6+from StringIO import StringIO
 7+
 8+from diff_match_patch import diff_match_patch
 9+
 10+from xml_simulator import RecordingFileWrapper
 11+from wmf.dump.iterator import Iterator
 12+import wmf
 13+
 14+def tokenize(content):
 15+ return re.findall(
 16+ r"[\w]+" + #Word
 17+ r"|\[\[" + #Opening internal link
 18+ r"|\]\]" + #Closing internal link
 19+ r"|\{\{" + #Opening template
 20+ r"|\}\}" + #Closing template
 21+ r"|\{\{\{" + #Opening template var
 22+ r"|\}\}\}" + #Closing template var
 23+ r"|\n+" + #Line breaks
 24+ r"| +" + #Spaces
 25+ r"|&\w+;" + #HTML escape sequence
 26+ r"|'''" + #Bold
 27+ r"|''" + #Italics
 28+ r"|=+" + #Header
 29+ r"|\{\|" + #Opening table
 30+ r"|\|\}" + #Closing table
 31+ r"|\|\-" + #Table row
 32+ r"|.", #Misc character
 33+ content
 34+ )
 35+
 36+def hashTokens(tokens, hash2Token=[], token2Hash={}):
 37+ hashBuffer = StringIO()
 38+ for t in tokens:
 39+ if t in token2Hash:
 40+ hashBuffer.write(unichr(token2Hash[t]+1))
 41+ else:
 42+ hashId = len(hash2Token)
 43+ hash2Token.append(t)
 44+ token2Hash[t] = hashId
 45+ hashBuffer.write(unichr(hashId+1))
 46+
 47+ return (hashBuffer.getvalue(), hash2Token, token2Hash)
 48+
 49+def unhash(hashes, hash2Token, sep=''):
 50+ return sep.join(hash2Token[ord(h)-1] for h in hashes)
 51+
 52+def simpleDiff(content1, content2, tokenize=tokenize, sep='', report=[-1,0,1]):
 53+ hashes1, h2t, t2h = hashTokens(tokenize(content1))
 54+ hashes2, h2t, t2h = hashTokens(tokenize(content2), h2t, t2h)
 55+
 56+ report = set(report)
 57+
 58+ dmp = diff_match_patch()
 59+
 60+ diffs = dmp.diff_main(hashes1, hashes2, checklines=False)
 61+
 62+ position = 0
 63+ for (ar,hashes) in diffs:
 64+ content = unhash(hashes,h2t,sep=sep)
 65+ if ar in report:
 66+ yield position, ar, content
 67+
 68+ if ar != -1: position += len(content)
 69+
 70+
 71+metaXML = """
 72+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">
 73+<siteinfo>
 74+<sitename>Wikipedia</sitename>
 75+<base>http://en.wikipedia.org/wiki/Main_Page</base>
 76+<generator>MediaWiki 1.17wmf1</generator>
 77+<case>first-letter</case>
 78+<namespaces>
 79+<namespace key="-2" case="first-letter">Media</namespace>
 80+<namespace key="-1" case="first-letter">Special</namespace>
 81+<namespace key="0" case="first-letter" />
 82+<namespace key="1" case="first-letter">Talk</namespace>
 83+<namespace key="2" case="first-letter">User</namespace>
 84+<namespace key="3" case="first-letter">User talk</namespace>
 85+<namespace key="4" case="first-letter">Wikipedia</namespace>
 86+<namespace key="5" case="first-letter">Wikipedia talk</namespace>
 87+<namespace key="6" case="first-letter">File</namespace>
 88+<namespace key="7" case="first-letter">File talk</namespace>
 89+<namespace key="8" case="first-letter">MediaWiki</namespace>
 90+<namespace key="9" case="first-letter">MediaWiki talk</namespace>
 91+<namespace key="10" case="first-letter">Template</namespace>
 92+<namespace key="11" case="first-letter">Template talk</namespace>
 93+<namespace key="12" case="first-letter">Help</namespace>
 94+<namespace key="13" case="first-letter">Help talk</namespace>
 95+<namespace key="14" case="first-letter">Category</namespace>
 96+<namespace key="15" case="first-letter">Category talk</namespace>
 97+<namespace key="100" case="first-letter">Portal</namespace>
 98+<namespace key="101" case="first-letter">Portal talk</namespace>
 99+<namespace key="108" case="first-letter">Book</namespace>
 100+<namespace key="109" case="first-letter">Book talk</namespace>
 101+</namespaces>
 102+</siteinfo>
 103+"""
 104+xmlSim = RecordingFileWrapper(sys.stdin, pre=metaXML, post='</mediawiki>')
 105+
 106+try:
 107+ dump = Iterator(xmlSim)
 108+except Exception as e:
 109+ sys.stderr.write(str(e) + xmlSim.getHistory())
 110+ sys.exit(1)
 111+
 112+
 113+for page in dump.readPages():
 114+ sys.stderr.write('Processing: %s - %s\n' % (page.getId(), page.getTitle().encode('UTF-8')))
 115+ try:
 116+ lastRev = None
 117+ for revision in page.readRevisions():
 118+ if lastRev == None:
 119+ lastRev = revision
 120+ else:
 121+ namespace, title = wmf.normalizeTitle(page.getTitle(), namespaces=dump.namespaces)
 122+ nsId = dump.namespaces[namespace]
 123+ row = [
 124+ repr(revision.getId()),
 125+ repr(page.getId()),
 126+ repr(nsId),
 127+ repr(title),
 128+ repr(revision.getTimestamp()),
 129+ repr(revision.getComment()),
 130+ repr(revision.getMinor()),
 131+ repr(revision.getContributor().getId()),
 132+ repr(revision.getContributor().getUsername())
 133+ ]
 134+
 135+ for d in simpleDiff(lastRev.getText(), revision.getText(), report=[-1,1]):
 136+ row.append(":".join(repr(v) for v in d))
 137+
 138+ print("\t".join(row))
 139+
 140+ except Exception as e:
 141+ sys.stderr.write('%s' % e)
 142+ #fh.write('%s' % e)
 143+ #logging.error(
 144+ # "Failed to process page %s:%s - %s" % (
 145+ # page.getId(),
 146+ # page.getTitle(),
 147+ # e
 148+ # ))
 149+ #logging.error(traceback.print_exc())
 150+#fh.close()
 151+#sys.exit(0)
Property changes on: trunk/tools/wsor/diffs/revision_differ.py
___________________________________________________________________
Added: svn:executable
1152 + *
Index: trunk/tools/wsor/diffs/xml_simulator.py
@@ -0,0 +1,80 @@
 2+import sys
 3+from StringIO import StringIO
 4+from collections import deque
 5+
 6+class FileWrapper:
 7+
 8+ def __init__(self, fp, pre='', post=''):
 9+ self.fp = fp
 10+ self.pre = StringIO(pre)
 11+ self.post = StringIO(post)
 12+ self.closed = False
 13+ self.mode = "r"
 14+
 15+ def read(self, bytes=sys.maxint):
 16+ bytes = int(bytes)
 17+ if self.closed: raise ValueError("I/O operation on closed file")
 18+
 19+ preBytes = self.pre.read(bytes)
 20+ if len(preBytes) < bytes:
 21+ fpBytes = self.fp.read(bytes-len(preBytes))
 22+ else:
 23+ fpBytes = ''
 24+
 25+ if len(preBytes) + len(fpBytes) < bytes:
 26+ postBytes = self.post.read(bytes-(len(preBytes) + len(fpBytes)))
 27+ else:
 28+ postBytes = ''
 29+
 30+ return preBytes + fpBytes + postBytes
 31+
 32+ def readline(self):
 33+ if self.closed: raise ValueError("I/O operation on closed file")
 34+
 35+ output = self.pre.readline()
 36+ if len(output) == 0 or output[-1] != "\n":
 37+ output += self.fp.readline()
 38+ if len(output) == 0 or output[-1] != "\n":
 39+ output += self.post.readline()
 40+
 41+ return output
 42+
 43+ def readlines(self): raise NotImplementedError()
 44+
 45+ def __iter__(self):
 46+
 47+ line = self.readline()
 48+ while line != '':
 49+ yield line
 50+ line = self.readline()
 51+
 52+
 53+ def seek(self): raise NotImplementedError()
 54+ def write(self): raise NotImplementedError()
 55+ def writelines(self): raise NotImplementedError()
 56+ def tell(self):
 57+ return self.pre.tell() + self.fp.tell() + self.post.tell()
 58+
 59+
 60+ def close(self):
 61+ self.closed = True
 62+ self.fp.close()
 63+
 64+class RecordingFileWrapper(FileWrapper):
 65+
 66+ def __init__(self, fp, pre='', post='', record=10000):
 67+ self.history = deque(maxlen=record)
 68+ FileWrapper.__init__(self, fp, pre=pre, post=post)
 69+
 70+ def read(self, bytes=sys.maxint):
 71+ outBytes = FileWrapper.read(self, bytes)
 72+ self.history.extend(outBytes)
 73+ return outBytes
 74+
 75+ def readline(self):
 76+ outBytes = FileWrapper.readline(self)
 77+ self.history.extend(outBytes)
 78+ return outBytes
 79+
 80+ def getHistory(self):
 81+ return ''.join(self.history)
Property changes on: trunk/tools/wsor/diffs/xml_simulator.py
___________________________________________________________________
Added: svn:executable
182 + *
Index: trunk/tools/wsor/diffs/diff_match_patch.py
@@ -0,0 +1,1949 @@
 2+#!/usr/bin/env python
 3+
 4+"""Diff Match and Patch
 5+
 6+Copyright 2006 Google Inc.
 7+http://code.google.com/p/google-diff-match-patch/
 8+
 9+Licensed under the Apache License, Version 2.0 (the "License");
 10+you may not use this file except in compliance with the License.
 11+You may obtain a copy of the License at
 12+
 13+ http://www.apache.org/licenses/LICENSE-2.0
 14+
 15+Unless required by applicable law or agreed to in writing, software
 16+distributed under the License is distributed on an "AS IS" BASIS,
 17+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18+See the License for the specific language governing permissions and
 19+limitations under the License.
 20+"""
 21+
 22+"""Functions for diff, match and patch.
 23+
 24+Computes the difference between two texts to create a patch.
 25+Applies the patch onto another text, allowing for errors.
 26+"""
 27+
 28+__author__ = 'fraser@google.com (Neil Fraser)'
 29+
 30+import math
 31+import time
 32+import urllib
 33+import re
 34+import sys
 35+
 36+class diff_match_patch:
 37+ """Class containing the diff, match and patch methods.
 38+
 39+ Also contains the behaviour settings.
 40+ """
 41+
 42+ def __init__(self):
 43+ """Inits a diff_match_patch object with default settings.
 44+ Redefine these in your program to override the defaults.
 45+ """
 46+
 47+ # Number of seconds to map a diff before giving up (0 for infinity).
 48+ self.Diff_Timeout = 1.0
 49+ # Cost of an empty edit operation in terms of edit characters.
 50+ self.Diff_EditCost = 4
 51+ # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
 52+ self.Match_Threshold = 0.5
 53+ # How far to search for a match (0 = exact location, 1000+ = broad match).
 54+ # A match this many characters away from the expected location will add
 55+ # 1.0 to the score (0.0 is a perfect match).
 56+ self.Match_Distance = 1000
 57+ # When deleting a large block of text (over ~64 characters), how close does
 58+ # the contents have to match the expected contents. (0.0 = perfection,
 59+ # 1.0 = very loose). Note that Match_Threshold controls how closely the
 60+ # end points of a delete need to match.
 61+ self.Patch_DeleteThreshold = 0.5
 62+ # Chunk size for context length.
 63+ self.Patch_Margin = 4
 64+
 65+ # The number of bits in an int.
 66+ # Python has no maximum, thus to disable patch splitting set to 0.
 67+ # However to avoid long patches in certain pathological cases, use 32.
 68+ # Multiple short patches (using native ints) are much faster than long ones.
 69+ self.Match_MaxBits = 32
 70+
 71+ # DIFF FUNCTIONS
 72+
 73+ # The data structure representing a diff is an array of tuples:
 74+ # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
 75+ # which means: delete "Hello", add "Goodbye" and keep " world."
 76+ DIFF_DELETE = -1
 77+ DIFF_INSERT = 1
 78+ DIFF_EQUAL = 0
 79+
 80+ def diff_main(self, text1, text2, checklines=True, deadline=None):
 81+ """Find the differences between two texts. Simplifies the problem by
 82+ stripping any common prefix or suffix off the texts before diffing.
 83+
 84+ Args:
 85+ text1: Old string to be diffed.
 86+ text2: New string to be diffed.
 87+ checklines: Optional speedup flag. If present and false, then don't run
 88+ a line-level diff first to identify the changed areas.
 89+ Defaults to true, which does a faster, slightly less optimal diff.
 90+ deadline: Optional time when the diff should be complete by. Used
 91+ internally for recursive calls. Users should set DiffTimeout instead.
 92+
 93+ Returns:
 94+ Array of changes.
 95+ """
 96+ # Set a deadline by which time the diff must be complete.
 97+ if deadline == None:
 98+ # Unlike in most languages, Python counts time in seconds.
 99+ if self.Diff_Timeout <= 0:
 100+ deadline = sys.maxint
 101+ else:
 102+ deadline = time.time() + self.Diff_Timeout
 103+
 104+ # Check for null inputs.
 105+ if text1 == None or text2 == None:
 106+ raise ValueError("Null inputs. (diff_main)")
 107+
 108+ # Check for equality (speedup).
 109+ if text1 == text2:
 110+ if text1:
 111+ return [(self.DIFF_EQUAL, text1)]
 112+ return []
 113+
 114+ # Trim off common prefix (speedup).
 115+ commonlength = self.diff_commonPrefix(text1, text2)
 116+ commonprefix = text1[:commonlength]
 117+ text1 = text1[commonlength:]
 118+ text2 = text2[commonlength:]
 119+
 120+ # Trim off common suffix (speedup).
 121+ commonlength = self.diff_commonSuffix(text1, text2)
 122+ if commonlength == 0:
 123+ commonsuffix = ''
 124+ else:
 125+ commonsuffix = text1[-commonlength:]
 126+ text1 = text1[:-commonlength]
 127+ text2 = text2[:-commonlength]
 128+
 129+ # Compute the diff on the middle block.
 130+ diffs = self.diff_compute(text1, text2, checklines, deadline)
 131+
 132+ # Restore the prefix and suffix.
 133+ if commonprefix:
 134+ diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
 135+ if commonsuffix:
 136+ diffs.append((self.DIFF_EQUAL, commonsuffix))
 137+ self.diff_cleanupMerge(diffs)
 138+ return diffs
 139+
 140+ def diff_compute(self, text1, text2, checklines, deadline):
 141+ """Find the differences between two texts. Assumes that the texts do not
 142+ have any common prefix or suffix.
 143+
 144+ Args:
 145+ text1: Old string to be diffed.
 146+ text2: New string to be diffed.
 147+ checklines: Speedup flag. If false, then don't run a line-level diff
 148+ first to identify the changed areas.
 149+ If true, then run a faster, slightly less optimal diff.
 150+ deadline: Time when the diff should be complete by.
 151+
 152+ Returns:
 153+ Array of changes.
 154+ """
 155+ if not text1:
 156+ # Just add some text (speedup).
 157+ return [(self.DIFF_INSERT, text2)]
 158+
 159+ if not text2:
 160+ # Just delete some text (speedup).
 161+ return [(self.DIFF_DELETE, text1)]
 162+
 163+ if len(text1) > len(text2):
 164+ (longtext, shorttext) = (text1, text2)
 165+ else:
 166+ (shorttext, longtext) = (text1, text2)
 167+ i = longtext.find(shorttext)
 168+ if i != -1:
 169+ # Shorter text is inside the longer text (speedup).
 170+ diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
 171+ (self.DIFF_INSERT, longtext[i + len(shorttext):])]
 172+ # Swap insertions for deletions if diff is reversed.
 173+ if len(text1) > len(text2):
 174+ diffs[0] = (self.DIFF_DELETE, diffs[0][1])
 175+ diffs[2] = (self.DIFF_DELETE, diffs[2][1])
 176+ return diffs
 177+
 178+ if len(shorttext) == 1:
 179+ # Single character string.
 180+ # After the previous speedup, the character can't be an equality.
 181+ return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
 182+ longtext = shorttext = None # Garbage collect.
 183+
 184+ # Check to see if the problem can be split in two.
 185+ hm = self.diff_halfMatch(text1, text2)
 186+ if hm:
 187+ # A half-match was found, sort out the return data.
 188+ (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
 189+ # Send both pairs off for separate processing.
 190+ diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
 191+ diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
 192+ # Merge the results.
 193+ return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
 194+
 195+ if checklines and len(text1) > 100 and len(text2) > 100:
 196+ return self.diff_lineMode(text1, text2, deadline)
 197+
 198+ return self.diff_bisect(text1, text2, deadline)
 199+
 200+ def diff_lineMode(self, text1, text2, deadline):
 201+ """Do a quick line-level diff on both strings, then rediff the parts for
 202+ greater accuracy.
 203+ This speedup can produce non-minimal diffs.
 204+
 205+ Args:
 206+ text1: Old string to be diffed.
 207+ text2: New string to be diffed.
 208+ deadline: Time when the diff should be complete by.
 209+
 210+ Returns:
 211+ Array of changes.
 212+ """
 213+
 214+ # Scan the text on a line-by-line basis first.
 215+ (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
 216+
 217+ diffs = self.diff_main(text1, text2, False, deadline)
 218+
 219+ # Convert the diff back to original text.
 220+ self.diff_charsToLines(diffs, linearray)
 221+ # Eliminate freak matches (e.g. blank lines)
 222+ self.diff_cleanupSemantic(diffs)
 223+
 224+ # Rediff any replacement blocks, this time character-by-character.
 225+ # Add a dummy entry at the end.
 226+ diffs.append((self.DIFF_EQUAL, ''))
 227+ pointer = 0
 228+ count_delete = 0
 229+ count_insert = 0
 230+ text_delete = ''
 231+ text_insert = ''
 232+ while pointer < len(diffs):
 233+ if diffs[pointer][0] == self.DIFF_INSERT:
 234+ count_insert += 1
 235+ text_insert += diffs[pointer][1]
 236+ elif diffs[pointer][0] == self.DIFF_DELETE:
 237+ count_delete += 1
 238+ text_delete += diffs[pointer][1]
 239+ elif diffs[pointer][0] == self.DIFF_EQUAL:
 240+ # Upon reaching an equality, check for prior redundancies.
 241+ if count_delete >= 1 and count_insert >= 1:
 242+ # Delete the offending records and add the merged ones.
 243+ a = self.diff_main(text_delete, text_insert, False, deadline)
 244+ diffs[pointer - count_delete - count_insert : pointer] = a
 245+ pointer = pointer - count_delete - count_insert + len(a)
 246+ count_insert = 0
 247+ count_delete = 0
 248+ text_delete = ''
 249+ text_insert = ''
 250+
 251+ pointer += 1
 252+
 253+ diffs.pop() # Remove the dummy entry at the end.
 254+
 255+ return diffs
 256+
 257+ def diff_bisect(self, text1, text2, deadline):
 258+ """Find the 'middle snake' of a diff, split the problem in two
 259+ and return the recursively constructed diff.
 260+ See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
 261+
 262+ Args:
 263+ text1: Old string to be diffed.
 264+ text2: New string to be diffed.
 265+ deadline: Time at which to bail if not yet complete.
 266+
 267+ Returns:
 268+ Array of diff tuples.
 269+ """
 270+
 271+ # Cache the text lengths to prevent multiple calls.
 272+ text1_length = len(text1)
 273+ text2_length = len(text2)
 274+ max_d = (text1_length + text2_length + 1) / 2
 275+ v_offset = max_d
 276+ v_length = 2 * max_d
 277+ v1 = [-1] * v_length
 278+ v1[v_offset + 1] = 0
 279+ v2 = v1[:]
 280+ delta = text1_length - text2_length
 281+ # If the total number of characters is odd, then the front path will
 282+ # collide with the reverse path.
 283+ front = (delta % 2 != 0)
 284+ # Offsets for start and end of k loop.
 285+ # Prevents mapping of space beyond the grid.
 286+ k1start = 0
 287+ k1end = 0
 288+ k2start = 0
 289+ k2end = 0
 290+ for d in xrange(max_d):
 291+ # Bail out if deadline is reached.
 292+ if time.time() > deadline:
 293+ break
 294+
 295+ # Walk the front path one step.
 296+ for k1 in xrange(-d + k1start, d + 1 - k1end, 2):
 297+ k1_offset = v_offset + k1
 298+ if (k1 == -d or k1 != d and
 299+ v1[k1_offset - 1] < v1[k1_offset + 1]):
 300+ x1 = v1[k1_offset + 1]
 301+ else:
 302+ x1 = v1[k1_offset - 1] + 1
 303+ y1 = x1 - k1
 304+ while (x1 < text1_length and y1 < text2_length and
 305+ text1[x1] == text2[y1]):
 306+ x1 += 1
 307+ y1 += 1
 308+ v1[k1_offset] = x1
 309+ if x1 > text1_length:
 310+ # Ran off the right of the graph.
 311+ k1end += 2
 312+ elif y1 > text2_length:
 313+ # Ran off the bottom of the graph.
 314+ k1start += 2
 315+ elif front:
 316+ k2_offset = v_offset + delta - k1
 317+ if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
 318+ # Mirror x2 onto top-left coordinate system.
 319+ x2 = text1_length - v2[k2_offset]
 320+ if x1 >= x2:
 321+ # Overlap detected.
 322+ return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
 323+
 324+ # Walk the reverse path one step.
 325+ for k2 in xrange(-d + k2start, d + 1 - k2end, 2):
 326+ k2_offset = v_offset + k2
 327+ if (k2 == -d or k2 != d and
 328+ v2[k2_offset - 1] < v2[k2_offset + 1]):
 329+ x2 = v2[k2_offset + 1]
 330+ else:
 331+ x2 = v2[k2_offset - 1] + 1
 332+ y2 = x2 - k2
 333+ while (x2 < text1_length and y2 < text2_length and
 334+ text1[-x2 - 1] == text2[-y2 - 1]):
 335+ x2 += 1
 336+ y2 += 1
 337+ v2[k2_offset] = x2
 338+ if x2 > text1_length:
 339+ # Ran off the left of the graph.
 340+ k2end += 2
 341+ elif y2 > text2_length:
 342+ # Ran off the top of the graph.
 343+ k2start += 2
 344+ elif not front:
 345+ k1_offset = v_offset + delta - k2
 346+ if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
 347+ x1 = v1[k1_offset]
 348+ y1 = v_offset + x1 - k1_offset
 349+ # Mirror x2 onto top-left coordinate system.
 350+ x2 = text1_length - x2
 351+ if x1 >= x2:
 352+ # Overlap detected.
 353+ return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
 354+
 355+ # Diff took too long and hit the deadline or
 356+ # number of diffs equals number of characters, no commonality at all.
 357+ return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
 358+
 359+ def diff_bisectSplit(self, text1, text2, x, y, deadline):
 360+ """Given the location of the 'middle snake', split the diff in two parts
 361+ and recurse.
 362+
 363+ Args:
 364+ text1: Old string to be diffed.
 365+ text2: New string to be diffed.
 366+ x: Index of split point in text1.
 367+ y: Index of split point in text2.
 368+ deadline: Time at which to bail if not yet complete.
 369+
 370+ Returns:
 371+ Array of diff tuples.
 372+ """
 373+ text1a = text1[:x]
 374+ text2a = text2[:y]
 375+ text1b = text1[x:]
 376+ text2b = text2[y:]
 377+
 378+ # Compute both diffs serially.
 379+ diffs = self.diff_main(text1a, text2a, False, deadline)
 380+ diffsb = self.diff_main(text1b, text2b, False, deadline)
 381+
 382+ return diffs + diffsb
 383+
 384+ def diff_linesToChars(self, text1, text2):
 385+ """Split two texts into an array of strings. Reduce the texts to a string
 386+ of hashes where each Unicode character represents one line.
 387+
 388+ Args:
 389+ text1: First string.
 390+ text2: Second string.
 391+
 392+ Returns:
 393+ Three element tuple, containing the encoded text1, the encoded text2 and
 394+ the array of unique strings. The zeroth element of the array of unique
 395+ strings is intentionally blank.
 396+ """
 397+ lineArray = [] # e.g. lineArray[4] == "Hello\n"
 398+ lineHash = {} # e.g. lineHash["Hello\n"] == 4
 399+
 400+ # "\x00" is a valid character, but various debuggers don't like it.
 401+ # So we'll insert a junk entry to avoid generating a null character.
 402+ lineArray.append('')
 403+
 404+ def diff_linesToCharsMunge(text):
 405+ """Split a text into an array of strings. Reduce the texts to a string
 406+ of hashes where each Unicode character represents one line.
 407+ Modifies linearray and linehash through being a closure.
 408+
 409+ Args:
 410+ text: String to encode.
 411+
 412+ Returns:
 413+ Encoded string.
 414+ """
 415+ chars = []
 416+ # Walk the text, pulling out a substring for each line.
 417+ # text.split('\n') would would temporarily double our memory footprint.
 418+ # Modifying text would create many large strings to garbage collect.
 419+ lineStart = 0
 420+ lineEnd = -1
 421+ while lineEnd < len(text) - 1:
 422+ lineEnd = text.find('\n', lineStart)
 423+ if lineEnd == -1:
 424+ lineEnd = len(text) - 1
 425+ line = text[lineStart:lineEnd + 1]
 426+ lineStart = lineEnd + 1
 427+
 428+ if line in lineHash:
 429+ chars.append(unichr(lineHash[line]))
 430+ else:
 431+ lineArray.append(line)
 432+ lineHash[line] = len(lineArray) - 1
 433+ chars.append(unichr(len(lineArray) - 1))
 434+ return "".join(chars)
 435+
 436+ chars1 = diff_linesToCharsMunge(text1)
 437+ chars2 = diff_linesToCharsMunge(text2)
 438+ return (chars1, chars2, lineArray)
 439+
 440+ def diff_linesToWords(self, text1, text2):
 441+ """
 442+ INSERT BY FABIAN
 443+ Split two texts into an array of strings. Reduce the texts to a string
 444+ of hashes where each Unicode character represents one word.
 445+
 446+ Args:
 447+ text1: First string.
 448+ text2: Second string.
 449+
 450+ Returns:
 451+ Three element tuple, containing the encoded text1, the encoded text2 and
 452+ the array of unique strings. The zeroth element of the array of unique
 453+ strings is intentionally blank.
 454+ """
 455+ lineArray = [] # e.g. lineArray[4] == "Hello\n"
 456+ lineHash = {} # e.g. lineHash["Hello\n"] == 4
 457+
 458+ # "\x00" is a valid character, but various debuggers don't like it.
 459+ # So we'll insert a junk entry to avoid generating a null character.
 460+ lineArray.append('')
 461+
 462+ def diff_linesToCharsMunge(text):
 463+ """Split a text into an array of strings. Reduce the texts to a string
 464+ of hashes where each Unicode character represents one line.
 465+ Modifies linearray and linehash through being a closure.
 466+
 467+ Args:
 468+ text: String to encode.
 469+
 470+ Returns:
 471+ Encoded string.
 472+ """
 473+ chars = []
 474+ # Walk the text, pulling out a substring for each line.
 475+ # text.split('\n') would would temporarily double our memory footprint.
 476+ # Modifying text would create many large strings to garbage collect.
 477+ lineStart = 0
 478+ lineEnd = -1
 479+ while lineEnd < len(text) - 1:
 480+ lineEnd = text.find(' ', lineStart)
 481+ if lineEnd == -1:
 482+ lineEnd = len(text) - 1
 483+ line = text[lineStart:lineEnd + 1]
 484+ lineStart = lineEnd + 1
 485+
 486+ if line in lineHash:
 487+ chars.append(unichr(lineHash[line]))
 488+ else:
 489+ lineArray.append(line)
 490+ lineHash[line] = len(lineArray) - 1
 491+ chars.append(unichr(len(lineArray) - 1))
 492+ return "".join(chars)
 493+
 494+ chars1 = diff_linesToCharsMunge(text1)
 495+ chars2 = diff_linesToCharsMunge(text2)
 496+ return (chars1, chars2, lineArray)
 497+
 498+
 499+
 500+ def diff_charsToLines(self, diffs, lineArray):
 501+ """Rehydrate the text in a diff from a string of line hashes to real lines
 502+ of text.
 503+
 504+ Args:
 505+ diffs: Array of diff tuples.
 506+ lineArray: Array of unique strings.
 507+ """
 508+ for x in xrange(len(diffs)):
 509+ text = []
 510+ for char in diffs[x][1]:
 511+ text.append(lineArray[ord(char)])
 512+ diffs[x] = (diffs[x][0], "".join(text))
 513+
 514+ def diff_commonPrefix(self, text1, text2):
 515+ """Determine the common prefix of two strings.
 516+
 517+ Args:
 518+ text1: First string.
 519+ text2: Second string.
 520+
 521+ Returns:
 522+ The number of characters common to the start of each string.
 523+ """
 524+ # Quick check for common null cases.
 525+ if not text1 or not text2 or text1[0] != text2[0]:
 526+ return 0
 527+ # Binary search.
 528+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
 529+ pointermin = 0
 530+ pointermax = min(len(text1), len(text2))
 531+ pointermid = pointermax
 532+ pointerstart = 0
 533+ while pointermin < pointermid:
 534+ if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
 535+ pointermin = pointermid
 536+ pointerstart = pointermin
 537+ else:
 538+ pointermax = pointermid
 539+ pointermid = int((pointermax - pointermin) / 2 + pointermin)
 540+ return pointermid
 541+
 542+ def diff_commonSuffix(self, text1, text2):
 543+ """Determine the common suffix of two strings.
 544+
 545+ Args:
 546+ text1: First string.
 547+ text2: Second string.
 548+
 549+ Returns:
 550+ The number of characters common to the end of each string.
 551+ """
 552+ # Quick check for common null cases.
 553+ if not text1 or not text2 or text1[-1] != text2[-1]:
 554+ return 0
 555+ # Binary search.
 556+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
 557+ pointermin = 0
 558+ pointermax = min(len(text1), len(text2))
 559+ pointermid = pointermax
 560+ pointerend = 0
 561+ while pointermin < pointermid:
 562+ if (text1[-pointermid:len(text1) - pointerend] ==
 563+ text2[-pointermid:len(text2) - pointerend]):
 564+ pointermin = pointermid
 565+ pointerend = pointermin
 566+ else:
 567+ pointermax = pointermid
 568+ pointermid = int((pointermax - pointermin) / 2 + pointermin)
 569+ return pointermid
 570+
 571+ def diff_commonOverlap(self, text1, text2):
 572+ """Determine if the suffix of one string is the prefix of another.
 573+
 574+ Args:
 575+ text1 First string.
 576+ text2 Second string.
 577+
 578+ Returns:
 579+ The number of characters common to the end of the first
 580+ string and the start of the second string.
 581+ """
 582+ # Cache the text lengths to prevent multiple calls.
 583+ text1_length = len(text1)
 584+ text2_length = len(text2)
 585+ # Eliminate the null case.
 586+ if text1_length == 0 or text2_length == 0:
 587+ return 0
 588+ # Truncate the longer string.
 589+ if text1_length > text2_length:
 590+ text1 = text1[-text2_length:]
 591+ elif text1_length < text2_length:
 592+ text2 = text2[:text1_length]
 593+ text_length = min(text1_length, text2_length)
 594+ # Quick check for the worst case.
 595+ if text1 == text2:
 596+ return text_length
 597+
 598+ # Start by looking for a single character match
 599+ # and increase length until no match is found.
 600+ # Performance analysis: http://neil.fraser.name/news/2010/11/04/
 601+ best = 0
 602+ length = 1
 603+ while True:
 604+ pattern = text1[-length:]
 605+ found = text2.find(pattern)
 606+ if found == -1:
 607+ return best
 608+ length += found
 609+ if found == 0 or text1[-length:] == text2[:length]:
 610+ best = length
 611+ length += 1
 612+
 613+ def diff_halfMatch(self, text1, text2):
 614+ """Do the two texts share a substring which is at least half the length of
 615+ the longer text?
 616+ This speedup can produce non-minimal diffs.
 617+
 618+ Args:
 619+ text1: First string.
 620+ text2: Second string.
 621+
 622+ Returns:
 623+ Five element Array, containing the prefix of text1, the suffix of text1,
 624+ the prefix of text2, the suffix of text2 and the common middle. Or None
 625+ if there was no match.
 626+ """
 627+ if self.Diff_Timeout <= 0:
 628+ # Don't risk returning a non-optimal diff if we have unlimited time.
 629+ return None
 630+ if len(text1) > len(text2):
 631+ (longtext, shorttext) = (text1, text2)
 632+ else:
 633+ (shorttext, longtext) = (text1, text2)
 634+ if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
 635+ return None # Pointless.
 636+
 637+ def diff_halfMatchI(longtext, shorttext, i):
 638+ """Does a substring of shorttext exist within longtext such that the
 639+ substring is at least half the length of longtext?
 640+ Closure, but does not reference any external variables.
 641+
 642+ Args:
 643+ longtext: Longer string.
 644+ shorttext: Shorter string.
 645+ i: Start index of quarter length substring within longtext.
 646+
 647+ Returns:
 648+ Five element Array, containing the prefix of longtext, the suffix of
 649+ longtext, the prefix of shorttext, the suffix of shorttext and the
 650+ common middle. Or None if there was no match.
 651+ """
 652+ seed = longtext[i:i + len(longtext) / 4]
 653+ best_common = ''
 654+ j = shorttext.find(seed)
 655+ while j != -1:
 656+ prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
 657+ suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
 658+ if len(best_common) < suffixLength + prefixLength:
 659+ best_common = (shorttext[j - suffixLength:j] +
 660+ shorttext[j:j + prefixLength])
 661+ best_longtext_a = longtext[:i - suffixLength]
 662+ best_longtext_b = longtext[i + prefixLength:]
 663+ best_shorttext_a = shorttext[:j - suffixLength]
 664+ best_shorttext_b = shorttext[j + prefixLength:]
 665+ j = shorttext.find(seed, j + 1)
 666+
 667+ if len(best_common) * 2 >= len(longtext):
 668+ return (best_longtext_a, best_longtext_b,
 669+ best_shorttext_a, best_shorttext_b, best_common)
 670+ else:
 671+ return None
 672+
 673+ # First check if the second quarter is the seed for a half-match.
 674+ hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
 675+ # Check again based on the third quarter.
 676+ hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
 677+ if not hm1 and not hm2:
 678+ return None
 679+ elif not hm2:
 680+ hm = hm1
 681+ elif not hm1:
 682+ hm = hm2
 683+ else:
 684+ # Both matched. Select the longest.
 685+ if len(hm1[4]) > len(hm2[4]):
 686+ hm = hm1
 687+ else:
 688+ hm = hm2
 689+
 690+ # A half-match was found, sort out the return data.
 691+ if len(text1) > len(text2):
 692+ (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
 693+ else:
 694+ (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
 695+ return (text1_a, text1_b, text2_a, text2_b, mid_common)
 696+
 697+ def diff_cleanupSemantic(self, diffs):
 698+ """Reduce the number of edits by eliminating semantically trivial
 699+ equalities.
 700+
 701+ Args:
 702+ diffs: Array of diff tuples.
 703+ """
 704+ changes = False
 705+ equalities = [] # Stack of indices where equalities are found.
 706+ lastequality = None # Always equal to equalities[-1][1]
 707+ pointer = 0 # Index of current position.
 708+ # Number of chars that changed prior to the equality.
 709+ length_insertions1, length_deletions1 = 0, 0
 710+ # Number of chars that changed after the equality.
 711+ length_insertions2, length_deletions2 = 0, 0
 712+ while pointer < len(diffs):
 713+ if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
 714+ equalities.append(pointer)
 715+ length_insertions1, length_insertions2 = length_insertions2, 0
 716+ length_deletions1, length_deletions2 = length_deletions2, 0
 717+ lastequality = diffs[pointer][1]
 718+ else: # An insertion or deletion.
 719+ if diffs[pointer][0] == self.DIFF_INSERT:
 720+ length_insertions2 += len(diffs[pointer][1])
 721+ else:
 722+ length_deletions2 += len(diffs[pointer][1])
 723+ # Eliminate an equality that is smaller or equal to the edits on both
 724+ # sides of it.
 725+ if (lastequality != None and (len(lastequality) <=
 726+ max(length_insertions1, length_deletions1)) and
 727+ (len(lastequality) <= max(length_insertions2, length_deletions2))):
 728+ # Duplicate record.
 729+ diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
 730+ # Change second copy to insert.
 731+ diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
 732+ diffs[equalities[-1] + 1][1])
 733+ # Throw away the equality we just deleted.
 734+ equalities.pop()
 735+ # Throw away the previous equality (it needs to be reevaluated).
 736+ if len(equalities):
 737+ equalities.pop()
 738+ if len(equalities):
 739+ pointer = equalities[-1]
 740+ else:
 741+ pointer = -1
 742+ # Reset the counters.
 743+ length_insertions1, length_deletions1 = 0, 0
 744+ length_insertions2, length_deletions2 = 0, 0
 745+ lastequality = None
 746+ changes = True
 747+ pointer += 1
 748+
 749+ # Normalize the diff.
 750+ if changes:
 751+ self.diff_cleanupMerge(diffs)
 752+ self.diff_cleanupSemanticLossless(diffs)
 753+
 754+ # Find any overlaps between deletions and insertions.
 755+ # e.g: <del>abcxxx</del><ins>xxxdef</ins>
 756+ # -> <del>abc</del>xxx<ins>def</ins>
 757+ # Only extract an overlap if it is as big as the edit ahead or behind it.
 758+ pointer = 1
 759+ while pointer < len(diffs):
 760+ if (diffs[pointer - 1][0] == self.DIFF_DELETE and
 761+ diffs[pointer][0] == self.DIFF_INSERT):
 762+ deletion = diffs[pointer - 1][1]
 763+ insertion = diffs[pointer][1]
 764+ overlap_length = self.diff_commonOverlap(deletion, insertion)
 765+ if (overlap_length >= len(deletion) / 2.0 or
 766+ overlap_length >= len(insertion) / 2.0):
 767+ # Overlap found. Insert an equality and trim the surrounding edits.
 768+ diffs.insert(pointer, (self.DIFF_EQUAL, insertion[:overlap_length]))
 769+ diffs[pointer - 1] = (self.DIFF_DELETE,
 770+ deletion[:len(deletion) - overlap_length])
 771+ diffs[pointer + 1] = (self.DIFF_INSERT, insertion[overlap_length:])
 772+ pointer += 1
 773+ pointer += 1
 774+ pointer += 1
 775+
 776+ def diff_cleanupSemanticLossless(self, diffs):
 777+ """Look for single edits surrounded on both sides by equalities
 778+ which can be shifted sideways to align the edit to a word boundary.
 779+ e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
 780+
 781+ Args:
 782+ diffs: Array of diff tuples.
 783+ """
 784+
 785+ def diff_cleanupSemanticScore(one, two):
 786+ """Given two strings, compute a score representing whether the
 787+ internal boundary falls on logical boundaries.
 788+ Scores range from 5 (best) to 0 (worst).
 789+ Closure, but does not reference any external variables.
 790+
 791+ Args:
 792+ one: First string.
 793+ two: Second string.
 794+
 795+ Returns:
 796+ The score.
 797+ """
 798+ if not one or not two:
 799+ # Edges are the best.
 800+ return 5
 801+
 802+ # Each port of this function behaves slightly differently due to
 803+ # subtle differences in each language's definition of things like
 804+ # 'whitespace'. Since this function's purpose is largely cosmetic,
 805+ # the choice has been made to use each language's native features
 806+ # rather than force total conformity.
 807+ score = 0
 808+ # One point for non-alphanumeric.
 809+ if not one[-1].isalnum() or not two[0].isalnum():
 810+ score += 1
 811+ # Two points for whitespace.
 812+ if one[-1].isspace() or two[0].isspace():
 813+ score += 1
 814+ # Three points for line breaks.
 815+ if (one[-1] == "\r" or one[-1] == "\n" or
 816+ two[0] == "\r" or two[0] == "\n"):
 817+ score += 1
 818+ # Four points for blank lines.
 819+ if (re.search("\\n\\r?\\n$", one) or
 820+ re.match("^\\r?\\n\\r?\\n", two)):
 821+ score += 1
 822+ return score
 823+
 824+ pointer = 1
 825+ # Intentionally ignore the first and last element (don't need checking).
 826+ while pointer < len(diffs) - 1:
 827+ if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
 828+ diffs[pointer + 1][0] == self.DIFF_EQUAL):
 829+ # This is a single edit surrounded by equalities.
 830+ equality1 = diffs[pointer - 1][1]
 831+ edit = diffs[pointer][1]
 832+ equality2 = diffs[pointer + 1][1]
 833+
 834+ # First, shift the edit as far left as possible.
 835+ commonOffset = self.diff_commonSuffix(equality1, edit)
 836+ if commonOffset:
 837+ commonString = edit[-commonOffset:]
 838+ equality1 = equality1[:-commonOffset]
 839+ edit = commonString + edit[:-commonOffset]
 840+ equality2 = commonString + equality2
 841+
 842+ # Second, step character by character right, looking for the best fit.
 843+ bestEquality1 = equality1
 844+ bestEdit = edit
 845+ bestEquality2 = equality2
 846+ bestScore = (diff_cleanupSemanticScore(equality1, edit) +
 847+ diff_cleanupSemanticScore(edit, equality2))
 848+ while edit and equality2 and edit[0] == equality2[0]:
 849+ equality1 += edit[0]
 850+ edit = edit[1:] + equality2[0]
 851+ equality2 = equality2[1:]
 852+ score = (diff_cleanupSemanticScore(equality1, edit) +
 853+ diff_cleanupSemanticScore(edit, equality2))
 854+ # The >= encourages trailing rather than leading whitespace on edits.
 855+ if score >= bestScore:
 856+ bestScore = score
 857+ bestEquality1 = equality1
 858+ bestEdit = edit
 859+ bestEquality2 = equality2
 860+
 861+ if diffs[pointer - 1][1] != bestEquality1:
 862+ # We have an improvement, save it back to the diff.
 863+ if bestEquality1:
 864+ diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
 865+ else:
 866+ del diffs[pointer - 1]
 867+ pointer -= 1
 868+ diffs[pointer] = (diffs[pointer][0], bestEdit)
 869+ if bestEquality2:
 870+ diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
 871+ else:
 872+ del diffs[pointer + 1]
 873+ pointer -= 1
 874+ pointer += 1
 875+
 876+ def diff_cleanupEfficiency(self, diffs):
 877+ """Reduce the number of edits by eliminating operationally trivial
 878+ equalities.
 879+
 880+ Args:
 881+ diffs: Array of diff tuples.
 882+ """
 883+ changes = False
 884+ equalities = [] # Stack of indices where equalities are found.
 885+ lastequality = '' # Always equal to equalities[-1][1]
 886+ pointer = 0 # Index of current position.
 887+ pre_ins = False # Is there an insertion operation before the last equality.
 888+ pre_del = False # Is there a deletion operation before the last equality.
 889+ post_ins = False # Is there an insertion operation after the last equality.
 890+ post_del = False # Is there a deletion operation after the last equality.
 891+ while pointer < len(diffs):
 892+ if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
 893+ if (len(diffs[pointer][1]) < self.Diff_EditCost and
 894+ (post_ins or post_del)):
 895+ # Candidate found.
 896+ equalities.append(pointer)
 897+ pre_ins = post_ins
 898+ pre_del = post_del
 899+ lastequality = diffs[pointer][1]
 900+ else:
 901+ # Not a candidate, and can never become one.
 902+ equalities = []
 903+ lastequality = ''
 904+
 905+ post_ins = post_del = False
 906+ else: # An insertion or deletion.
 907+ if diffs[pointer][0] == self.DIFF_DELETE:
 908+ post_del = True
 909+ else:
 910+ post_ins = True
 911+
 912+ # Five types to be split:
 913+ # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
 914+ # <ins>A</ins>X<ins>C</ins><del>D</del>
 915+ # <ins>A</ins><del>B</del>X<ins>C</ins>
 916+ # <ins>A</del>X<ins>C</ins><del>D</del>
 917+ # <ins>A</ins><del>B</del>X<del>C</del>
 918+
 919+ if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
 920+ ((len(lastequality) < self.Diff_EditCost / 2) and
 921+ (pre_ins + pre_del + post_ins + post_del) == 3)):
 922+ # Duplicate record.
 923+ diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
 924+ # Change second copy to insert.
 925+ diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
 926+ diffs[equalities[-1] + 1][1])
 927+ equalities.pop() # Throw away the equality we just deleted.
 928+ lastequality = ''
 929+ if pre_ins and pre_del:
 930+ # No changes made which could affect previous entry, keep going.
 931+ post_ins = post_del = True
 932+ equalities = []
 933+ else:
 934+ if len(equalities):
 935+ equalities.pop() # Throw away the previous equality.
 936+ if len(equalities):
 937+ pointer = equalities[-1]
 938+ else:
 939+ pointer = -1
 940+ post_ins = post_del = False
 941+ changes = True
 942+ pointer += 1
 943+
 944+ if changes:
 945+ self.diff_cleanupMerge(diffs)
 946+
 947+ def diff_cleanupMerge(self, diffs):
 948+ """Reorder and merge like edit sections. Merge equalities.
 949+ Any edit section can move as long as it doesn't cross an equality.
 950+
 951+ Args:
 952+ diffs: Array of diff tuples.
 953+ """
 954+ diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
 955+ pointer = 0
 956+ count_delete = 0
 957+ count_insert = 0
 958+ text_delete = ''
 959+ text_insert = ''
 960+ while pointer < len(diffs):
 961+ if diffs[pointer][0] == self.DIFF_INSERT:
 962+ count_insert += 1
 963+ text_insert += diffs[pointer][1]
 964+ pointer += 1
 965+ elif diffs[pointer][0] == self.DIFF_DELETE:
 966+ count_delete += 1
 967+ text_delete += diffs[pointer][1]
 968+ pointer += 1
 969+ elif diffs[pointer][0] == self.DIFF_EQUAL:
 970+ # Upon reaching an equality, check for prior redundancies.
 971+ if count_delete + count_insert > 1:
 972+ if count_delete != 0 and count_insert != 0:
 973+ # Factor out any common prefixies.
 974+ commonlength = self.diff_commonPrefix(text_insert, text_delete)
 975+ if commonlength != 0:
 976+ x = pointer - count_delete - count_insert - 1
 977+ if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
 978+ diffs[x] = (diffs[x][0], diffs[x][1] +
 979+ text_insert[:commonlength])
 980+ else:
 981+ diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
 982+ pointer += 1
 983+ text_insert = text_insert[commonlength:]
 984+ text_delete = text_delete[commonlength:]
 985+ # Factor out any common suffixies.
 986+ commonlength = self.diff_commonSuffix(text_insert, text_delete)
 987+ if commonlength != 0:
 988+ diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
 989+ diffs[pointer][1])
 990+ text_insert = text_insert[:-commonlength]
 991+ text_delete = text_delete[:-commonlength]
 992+ # Delete the offending records and add the merged ones.
 993+ if count_delete == 0:
 994+ diffs[pointer - count_insert : pointer] = [
 995+ (self.DIFF_INSERT, text_insert)]
 996+ elif count_insert == 0:
 997+ diffs[pointer - count_delete : pointer] = [
 998+ (self.DIFF_DELETE, text_delete)]
 999+ else:
 1000+ diffs[pointer - count_delete - count_insert : pointer] = [
 1001+ (self.DIFF_DELETE, text_delete),
 1002+ (self.DIFF_INSERT, text_insert)]
 1003+ pointer = pointer - count_delete - count_insert + 1
 1004+ if count_delete != 0:
 1005+ pointer += 1
 1006+ if count_insert != 0:
 1007+ pointer += 1
 1008+ elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
 1009+ # Merge this equality with the previous one.
 1010+ diffs[pointer - 1] = (diffs[pointer - 1][0],
 1011+ diffs[pointer - 1][1] + diffs[pointer][1])
 1012+ del diffs[pointer]
 1013+ else:
 1014+ pointer += 1
 1015+
 1016+ count_insert = 0
 1017+ count_delete = 0
 1018+ text_delete = ''
 1019+ text_insert = ''
 1020+
 1021+ if diffs[-1][1] == '':
 1022+ diffs.pop() # Remove the dummy entry at the end.
 1023+
 1024+ # Second pass: look for single edits surrounded on both sides by equalities
 1025+ # which can be shifted sideways to eliminate an equality.
 1026+ # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
 1027+ changes = False
 1028+ pointer = 1
 1029+ # Intentionally ignore the first and last element (don't need checking).
 1030+ while pointer < len(diffs) - 1:
 1031+ if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
 1032+ diffs[pointer + 1][0] == self.DIFF_EQUAL):
 1033+ # This is a single edit surrounded by equalities.
 1034+ if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
 1035+ # Shift the edit over the previous equality.
 1036+ diffs[pointer] = (diffs[pointer][0],
 1037+ diffs[pointer - 1][1] +
 1038+ diffs[pointer][1][:-len(diffs[pointer - 1][1])])
 1039+ diffs[pointer + 1] = (diffs[pointer + 1][0],
 1040+ diffs[pointer - 1][1] + diffs[pointer + 1][1])
 1041+ del diffs[pointer - 1]
 1042+ changes = True
 1043+ elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
 1044+ # Shift the edit over the next equality.
 1045+ diffs[pointer - 1] = (diffs[pointer - 1][0],
 1046+ diffs[pointer - 1][1] + diffs[pointer + 1][1])
 1047+ diffs[pointer] = (diffs[pointer][0],
 1048+ diffs[pointer][1][len(diffs[pointer + 1][1]):] +
 1049+ diffs[pointer + 1][1])
 1050+ del diffs[pointer + 1]
 1051+ changes = True
 1052+ pointer += 1
 1053+
 1054+ # If shifts were made, the diff needs reordering and another shift sweep.
 1055+ if changes:
 1056+ self.diff_cleanupMerge(diffs)
 1057+
 1058+ def diff_xIndex(self, diffs, loc):
 1059+ """loc is a location in text1, compute and return the equivalent location
 1060+ in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
 1061+
 1062+ Args:
 1063+ diffs: Array of diff tuples.
 1064+ loc: Location within text1.
 1065+
 1066+ Returns:
 1067+ Location within text2.
 1068+ """
 1069+ chars1 = 0
 1070+ chars2 = 0
 1071+ last_chars1 = 0
 1072+ last_chars2 = 0
 1073+ for x in xrange(len(diffs)):
 1074+ (op, text) = diffs[x]
 1075+ if op != self.DIFF_INSERT: # Equality or deletion.
 1076+ chars1 += len(text)
 1077+ if op != self.DIFF_DELETE: # Equality or insertion.
 1078+ chars2 += len(text)
 1079+ if chars1 > loc: # Overshot the location.
 1080+ break
 1081+ last_chars1 = chars1
 1082+ last_chars2 = chars2
 1083+
 1084+ if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
 1085+ # The location was deleted.
 1086+ return last_chars2
 1087+ # Add the remaining len(character).
 1088+ return last_chars2 + (loc - last_chars1)
 1089+
 1090+ def diff_prettyHtml(self, diffs):
 1091+ """Convert a diff array into a pretty HTML report.
 1092+
 1093+ Args:
 1094+ diffs: Array of diff tuples.
 1095+
 1096+ Returns:
 1097+ HTML representation.
 1098+ """
 1099+ html = []
 1100+ i = 0
 1101+ for (op, data) in diffs:
 1102+ text = (data.replace("&", "&amp;").replace("<", "&lt;")
 1103+ .replace(">", "&gt;").replace("\n", "&para;<br>"))
 1104+ if op == self.DIFF_INSERT:
 1105+ html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)
 1106+ elif op == self.DIFF_DELETE:
 1107+ html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)
 1108+ elif op == self.DIFF_EQUAL:
 1109+ html.append("<span>%s</span>" % text)
 1110+ if op != self.DIFF_DELETE:
 1111+ i += len(data)
 1112+ return "".join(html)
 1113+
 1114+ def diff_text1(self, diffs):
 1115+ """Compute and return the source text (all equalities and deletions).
 1116+
 1117+ Args:
 1118+ diffs: Array of diff tuples.
 1119+
 1120+ Returns:
 1121+ Source text.
 1122+ """
 1123+ text = []
 1124+ for (op, data) in diffs:
 1125+ if op != self.DIFF_INSERT:
 1126+ text.append(data)
 1127+ return "".join(text)
 1128+
 1129+ def diff_text2(self, diffs):
 1130+ """Compute and return the destination text (all equalities and insertions).
 1131+
 1132+ Args:
 1133+ diffs: Array of diff tuples.
 1134+
 1135+ Returns:
 1136+ Destination text.
 1137+ """
 1138+ text = []
 1139+ for (op, data) in diffs:
 1140+ if op != self.DIFF_DELETE:
 1141+ text.append(data)
 1142+ return "".join(text)
 1143+
 1144+ def diff_levenshtein(self, diffs):
 1145+ """Compute the Levenshtein distance; the number of inserted, deleted or
 1146+ substituted characters.
 1147+
 1148+ Args:
 1149+ diffs: Array of diff tuples.
 1150+
 1151+ Returns:
 1152+ Number of changes.
 1153+ """
 1154+ levenshtein = 0
 1155+ insertions = 0
 1156+ deletions = 0
 1157+ for (op, data) in diffs:
 1158+ if op == self.DIFF_INSERT:
 1159+ insertions += len(data)
 1160+ elif op == self.DIFF_DELETE:
 1161+ deletions += len(data)
 1162+ elif op == self.DIFF_EQUAL:
 1163+ # A deletion and an insertion is one substitution.
 1164+ levenshtein += max(insertions, deletions)
 1165+ insertions = 0
 1166+ deletions = 0
 1167+ levenshtein += max(insertions, deletions)
 1168+ return levenshtein
 1169+
 1170+ def diff_toDelta(self, diffs):
 1171+ """Crush the diff into an encoded string which describes the operations
 1172+ required to transform text1 into text2.
 1173+ E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
 1174+ Operations are tab-separated. Inserted text is escaped using %xx notation.
 1175+
 1176+ Args:
 1177+ diffs: Array of diff tuples.
 1178+
 1179+ Returns:
 1180+ Delta text.
 1181+ """
 1182+ text = []
 1183+ for (op, data) in diffs:
 1184+ if op == self.DIFF_INSERT:
 1185+ # High ascii will raise UnicodeDecodeError. Use Unicode instead.
 1186+ data = data.encode("utf-8")
 1187+ text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
 1188+ elif op == self.DIFF_DELETE:
 1189+ text.append("-%d" % len(data))
 1190+ elif op == self.DIFF_EQUAL:
 1191+ text.append("=%d" % len(data))
 1192+ return "\t".join(text)
 1193+
 1194+ def diff_fromDelta(self, text1, delta):
 1195+ """Given the original text1, and an encoded string which describes the
 1196+ operations required to transform text1 into text2, compute the full diff.
 1197+
 1198+ Args:
 1199+ text1: Source string for the diff.
 1200+ delta: Delta text.
 1201+
 1202+ Returns:
 1203+ Array of diff tuples.
 1204+
 1205+ Raises:
 1206+ ValueError: If invalid input.
 1207+ """
 1208+ if type(delta) == unicode:
 1209+ # Deltas should be composed of a subset of ascii chars, Unicode not
 1210+ # required. If this encode raises UnicodeEncodeError, delta is invalid.
 1211+ delta = delta.encode("ascii")
 1212+ diffs = []
 1213+ pointer = 0 # Cursor in text1
 1214+ tokens = delta.split("\t")
 1215+ for token in tokens:
 1216+ if token == "":
 1217+ # Blank tokens are ok (from a trailing \t).
 1218+ continue
 1219+ # Each token begins with a one character parameter which specifies the
 1220+ # operation of this token (delete, insert, equality).
 1221+ param = token[1:]
 1222+ if token[0] == "+":
 1223+ param = urllib.unquote(param).decode("utf-8")
 1224+ diffs.append((self.DIFF_INSERT, param))
 1225+ elif token[0] == "-" or token[0] == "=":
 1226+ try:
 1227+ n = int(param)
 1228+ except ValueError:
 1229+ raise ValueError("Invalid number in diff_fromDelta: " + param)
 1230+ if n < 0:
 1231+ raise ValueError("Negative number in diff_fromDelta: " + param)
 1232+ text = text1[pointer : pointer + n]
 1233+ pointer += n
 1234+ if token[0] == "=":
 1235+ diffs.append((self.DIFF_EQUAL, text))
 1236+ else:
 1237+ diffs.append((self.DIFF_DELETE, text))
 1238+ else:
 1239+ # Anything else is an error.
 1240+ raise ValueError("Invalid diff operation in diff_fromDelta: " +
 1241+ token[0])
 1242+ if pointer != len(text1):
 1243+ raise ValueError(
 1244+ "Delta length (%d) does not equal source text length (%d)." %
 1245+ (pointer, len(text1)))
 1246+ return diffs
 1247+
 1248+ # MATCH FUNCTIONS
 1249+
 1250+ def match_main(self, text, pattern, loc):
 1251+ """Locate the best instance of 'pattern' in 'text' near 'loc'.
 1252+
 1253+ Args:
 1254+ text: The text to search.
 1255+ pattern: The pattern to search for.
 1256+ loc: The location to search around.
 1257+
 1258+ Returns:
 1259+ Best match index or -1.
 1260+ """
 1261+ # Check for null inputs.
 1262+ if text == None or pattern == None:
 1263+ raise ValueError("Null inputs. (match_main)")
 1264+
 1265+ loc = max(0, min(loc, len(text)))
 1266+ if text == pattern:
 1267+ # Shortcut (potentially not guaranteed by the algorithm)
 1268+ return 0
 1269+ elif not text:
 1270+ # Nothing to match.
 1271+ return -1
 1272+ elif text[loc:loc + len(pattern)] == pattern:
 1273+ # Perfect match at the perfect spot! (Includes case of null pattern)
 1274+ return loc
 1275+ else:
 1276+ # Do a fuzzy compare.
 1277+ match = self.match_bitap(text, pattern, loc)
 1278+ return match
 1279+
 1280+ def match_bitap(self, text, pattern, loc):
 1281+ """Locate the best instance of 'pattern' in 'text' near 'loc' using the
 1282+ Bitap algorithm.
 1283+
 1284+ Args:
 1285+ text: The text to search.
 1286+ pattern: The pattern to search for.
 1287+ loc: The location to search around.
 1288+
 1289+ Returns:
 1290+ Best match index or -1.
 1291+ """
 1292+ # Python doesn't have a maxint limit, so ignore this check.
 1293+ #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
 1294+ # raise ValueError("Pattern too long for this application.")
 1295+
 1296+ # Initialise the alphabet.
 1297+ s = self.match_alphabet(pattern)
 1298+
 1299+ def match_bitapScore(e, x):
 1300+ """Compute and return the score for a match with e errors and x location.
 1301+ Accesses loc and pattern through being a closure.
 1302+
 1303+ Args:
 1304+ e: Number of errors in match.
 1305+ x: Location of match.
 1306+
 1307+ Returns:
 1308+ Overall score for match (0.0 = good, 1.0 = bad).
 1309+ """
 1310+ accuracy = float(e) / len(pattern)
 1311+ proximity = abs(loc - x)
 1312+ if not self.Match_Distance:
 1313+ # Dodge divide by zero error.
 1314+ return proximity and 1.0 or accuracy
 1315+ return accuracy + (proximity / float(self.Match_Distance))
 1316+
 1317+ # Highest score beyond which we give up.
 1318+ score_threshold = self.Match_Threshold
 1319+ # Is there a nearby exact match? (speedup)
 1320+ best_loc = text.find(pattern, loc)
 1321+ if best_loc != -1:
 1322+ score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
 1323+ # What about in the other direction? (speedup)
 1324+ best_loc = text.rfind(pattern, loc + len(pattern))
 1325+ if best_loc != -1:
 1326+ score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
 1327+
 1328+ # Initialise the bit arrays.
 1329+ matchmask = 1 << (len(pattern) - 1)
 1330+ best_loc = -1
 1331+
 1332+ bin_max = len(pattern) + len(text)
 1333+ # Empty initialization added to appease pychecker.
 1334+ last_rd = None
 1335+ for d in xrange(len(pattern)):
 1336+ # Scan for the best match each iteration allows for one more error.
 1337+ # Run a binary search to determine how far from 'loc' we can stray at
 1338+ # this error level.
 1339+ bin_min = 0
 1340+ bin_mid = bin_max
 1341+ while bin_min < bin_mid:
 1342+ if match_bitapScore(d, loc + bin_mid) <= score_threshold:
 1343+ bin_min = bin_mid
 1344+ else:
 1345+ bin_max = bin_mid
 1346+ bin_mid = (bin_max - bin_min) / 2 + bin_min
 1347+
 1348+ # Use the result from this iteration as the maximum for the next.
 1349+ bin_max = bin_mid
 1350+ start = max(1, loc - bin_mid + 1)
 1351+ finish = min(loc + bin_mid, len(text)) + len(pattern)
 1352+
 1353+ rd = range(finish + 1)
 1354+ rd.append((1 << d) - 1)
 1355+ for j in xrange(finish, start - 1, -1):
 1356+ if len(text) <= j - 1:
 1357+ # Out of range.
 1358+ charMatch = 0
 1359+ else:
 1360+ charMatch = s.get(text[j - 1], 0)
 1361+ if d == 0: # First pass: exact match.
 1362+ rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
 1363+ else: # Subsequent passes: fuzzy match.
 1364+ rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (
 1365+ ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
 1366+ if rd[j] & matchmask:
 1367+ score = match_bitapScore(d, j - 1)
 1368+ # This match will almost certainly be better than any existing match.
 1369+ # But check anyway.
 1370+ if score <= score_threshold:
 1371+ # Told you so.
 1372+ score_threshold = score
 1373+ best_loc = j - 1
 1374+ if best_loc > loc:
 1375+ # When passing loc, don't exceed our current distance from loc.
 1376+ start = max(1, 2 * loc - best_loc)
 1377+ else:
 1378+ # Already passed loc, downhill from here on in.
 1379+ break
 1380+ # No hope for a (better) match at greater error levels.
 1381+ if match_bitapScore(d + 1, loc) > score_threshold:
 1382+ break
 1383+ last_rd = rd
 1384+ return best_loc
 1385+
 1386+ def match_alphabet(self, pattern):
 1387+ """Initialise the alphabet for the Bitap algorithm.
 1388+
 1389+ Args:
 1390+ pattern: The text to encode.
 1391+
 1392+ Returns:
 1393+ Hash of character locations.
 1394+ """
 1395+ s = {}
 1396+ for char in pattern:
 1397+ s[char] = 0
 1398+ for i in xrange(len(pattern)):
 1399+ s[pattern[i]] |= 1 << (len(pattern) - i - 1)
 1400+ return s
 1401+
 1402+ # PATCH FUNCTIONS
 1403+
 1404+ def patch_addContext(self, patch, text):
 1405+ """Increase the context until it is unique,
 1406+ but don't let the pattern expand beyond Match_MaxBits.
 1407+
 1408+ Args:
 1409+ patch: The patch to grow.
 1410+ text: Source text.
 1411+ """
 1412+ if len(text) == 0:
 1413+ return
 1414+ pattern = text[patch.start2 : patch.start2 + patch.length1]
 1415+ padding = 0
 1416+
 1417+ # Look for the first and last matches of pattern in text. If two different
 1418+ # matches are found, increase the pattern length.
 1419+ while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
 1420+ 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
 1421+ self.Patch_Margin)):
 1422+ padding += self.Patch_Margin
 1423+ pattern = text[max(0, patch.start2 - padding) :
 1424+ patch.start2 + patch.length1 + padding]
 1425+ # Add one chunk for good luck.
 1426+ padding += self.Patch_Margin
 1427+
 1428+ # Add the prefix.
 1429+ prefix = text[max(0, patch.start2 - padding) : patch.start2]
 1430+ if prefix:
 1431+ patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
 1432+ # Add the suffix.
 1433+ suffix = text[patch.start2 + patch.length1 :
 1434+ patch.start2 + patch.length1 + padding]
 1435+ if suffix:
 1436+ patch.diffs.append((self.DIFF_EQUAL, suffix))
 1437+
 1438+ # Roll back the start points.
 1439+ patch.start1 -= len(prefix)
 1440+ patch.start2 -= len(prefix)
 1441+ # Extend lengths.
 1442+ patch.length1 += len(prefix) + len(suffix)
 1443+ patch.length2 += len(prefix) + len(suffix)
 1444+
 1445+ def patch_make(self, a, b=None, c=None):
 1446+ """Compute a list of patches to turn text1 into text2.
 1447+ Use diffs if provided, otherwise compute it ourselves.
 1448+ There are four ways to call this function, depending on what data is
 1449+ available to the caller:
 1450+ Method 1:
 1451+ a = text1, b = text2
 1452+ Method 2:
 1453+ a = diffs
 1454+ Method 3 (optimal):
 1455+ a = text1, b = diffs
 1456+ Method 4 (deprecated, use method 3):
 1457+ a = text1, b = text2, c = diffs
 1458+
 1459+ Args:
 1460+ a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
 1461+ text2 (method 2).
 1462+ b: text2 (methods 1,4) or Array of diff tuples for text1 to
 1463+ text2 (method 3) or undefined (method 2).
 1464+ c: Array of diff tuples for text1 to text2 (method 4) or
 1465+ undefined (methods 1,2,3).
 1466+
 1467+ Returns:
 1468+ Array of patch objects.
 1469+ """
 1470+ text1 = None
 1471+ diffs = None
 1472+ # Note that texts may arrive as 'str' or 'unicode'.
 1473+ if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
 1474+ # Method 1: text1, text2
 1475+ # Compute diffs from text1 and text2.
 1476+ text1 = a
 1477+ diffs = self.diff_main(text1, b, True)
 1478+ if len(diffs) > 2:
 1479+ self.diff_cleanupSemantic(diffs)
 1480+ self.diff_cleanupEfficiency(diffs)
 1481+ elif isinstance(a, list) and b is None and c is None:
 1482+ # Method 2: diffs
 1483+ # Compute text1 from diffs.
 1484+ diffs = a
 1485+ text1 = self.diff_text1(diffs)
 1486+ elif isinstance(a, basestring) and isinstance(b, list) and c is None:
 1487+ # Method 3: text1, diffs
 1488+ text1 = a
 1489+ diffs = b
 1490+ elif (isinstance(a, basestring) and isinstance(b, basestring) and
 1491+ isinstance(c, list)):
 1492+ # Method 4: text1, text2, diffs
 1493+ # text2 is not used.
 1494+ text1 = a
 1495+ diffs = c
 1496+ else:
 1497+ raise ValueError("Unknown call format to patch_make.")
 1498+
 1499+ if not diffs:
 1500+ return [] # Get rid of the None case.
 1501+ patches = []
 1502+ patch = patch_obj()
 1503+ char_count1 = 0 # Number of characters into the text1 string.
 1504+ char_count2 = 0 # Number of characters into the text2 string.
 1505+ prepatch_text = text1 # Recreate the patches to determine context info.
 1506+ postpatch_text = text1
 1507+ for x in xrange(len(diffs)):
 1508+ (diff_type, diff_text) = diffs[x]
 1509+ if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
 1510+ # A new patch starts here.
 1511+ patch.start1 = char_count1
 1512+ patch.start2 = char_count2
 1513+ if diff_type == self.DIFF_INSERT:
 1514+ # Insertion
 1515+ patch.diffs.append(diffs[x])
 1516+ patch.length2 += len(diff_text)
 1517+ postpatch_text = (postpatch_text[:char_count2] + diff_text +
 1518+ postpatch_text[char_count2:])
 1519+ elif diff_type == self.DIFF_DELETE:
 1520+ # Deletion.
 1521+ patch.length1 += len(diff_text)
 1522+ patch.diffs.append(diffs[x])
 1523+ postpatch_text = (postpatch_text[:char_count2] +
 1524+ postpatch_text[char_count2 + len(diff_text):])
 1525+ elif (diff_type == self.DIFF_EQUAL and
 1526+ len(diff_text) <= 2 * self.Patch_Margin and
 1527+ len(patch.diffs) != 0 and len(diffs) != x + 1):
 1528+ # Small equality inside a patch.
 1529+ patch.diffs.append(diffs[x])
 1530+ patch.length1 += len(diff_text)
 1531+ patch.length2 += len(diff_text)
 1532+
 1533+ if (diff_type == self.DIFF_EQUAL and
 1534+ len(diff_text) >= 2 * self.Patch_Margin):
 1535+ # Time for a new patch.
 1536+ if len(patch.diffs) != 0:
 1537+ self.patch_addContext(patch, prepatch_text)
 1538+ patches.append(patch)
 1539+ patch = patch_obj()
 1540+ # Unlike Unidiff, our patch lists have a rolling context.
 1541+ # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
 1542+ # Update prepatch text & pos to reflect the application of the
 1543+ # just completed patch.
 1544+ prepatch_text = postpatch_text
 1545+ char_count1 = char_count2
 1546+
 1547+ # Update the current character count.
 1548+ if diff_type != self.DIFF_INSERT:
 1549+ char_count1 += len(diff_text)
 1550+ if diff_type != self.DIFF_DELETE:
 1551+ char_count2 += len(diff_text)
 1552+
 1553+ # Pick up the leftover patch if not empty.
 1554+ if len(patch.diffs) != 0:
 1555+ self.patch_addContext(patch, prepatch_text)
 1556+ patches.append(patch)
 1557+ return patches
 1558+
 1559+ def patch_deepCopy(self, patches):
 1560+ """Given an array of patches, return another array that is identical.
 1561+
 1562+ Args:
 1563+ patches: Array of patch objects.
 1564+
 1565+ Returns:
 1566+ Array of patch objects.
 1567+ """
 1568+ patchesCopy = []
 1569+ for patch in patches:
 1570+ patchCopy = patch_obj()
 1571+ # No need to deep copy the tuples since they are immutable.
 1572+ patchCopy.diffs = patch.diffs[:]
 1573+ patchCopy.start1 = patch.start1
 1574+ patchCopy.start2 = patch.start2
 1575+ patchCopy.length1 = patch.length1
 1576+ patchCopy.length2 = patch.length2
 1577+ patchesCopy.append(patchCopy)
 1578+ return patchesCopy
 1579+
 1580+ def patch_apply(self, patches, text):
 1581+ """Merge a set of patches onto the text. Return a patched text, as well
 1582+ as a list of true/false values indicating which patches were applied.
 1583+
 1584+ Args:
 1585+ patches: Array of patch objects.
 1586+ text: Old text.
 1587+
 1588+ Returns:
 1589+ Two element Array, containing the new text and an array of boolean values.
 1590+ """
 1591+ if not patches:
 1592+ return (text, [])
 1593+
 1594+ # Deep copy the patches so that no changes are made to originals.
 1595+ patches = self.patch_deepCopy(patches)
 1596+
 1597+ nullPadding = self.patch_addPadding(patches)
 1598+ text = nullPadding + text + nullPadding
 1599+ self.patch_splitMax(patches)
 1600+
 1601+ # delta keeps track of the offset between the expected and actual location
 1602+ # of the previous patch. If there are patches expected at positions 10 and
 1603+ # 20, but the first patch was found at 12, delta is 2 and the second patch
 1604+ # has an effective expected position of 22.
 1605+ delta = 0
 1606+ results = []
 1607+ for patch in patches:
 1608+ expected_loc = patch.start2 + delta
 1609+ text1 = self.diff_text1(patch.diffs)
 1610+ end_loc = -1
 1611+ if len(text1) > self.Match_MaxBits:
 1612+ # patch_splitMax will only provide an oversized pattern in the case of
 1613+ # a monster delete.
 1614+ start_loc = self.match_main(text, text1[:self.Match_MaxBits],
 1615+ expected_loc)
 1616+ if start_loc != -1:
 1617+ end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
 1618+ expected_loc + len(text1) - self.Match_MaxBits)
 1619+ if end_loc == -1 or start_loc >= end_loc:
 1620+ # Can't find valid trailing context. Drop this patch.
 1621+ start_loc = -1
 1622+ else:
 1623+ start_loc = self.match_main(text, text1, expected_loc)
 1624+ if start_loc == -1:
 1625+ # No match found. :(
 1626+ results.append(False)
 1627+ # Subtract the delta for this failed patch from subsequent patches.
 1628+ delta -= patch.length2 - patch.length1
 1629+ else:
 1630+ # Found a match. :)
 1631+ results.append(True)
 1632+ delta = start_loc - expected_loc
 1633+ if end_loc == -1:
 1634+ text2 = text[start_loc : start_loc + len(text1)]
 1635+ else:
 1636+ text2 = text[start_loc : end_loc + self.Match_MaxBits]
 1637+ if text1 == text2:
 1638+ # Perfect match, just shove the replacement text in.
 1639+ text = (text[:start_loc] + self.diff_text2(patch.diffs) +
 1640+ text[start_loc + len(text1):])
 1641+ else:
 1642+ # Imperfect match.
 1643+ # Run a diff to get a framework of equivalent indices.
 1644+ diffs = self.diff_main(text1, text2, False)
 1645+ if (len(text1) > self.Match_MaxBits and
 1646+ self.diff_levenshtein(diffs) / float(len(text1)) >
 1647+ self.Patch_DeleteThreshold):
 1648+ # The end points match, but the content is unacceptably bad.
 1649+ results[-1] = False
 1650+ else:
 1651+ self.diff_cleanupSemanticLossless(diffs)
 1652+ index1 = 0
 1653+ for (op, data) in patch.diffs:
 1654+ if op != self.DIFF_EQUAL:
 1655+ index2 = self.diff_xIndex(diffs, index1)
 1656+ if op == self.DIFF_INSERT: # Insertion
 1657+ text = text[:start_loc + index2] + data + text[start_loc +
 1658+ index2:]
 1659+ elif op == self.DIFF_DELETE: # Deletion
 1660+ text = text[:start_loc + index2] + text[start_loc +
 1661+ self.diff_xIndex(diffs, index1 + len(data)):]
 1662+ if op != self.DIFF_DELETE:
 1663+ index1 += len(data)
 1664+ # Strip the padding off.
 1665+ text = text[len(nullPadding):-len(nullPadding)]
 1666+ return (text, results)
 1667+
 1668+ def patch_addPadding(self, patches):
 1669+ """Add some padding on text start and end so that edges can match
 1670+ something. Intended to be called only from within patch_apply.
 1671+
 1672+ Args:
 1673+ patches: Array of patch objects.
 1674+
 1675+ Returns:
 1676+ The padding string added to each side.
 1677+ """
 1678+ paddingLength = self.Patch_Margin
 1679+ nullPadding = ""
 1680+ for x in xrange(1, paddingLength + 1):
 1681+ nullPadding += chr(x)
 1682+
 1683+ # Bump all the patches forward.
 1684+ for patch in patches:
 1685+ patch.start1 += paddingLength
 1686+ patch.start2 += paddingLength
 1687+
 1688+ # Add some padding on start of first diff.
 1689+ patch = patches[0]
 1690+ diffs = patch.diffs
 1691+ if not diffs or diffs[0][0] != self.DIFF_EQUAL:
 1692+ # Add nullPadding equality.
 1693+ diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
 1694+ patch.start1 -= paddingLength # Should be 0.
 1695+ patch.start2 -= paddingLength # Should be 0.
 1696+ patch.length1 += paddingLength
 1697+ patch.length2 += paddingLength
 1698+ elif paddingLength > len(diffs[0][1]):
 1699+ # Grow first equality.
 1700+ extraLength = paddingLength - len(diffs[0][1])
 1701+ newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
 1702+ diffs[0] = (diffs[0][0], newText)
 1703+ patch.start1 -= extraLength
 1704+ patch.start2 -= extraLength
 1705+ patch.length1 += extraLength
 1706+ patch.length2 += extraLength
 1707+
 1708+ # Add some padding on end of last diff.
 1709+ patch = patches[-1]
 1710+ diffs = patch.diffs
 1711+ if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
 1712+ # Add nullPadding equality.
 1713+ diffs.append((self.DIFF_EQUAL, nullPadding))
 1714+ patch.length1 += paddingLength
 1715+ patch.length2 += paddingLength
 1716+ elif paddingLength > len(diffs[-1][1]):
 1717+ # Grow last equality.
 1718+ extraLength = paddingLength - len(diffs[-1][1])
 1719+ newText = diffs[-1][1] + nullPadding[:extraLength]
 1720+ diffs[-1] = (diffs[-1][0], newText)
 1721+ patch.length1 += extraLength
 1722+ patch.length2 += extraLength
 1723+
 1724+ return nullPadding
 1725+
 1726+ def patch_splitMax(self, patches):
 1727+ """Look through the patches and break up any which are longer than the
 1728+ maximum limit of the match algorithm.
 1729+ Intended to be called only from within patch_apply.
 1730+
 1731+ Args:
 1732+ patches: Array of patch objects.
 1733+ """
 1734+ patch_size = self.Match_MaxBits
 1735+ if patch_size == 0:
 1736+ # Python has the option of not splitting strings due to its ability
 1737+ # to handle integers of arbitrary precision.
 1738+ return
 1739+ for x in xrange(len(patches)):
 1740+ if patches[x].length1 > patch_size:
 1741+ bigpatch = patches[x]
 1742+ # Remove the big old patch.
 1743+ del patches[x]
 1744+ x -= 1
 1745+ start1 = bigpatch.start1
 1746+ start2 = bigpatch.start2
 1747+ precontext = ''
 1748+ while len(bigpatch.diffs) != 0:
 1749+ # Create one of several smaller patches.
 1750+ patch = patch_obj()
 1751+ empty = True
 1752+ patch.start1 = start1 - len(precontext)
 1753+ patch.start2 = start2 - len(precontext)
 1754+ if precontext:
 1755+ patch.length1 = patch.length2 = len(precontext)
 1756+ patch.diffs.append((self.DIFF_EQUAL, precontext))
 1757+
 1758+ while (len(bigpatch.diffs) != 0 and
 1759+ patch.length1 < patch_size - self.Patch_Margin):
 1760+ (diff_type, diff_text) = bigpatch.diffs[0]
 1761+ if diff_type == self.DIFF_INSERT:
 1762+ # Insertions are harmless.
 1763+ patch.length2 += len(diff_text)
 1764+ start2 += len(diff_text)
 1765+ patch.diffs.append(bigpatch.diffs.pop(0))
 1766+ empty = False
 1767+ elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
 1768+ patch.diffs[0][0] == self.DIFF_EQUAL and
 1769+ len(diff_text) > 2 * patch_size):
 1770+ # This is a large deletion. Let it pass in one chunk.
 1771+ patch.length1 += len(diff_text)
 1772+ start1 += len(diff_text)
 1773+ empty = False
 1774+ patch.diffs.append((diff_type, diff_text))
 1775+ del bigpatch.diffs[0]
 1776+ else:
 1777+ # Deletion or equality. Only take as much as we can stomach.
 1778+ diff_text = diff_text[:patch_size - patch.length1 -
 1779+ self.Patch_Margin]
 1780+ patch.length1 += len(diff_text)
 1781+ start1 += len(diff_text)
 1782+ if diff_type == self.DIFF_EQUAL:
 1783+ patch.length2 += len(diff_text)
 1784+ start2 += len(diff_text)
 1785+ else:
 1786+ empty = False
 1787+
 1788+ patch.diffs.append((diff_type, diff_text))
 1789+ if diff_text == bigpatch.diffs[0][1]:
 1790+ del bigpatch.diffs[0]
 1791+ else:
 1792+ bigpatch.diffs[0] = (bigpatch.diffs[0][0],
 1793+ bigpatch.diffs[0][1][len(diff_text):])
 1794+
 1795+ # Compute the head context for the next patch.
 1796+ precontext = self.diff_text2(patch.diffs)
 1797+ precontext = precontext[-self.Patch_Margin:]
 1798+ # Append the end context for this patch.
 1799+ postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
 1800+ if postcontext:
 1801+ patch.length1 += len(postcontext)
 1802+ patch.length2 += len(postcontext)
 1803+ if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
 1804+ patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
 1805+ postcontext)
 1806+ else:
 1807+ patch.diffs.append((self.DIFF_EQUAL, postcontext))
 1808+
 1809+ if not empty:
 1810+ x += 1
 1811+ patches.insert(x, patch)
 1812+
 1813+ def patch_toText(self, patches):
 1814+ """Take a list of patches and return a textual representation.
 1815+
 1816+ Args:
 1817+ patches: Array of patch objects.
 1818+
 1819+ Returns:
 1820+ Text representation of patches.
 1821+ """
 1822+ text = []
 1823+ for patch in patches:
 1824+ text.append(str(patch))
 1825+ return "".join(text)
 1826+
 1827+ def patch_fromText(self, textline):
 1828+ """Parse a textual representation of patches and return a list of patch
 1829+ objects.
 1830+
 1831+ Args:
 1832+ textline: Text representation of patches.
 1833+
 1834+ Returns:
 1835+ Array of patch objects.
 1836+
 1837+ Raises:
 1838+ ValueError: If invalid input.
 1839+ """
 1840+ if type(textline) == unicode:
 1841+ # Patches should be composed of a subset of ascii chars, Unicode not
 1842+ # required. If this encode raises UnicodeEncodeError, patch is invalid.
 1843+ textline = textline.encode("ascii")
 1844+ patches = []
 1845+ if not textline:
 1846+ return patches
 1847+ text = textline.split('\n')
 1848+ while len(text) != 0:
 1849+ m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
 1850+ if not m:
 1851+ raise ValueError("Invalid patch string: " + text[0])
 1852+ patch = patch_obj()
 1853+ patches.append(patch)
 1854+ patch.start1 = int(m.group(1))
 1855+ if m.group(2) == '':
 1856+ patch.start1 -= 1
 1857+ patch.length1 = 1
 1858+ elif m.group(2) == '0':
 1859+ patch.length1 = 0
 1860+ else:
 1861+ patch.start1 -= 1
 1862+ patch.length1 = int(m.group(2))
 1863+
 1864+ patch.start2 = int(m.group(3))
 1865+ if m.group(4) == '':
 1866+ patch.start2 -= 1
 1867+ patch.length2 = 1
 1868+ elif m.group(4) == '0':
 1869+ patch.length2 = 0
 1870+ else:
 1871+ patch.start2 -= 1
 1872+ patch.length2 = int(m.group(4))
 1873+
 1874+ del text[0]
 1875+
 1876+ while len(text) != 0:
 1877+ if text[0]:
 1878+ sign = text[0][0]
 1879+ else:
 1880+ sign = ''
 1881+ line = urllib.unquote(text[0][1:])
 1882+ line = line.decode("utf-8")
 1883+ if sign == '+':
 1884+ # Insertion.
 1885+ patch.diffs.append((self.DIFF_INSERT, line))
 1886+ elif sign == '-':
 1887+ # Deletion.
 1888+ patch.diffs.append((self.DIFF_DELETE, line))
 1889+ elif sign == ' ':
 1890+ # Minor equality.
 1891+ patch.diffs.append((self.DIFF_EQUAL, line))
 1892+ elif sign == '@':
 1893+ # Start of next patch.
 1894+ break
 1895+ elif sign == '':
 1896+ # Blank line? Whatever.
 1897+ pass
 1898+ else:
 1899+ # WTF?
 1900+ raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
 1901+ del text[0]
 1902+ return patches
 1903+
 1904+
 1905+class patch_obj:
 1906+ """Class representing one patch operation.
 1907+ """
 1908+
 1909+ def __init__(self):
 1910+ """Initializes with an empty list of diffs.
 1911+ """
 1912+ self.diffs = []
 1913+ self.start1 = None
 1914+ self.start2 = None
 1915+ self.length1 = 0
 1916+ self.length2 = 0
 1917+
 1918+ def __str__(self):
 1919+ """Emmulate GNU diff's format.
 1920+ Header: @@ -382,8 +481,9 @@
 1921+ Indicies are printed as 1-based, not 0-based.
 1922+
 1923+ Returns:
 1924+ The GNU diff string.
 1925+ """
 1926+ if self.length1 == 0:
 1927+ coords1 = str(self.start1) + ",0"
 1928+ elif self.length1 == 1:
 1929+ coords1 = str(self.start1 + 1)
 1930+ else:
 1931+ coords1 = str(self.start1 + 1) + "," + str(self.length1)
 1932+ if self.length2 == 0:
 1933+ coords2 = str(self.start2) + ",0"
 1934+ elif self.length2 == 1:
 1935+ coords2 = str(self.start2 + 1)
 1936+ else:
 1937+ coords2 = str(self.start2 + 1) + "," + str(self.length2)
 1938+ text = ["@@ -", coords1, " +", coords2, " @@\n"]
 1939+ # Escape the body of the patch with %xx notation.
 1940+ for (op, data) in self.diffs:
 1941+ if op == diff_match_patch.DIFF_INSERT:
 1942+ text.append("+")
 1943+ elif op == diff_match_patch.DIFF_DELETE:
 1944+ text.append("-")
 1945+ elif op == diff_match_patch.DIFF_EQUAL:
 1946+ text.append(" ")
 1947+ # High ascii will raise UnicodeDecodeError. Use Unicode instead.
 1948+ data = data.encode("utf-8")
 1949+ text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
 1950+ return "".join(text)
Property changes on: trunk/tools/wsor/diffs/diff_match_patch.py
___________________________________________________________________
Added: svn:executable
11951 + *
Index: trunk/tools/wsor/diffs/page_sample.xml
@@ -0,0 +1,37 @@
 2+ <page>
 3+ <title>Bassist</title>
 4+ <id>60001</id>
 5+ <revision>
 6+ <id>108204</id>
 7+ <timestamp>2002-06-30T02:03:23Z</timestamp>
 8+ <contributor>
 9+ <ip>195.149.37.198</ip>
 10+ </contributor>
 11+ <minor />
 12+ <comment>stub</comment>
 13+ <text xml:space="preserve">A &lt;b&gt;bassist&lt;/b&gt; is somebody who plays a [[bass guitar]] or [[double bass]].</text>
 14+ </revision>
 15+ <revision>
 16+ <id>208937</id>
 17+ <timestamp>2002-06-30T16:00:41Z</timestamp>
 18+ <contributor>
 19+ <username>JeLuF</username>
 20+ <id>733</id>
 21+ </contributor>
 22+ <comment>added list</comment>
 23+ <text xml:space="preserve">A &lt;b&gt;bassist&lt;/b&gt; is somebody who plays a [[bass guitar]] or [[double bass]].
 24+
 25+Famous bassists include:
 26+* [[Ron Carter]]
 27+* [[Les Claypool]] from [[Primus]]
 28+* [[John Entwistle]] from [[The Who]]
 29+* [[Kelly Grouchet]] from [[Electric Light Orchestra]]
 30+* [[Glenn Hughes]] from [[Deep Purple]]
 31+* [[Lemmy Kilmister]] from [[Motorhead]]
 32+* Sir [[Paul McCartney]] from [[The Beatles]]
 33+* [[Charles Mingus]]
 34+* [[Jason Newsted]] from [[Metallica]]
 35+* [[Sting]] from [[The Police]]
 36+* [[Leon Wilkeson]] from [[Lynyrd Skynyrd]]</text>
 37+ </revision>
 38+ </page>

Status & tagging log